From 6018bf9178d164a1dbcfda25b6f22ef797e01fd6 Mon Sep 17 00:00:00 2001
From: Raymond Douglass <ray@raydouglass.com>
Date: Thu, 15 Jul 2021 17:02:52 -0400
Subject: [PATCH 001/171] DOC v21.10 Updates

---
 CHANGELOG.md       | 4 ++++
 cpp/CMakeLists.txt | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c690a5059c..33d23198e5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# raft 21.10.00 (Date TBD)
+
+Please see https://github.com/rapidsai/raft/releases/tag/v21.10.00a for the latest changes to this development branch.
+
 # raft 21.08.00 (Date TBD)
 
 Please see https://github.com/rapidsai/raft/releases/tag/v21.08.00a for the latest changes to this development branch.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 1d60071d7e..eba8d75826 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -30,7 +30,7 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(RAFT)
 
-project(RAFT VERSION 21.08.00 LANGUAGES CXX CUDA)
+project(RAFT VERSION 21.10.00 LANGUAGES CXX CUDA)
 
 ##############################################################################
 # - build type ---------------------------------------------------------------

From 300fcd7186ebc413354abeeb4215e4417137b733 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Mon, 26 Jul 2021 16:46:08 -0400
Subject: [PATCH 002/171] Use the new RAPIDS.cmake to fetch rapids-cmake (#298)

The original approach of using FetchContent naively has a subtle
bug when multiple projects that use rapids-cmake are combined together inside as sibling projects. This bug causes any
`include(rapids-*)` commands to fail, causing CMake errors.

Bug using `RAPIDS.cmake` we can resolve this issue and remove
the new complex logic from each consumer.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/298
---
 cpp/CMakeLists.txt | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index eba8d75826..04eaf548ce 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -15,13 +15,9 @@
 #=============================================================================
 
 cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR)
-include(FetchContent)
-FetchContent_Declare(
-  rapids-cmake
-  GIT_REPOSITORY https://github.com/rapidsai/rapids-cmake.git
-  GIT_TAG        origin/branch-21.08
-  )
-FetchContent_MakeAvailable(rapids-cmake)
+file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.10/RAPIDS.cmake
+    ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
+include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
 include(rapids-cmake)
 include(rapids-cpm)
 include(rapids-cuda)

From 78eca24a0a1824ddcd896f08dcf820911245dcf6 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 28 Jul 2021 12:36:16 -0500
Subject: [PATCH 003/171] Remove max version pin for dask & distributed on
 development branch (#303)

This PR will remove max version pinning for dask & distributed for development purposes.

ref: rapidsai/cudf#8881

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/raft/pull/303
---
 ci/gpu/build.sh          | 4 ++--
 ci/local/old-gpubuild.sh | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 8f7bb5efd9..1cb3a0b6f7 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -59,8 +59,8 @@ gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid
 # Install the master version of dask, distributed, and dask-ml
 gpuci_logger "Install the master version of dask and distributed"
 set -x
-pip install "git+https://github.com/dask/distributed.git@2021.07.1" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git@2021.07.1" --upgrade --no-deps
+pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
+pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
 set +x
 
 
diff --git a/ci/local/old-gpubuild.sh b/ci/local/old-gpubuild.sh
index 657c354603..eb2667149d 100644
--- a/ci/local/old-gpubuild.sh
+++ b/ci/local/old-gpubuild.sh
@@ -81,8 +81,8 @@ fi
 
 # Install the master version of dask, distributed, and dask-ml
 set -x
-pip install "git+https://github.com/dask/distributed.git@2021.07.1" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git@2021.07.1" --upgrade --no-deps
+pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
+pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
 set +x
 
 
From fc1e701742a4173ae5c6ac973b5097d0550be594 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Thu, 29 Jul 2021 04:14:38 +1000
Subject: [PATCH 004/171] Warnings are errors (#299)

This PR fixes current RAFT C++/CUDA compilation warnings and turns on -Wall to treat warnings as errors.

Fixes #225
Fixes #289

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/299
---
 cpp/cmake/modules/ConfigureCUDA.cmake         |  4 +--
 .../raft/distance/pairwise_distance_base.cuh  | 14 ++++----
 cpp/include/raft/lap/lap_functions.cuh        | 14 ++++----
 cpp/include/raft/lap/lap_kernels.cuh          | 11 +++---
 cpp/include/raft/mr/allocator.hpp             |  6 ++--
 cpp/include/raft/mr/buffer_base.hpp           |  5 ++-
 cpp/include/raft/mr/device/allocator.hpp      |  6 ++--
 cpp/include/raft/mr/host/allocator.hpp        |  7 ++--
 .../sparse/hierarchy/detail/agglomerative.cuh | 21 ++++++-----
 cpp/include/raft/sparse/op/sort.h             | 15 +++-----
 cpp/test/eigen_solvers.cu                     | 14 ++++----
 cpp/test/mst.cu                               | 35 ++++++++++---------
 cpp/test/sparse/linkage.cu                    | 12 ++++---
 cpp/test/spatial/knn.cu                       | 18 ++++++----
 14 files changed, 97 insertions(+), 85 deletions(-)

diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index e92632a238..29203e86be 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -26,8 +26,8 @@ endif()
 list(APPEND RAFT_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
 
 # set warnings as errors
-# list(APPEND RAFT_CUDA_FLAGS -Werror=cross-execution-space-call)
-# list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations)
+list(APPEND RAFT_CUDA_FLAGS -Werror=cross-execution-space-call)
+list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations)
 
 # Option to enable line info in CUDA device compilation to allow introspection when profiling / memchecking
 if(CUDA_ENABLE_LINEINFO)
diff --git a/cpp/include/raft/distance/pairwise_distance_base.cuh b/cpp/include/raft/distance/pairwise_distance_base.cuh
index 43abc9eb65..e3ff9a7081 100644
--- a/cpp/include/raft/distance/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/pairwise_distance_base.cuh
@@ -20,6 +20,8 @@
 #include <raft/linalg/norm.cuh>
 #include <raft/vectorized.cuh>
 
+#include <cstddef>
+
 namespace raft {
 namespace distance {
 
@@ -312,20 +314,20 @@ __global__ __launch_bounds__(
 }
 
 template <typename P, typename IdxT, typename T>
-dim3 launchConfigGenerator(IdxT m, IdxT n, size_t sMemSize, T func) {
+dim3 launchConfigGenerator(IdxT m, IdxT n, std::size_t sMemSize, T func) {
   const auto numSMs = raft::getMultiProcessorCount();
   int numBlocksPerSm = 0;
   dim3 grid;
 
   CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
     &numBlocksPerSm, func, P::Nthreads, sMemSize));
-  int minGridSize = numSMs * numBlocksPerSm;
-  int yChunks = raft::ceildiv<int>(m, P::Mblk);
-  int xChunks = raft::ceildiv<int>(n, P::Nblk);
+  std::size_t minGridSize = numSMs * numBlocksPerSm;
+  std::size_t yChunks = raft::ceildiv<int>(m, P::Mblk);
+  std::size_t xChunks = raft::ceildiv<int>(n, P::Nblk);
   grid.y = yChunks > minGridSize ? minGridSize : yChunks;
   grid.x = (minGridSize - grid.y) <= 0 ? 1 : xChunks;
   if (grid.x != 1) {
-    int i = 1;
+    std::size_t i = 1;
     while (grid.y * i < minGridSize) {
       i++;
     }
@@ -336,4 +338,4 @@ dim3 launchConfigGenerator(IdxT m, IdxT n, size_t sMemSize, T func) {
 }
 
 };  // namespace distance
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/include/raft/lap/lap_functions.cuh b/cpp/include/raft/lap/lap_functions.cuh
index 0079f50e82..7640f3f816 100644
--- a/cpp/include/raft/lap/lap_functions.cuh
+++ b/cpp/include/raft/lap/lap_functions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  * Copyright 2020 KETAN DATE & RAKESH NAGI
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,19 +24,17 @@
  */
 #pragma once
 
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <device_launch_parameters.h>
-#include <thrust/device_ptr.h>
-#include <thrust/reduce.h>
-#include <thrust/scan.h>
 #include "d_structs.h"
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
+#include <raft/lap/lap_kernels.cuh>
 #include <raft/mr/device/buffer.hpp>
 
-#include <raft/lap/lap_kernels.cuh>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+
+#include <cstddef>
 
 namespace raft {
 namespace lap {
diff --git a/cpp/include/raft/lap/lap_kernels.cuh b/cpp/include/raft/lap/lap_kernels.cuh
index 8c9012ed72..14ad877aa4 100644
--- a/cpp/include/raft/lap/lap_kernels.cuh
+++ b/cpp/include/raft/lap/lap_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  * Copyright 2020 KETAN DATE & RAKESH NAGI
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,18 +24,15 @@
  */
 #pragma once
 
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <device_launch_parameters.h>
-#include <thrust/device_ptr.h>
-#include <thrust/reduce.h>
-#include <thrust/scan.h>
 #include "d_structs.h"
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 #include <raft/mr/device/buffer.hpp>
 
+#include <thrust/for_each.h>
+
+#include <cstddef>
 namespace raft {
 namespace lap {
 namespace detail {
diff --git a/cpp/include/raft/mr/allocator.hpp b/cpp/include/raft/mr/allocator.hpp
index 707b71d468..08a4987c91 100644
--- a/cpp/include/raft/mr/allocator.hpp
+++ b/cpp/include/raft/mr/allocator.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,9 @@
 
 #pragma once
 
-#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+
+#include <cstddef>
 
 namespace raft {
 namespace mr {
diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp
index 29e0d7cfcd..3a44175182 100644
--- a/cpp/include/raft/mr/buffer_base.hpp
+++ b/cpp/include/raft/mr/buffer_base.hpp
@@ -16,8 +16,11 @@
 
 #pragma once
 
-#include <cuda_runtime.h>
 #include <raft/cudart_utils.h>
+
+#include <cuda_runtime.h>
+
+#include <cstddef>
 #include <memory>
 #include <utility>
 
diff --git a/cpp/include/raft/mr/device/allocator.hpp b/cpp/include/raft/mr/device/allocator.hpp
index 889e1640db..3d1ce38c31 100644
--- a/cpp/include/raft/mr/device/allocator.hpp
+++ b/cpp/include/raft/mr/device/allocator.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,12 @@
 
 #pragma once
 
-#include <cstddef>
 #include <raft/mr/allocator.hpp>
+
 #include <rmm/mr/device/per_device_resource.hpp>
 
+#include <cstddef>
+
 namespace raft {
 namespace mr {
 namespace device {
diff --git a/cpp/include/raft/mr/host/allocator.hpp b/cpp/include/raft/mr/host/allocator.hpp
index 8af266d4f0..e5b3da24eb 100644
--- a/cpp/include/raft/mr/host/allocator.hpp
+++ b/cpp/include/raft/mr/host/allocator.hpp
@@ -16,12 +16,13 @@
 
 #pragma once
 
-#include <cstddef>
-
-#include <cuda_runtime.h>
 #include <raft/cudart_utils.h>
 #include <raft/mr/allocator.hpp>
 
+#include <cuda_runtime.h>
+
+#include <cstddef>
+
 namespace raft {
 namespace mr {
 namespace host {
diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
index 1ac075489a..3cffa1c28a 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
@@ -20,6 +20,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
 #include <raft/mr/device/buffer.hpp>
+
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -27,6 +28,8 @@
 #include <thrust/execution_policy.h>
 #include <thrust/sort.h>
 
+#include <cstddef>
+
 namespace raft {
 
 namespace hierarchy {
@@ -97,8 +100,8 @@ class UnionFind {
 template <typename value_idx, typename value_t>
 void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
                            const value_idx *cols, const value_t *data,
-                           size_t nnz, value_idx *children, value_t *out_delta,
-                           value_idx *out_size) {
+                           std::size_t nnz, value_idx *children,
+                           value_t *out_delta, value_idx *out_size) {
   auto d_alloc = handle.get_device_allocator();
   auto stream = handle.get_stream();
 
@@ -120,7 +123,7 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
 
   UnionFind<value_idx, value_t> U(nnz + 1);
 
-  for (value_idx i = 0; i < nnz; i++) {
+  for (std::size_t i = 0; i < nnz; i++) {
     value_idx a = mst_src_h[i];
     value_idx b = mst_dst_h[i];
     value_t delta = mst_weights_h[i];
@@ -167,7 +170,7 @@ __global__ void write_levels_kernel(const value_idx *children,
  */
 template <typename value_idx>
 __global__ void inherit_labels(const value_idx *children,
-                               const value_idx *levels, size_t n_leaves,
+                               const value_idx *levels, std::size_t n_leaves,
                                value_idx *labels, int cut_level,
                                value_idx n_vertices) {
   value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
@@ -222,8 +225,8 @@ struct init_label_roots {
  */
 template <typename value_idx, int tpb = 256>
 void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
-                                const value_idx *children, size_t n_clusters,
-                                size_t n_leaves) {
+                                const value_idx *children,
+                                std::size_t n_clusters, std::size_t n_leaves) {
   auto d_alloc = handle.get_device_allocator();
   auto stream = handle.get_stream();
   auto thrust_policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
@@ -241,7 +244,7 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
      *        out for each of the children
      */
 
-    size_t n_edges = (n_leaves - 1) * 2;
+    auto n_edges = (n_leaves - 1) * 2;
 
     thrust::device_ptr<const value_idx> d_ptr =
       thrust::device_pointer_cast(children);
@@ -250,7 +253,9 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
 
     // Prevent potential infinite loop from labeling disconnected
     // connectivities graph.
-    RAFT_EXPECTS(n_vertices == (n_leaves - 1) * 2,
+    RAFT_EXPECTS(n_leaves > 0, "n_leaves must be positive");
+    RAFT_EXPECTS(static_cast<std::size_t>(n_vertices) ==
+                   static_cast<std::size_t>((n_leaves - 1) * 2),
                  "Multiple components found in MST or MST is invalid. "
                  "Cannot find single-linkage solution.");
 
diff --git a/cpp/include/raft/sparse/op/sort.h b/cpp/include/raft/sparse/op/sort.h
index 9dbe2b67c5..09d5b568be 100644
--- a/cpp/include/raft/sparse/op/sort.h
+++ b/cpp/include/raft/sparse/op/sort.h
@@ -16,25 +16,22 @@
 
 #pragma once
 
-#include <cusparse_v2.h>
-
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
+#include <raft/sparse/coo.cuh>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
 
+#include <cusparse_v2.h>
+
 #include <cuda_runtime.h>
-#include <stdio.h>
 
 #include <algorithm>
-#include <iostream>
-
-#include <raft/sparse/utils.h>
-#include <raft/sparse/coo.cuh>
 
 namespace raft {
 namespace sparse {
@@ -106,8 +103,6 @@ void coo_sort(COO<T> *const in,
 template <typename value_idx, typename value_t>
 void coo_sort_by_weight(value_idx *rows, value_idx *cols, value_t *data,
                         value_idx nnz, cudaStream_t stream) {
-  thrust::device_ptr<value_idx> t_rows = thrust::device_pointer_cast(rows);
-  thrust::device_ptr<value_idx> t_cols = thrust::device_pointer_cast(cols);
   thrust::device_ptr<value_t> t_data = thrust::device_pointer_cast(data);
 
   auto first = thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
@@ -117,4 +112,4 @@ void coo_sort_by_weight(value_idx *rows, value_idx *cols, value_t *data,
 }
 };  // namespace op
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu
index e6ee09262e..328137f42d 100644
--- a/cpp/test/eigen_solvers.cu
+++ b/cpp/test/eigen_solvers.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,12 +14,14 @@
  * limitations under the License.
  */
 
+#include <raft/handle.hpp>
+#include <raft/spectral/partition.hpp>
+
 #include <gtest/gtest.h>
+
+#include <cstddef>
 #include <iostream>
 #include <memory>
-#include <raft/handle.hpp>
-
-#include <raft/spectral/partition.hpp>
 
 namespace raft {
 
@@ -37,8 +39,6 @@ TEST(Raft, EigenSolvers) {
   value_type* vs{nullptr};
   index_type nnz = 0;
   index_type nrows = 0;
-  auto stream = h.get_stream();
-  auto t_exe_pol = thrust::cuda::par.on(stream);
 
   sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
   ASSERT_EQ(nullptr, sm1.row_offsets_);
@@ -53,7 +53,7 @@ TEST(Raft, EigenSolvers) {
   //
   value_type* eigvals{nullptr};
   value_type* eigvecs{nullptr};
-  unsigned long long seed{100110021003};
+  std::uint64_t seed{100110021003};
 
   eigen_solver_config_t<index_type, value_type> cfg{
     neigvs, maxiter, restart_iter, tol, reorthog, seed};
diff --git a/cpp/test/mst.cu b/cpp/test/mst.cu
index d7aa76500b..94f81cddb8 100644
--- a/cpp/test/mst.cu
+++ b/cpp/test/mst.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,19 +14,20 @@
  * limitations under the License.
  */
 
-#include <bits/stdc++.h>
-
-#include <gtest/gtest.h>
-#include <iostream>
-#include <rmm/device_buffer.hpp>
-#include <vector>
+#include "test_utils.h"
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-
 #include <raft/sparse/mst/mst.cuh>
 
-#include "test_utils.h"
+#include <rmm/device_buffer.hpp>
+
+#include <gtest/gtest.h>
+
+#include <bits/stdc++.h>
+
+#include <cstddef>
+#include <vector>
 
 template <typename vertex_t, typename edge_t, typename weight_t>
 struct CSRHost {
@@ -55,25 +56,25 @@ namespace mst {
 // Returns total weight of MST
 template <typename vertex_t, typename edge_t, typename weight_t>
 weight_t prims(CSRHost<vertex_t, edge_t, weight_t> &csr_h) {
-  auto n_vertices = csr_h.offsets.size() - 1;
+  std::size_t n_vertices = csr_h.offsets.size() - 1;
 
   bool active_vertex[n_vertices];
   //  bool mst_set[csr_h.n_edges];
   weight_t curr_edge[n_vertices];
 
-  for (auto i = 0; i < n_vertices; i++) {
+  for (std::size_t i = 0; i < n_vertices; i++) {
     active_vertex[i] = false;
-    curr_edge[i] = INT_MAX;
+    curr_edge[i] = static_cast<weight_t>(std::numeric_limits<int>::max());
   }
   curr_edge[0] = 0;
 
   // function to pick next min vertex-edge
   auto min_vertex_edge = [](auto *curr_edge, auto *active_vertex,
                             auto n_vertices) {
-    weight_t min = INT_MAX;
-    vertex_t min_vertex;
+    auto min = static_cast<weight_t>(std::numeric_limits<int>::max());
+    vertex_t min_vertex{};
 
-    for (auto v = 0; v < n_vertices; v++) {
+    for (std::size_t v = 0; v < n_vertices; v++) {
       if (!active_vertex[v] && curr_edge[v] < min) {
         min = curr_edge[v];
         min_vertex = v;
@@ -84,7 +85,7 @@ weight_t prims(CSRHost<vertex_t, edge_t, weight_t> &csr_h) {
   };
 
   // iterate over n vertices
-  for (auto v = 0; v < n_vertices - 1; v++) {
+  for (std::size_t v = 0; v < n_vertices - 1; v++) {
     // pick min vertex-edge
     auto curr_v = min_vertex_edge(curr_edge, active_vertex, n_vertices);
 
@@ -106,7 +107,7 @@ weight_t prims(CSRHost<vertex_t, edge_t, weight_t> &csr_h) {
 
   // find sum of MST
   weight_t total_weight = 0;
-  for (auto v = 1; v < n_vertices; v++) {
+  for (std::size_t v = 1; v < n_vertices; v++) {
     total_weight += curr_edge[v];
   }
 
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu
index ce567e4298..a157a17e30 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/sparse/linkage.cu
@@ -14,19 +14,21 @@
  * limitations under the License.
  */
 
-#include <gtest/gtest.h>
-#include <raft/cudart_utils.h>
-#include <raft/cuda_utils.cuh>
-#include <vector>
+#include "../test_utils.h"
 
+#include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
 #include <raft/linalg/transpose.h>
+#include <raft/cuda_utils.cuh>
 #include <raft/mr/device/allocator.hpp>
 #include <raft/sparse/coo.cuh>
 #include <raft/sparse/hierarchy/single_linkage.hpp>
+
 #include <rmm/device_uvector.hpp>
 
-#include "../test_utils.h"
+#include <gtest/gtest.h>
+
+#include <vector>
 
 namespace raft {
 
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index 2b1ef89f7a..de6251d32d 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -14,13 +14,18 @@
  * limitations under the License.
  */
 
-#include <gtest/gtest.h>
+#include "../test_utils.h"
+
 #include <raft/linalg/distance_type.h>
-#include <iostream>
 #include <raft/spatial/knn/knn.hpp>
+
 #include <rmm/device_buffer.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <iostream>
 #include <vector>
-#include "../test_utils.h"
 
 namespace raft {
 namespace spatial {
@@ -37,8 +42,7 @@ __global__ void build_actual_output(int *output, int n_rows, int k,
   int element = threadIdx.x + blockDim.x * blockIdx.x;
   if (element >= n_rows * k) return;
 
-  int ind = (int)indices[element];
-  output[element] = idx_labels[ind];
+  output[element] = idx_labels[indices[element]];
 }
 
 __global__ void build_expected_output(int *output, int n_rows, int k,
@@ -101,8 +105,8 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
     k_ = params_.k;
 
     std::vector<float> row_major_input;
-    for (int i = 0; i < params_.input.size(); ++i) {
-      for (int j = 0; j < params_.input[i].size(); ++j) {
+    for (std::size_t i = 0; i < params_.input.size(); ++i) {
+      for (std::size_t j = 0; j < params_.input[i].size(); ++j) {
         row_major_input.push_back(params_.input[i][j]);
       }
     }

From 947e22f65c2fd44f33f0f27c738bd74a1a1a69b4 Mon Sep 17 00:00:00 2001
From: Dillon Cullinan <dcullinan92@gmail.com>
Date: Thu, 29 Jul 2021 12:01:38 -0400
Subject: [PATCH 005/171] ENH Replace gpuci_conda_retry with gpuci_mamba_retry
 (#295)

`mamba` was recently added to gpuCI build environment, testing usage and solvability with this PR which should speed up build times.

Authors:
  - Dillon Cullinan (https://github.com/dillon-cullinan)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/raft/pull/295
---
 ci/gpu/build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 1cb3a0b6f7..a62354883b 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -39,13 +39,13 @@ env
 gpuci_logger "Check GPU usage"
 nvidia-smi
 
-# temporary usage of gpuci_conda_retry install with packages listed here, looking into
+# temporary usage of gpuci_mamba_retry install with packages listed here, looking into
 # using the repos yaml files for this
 gpuci_logger "Activate conda env"
 . /opt/conda/etc/profile.d/conda.sh
 conda activate rapids
 gpuci_logger "Installing packages needed for RAFT"
-gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \
+gpuci_mamba_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \
       "cudatoolkit=${CUDA_REL}" \
       "cudf=${MINOR_VERSION}" \
       "rmm=${MINOR_VERSION}" \

From e4d8a036a5e268ebe360eea1f0f249168e94ebfa Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Wed, 4 Aug 2021 11:48:43 +1000
Subject: [PATCH 006/171] Enable CUDA device code warnings as errors (#307)

Adds `-Werror=all-warnings` NVCC flag to ensure all CUDA device code warnings are treated as errors. Only enabled on CUDA 11.2+ because CUDA 11.0 has PTXAS warnings that go away in newer CUDA versions.

Missed this in #299.

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/307
---
 cpp/cmake/modules/ConfigureCUDA.cmake | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index 29203e86be..3786910be0 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -26,7 +26,9 @@ endif()
 list(APPEND RAFT_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
 
 # set warnings as errors
-list(APPEND RAFT_CUDA_FLAGS -Werror=cross-execution-space-call)
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2.0)
+    list(APPEND RAFT_CUDA_FLAGS -Werror=all-warnings)
+endif()
 list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations)
 
 # Option to enable line info in CUDA device compilation to allow introspection when profiling / memchecking

From 78b67af6cb7a919a5400c9679169c366221cf283 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Fri, 6 Aug 2021 08:01:50 +1000
Subject: [PATCH 007/171] Fix more warnings (#311)

Warnings missed in #299...

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/311
---
 cpp/include/raft/distance/distance.cuh    | 34 +++++++++++------------
 cpp/include/raft/linalg/cublas_wrappers.h |  5 +---
 2 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
index 1b39a6ec18..fc0d07773f 100644
--- a/cpp/include/raft/distance/distance.cuh
+++ b/cpp/include/raft/distance/distance.cuh
@@ -47,7 +47,7 @@ struct DistanceImpl<raft::distance::DistanceType::L2Expanded, InType, AccType,
                     OutType, FinalLambda, Index_> {
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
            Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+           cudaStream_t stream, bool isRowMajor, InType) {
     raft::distance::euclideanAlgo1<InType, AccType, OutType, FinalLambda,
                                    Index_>(m, n, k, x, y, dist, false,
                                            (AccType *)workspace, worksize,
@@ -61,7 +61,7 @@ struct DistanceImpl<raft::distance::DistanceType::L2SqrtExpanded, InType,
                     AccType, OutType, FinalLambda, Index_> {
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
            Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+           cudaStream_t stream, bool isRowMajor, InType) {
     raft::distance::euclideanAlgo1<InType, AccType, OutType, FinalLambda,
                                    Index_>(m, n, k, x, y, dist, true,
                                            (AccType *)workspace, worksize,
@@ -75,7 +75,7 @@ struct DistanceImpl<raft::distance::DistanceType::CosineExpanded, InType,
                     AccType, OutType, FinalLambda, Index_> {
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
            Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+           cudaStream_t stream, bool isRowMajor, InType) {
     raft::distance::cosineAlgo1<InType, AccType, OutType, FinalLambda, Index_>(
       m, n, k, x, y, dist, (AccType *)workspace, worksize, fin_op, stream,
       isRowMajor);
@@ -87,8 +87,8 @@ template <typename InType, typename AccType, typename OutType,
 struct DistanceImpl<raft::distance::DistanceType::L2Unexpanded, InType, AccType,
                     OutType, FinalLambda, Index_> {
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
+           bool isRowMajor, InType) {
     raft::distance::euclideanAlgo2<InType, AccType, OutType, FinalLambda,
                                    Index_>(m, n, k, x, y, dist, false, fin_op,
                                            stream, isRowMajor);
@@ -100,8 +100,8 @@ template <typename InType, typename AccType, typename OutType,
 struct DistanceImpl<raft::distance::DistanceType::L2SqrtUnexpanded, InType,
                     AccType, OutType, FinalLambda, Index_> {
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
+           bool isRowMajor, InType) {
     raft::distance::euclideanAlgo2<InType, AccType, OutType, FinalLambda,
                                    Index_>(m, n, k, x, y, dist, true, fin_op,
                                            stream, isRowMajor);
@@ -113,8 +113,8 @@ template <typename InType, typename AccType, typename OutType,
 struct DistanceImpl<raft::distance::DistanceType::L1, InType, AccType, OutType,
                     FinalLambda, Index_> {
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
+           bool isRowMajor, InType) {
     raft::distance::l1Impl<InType, AccType, OutType, FinalLambda, Index_>(
       m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
@@ -125,8 +125,8 @@ template <typename InType, typename AccType, typename OutType,
 struct DistanceImpl<raft::distance::DistanceType::Linf, InType, AccType,
                     OutType, FinalLambda, Index_> {
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
+           bool isRowMajor, InType) {
     raft::distance::chebyshevImpl<InType, AccType, OutType, FinalLambda,
                                   Index_>(m, n, k, x, y, dist, fin_op, stream,
                                           isRowMajor);
@@ -138,8 +138,8 @@ template <typename InType, typename AccType, typename OutType,
 struct DistanceImpl<raft::distance::DistanceType::HellingerExpanded, InType,
                     AccType, OutType, FinalLambda, Index_> {
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
+           bool isRowMajor, InType) {
     raft::distance::hellingerImpl<InType, AccType, OutType, FinalLambda,
                                   Index_>(m, n, k, x, y, dist, fin_op, stream,
                                           isRowMajor);
@@ -151,8 +151,8 @@ template <typename InType, typename AccType, typename OutType,
 struct DistanceImpl<raft::distance::DistanceType::LpUnexpanded, InType, AccType,
                     OutType, FinalLambda, Index_> {
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
+           bool isRowMajor, InType metric_arg) {
     raft::distance::minkowskiImpl<InType, AccType, OutType, FinalLambda,
                                   Index_>(m, n, k, x, y, dist, fin_op, stream,
                                           isRowMajor, metric_arg);
@@ -164,8 +164,8 @@ template <typename InType, typename AccType, typename OutType,
 struct DistanceImpl<raft::distance::DistanceType::Canberra, InType, AccType,
                     OutType, FinalLambda, Index_> {
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
+           bool isRowMajor, InType) {
     raft::distance::canberraImpl<InType, AccType, OutType, FinalLambda, Index_>(
       m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h
index 7c79e6c91d..1be14a550d 100644
--- a/cpp/include/raft/linalg/cublas_wrappers.h
+++ b/cpp/include/raft/linalg/cublas_wrappers.h
@@ -86,10 +86,8 @@ inline const char *cublas_error_to_string(cublasStatus_t err) {
 /** FIXME: temporary alias for cuML compatibility */
 #define CUBLAS_CHECK(call) CUBLAS_TRY(call)
 
-///@todo: enable this once we have logging enabled
-#if 0
 /** check for cublas runtime API errors but do not assert */
-define CUBLAS_CHECK_NO_THROW(call)                                          \
+#define CUBLAS_CHECK_NO_THROW(call)                                          \
   do {                                                                       \
     cublasStatus_t err = call;                                               \
     if (err != CUBLAS_STATUS_SUCCESS) {                                      \
@@ -97,7 +95,6 @@ define CUBLAS_CHECK_NO_THROW(call)                                          \
                      raft::linalg::detail::cublas_error_to_string(err));     \
     }                                                                        \
   } while (0)
-#endif
 
 namespace raft {
 namespace linalg {

From 3196480e2b41060b886e597e8ce07ce7d5eb7daa Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Mon, 16 Aug 2021 15:41:03 -0500
Subject: [PATCH 008/171] Temporarily pin RMM while refactor removes deprecated
 calls (#315)

This will unblock cuGraph's build.

Authors:
  - Dante Gama Dessavre (https://github.com/dantegd)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/raft/pull/315
---
 cpp/cmake/thirdparty/get_rmm.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake
index 51f959a8d9..85ebc6238e 100644
--- a/cpp/cmake/thirdparty/get_rmm.cmake
+++ b/cpp/cmake/thirdparty/get_rmm.cmake
@@ -32,8 +32,8 @@ function(find_and_configure_rmm VERSION)
         INSTALL_EXPORT_SET  raft-exports
         CPM_ARGS
             GIT_REPOSITORY  https://github.com/rapidsai/rmm.git
-            GIT_TAG         branch-${MAJOR_AND_MINOR}
-            GIT_SHALLOW     TRUE
+            GIT_TAG         23bbe745af1d988224b5498f7b8e3fe3720532d4
+            GIT_SHALLOW     FALSE
             OPTIONS         "BUILD_TESTS OFF"
                             "BUILD_BENCHMARKS OFF"
                             "CUDA_STATIC_RUNTIME ${CUDA_STATIC_RUNTIME}"

From 8992816ea79f1404bd7aafc9b729f32df4c5e4f6 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 25 Aug 2021 18:19:27 +0200
Subject: [PATCH 009/171] Update to UCX-Py 0.22 (#319)

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Jordan Jacobelli (https://github.com/Ethyling)

URL: https://github.com/rapidsai/raft/pull/319
---
 ci/gpu/build.sh          | 2 +-
 ci/local/old-gpubuild.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index a62354883b..0ba9901107 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -51,7 +51,7 @@ gpuci_mamba_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid
       "rmm=${MINOR_VERSION}" \
       "dask-cudf=${MINOR_VERSION}" \
       "dask-cuda=${MINOR_VERSION}" \
-      "ucx-py=0.21.*" \
+      "ucx-py=0.22.*" \
       "rapids-build-env=${MINOR_VERSION}.*" \
       "rapids-notebook-env=${MINOR_VERSION}.*" \
       "rapids-doc-env=${MINOR_VERSION}.*"
diff --git a/ci/local/old-gpubuild.sh b/ci/local/old-gpubuild.sh
index eb2667149d..efd6c0382a 100644
--- a/ci/local/old-gpubuild.sh
+++ b/ci/local/old-gpubuild.sh
@@ -56,7 +56,7 @@ gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid
       "distributed>=2.12.0" \
       "dask-cudf=${MINOR_VERSION}" \
       "dask-cuda=${MINOR_VERSION}" \
-      "ucx-py=0.21.*"
+      "ucx-py=0.22.*"
 
 if [ "$RUN_CUML_LIBCUML_TESTS" = "ON" ] || [ "$RUN_CUML_PRIMS_TESTS" = "ON" ] || [ "$RUN_CUML_PYTHON_TESTS" = "ON" ]; then
   gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \

From aab9b958399fee343e6ac9d476fd18fba4df04f8 Mon Sep 17 00:00:00 2001
From: Mahesh Doijade <36705640+mdoijade@users.noreply.github.com>
Date: Thu, 26 Aug 2021 04:25:22 +0530
Subject: [PATCH 010/171] Add Hamming, Jensen-Shannon, KL-Divergence, Russell
 rao and Correlation distance metrics support (#306)

This PR introduces the following distances:
- Hamming
- Jensen-Shannon
- Russell-Rao
- KL-Divergence
- Correlation
with unit tests for each of them.

Authors:
  - Mahesh Doijade (https://github.com/mdoijade)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/raft/pull/306
---
 cpp/include/raft/distance/correlation.cuh    | 247 +++++++++++++++++++
 cpp/include/raft/distance/distance.cuh       | 107 +++++++-
 cpp/include/raft/distance/hamming.cuh        | 175 +++++++++++++
 cpp/include/raft/distance/jensen_shannon.cuh | 181 ++++++++++++++
 cpp/include/raft/distance/kl_divergence.cuh  | 242 ++++++++++++++++++
 cpp/include/raft/distance/russell_rao.cuh    | 171 +++++++++++++
 cpp/test/CMakeLists.txt                      |   5 +
 cpp/test/distance/dist_correlation.cu        |  69 ++++++
 cpp/test/distance/dist_hamming.cu            |  69 ++++++
 cpp/test/distance/dist_jensen_shannon.cu     |  69 ++++++
 cpp/test/distance/dist_kl_divergence.cu      |  69 ++++++
 cpp/test/distance/dist_russell_rao.cu        |  69 ++++++
 cpp/test/distance/distance_base.cuh          | 163 +++++++++++-
 13 files changed, 1632 insertions(+), 4 deletions(-)
 create mode 100644 cpp/include/raft/distance/correlation.cuh
 create mode 100644 cpp/include/raft/distance/hamming.cuh
 create mode 100644 cpp/include/raft/distance/jensen_shannon.cuh
 create mode 100644 cpp/include/raft/distance/kl_divergence.cuh
 create mode 100644 cpp/include/raft/distance/russell_rao.cuh
 create mode 100644 cpp/test/distance/dist_correlation.cu
 create mode 100644 cpp/test/distance/dist_hamming.cu
 create mode 100644 cpp/test/distance/dist_jensen_shannon.cu
 create mode 100644 cpp/test/distance/dist_kl_divergence.cu
 create mode 100644 cpp/test/distance/dist_russell_rao.cu

diff --git a/cpp/include/raft/distance/correlation.cuh b/cpp/include/raft/distance/correlation.cuh
new file mode 100644
index 0000000000..ed3b7a5464
--- /dev/null
+++ b/cpp/include/raft/distance/correlation.cuh
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/cuda_utils.cuh>
+#include <raft/distance/pairwise_distance_base.cuh>
+#include <raft/linalg/reduce.cuh>
+
+namespace raft {
+namespace distance {
+
+/**
+ * @brief the Correlation distance matrix:
+ *
+ * @tparam DataT          input data-type (for A and B matrices)
+ * @tparam AccT           accumulation data-type
+ * @tparam OutT           output data-type (for C and D matrices)
+ * @tparam IdxT           index data-type
+ * @tparam Veclen         number of k-elements loaded by each thread
+                          for every LDG call. details in contractions.cuh
+ * @tparam FinalLambda    final lambda called on final distance value
+ * @tparam isRowMajor     true if input/output is row major,
+                          false for column major
+ * @param[in]       x input matrix
+ * @param[in]       y input matrix
+ * @param[in]       m number of rows of A and C/D
+ * @param[in]       n number of rows of B and C/D
+ * @param[in]       k number of cols of A and B
+ * @param[in]       lda leading dimension of A
+ * @param[in]       ldb leading dimension of B
+ * @param[in]       ldd leading dimension of C/D
+ * @param[output]   dOutput output matrix
+ * @param[in]       fin_op the final gemm epilogue lambda
+ * @param[in]       stream cuda stream to launch work
+ */
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          int VecLen, typename FinalLambda, bool isRowMajor>
+static void correlationImpl(const DataT *x, const DataT *y, const DataT *xn,
+                            const DataT *yn, const DataT *x2n, const DataT *y2n,
+                            IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb,
+                            IdxT ldd, OutT *dOutput, FinalLambda fin_op,
+                            cudaStream_t stream) {
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
+
+  typedef
+    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+
+  dim3 blk(KPolicy::Nthreads);
+
+  // Accumulation operation lambda
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
+    acc += x * y;
+  };
+
+  // epilogue operation lambda for final value calculation
+  auto epilog_lambda = [x2n, y2n, m, n, k] __device__(
+                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         IdxT gridStrideY) {
+    DataT regx2n[KPolicy::AccRowsPerTh], regy2n[KPolicy::AccColsPerTh];
+
+    extern __shared__ char smem[];
+    DataT *sx2Norm =
+      (DataT *)(&smem[KPolicy::SmemSize +
+                      (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)]);
+    DataT *sy2Norm = (&sx2Norm[KPolicy::Mblk]);
+
+    // Load x & y norms required by this threadblock in shmem buffer
+    if (gridStrideX == blockIdx.x * KPolicy::Nblk) {
+      for (int i = threadIdx.x; i < KPolicy::Mblk; i += KPolicy::Nthreads) {
+        auto idx = gridStrideY + i;
+        sx2Norm[i] = idx < m ? x2n[idx] : 0;
+      }
+    }
+
+    for (int i = threadIdx.x; i < KPolicy::Nblk; i += KPolicy::Nthreads) {
+      auto idx = gridStrideX + i;
+      sy2Norm[i] = idx < n ? y2n[idx] : 0;
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
+      regx2n[i] =
+        sx2Norm[i * KPolicy::AccThRows + (threadIdx.x / KPolicy::AccThCols)];
+    }
+#pragma unroll
+    for (int i = 0; i < KPolicy::AccColsPerTh; ++i) {
+      regy2n[i] =
+        sy2Norm[i * KPolicy::AccThCols + (threadIdx.x % KPolicy::AccThCols)];
+    }
+
+#pragma unroll
+    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
+        auto numer = k * acc[i][j] - (regxn[i] * regyn[j]);
+        auto Q_denom = k * regx2n[i] - (regxn[i] * regxn[i]);
+        auto R_denom = k * regy2n[j] - (regyn[j] * regyn[j]);
+
+        acc[i][j] = 1 - (numer / raft::mySqrt(Q_denom * R_denom));
+      }
+    }
+  };
+
+  constexpr size_t shmemSize =
+    KPolicy::SmemSize + (2 * (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
+  if (isRowMajor) {
+    constexpr auto correlationRowMajor =
+      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               correlationRowMajor);
+    correlationRowMajor<<<grid, blk, shmemSize, stream>>>(
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
+      fin_op);
+  } else {
+    constexpr auto correlationColMajor =
+      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               correlationColMajor);
+    correlationColMajor<<<grid, blk, shmemSize, stream>>>(
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
+      fin_op);
+  }
+
+  CUDA_CHECK(cudaGetLastError());
+}
+
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          typename FinalLambda, bool isRowMajor>
+void correlation(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                 const DataT *x, const DataT *y, const DataT *xn,
+                 const DataT *yn, const DataT *x2n, const DataT *y2n,
+                 OutT *dOutput, FinalLambda fin_op, cudaStream_t stream) {
+  size_t bytesA = sizeof(DataT) * lda;
+  size_t bytesB = sizeof(DataT) * ldb;
+  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
+    correlationImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
+                    isRowMajor>(x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd,
+                                dOutput, fin_op, stream);
+  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
+    correlationImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
+                    isRowMajor>(x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd,
+                                dOutput, fin_op, stream);
+  } else {
+    correlationImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
+      x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+  }
+}
+
+/**
+ * @brief the Correlation distance matrix calculation
+ *
+ * @tparam InType input data-type (for A and B matrices)
+ * @tparam AccType accumulation data-type
+ * @tparam OutType output data-type (for C and D matrices)
+ * @tparam FinalLambda user-defined epilogue lamba
+ * @tparam Index_ Index type
+ * @param m number of rows of A and C/D
+ * @param n number of columns of B and C/D
+ * @param k number of cols of A and rows of B
+ * @param pA input matrix
+ * @param pB input matrix
+ * @param pD output matrix
+ * @param fin_op the final element-wise epilogue lambda
+ * @param stream cuda stream where to launch work
+ * @param isRowMajor whether the input and output matrices are row major
+ */
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_ = int>
+void correlationImpl(int m, int n, int k, const InType *pA, const InType *pB,
+                     OutType *pD, AccType *workspace, size_t &worksize,
+                     FinalLambda fin_op, cudaStream_t stream, bool isRowMajor) {
+  typedef std::is_same<OutType, bool> is_bool;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
+    correlationOutType;
+  Index_ lda, ldb, ldd;
+  correlationOutType *pDcast = reinterpret_cast<correlationOutType *>(pD);
+
+  ASSERT(!(((pA != pB) && (worksize < 2 * (m + n) * sizeof(AccType))) ||
+           (worksize < 2 * m * sizeof(AccType))),
+         "workspace size error");
+  ASSERT(workspace != nullptr, "workspace is null");
+
+  AccType *norm_col_vec = workspace;
+  AccType *norm_row_vec = workspace;
+  AccType *sq_norm_col_vec = workspace;
+  AccType *sq_norm_row_vec = workspace;
+  if (pA != pB) {
+    norm_row_vec += m;
+
+    raft::linalg::reduce(norm_col_vec, pA, k, m, (AccType)0, isRowMajor, true,
+                         stream, false, raft::Nop<InType>(),
+                         raft::Sum<InType>());
+    raft::linalg::reduce(norm_row_vec, pB, k, n, (AccType)0, isRowMajor, true,
+                         stream, false, raft::Nop<InType>(),
+                         raft::Sum<InType>());
+
+    sq_norm_col_vec += (m + n);
+    sq_norm_row_vec = sq_norm_col_vec + m;
+    raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm,
+                          isRowMajor, stream);
+    raft::linalg::rowNorm(sq_norm_row_vec, pB, k, n, raft::linalg::L2Norm,
+                          isRowMajor, stream);
+  } else {
+    raft::linalg::reduce(norm_col_vec, pA, k, m, (AccType)0, isRowMajor, true,
+                         stream, false, raft::Nop<InType>(),
+                         raft::Sum<InType>());
+    sq_norm_col_vec += m;
+    sq_norm_row_vec = sq_norm_col_vec;
+    raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm,
+                          isRowMajor, stream);
+  }
+
+  if (isRowMajor) {
+    lda = k, ldb = k, ldd = n;
+    correlation<InType, AccType, correlationOutType, Index_, FinalLambda, true>(
+      m, n, k, lda, ldb, ldd, pA, pB, norm_col_vec, norm_row_vec,
+      sq_norm_col_vec, sq_norm_row_vec, pDcast, fin_op, stream);
+  } else {
+    lda = n, ldb = m, ldd = m;
+    correlation<InType, AccType, correlationOutType, Index_, FinalLambda,
+                false>(n, m, k, lda, ldb, ldd, pB, pA, norm_row_vec,
+                       norm_col_vec, sq_norm_row_vec, sq_norm_col_vec, pDcast,
+                       fin_op, stream);
+  }
+}
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
index fc0d07773f..02d8fb6d03 100644
--- a/cpp/include/raft/distance/distance.cuh
+++ b/cpp/include/raft/distance/distance.cuh
@@ -21,11 +21,16 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/canberra.cuh>
 #include <raft/distance/chebyshev.cuh>
+#include <raft/distance/correlation.cuh>
 #include <raft/distance/cosine.cuh>
 #include <raft/distance/euclidean.cuh>
+#include <raft/distance/hamming.cuh>
 #include <raft/distance/hellinger.cuh>
+#include <raft/distance/jensen_shannon.cuh>
+#include <raft/distance/kl_divergence.cuh>
 #include <raft/distance/l1.cuh>
 #include <raft/distance/minkowski.cuh>
+#include <raft/distance/russell_rao.cuh>
 #include <raft/mr/device/buffer.hpp>
 
 namespace raft {
@@ -171,6 +176,72 @@ struct DistanceImpl<raft::distance::DistanceType::Canberra, InType, AccType,
   }
 };
 
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::HammingUnexpanded, InType,
+                    AccType, OutType, FinalLambda, Index_> {
+  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
+           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
+           bool isRowMajor, InType) {
+    raft::distance::hammingUnexpandedImpl<InType, AccType, OutType, FinalLambda,
+                                          Index_>(m, n, k, x, y, dist, fin_op,
+                                                  stream, isRowMajor);
+  }
+};
+
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::JensenShannon, InType,
+                    AccType, OutType, FinalLambda, Index_> {
+  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
+           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
+           bool isRowMajor, InType) {
+    raft::distance::jensenShannonImpl<InType, AccType, OutType, FinalLambda,
+                                      Index_>(m, n, k, x, y, dist, fin_op,
+                                              stream, isRowMajor);
+  }
+};
+
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::RusselRaoExpanded, InType,
+                    AccType, OutType, FinalLambda, Index_> {
+  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
+           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
+           bool isRowMajor, InType) {
+    raft::distance::russellRaoImpl<InType, AccType, OutType, FinalLambda,
+                                   Index_>(m, n, k, x, y, dist, fin_op, stream,
+                                           isRowMajor);
+  }
+};
+
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::KLDivergence, InType, AccType,
+                    OutType, FinalLambda, Index_> {
+  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
+           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
+           bool isRowMajor, InType) {
+    raft::distance::klDivergenceImpl<InType, AccType, OutType, FinalLambda,
+                                     Index_>(m, n, k, x, y, dist, fin_op,
+                                             stream, isRowMajor);
+  }
+};
+
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::CorrelationExpanded, InType,
+                    AccType, OutType, FinalLambda, Index_> {
+  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
+           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
+           cudaStream_t stream, bool isRowMajor, InType) {
+    raft::distance::correlationImpl<InType, AccType, OutType, FinalLambda,
+                                    Index_>(m, n, k, x, y, dist,
+                                            (AccType *)workspace, worksize,
+                                            fin_op, stream, isRowMajor);
+  }
+};
+
 }  // anonymous namespace
 
 /**
@@ -195,11 +266,16 @@ size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n,
                         Index_ k) {
   size_t worksize = 0;
   constexpr bool is_allocated =
-    distanceType <= raft::distance::DistanceType::CosineExpanded;
+    (distanceType <= raft::distance::DistanceType::CosineExpanded) ||
+    (distanceType == raft::distance::DistanceType::CorrelationExpanded);
+  constexpr int numOfBuffers =
+    (distanceType == raft::distance::DistanceType::CorrelationExpanded) ? 2 : 1;
+
   if (is_allocated) {
-    worksize += m * sizeof(AccType);
-    if (x != y) worksize += n * sizeof(AccType);
+    worksize += numOfBuffers * m * sizeof(AccType);
+    if (x != y) worksize += numOfBuffers * n * sizeof(AccType);
   }
+
   return worksize;
 }
 
@@ -366,6 +442,31 @@ void pairwise_distance(const Type *x, const Type *y, Type *dist, Index_ m,
                              raft::distance::DistanceType::Canberra>(
         x, y, dist, m, n, k, workspace, stream, isRowMajor);
       break;
+    case raft::distance::DistanceType::HammingUnexpanded:
+      pairwise_distance_impl<Type, Index_,
+                             raft::distance::DistanceType::HammingUnexpanded>(
+        x, y, dist, m, n, k, workspace, stream, isRowMajor);
+      break;
+    case raft::distance::DistanceType::JensenShannon:
+      pairwise_distance_impl<Type, Index_,
+                             raft::distance::DistanceType::JensenShannon>(
+        x, y, dist, m, n, k, workspace, stream, isRowMajor);
+      break;
+    case raft::distance::DistanceType::RusselRaoExpanded:
+      pairwise_distance_impl<Type, Index_,
+                             raft::distance::DistanceType::RusselRaoExpanded>(
+        x, y, dist, m, n, k, workspace, stream, isRowMajor);
+      break;
+    case raft::distance::DistanceType::KLDivergence:
+      pairwise_distance_impl<Type, Index_,
+                             raft::distance::DistanceType::KLDivergence>(
+        x, y, dist, m, n, k, workspace, stream, isRowMajor);
+      break;
+    case raft::distance::DistanceType::CorrelationExpanded:
+      pairwise_distance_impl<Type, Index_,
+                             raft::distance::DistanceType::CorrelationExpanded>(
+        x, y, dist, m, n, k, workspace, stream, isRowMajor);
+      break;
     default:
       THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
   };
diff --git a/cpp/include/raft/distance/hamming.cuh b/cpp/include/raft/distance/hamming.cuh
new file mode 100644
index 0000000000..08f1020b85
--- /dev/null
+++ b/cpp/include/raft/distance/hamming.cuh
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/distance/pairwise_distance_base.cuh>
+
+namespace raft {
+namespace distance {
+
+/**
+ * @brief the Hamming distance matrix using the unexpanded form:
+ *  It computes the following equation: 
+    Cij = sum(x_i != y_i) / k
+ *
+ * @tparam DataT          input data-type (for A and B matrices)
+ * @tparam AccT           accumulation data-type
+ * @tparam OutT           output data-type (for C and D matrices)
+ * @tparam IdxT           index data-type
+ * @tparam Veclen         number of k-elements loaded by each thread
+                          for every LDG call. details in contractions.cuh
+ * @tparam FinalLambda    final lambda called on final distance value
+ * @tparam isRowMajor     true if input/output is row major,
+                          false for column major
+ * @param[in]       x input matrix
+ * @param[in]       y input matrix
+ * @param[in]       m number of rows of A and C/D
+ * @param[in]       n number of rows of B and C/D
+ * @param[in]       k number of cols of A and B
+ * @param[in]       lda leading dimension of A
+ * @param[in]       ldb leading dimension of B
+ * @param[in]       ldd leading dimension of C/D
+ * @param[output]   dOutput output matrix
+ * @param[in]       fin_op the final gemm epilogue lambda
+ * @param[in]       stream cuda stream to launch work
+ */
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          int VecLen, typename FinalLambda, bool isRowMajor>
+static void hammingUnexpandedImpl(const DataT *x, const DataT *y, IdxT m,
+                                  IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                                  OutT *dOutput, FinalLambda fin_op,
+                                  cudaStream_t stream) {
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
+
+  typedef
+    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+
+  dim3 blk(KPolicy::Nthreads);
+
+  // Accumulation operation lambda
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
+    acc += (x != y);
+  };
+
+  // epilogue operation lambda for final value calculation
+  auto epilog_lambda = [k] __device__(
+                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         IdxT gridStrideY) {
+    const DataT one_over_k = DataT(1.0) / k;
+#pragma unroll
+    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
+        acc[i][j] *= one_over_k;
+      }
+    }
+  };
+
+  if (isRowMajor) {
+    auto hammingUnexpandedRowMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               hammingUnexpandedRowMajor);
+
+    hammingUnexpandedRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
+  } else {
+    auto hammingUnexpandedColMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               hammingUnexpandedColMajor);
+    hammingUnexpandedColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
+  }
+
+  CUDA_CHECK(cudaGetLastError());
+}
+
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          typename FinalLambda, bool isRowMajor>
+void hammingUnexpanded(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                       const DataT *x, const DataT *y, OutT *dOutput,
+                       FinalLambda fin_op, cudaStream_t stream) {
+  size_t bytesA = sizeof(DataT) * lda;
+  size_t bytesB = sizeof(DataT) * ldb;
+  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
+    hammingUnexpandedImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT),
+                          FinalLambda, isRowMajor>(x, y, m, n, k, lda, ldb, ldd,
+                                                   dOutput, fin_op, stream);
+  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
+    hammingUnexpandedImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT),
+                          FinalLambda, isRowMajor>(x, y, m, n, k, lda, ldb, ldd,
+                                                   dOutput, fin_op, stream);
+  } else {
+    hammingUnexpandedImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+  }
+}
+
+/**
+ * @brief the Hamming Unexpanded distance matrix calculation
+ *  It computes the following equation: 
+    Cij = sum(x_i != y_i) / k
+ *
+ * @tparam InType input data-type (for A and B matrices)
+ * @tparam AccType accumulation data-type
+ * @tparam OutType output data-type (for C and D matrices)
+ * @tparam FinalLambda user-defined epilogue lamba
+ * @tparam Index_ Index type
+ * @param m number of rows of A and C/D
+ * @param n number of columns of B and C/D
+ * @param k number of cols of A and rows of B
+ * @param pA input matrix
+ * @param pB input matrix
+ * @param pD output matrix
+ * @param fin_op the final element-wise epilogue lambda
+ * @param stream cuda stream where to launch work
+ * @param isRowMajor whether the input and output matrices are row major
+ */
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_ = int>
+void hammingUnexpandedImpl(int m, int n, int k, const InType *pA,
+                           const InType *pB, OutType *pD, FinalLambda fin_op,
+                           cudaStream_t stream, bool isRowMajor) {
+  typedef std::is_same<OutType, bool> is_bool;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
+    hammingUnexpandedOutType;
+  Index_ lda, ldb, ldd;
+  hammingUnexpandedOutType *pDcast =
+    reinterpret_cast<hammingUnexpandedOutType *>(pD);
+  if (isRowMajor) {
+    lda = k, ldb = k, ldd = n;
+    hammingUnexpanded<InType, AccType, hammingUnexpandedOutType, Index_,
+                      FinalLambda, true>(m, n, k, lda, ldb, ldd, pA, pB, pDcast,
+                                         fin_op, stream);
+
+  } else {
+    lda = n, ldb = m, ldd = m;
+    hammingUnexpanded<InType, AccType, hammingUnexpandedOutType, Index_,
+                      FinalLambda, false>(n, m, k, lda, ldb, ldd, pB, pA,
+                                          pDcast, fin_op, stream);
+  }
+}
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/jensen_shannon.cuh b/cpp/include/raft/distance/jensen_shannon.cuh
new file mode 100644
index 0000000000..2a94205853
--- /dev/null
+++ b/cpp/include/raft/distance/jensen_shannon.cuh
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/distance/pairwise_distance_base.cuh>
+
+namespace raft {
+namespace distance {
+
+/**
+ * @brief the Jensen Shannon distance matrix:
+ *  It computes the following equation: 
+    Cij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i))
+            + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i)))))
+ *
+ * @tparam DataT          input data-type (for A and B matrices)
+ * @tparam AccT           accumulation data-type
+ * @tparam OutT           output data-type (for C and D matrices)
+ * @tparam IdxT           index data-type
+ * @tparam Veclen         number of k-elements loaded by each thread
+                          for every LDG call. details in contractions.cuh
+ * @tparam FinalLambda    final lambda called on final distance value
+ * @tparam isRowMajor     true if input/output is row major,
+                          false for column major
+ * @param[in]       x input matrix
+ * @param[in]       y input matrix
+ * @param[in]       m number of rows of A and C/D
+ * @param[in]       n number of rows of B and C/D
+ * @param[in]       k number of cols of A and B
+ * @param[in]       lda leading dimension of A
+ * @param[in]       ldb leading dimension of B
+ * @param[in]       ldd leading dimension of C/D
+ * @param[output]   dOutput output matrix
+ * @param[in]       fin_op the final gemm epilogue lambda
+ * @param[in]       stream cuda stream to launch work
+ */
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          int VecLen, typename FinalLambda, bool isRowMajor>
+static void jensenShannonImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
+                              IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                              OutT *dOutput, FinalLambda fin_op,
+                              cudaStream_t stream) {
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
+
+  typedef
+    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+
+  dim3 blk(KPolicy::Nthreads);
+
+  // Accumulation operation lambda
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
+    const DataT m = 0.5f * (x + y);
+    const bool m_zero = (m == 0);
+    const auto logM = (!m_zero) * raft::myLog(m + m_zero);
+
+    const bool x_zero = (x == 0);
+    const bool y_zero = (y == 0);
+    acc += (-x * (logM - raft::myLog(x + x_zero))) +
+           (-y * (logM - raft::myLog(y + y_zero)));
+  };
+
+  // epilogue operation lambda for final value calculation
+  auto epilog_lambda = [] __device__(
+                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         IdxT gridStrideY) {
+#pragma unroll
+    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
+        acc[i][j] = raft::mySqrt(0.5 * acc[i][j]);
+      }
+    }
+  };
+
+  if (isRowMajor) {
+    auto jensenShannonRowMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               jensenShannonRowMajor);
+
+    jensenShannonRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
+  } else {
+    auto jensenShannonColMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               jensenShannonColMajor);
+    jensenShannonColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
+  }
+
+  CUDA_CHECK(cudaGetLastError());
+}
+
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          typename FinalLambda, bool isRowMajor>
+void jensenShannon(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                   const DataT *x, const DataT *y, OutT *dOutput,
+                   FinalLambda fin_op, cudaStream_t stream) {
+  size_t bytesA = sizeof(DataT) * lda;
+  size_t bytesB = sizeof(DataT) * ldb;
+  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
+    jensenShannonImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
+                      isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
+                                  stream);
+  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
+    jensenShannonImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
+                      isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
+                                  stream);
+  } else {
+    jensenShannonImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+  }
+}
+
+/**
+ * @brief the Jensen Shannon distance matrix calculation
+ *  It computes the following equation: 
+    Cij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i))
+            + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i)))))
+ *
+ * @tparam InType input data-type (for A and B matrices)
+ * @tparam AccType accumulation data-type
+ * @tparam OutType output data-type (for C and D matrices)
+ * @tparam FinalLambda user-defined epilogue lamba
+ * @tparam Index_ Index type
+ * @param m number of rows of A and C/D
+ * @param n number of columns of B and C/D
+ * @param k number of cols of A and rows of B
+ * @param pA input matrix
+ * @param pB input matrix
+ * @param pD output matrix
+ * @param fin_op the final element-wise epilogue lambda
+ * @param stream cuda stream where to launch work
+ * @param isRowMajor whether the input and output matrices are row major
+ */
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_ = int>
+void jensenShannonImpl(int m, int n, int k, const InType *pA, const InType *pB,
+                       OutType *pD, FinalLambda fin_op, cudaStream_t stream,
+                       bool isRowMajor) {
+  typedef std::is_same<OutType, bool> is_bool;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
+    jensenShannonOutType;
+  Index_ lda, ldb, ldd;
+  jensenShannonOutType *pDcast = reinterpret_cast<jensenShannonOutType *>(pD);
+  if (isRowMajor) {
+    lda = k, ldb = k, ldd = n;
+    jensenShannon<InType, AccType, jensenShannonOutType, Index_, FinalLambda,
+                  true>(m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
+
+  } else {
+    lda = n, ldb = m, ldd = m;
+    jensenShannon<InType, AccType, jensenShannonOutType, Index_, FinalLambda,
+                  false>(n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op,
+                         stream);
+  }
+}
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/kl_divergence.cuh b/cpp/include/raft/distance/kl_divergence.cuh
new file mode 100644
index 0000000000..3197b73d10
--- /dev/null
+++ b/cpp/include/raft/distance/kl_divergence.cuh
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/distance/pairwise_distance_base.cuh>
+
+namespace raft {
+namespace distance {
+
+/**
+ * @brief the KL Divergence distance matrix:
+ *  It computes the following equation: 
+    Cij = 0.5 * sum(x * log (x / y));
+ * This distance computation modifies A or B by computing a log(x)
+ * and then performing a `pow(e, log(x))` to convert it back. Because of this,
+ * it is possible that the values in A or B might differ slightly
+ * after this is invoked.
+ *
+ * @tparam DataT          input data-type (for A and B matrices)
+ * @tparam AccT           accumulation data-type
+ * @tparam OutT           output data-type (for C and D matrices)
+ * @tparam IdxT           index data-type
+ * @tparam Veclen         number of k-elements loaded by each thread
+                          for every LDG call. details in contractions.cuh
+ * @tparam FinalLambda    final lambda called on final distance value
+ * @tparam isRowMajor     true if input/output is row major,
+                          false for column major
+ * @param[in]       x input matrix
+ * @param[in]       y input matrix
+ * @param[in]       m number of rows of A and C/D
+ * @param[in]       n number of rows of B and C/D
+ * @param[in]       k number of cols of A and B
+ * @param[in]       lda leading dimension of A
+ * @param[in]       ldb leading dimension of B
+ * @param[in]       ldd leading dimension of C/D
+ * @param[output]   dOutput output matrix
+ * @param[in]       fin_op the final gemm epilogue lambda
+ * @param[in]       stream cuda stream to launch work
+ */
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          int VecLen, typename FinalLambda, bool isRowMajor>
+static void klDivergenceImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
+                             IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                             OutT *dOutput, FinalLambda fin_op,
+                             cudaStream_t stream) {
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
+
+  typedef
+    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+
+  dim3 blk(KPolicy::Nthreads);
+
+  // Accumulation operation lambda
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
+    if (isRowMajor) {
+      const bool x_zero = (x == 0);
+      acc += x * (raft::myLog(x + x_zero) - y);
+    } else {
+      const bool y_zero = (y == 0);
+      acc += y * (raft::myLog(y + y_zero) - x);
+    }
+  };
+
+  auto core_lambda_x_equal_y = [] __device__(AccT & acc, DataT & x, DataT & y) {
+    if (isRowMajor) {
+      const bool x_zero = (x == 0);
+      const bool y_zero = (y == 0);
+      acc +=
+        x * (raft::myLog(x + x_zero) - (!y_zero) * raft::myLog(y + y_zero));
+    } else {
+      const bool y_zero = (y == 0);
+      const bool x_zero = (x == 0);
+      acc +=
+        y * (raft::myLog(y + y_zero) - (!x_zero) * raft::myLog(x + x_zero));
+    }
+  };
+
+  auto unaryOp_lambda = [] __device__(DataT input) {
+    const bool x_zero = (input == 0);
+    return (!x_zero) * raft::myLog(input + x_zero);
+  };
+
+  auto unaryOp_lambda_reverse = [] __device__(DataT input) {
+    // reverse previous log (x) back to x using (e ^ log(x))
+    const bool x_zero = (input == 0);
+    return (!x_zero) * raft::myExp(input);
+  };
+
+  // epilogue operation lambda for final value calculation
+  auto epilog_lambda = [] __device__(
+                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         IdxT gridStrideY) {
+#pragma unroll
+    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
+        acc[i][j] = (0.5f * acc[i][j]);
+      }
+    }
+  };
+
+  if (isRowMajor) {
+    constexpr auto klDivergenceRowMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, true>;
+    constexpr auto klDivergenceRowMajorXequalY =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda_x_equal_y),
+                                decltype(epilog_lambda), FinalLambda, true>;
+    if (x != y) {
+      raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
+        (DataT *)y, y, n * k, unaryOp_lambda, stream);
+      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                                 klDivergenceRowMajor);
+      klDivergenceRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
+        x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+        epilog_lambda, fin_op);
+      // Now reverse previous log (x) back to x using (e ^ log(x))
+      raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda_reverse), IdxT>(
+        (DataT *)y, y, n * k, unaryOp_lambda_reverse, stream);
+    } else {
+      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                                 klDivergenceRowMajorXequalY);
+      klDivergenceRowMajorXequalY<<<grid, blk, KPolicy::SmemSize, stream>>>(
+        x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput,
+        core_lambda_x_equal_y, epilog_lambda, fin_op);
+    }
+  } else {
+    constexpr auto klDivergenceColMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, false>;
+    constexpr auto klDivergenceColMajorXequalY =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda_x_equal_y),
+                                decltype(epilog_lambda), FinalLambda, false>;
+    if (x != y) {
+      raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
+        (DataT *)x, x, m * k, unaryOp_lambda, stream);
+      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                                 klDivergenceColMajor);
+      klDivergenceColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
+        x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+        epilog_lambda, fin_op);
+      // Now reverse previous log (x) back to x using (e ^ log(x))
+      raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda_reverse), IdxT>(
+        (DataT *)x, x, m * k, unaryOp_lambda_reverse, stream);
+    } else {
+      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                                 klDivergenceColMajorXequalY);
+      klDivergenceColMajorXequalY<<<grid, blk, KPolicy::SmemSize, stream>>>(
+        x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput,
+        core_lambda_x_equal_y, epilog_lambda, fin_op);
+    }
+  }
+
+  CUDA_CHECK(cudaGetLastError());
+}
+
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          typename FinalLambda, bool isRowMajor>
+void klDivergence(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                  const DataT *x, const DataT *y, OutT *dOutput,
+                  FinalLambda fin_op, cudaStream_t stream) {
+  size_t bytesA = sizeof(DataT) * lda;
+  size_t bytesB = sizeof(DataT) * ldb;
+  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
+    klDivergenceImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
+                     isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
+                                 stream);
+  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
+    klDivergenceImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
+                     isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
+                                 stream);
+  } else {
+    klDivergenceImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+  }
+}
+
+/**
+ * @brief the KL Divergence distance matrix calculation
+ *  It computes the following equation: 
+      Cij = 0.5 * sum(x * log (x / y));
+ * This distance computation modifies A or B by computing a log(x)
+ * and then performing a `pow(e, log(x))` to convert it back. Because of this,
+ * it is possible that the values in A or B might differ slightly
+ * after this is invoked.
+ * @tparam InType input data-type (for A and B matrices)
+ * @tparam AccType accumulation data-type
+ * @tparam OutType output data-type (for C and D matrices)
+ * @tparam FinalLambda user-defined epilogue lamba
+ * @tparam Index_ Index type
+ * @param m number of rows of A and C/D
+ * @param n number of columns of B and C/D
+ * @param k number of cols of A and rows of B
+ * @param pA input matrix
+ * @param pB input matrix
+ * @param pD output matrix
+ * @param fin_op the final element-wise epilogue lambda
+ * @param stream cuda stream where to launch work
+ * @param isRowMajor whether the input and output matrices are row major
+ */
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_ = int>
+void klDivergenceImpl(int m, int n, int k, const InType *pA, const InType *pB,
+                      OutType *pD, FinalLambda fin_op, cudaStream_t stream,
+                      bool isRowMajor) {
+  typedef std::is_same<OutType, bool> is_bool;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
+    klDivergenceOutType;
+  Index_ lda, ldb, ldd;
+  klDivergenceOutType *pDcast = reinterpret_cast<klDivergenceOutType *>(pD);
+  if (isRowMajor) {
+    lda = k, ldb = k, ldd = n;
+    klDivergence<InType, AccType, klDivergenceOutType, Index_, FinalLambda,
+                 true>(m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
+
+  } else {
+    lda = n, ldb = m, ldd = m;
+    klDivergence<InType, AccType, klDivergenceOutType, Index_, FinalLambda,
+                 false>(n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
+  }
+}
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/russell_rao.cuh b/cpp/include/raft/distance/russell_rao.cuh
new file mode 100644
index 0000000000..417fb73b94
--- /dev/null
+++ b/cpp/include/raft/distance/russell_rao.cuh
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/distance/pairwise_distance_base.cuh>
+
+namespace raft {
+namespace distance {
+
+/**
+ * @brief the Russell Rao distance matrix:
+ *  It computes the following equation: 
+    Cij = (k - sum(x_i * y_i)) / k
+ *
+ * @tparam DataT          input data-type (for A and B matrices)
+ * @tparam AccT           accumulation data-type
+ * @tparam OutT           output data-type (for C and D matrices)
+ * @tparam IdxT           index data-type
+ * @tparam Veclen         number of k-elements loaded by each thread
+                          for every LDG call. details in contractions.cuh
+ * @tparam FinalLambda    final lambda called on final distance value
+ * @tparam isRowMajor     true if input/output is row major,
+                          false for column major
+ * @param[in]       x input matrix
+ * @param[in]       y input matrix
+ * @param[in]       m number of rows of A and C/D
+ * @param[in]       n number of rows of B and C/D
+ * @param[in]       k number of cols of A and B
+ * @param[in]       lda leading dimension of A
+ * @param[in]       ldb leading dimension of B
+ * @param[in]       ldd leading dimension of C/D
+ * @param[output]   dOutput output matrix
+ * @param[in]       fin_op the final gemm epilogue lambda
+ * @param[in]       stream cuda stream to launch work
+ */
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          int VecLen, typename FinalLambda, bool isRowMajor>
+static void russellRaoImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
+                           IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
+                           FinalLambda fin_op, cudaStream_t stream) {
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
+
+  typedef
+    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+
+  dim3 blk(KPolicy::Nthreads);
+
+  // Accumulation operation lambda
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
+    acc += x * y;
+  };
+
+  const float one_over_k = 1.0 / k;
+  // epilogue operation lambda for final value calculation
+  auto epilog_lambda = [k, one_over_k] __device__(
+                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         IdxT gridStrideY) {
+#pragma unroll
+    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
+        acc[i][j] = (k - acc[i][j]) * one_over_k;
+      }
+    }
+  };
+
+  if (isRowMajor) {
+    constexpr auto russellRaoRowMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               russellRaoRowMajor);
+
+    russellRaoRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
+  } else {
+    constexpr auto russellRaoColMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               russellRaoColMajor);
+    russellRaoColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
+  }
+
+  CUDA_CHECK(cudaGetLastError());
+}
+
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          typename FinalLambda, bool isRowMajor>
+void russellRao(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                const DataT *x, const DataT *y, OutT *dOutput,
+                FinalLambda fin_op, cudaStream_t stream) {
+  size_t bytesA = sizeof(DataT) * lda;
+  size_t bytesB = sizeof(DataT) * ldb;
+  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
+    russellRaoImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
+                   isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
+                               stream);
+  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
+    russellRaoImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
+                   isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
+                               stream);
+  } else {
+    russellRaoImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+  }
+}
+
+/**
+ * @brief the Russell Rao distance matrix calculation
+ *  It computes the following equation: 
+    Cij = (k - sum(x_i * y_i)) / k
+ *
+ * @tparam InType input data-type (for A and B matrices)
+ * @tparam AccType accumulation data-type
+ * @tparam OutType output data-type (for C and D matrices)
+ * @tparam FinalLambda user-defined epilogue lamba
+ * @tparam Index_ Index type
+ * @param m number of rows of A and C/D
+ * @param n number of columns of B and C/D
+ * @param k number of cols of A and rows of B
+ * @param pA input matrix
+ * @param pB input matrix
+ * @param pD output matrix
+ * @param fin_op the final element-wise epilogue lambda
+ * @param stream cuda stream where to launch work
+ * @param isRowMajor whether the input and output matrices are row major
+ */
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_ = int>
+void russellRaoImpl(int m, int n, int k, const InType *pA, const InType *pB,
+                    OutType *pD, FinalLambda fin_op, cudaStream_t stream,
+                    bool isRowMajor) {
+  typedef std::is_same<OutType, bool> is_bool;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
+    russellRaoOutType;
+  Index_ lda, ldb, ldd;
+  russellRaoOutType *pDcast = reinterpret_cast<russellRaoOutType *>(pD);
+  if (isRowMajor) {
+    lda = k, ldb = k, ldd = n;
+    russellRao<InType, AccType, russellRaoOutType, Index_, FinalLambda, true>(
+      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
+
+  } else {
+    lda = n, ldb = m, ldd = m;
+    russellRao<InType, AccType, russellRaoOutType, Index_, FinalLambda, false>(
+      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
+  }
+}
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index f94a8d9525..0428e09142 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -21,12 +21,17 @@ add_executable(test_raft
     test/distance/dist_adj.cu
     test/distance/dist_canberra.cu
     test/distance/dist_chebyshev.cu
+    test/distance/dist_correlation.cu
     test/distance/dist_cos.cu
     test/distance/dist_euc_exp.cu
     test/distance/dist_euc_unexp.cu
+    test/distance/dist_hamming.cu
     test/distance/dist_hellinger.cu
+    test/distance/dist_jensen_shannon.cu
+    test/distance/dist_kl_divergence.cu
     test/distance/dist_l1.cu
     test/distance/dist_minkowski.cu
+    test/distance/dist_russell_rao.cu
     test/distance/fused_l2_nn.cu
     test/eigen_solvers.cu
     test/handle.cpp
diff --git a/cpp/test/distance/dist_correlation.cu b/cpp/test/distance/dist_correlation.cu
new file mode 100644
index 0000000000..5d84f18e52
--- /dev/null
+++ b/cpp/test/distance/dist_correlation.cu
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include "distance_base.cuh"
+
+namespace raft {
+namespace distance {
+
+template <typename DataType>
+class DistanceCorrelation
+  : public DistanceTest<raft::distance::DistanceType::CorrelationExpanded,
+                        DataType> {};
+
+const std::vector<DistanceInputs<float>> inputsf = {
+  {0.001f, 1024, 1024, 32, true, 1234ULL},
+  {0.001f, 1024, 32, 1024, true, 1234ULL},
+  {0.001f, 32, 1024, 1024, true, 1234ULL},
+  {0.003f, 1024, 1024, 1024, true, 1234ULL},
+  {0.001f, 1024, 1024, 32, false, 1234ULL},
+  {0.001f, 1024, 32, 1024, false, 1234ULL},
+  {0.001f, 32, 1024, 1024, false, 1234ULL},
+  {0.003f, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceCorrelation<float> DistanceCorrelationF;
+TEST_P(DistanceCorrelationF, Result) {
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationF,
+                        ::testing::ValuesIn(inputsf));
+
+const std::vector<DistanceInputs<double>> inputsd = {
+  {0.001, 1024, 1024, 32, true, 1234ULL},
+  {0.001, 1024, 32, 1024, true, 1234ULL},
+  {0.001, 32, 1024, 1024, true, 1234ULL},
+  {0.003, 1024, 1024, 1024, true, 1234ULL},
+  {0.001, 1024, 1024, 32, false, 1234ULL},
+  {0.001, 1024, 32, 1024, false, 1234ULL},
+  {0.001, 32, 1024, 1024, false, 1234ULL},
+  {0.003, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceCorrelation<double> DistanceCorrelationD;
+TEST_P(DistanceCorrelationD, Result) {
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationD,
+                        ::testing::ValuesIn(inputsd));
+
+}  // end namespace distance
+}  // end namespace raft
diff --git a/cpp/test/distance/dist_hamming.cu b/cpp/test/distance/dist_hamming.cu
new file mode 100644
index 0000000000..47febd825b
--- /dev/null
+++ b/cpp/test/distance/dist_hamming.cu
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include "distance_base.cuh"
+
+namespace raft {
+namespace distance {
+
+template <typename DataType>
+class DistanceHamming
+  : public DistanceTest<raft::distance::DistanceType::HammingUnexpanded,
+                        DataType> {};
+
+const std::vector<DistanceInputs<float>> inputsf = {
+  {0.001f, 1024, 1024, 32, true, 1234ULL},
+  {0.001f, 1024, 32, 1024, true, 1234ULL},
+  {0.001f, 32, 1024, 1024, true, 1234ULL},
+  {0.003f, 1024, 1024, 1024, true, 1234ULL},
+  {0.001f, 1024, 1024, 32, false, 1234ULL},
+  {0.001f, 1024, 32, 1024, false, 1234ULL},
+  {0.001f, 32, 1024, 1024, false, 1234ULL},
+  {0.003f, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceHamming<float> DistanceHammingF;
+TEST_P(DistanceHammingF, Result) {
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingF,
+                        ::testing::ValuesIn(inputsf));
+
+const std::vector<DistanceInputs<double>> inputsd = {
+  {0.001, 1024, 1024, 32, true, 1234ULL},
+  {0.001, 1024, 32, 1024, true, 1234ULL},
+  {0.001, 32, 1024, 1024, true, 1234ULL},
+  {0.003, 1024, 1024, 1024, true, 1234ULL},
+  {0.001, 1024, 1024, 32, false, 1234ULL},
+  {0.001, 1024, 32, 1024, false, 1234ULL},
+  {0.001, 32, 1024, 1024, false, 1234ULL},
+  {0.003, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceHamming<double> DistanceHammingD;
+TEST_P(DistanceHammingD, Result) {
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingD,
+                        ::testing::ValuesIn(inputsd));
+
+}  // end namespace distance
+}  // end namespace raft
diff --git a/cpp/test/distance/dist_jensen_shannon.cu b/cpp/test/distance/dist_jensen_shannon.cu
new file mode 100644
index 0000000000..bc0b56f506
--- /dev/null
+++ b/cpp/test/distance/dist_jensen_shannon.cu
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include "distance_base.cuh"
+
+namespace raft {
+namespace distance {
+
+template <typename DataType>
+class DistanceJensenShannon
+  : public DistanceTest<raft::distance::DistanceType::JensenShannon, DataType> {
+};
+
+const std::vector<DistanceInputs<float>> inputsf = {
+  {0.001f, 1024, 1024, 32, true, 1234ULL},
+  {0.001f, 1024, 32, 1024, true, 1234ULL},
+  {0.001f, 32, 1024, 1024, true, 1234ULL},
+  {0.003f, 1024, 1024, 1024, true, 1234ULL},
+  {0.001f, 1024, 1024, 32, false, 1234ULL},
+  {0.001f, 1024, 32, 1024, false, 1234ULL},
+  {0.001f, 32, 1024, 1024, false, 1234ULL},
+  {0.003f, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceJensenShannon<float> DistanceJensenShannonF;
+TEST_P(DistanceJensenShannonF, Result) {
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonF,
+                        ::testing::ValuesIn(inputsf));
+
+const std::vector<DistanceInputs<double>> inputsd = {
+  {0.001, 1024, 1024, 32, true, 1234ULL},
+  {0.001, 1024, 32, 1024, true, 1234ULL},
+  {0.001, 32, 1024, 1024, true, 1234ULL},
+  {0.003, 1024, 1024, 1024, true, 1234ULL},
+  {0.001, 1024, 1024, 32, false, 1234ULL},
+  {0.001, 1024, 32, 1024, false, 1234ULL},
+  {0.001, 32, 1024, 1024, false, 1234ULL},
+  {0.003, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceJensenShannon<double> DistanceJensenShannonD;
+TEST_P(DistanceJensenShannonD, Result) {
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonD,
+                        ::testing::ValuesIn(inputsd));
+
+}  // end namespace distance
+}  // end namespace raft
diff --git a/cpp/test/distance/dist_kl_divergence.cu b/cpp/test/distance/dist_kl_divergence.cu
new file mode 100644
index 0000000000..884ac4b948
--- /dev/null
+++ b/cpp/test/distance/dist_kl_divergence.cu
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include "distance_base.cuh"
+
+namespace raft {
+namespace distance {
+
+template <typename DataType>
+class DistanceKLDivergence
+  : public DistanceTest<raft::distance::DistanceType::KLDivergence, DataType> {
+};
+
+const std::vector<DistanceInputs<float>> inputsf = {
+  {0.001f, 1024, 1024, 32, true, 1234ULL},
+  {0.001f, 1024, 32, 1024, true, 1234ULL},
+  {0.001f, 32, 1024, 1024, true, 1234ULL},
+  {0.003f, 1024, 1024, 1024, true, 1234ULL},
+  {0.001f, 1024, 1024, 32, false, 1234ULL},
+  {0.001f, 1024, 32, 1024, false, 1234ULL},
+  {0.001f, 32, 1024, 1024, false, 1234ULL},
+  {0.003f, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceKLDivergence<float> DistanceKLDivergenceF;
+TEST_P(DistanceKLDivergenceF, Result) {
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceF,
+                        ::testing::ValuesIn(inputsf));
+
+const std::vector<DistanceInputs<double>> inputsd = {
+  {0.001, 1024, 1024, 32, true, 1234ULL},
+  {0.001, 1024, 32, 1024, true, 1234ULL},
+  {0.001, 32, 1024, 1024, true, 1234ULL},
+  {0.003, 1024, 1024, 1024, true, 1234ULL},
+  {0.001, 1024, 1024, 32, false, 1234ULL},
+  {0.001, 1024, 32, 1024, false, 1234ULL},
+  {0.001, 32, 1024, 1024, false, 1234ULL},
+  {0.003, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceKLDivergence<double> DistanceKLDivergenceD;
+TEST_P(DistanceKLDivergenceD, Result) {
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceD,
+                        ::testing::ValuesIn(inputsd));
+
+}  // end namespace distance
+}  // end namespace raft
diff --git a/cpp/test/distance/dist_russell_rao.cu b/cpp/test/distance/dist_russell_rao.cu
new file mode 100644
index 0000000000..74ccfb0c2e
--- /dev/null
+++ b/cpp/test/distance/dist_russell_rao.cu
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include "distance_base.cuh"
+
+namespace raft {
+namespace distance {
+
+template <typename DataType>
+class DistanceRussellRao
+  : public DistanceTest<raft::distance::DistanceType::RusselRaoExpanded,
+                        DataType> {};
+
+const std::vector<DistanceInputs<float>> inputsf = {
+  {0.001f, 1024, 1024, 32, true, 1234ULL},
+  {0.001f, 1024, 32, 1024, true, 1234ULL},
+  {0.001f, 32, 1024, 1024, true, 1234ULL},
+  {0.003f, 1024, 1024, 1024, true, 1234ULL},
+  {0.001f, 1024, 1024, 32, false, 1234ULL},
+  {0.001f, 1024, 32, 1024, false, 1234ULL},
+  {0.001f, 32, 1024, 1024, false, 1234ULL},
+  {0.003f, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceRussellRao<float> DistanceRussellRaoF;
+TEST_P(DistanceRussellRaoF, Result) {
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoF,
+                        ::testing::ValuesIn(inputsf));
+
+const std::vector<DistanceInputs<double>> inputsd = {
+  {0.001, 1024, 1024, 32, true, 1234ULL},
+  {0.001, 1024, 32, 1024, true, 1234ULL},
+  {0.001, 32, 1024, 1024, true, 1234ULL},
+  {0.003, 1024, 1024, 1024, true, 1234ULL},
+  {0.001, 1024, 1024, 32, false, 1234ULL},
+  {0.001, 1024, 32, 1024, false, 1234ULL},
+  {0.001, 32, 1024, 1024, false, 1234ULL},
+  {0.003, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceRussellRao<double> DistanceRussellRaoD;
+TEST_P(DistanceRussellRaoD, Result) {
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoD,
+                        ::testing::ValuesIn(inputsd));
+
+}  // end namespace distance
+}  // end namespace raft
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index fc7b064205..9e3290593d 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -160,6 +160,138 @@ __global__ void naiveLpUnexpDistanceKernel(DataType *dist, const DataType *x,
   dist[outidx] = acc;
 }
 
+template <typename DataType>
+__global__ void naiveHammingDistanceKernel(DataType *dist, const DataType *x,
+                                           const DataType *y, int m, int n,
+                                           int k, bool isRowMajor) {
+  int midx = threadIdx.x + blockIdx.x * blockDim.x;
+  int nidx = threadIdx.y + blockIdx.y * blockDim.y;
+  if (midx >= m || nidx >= n) return;
+  DataType acc = DataType(0);
+  for (int i = 0; i < k; ++i) {
+    int xidx = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
+    auto a = x[xidx];
+    auto b = y[yidx];
+    acc += (a != b);
+  }
+  acc = acc / k;
+  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  dist[outidx] = acc;
+}
+
+template <typename DataType>
+__global__ void naiveJensenShannonDistanceKernel(DataType *dist,
+                                                 const DataType *x,
+                                                 const DataType *y, int m,
+                                                 int n, int k,
+                                                 bool isRowMajor) {
+  int midx = threadIdx.x + blockIdx.x * blockDim.x;
+  int nidx = threadIdx.y + blockIdx.y * blockDim.y;
+  if (midx >= m || nidx >= n) return;
+  DataType acc = DataType(0);
+  for (int i = 0; i < k; ++i) {
+    int xidx = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
+    auto a = x[xidx];
+    auto b = y[yidx];
+
+    DataType m = 0.5f * (a + b);
+    bool a_zero = a == 0;
+    bool b_zero = b == 0;
+
+    DataType p = (!a_zero * m) / (a_zero + a);
+    DataType q = (!b_zero * m) / (b_zero + b);
+
+    bool p_zero = p == 0;
+    bool q_zero = q == 0;
+
+    acc +=
+      (-a * (!p_zero * log(p + p_zero))) + (-b * (!q_zero * log(q + q_zero)));
+  }
+  acc = raft::mySqrt(0.5f * acc);
+  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  dist[outidx] = acc;
+}
+
+template <typename DataType, typename OutType>
+__global__ void naiveRussellRaoDistanceKernel(OutType *dist, const DataType *x,
+                                              const DataType *y, int m, int n,
+                                              int k, bool isRowMajor) {
+  int midx = threadIdx.x + blockIdx.x * blockDim.x;
+  int nidx = threadIdx.y + blockIdx.y * blockDim.y;
+  if (midx >= m || nidx >= n) return;
+  OutType acc = OutType(0);
+  for (int i = 0; i < k; ++i) {
+    int xidx = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
+    auto a = x[xidx];
+    auto b = y[yidx];
+    acc += (a * b);
+  }
+  acc = (k - acc) / k;
+  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  dist[outidx] = acc;
+}
+
+template <typename DataType, typename OutType>
+__global__ void naiveKLDivergenceDistanceKernel(OutType *dist,
+                                                const DataType *x,
+                                                const DataType *y, int m, int n,
+                                                int k, bool isRowMajor) {
+  int midx = threadIdx.x + blockIdx.x * blockDim.x;
+  int nidx = threadIdx.y + blockIdx.y * blockDim.y;
+  if (midx >= m || nidx >= n) return;
+  OutType acc = OutType(0);
+  for (int i = 0; i < k; ++i) {
+    int xidx = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
+    auto a = x[xidx];
+    auto b = y[yidx];
+    bool b_zero = (b == 0);
+    const auto m = (!b_zero) * (a / b);
+    const bool m_zero = (m == 0);
+    acc += (a * (!m_zero) * log(m + m_zero));
+  }
+  acc = 0.5f * acc;
+  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  dist[outidx] = acc;
+}
+
+template <typename DataType, typename OutType>
+__global__ void naiveCorrelationDistanceKernel(OutType *dist, const DataType *x,
+                                               const DataType *y, int m, int n,
+                                               int k, bool isRowMajor) {
+  int midx = threadIdx.x + blockIdx.x * blockDim.x;
+  int nidx = threadIdx.y + blockIdx.y * blockDim.y;
+  if (midx >= m || nidx >= n) return;
+  OutType acc = OutType(0);
+  auto a_norm = DataType(0);
+  auto b_norm = DataType(0);
+  auto a_sq_norm = DataType(0);
+  auto b_sq_norm = DataType(0);
+  for (int i = 0; i < k; ++i) {
+    int xidx = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
+    auto a = x[xidx];
+    auto b = y[yidx];
+    a_norm += a;
+    b_norm += b;
+    a_sq_norm += (a * a);
+    b_sq_norm += (b * b);
+    acc += (a * b);
+  }
+
+  auto numer = k * acc - (a_norm * b_norm);
+  auto Q_denom = k * a_sq_norm - (a_norm * a_norm);
+  auto R_denom = k * b_sq_norm - (b_norm * b_norm);
+
+  acc = 1 - (numer / raft::mySqrt(Q_denom * R_denom));
+
+  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  dist[outidx] = acc;
+}
+
 template <typename DataType>
 void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m,
                    int n, int k, raft::distance::DistanceType type,
@@ -193,6 +325,26 @@ void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m,
       naiveLpUnexpDistanceKernel<DataType>
         <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor, metric_arg);
       break;
+    case raft::distance::DistanceType::HammingUnexpanded:
+      naiveHammingDistanceKernel<DataType>
+        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      break;
+    case raft::distance::DistanceType::JensenShannon:
+      naiveJensenShannonDistanceKernel<DataType>
+        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      break;
+    case raft::distance::DistanceType::RusselRaoExpanded:
+      naiveRussellRaoDistanceKernel<DataType>
+        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      break;
+    case raft::distance::DistanceType::KLDivergence:
+      naiveKLDivergenceDistanceKernel<DataType>
+        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      break;
+    case raft::distance::DistanceType::CorrelationExpanded:
+      naiveCorrelationDistanceKernel<DataType>
+        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      break;
     default:
       FAIL() << "should be here\n";
   }
@@ -247,10 +399,19 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
     raft::allocate(dist_ref, m * n);
     raft::allocate(dist, m * n);
     raft::allocate(dist2, m * n);
-    if (distanceType == raft::distance::DistanceType::HellingerExpanded) {
+    if (distanceType == raft::distance::DistanceType::HellingerExpanded ||
+        distanceType == raft::distance::DistanceType::JensenShannon ||
+        distanceType == raft::distance::DistanceType::KLDivergence) {
       // Hellinger works only on positive numbers
       r.uniform(x, m * k, DataType(0.0), DataType(1.0), stream);
       r.uniform(y, n * k, DataType(0.0), DataType(1.0), stream);
+    } else if (distanceType ==
+               raft::distance::DistanceType::RusselRaoExpanded) {
+      r.uniform(x, m * k, DataType(0.0), DataType(1.0), stream);
+      r.uniform(y, n * k, DataType(0.0), DataType(1.0), stream);
+      // Russel rao works on boolean values.
+      r.bernoulli(x, m * k, 0.5f, stream);
+      r.bernoulli(y, n * k, 0.5f, stream);
     } else {
       r.uniform(x, m * k, DataType(-1.0), DataType(1.0), stream);
       r.uniform(y, n * k, DataType(-1.0), DataType(1.0), stream);

From 1c4e1e600b8b5da4265290f031e7785e824b706e Mon Sep 17 00:00:00 2001
From: Victor Lafargue <viclafargue@nvidia.com>
Date: Fri, 27 Aug 2021 17:26:55 +0200
Subject: [PATCH 011/171] Miscellaneous tech debts/cleanups (#286)

Miscellaneous updates to solve tech debts in RAFT :
- [x] Removal of handle host and device allocators
- [x] Addition of a `get_thrust_policy` method to the handle
- [x] Usage of `get_thrust_policy` where handle is available
- [x] Removal of `rmm::device_vector`
- [x] Use of RMM device allocator in the `raft::allocate` function
- [x] Creation of an allocation + deallocation helper system
- [x] Usage of `rmm::exec_policy` instead of `thrust::cuda::par.on` when no handle is available

Authors:
  - Victor Lafargue (https://github.com/viclafargue)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/286
---
 cpp/cmake/thirdparty/get_rmm.cmake            |   2 +-
 cpp/include/raft/common/cub_wrappers.cuh      |   4 +-
 cpp/include/raft/comms/helper.hpp             |  10 +-
 cpp/include/raft/comms/std_comms.hpp          |  40 ++----
 cpp/include/raft/comms/test.hpp               |  41 ++----
 cpp/include/raft/cudart_utils.h               |  46 +++++--
 cpp/include/raft/distance/distance.cuh        |   7 +-
 cpp/include/raft/handle.hpp                   |  30 +---
 cpp/include/raft/label/classlabels.cuh        |  57 ++++----
 cpp/include/raft/lap/lap.cuh                  |  87 ++++++------
 cpp/include/raft/lap/lap_functions.cuh        |  47 +++----
 .../raft/linalg/cholesky_r1_update.cuh        |   8 +-
 cpp/include/raft/linalg/eig.cuh               |  28 ++--
 cpp/include/raft/linalg/init.h                |   3 +-
 cpp/include/raft/linalg/qr.cuh                |  19 ++-
 cpp/include/raft/linalg/svd.cuh               |  28 ++--
 cpp/include/raft/linalg/transpose.h           |   4 +-
 cpp/include/raft/matrix/math.cuh              |  10 +-
 cpp/include/raft/matrix/matrix.cuh            |  14 +-
 cpp/include/raft/random/rng.cuh               |  17 +--
 cpp/include/raft/sparse/convert/csr.cuh       |  24 ++--
 cpp/include/raft/sparse/coo.cuh               |  38 +++--
 cpp/include/raft/sparse/csr.cuh               |  19 +--
 .../raft/sparse/distance/bin_distance.cuh     |  11 +-
 cpp/include/raft/sparse/distance/coo_spmv.cuh |   3 -
 .../coo_spmv_strategies/base_strategy.cuh     |   2 -
 .../coo_mask_row_iterators.cuh                |   1 -
 .../coo_spmv_strategies/hash_strategy.cuh     |   2 +-
 cpp/include/raft/sparse/distance/distance.cuh |   1 -
 .../raft/sparse/distance/ip_distance.cuh      |   3 +-
 .../raft/sparse/distance/l2_distance.cuh      |  21 +--
 cpp/include/raft/sparse/distance/utils.cuh    |   2 -
 .../sparse/hierarchy/detail/agglomerative.cuh |  13 +-
 .../hierarchy/detail/connectivities.cuh       |  13 +-
 .../raft/sparse/hierarchy/detail/mst.cuh      |  16 +--
 .../raft/sparse/hierarchy/single_linkage.hpp  |   1 -
 cpp/include/raft/sparse/linalg/add.cuh        |  10 +-
 cpp/include/raft/sparse/linalg/spectral.cuh   |  23 ++--
 cpp/include/raft/sparse/linalg/symmetrize.cuh |  33 ++---
 cpp/include/raft/sparse/linalg/transpose.h    |  12 +-
 .../raft/sparse/mst/detail/mst_solver_inl.cuh | 122 ++++++++--------
 cpp/include/raft/sparse/mst/detail/utils.cuh  |   4 +-
 cpp/include/raft/sparse/mst/mst_solver.cuh    |  24 ++--
 cpp/include/raft/sparse/op/filter.cuh         |  36 ++---
 cpp/include/raft/sparse/op/reduce.cuh         |  10 +-
 cpp/include/raft/sparse/op/sort.h             |  21 +--
 .../sparse/selection/connect_components.cuh   |  36 ++---
 cpp/include/raft/sparse/selection/knn.cuh     |   2 -
 .../raft/sparse/selection/knn_graph.cuh       |   1 -
 cpp/include/raft/spatial/knn/ann.hpp          |   3 -
 .../knn/detail/ann_quantized_faiss.cuh        |   8 +-
 .../knn/detail/knn_brute_force_faiss.cuh      |  19 ++-
 .../raft/spatial/knn/detail/processing.hpp    |  33 ++---
 cpp/include/raft/spatial/knn/knn.hpp          |   6 +-
 cpp/include/raft/spectral/cluster_solvers.hpp |   9 +-
 cpp/include/raft/spectral/kmeans.hpp          | 130 ++++++++----------
 cpp/include/raft/spectral/matrix_wrappers.hpp |  83 ++++-------
 .../raft/spectral/modularity_maximization.hpp |  29 ++--
 cpp/include/raft/spectral/partition.hpp       |  29 ++--
 cpp/include/raft/spectral/spectral_util.hpp   |  23 ++--
 cpp/test/cluster_solvers.cu                   |   9 +-
 cpp/test/distance/dist_adj.cu                 |  22 ++-
 cpp/test/distance/distance_base.cuh           |  24 ++--
 cpp/test/distance/fused_l2_nn.cu              |  32 ++---
 cpp/test/eigen_solvers.cu                     |   9 +-
 cpp/test/label/label.cu                       |  34 ++---
 cpp/test/label/merge_labels.cu                |   6 +-
 cpp/test/lap/lap.cu                           |  17 ++-
 cpp/test/linalg/add.cu                        |  14 +-
 cpp/test/linalg/binary_op.cu                  |  16 +--
 cpp/test/linalg/cholesky_r1.cu                |  27 ++--
 cpp/test/linalg/coalesced_reduction.cu        |  15 +-
 cpp/test/linalg/divide.cu                     |  15 +-
 cpp/test/linalg/eig.cu                        |  34 ++---
 cpp/test/linalg/eig_sel.cu                    |  19 +--
 cpp/test/linalg/eltwise.cu                    |  14 +-
 cpp/test/linalg/map.cu                        |  25 ++--
 cpp/test/linalg/map_then_reduce.cu            |  26 ++--
 cpp/test/linalg/matrix_vector_op.cu           |  21 ++-
 cpp/test/linalg/multiply.cu                   |  15 +-
 cpp/test/linalg/norm.cu                       |  18 ++-
 cpp/test/linalg/reduce.cu                     |  10 +-
 cpp/test/linalg/reduce.cuh                    |  15 +-
 cpp/test/linalg/strided_reduction.cu          |  10 +-
 cpp/test/linalg/subtract.cu                   |  18 ++-
 cpp/test/linalg/svd.cu                        |  28 ++--
 cpp/test/linalg/transpose.cu                  |  12 +-
 cpp/test/linalg/unary_op.cu                   |  11 +-
 cpp/test/matrix/math.cu                       |  51 +++----
 cpp/test/matrix/matrix.cu                     |  34 +++--
 cpp/test/mr/device/buffer.cpp                 |  18 ++-
 cpp/test/mst.cu                               |  24 +++-
 cpp/test/random/rng.cu                        |  41 +++---
 cpp/test/random/rng_int.cu                    |  12 +-
 cpp/test/random/sample_without_replacement.cu |  14 +-
 cpp/test/sparse/add.cu                        |  43 ++----
 cpp/test/sparse/connect_components.cu         |  11 +-
 cpp/test/sparse/convert_coo.cu                |  10 +-
 cpp/test/sparse/convert_csr.cu                |  26 ++--
 cpp/test/sparse/csr_row_slice.cu              |  34 ++---
 cpp/test/sparse/csr_to_dense.cu               |  22 +--
 cpp/test/sparse/csr_transpose.cu              |  36 ++---
 cpp/test/sparse/degree.cu                     |  36 ++---
 cpp/test/sparse/dist_coo_spmv.cu              |  29 ++--
 cpp/test/sparse/distance.cu                   |  21 +--
 cpp/test/sparse/filter.cu                     |  13 +-
 cpp/test/sparse/knn.cu                        |  27 ++--
 cpp/test/sparse/knn_graph.cu                  |  16 +--
 cpp/test/sparse/linkage.cu                    |  39 ++----
 cpp/test/sparse/norm.cu                       |  15 +-
 cpp/test/sparse/reduce.cu                     |   3 +-
 cpp/test/sparse/row_op.cu                     |  12 +-
 cpp/test/sparse/selection.cu                  |  24 +---
 cpp/test/sparse/sort.cu                       |  13 +-
 cpp/test/sparse/symmetrize.cu                 |  33 ++---
 cpp/test/spatial/haversine.cu                 |  33 ++---
 cpp/test/spatial/knn.cu                       |  34 +++--
 cpp/test/spectral_matrix.cu                   |  19 ++-
 cpp/test/stats/mean.cu                        |  10 +-
 cpp/test/stats/mean_center.cu                 |  18 ++-
 cpp/test/stats/stddev.cu                      |  18 ++-
 cpp/test/stats/sum.cu                         |  12 +-
 python/raft/common/handle.pxd                 |   2 -
 123 files changed, 1114 insertions(+), 1583 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake
index 85ebc6238e..e990ab1367 100644
--- a/cpp/cmake/thirdparty/get_rmm.cmake
+++ b/cpp/cmake/thirdparty/get_rmm.cmake
@@ -44,4 +44,4 @@ endfunction()
 
 set(RAFT_MIN_VERSION_rmm "${RAFT_VERSION_MAJOR}.${RAFT_VERSION_MINOR}.00")
 
-find_and_configure_rmm(${RAFT_MIN_VERSION_rmm})
+find_and_configure_rmm(${RAFT_MIN_VERSION_rmm})
\ No newline at end of file
diff --git a/cpp/include/raft/common/cub_wrappers.cuh b/cpp/include/raft/common/cub_wrappers.cuh
index 8d5b29f700..8e3519fea5 100644
--- a/cpp/include/raft/common/cub_wrappers.cuh
+++ b/cpp/include/raft/common/cub_wrappers.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <cub/cub.cuh>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 
@@ -34,7 +34,7 @@ namespace raft {
      * @param stream cuda stream
      */
 template <typename KeyT, typename ValueT>
-void sortPairs(raft::mr::device::buffer<char> &workspace, const KeyT *inKeys,
+void sortPairs(rmm::device_uvector<char> &workspace, const KeyT *inKeys,
                KeyT *outKeys, const ValueT *inVals, ValueT *outVals, int len,
                cudaStream_t stream) {
   size_t worksize;
diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp
index 7b24e31bbe..e01490d728 100644
--- a/cpp/include/raft/comms/helper.hpp
+++ b/cpp/include/raft/comms/helper.hpp
@@ -38,11 +38,10 @@ namespace comms {
  */
 void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm,
                            int num_ranks, int rank) {
-  auto d_alloc = handle->get_device_allocator();
   cudaStream_t stream = handle->get_stream();
 
   auto communicator = std::make_shared<comms_t>(std::unique_ptr<comms_iface>(
-    new raft::comms::std_comms(nccl_comm, num_ranks, rank, d_alloc, stream)));
+    new raft::comms::std_comms(nccl_comm, num_ranks, rank, stream)));
   handle->set_comms(communicator);
 }
 
@@ -80,12 +79,11 @@ void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm,
     }
   }
 
-  auto d_alloc = handle->get_device_allocator();
   cudaStream_t stream = handle->get_stream();
 
-  auto communicator = std::make_shared<comms_t>(std::unique_ptr<comms_iface>(
-    new raft::comms::std_comms(nccl_comm, (ucp_worker_h)ucp_worker, eps_sp,
-                               num_ranks, rank, d_alloc, stream)));
+  auto communicator = std::make_shared<comms_t>(
+    std::unique_ptr<comms_iface>(new raft::comms::std_comms(
+      nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, num_ranks, rank, stream)));
   handle->set_comms(communicator);
 }
 
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index 765e8741bb..ff75931fb9 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -20,7 +20,7 @@
 
 #include <raft/comms/ucp_helper.hpp>
 #include <raft/handle.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <raft/error.hpp>
 
@@ -64,17 +64,16 @@ class std_comms : public comms_iface {
    */
   std_comms(ncclComm_t nccl_comm, ucp_worker_h ucp_worker,
             std::shared_ptr<ucp_ep_h *> eps, int num_ranks, int rank,
-            const std::shared_ptr<mr::device::allocator> device_allocator,
             cudaStream_t stream, bool subcomms_ucp = true)
     : nccl_comm_(nccl_comm),
       stream_(stream),
+      status_(2, stream),
       num_ranks_(num_ranks),
       rank_(rank),
       subcomms_ucp_(subcomms_ucp),
       ucp_worker_(ucp_worker),
       ucp_eps_(eps),
-      next_request_id_(0),
-      device_allocator_(device_allocator) {
+      next_request_id_(0) {
     initialize();
   };
 
@@ -85,27 +84,19 @@ class std_comms : public comms_iface {
    * @param rank rank of the current worker
    */
   std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank,
-            const std::shared_ptr<mr::device::allocator> device_allocator,
             cudaStream_t stream)
     : nccl_comm_(nccl_comm),
       stream_(stream),
+      status_(2, stream),
       num_ranks_(num_ranks),
       rank_(rank),
-      subcomms_ucp_(false),
-      device_allocator_(device_allocator) {
+      subcomms_ucp_(false) {
     initialize();
   };
 
-  virtual ~std_comms() {
-    device_allocator_->deallocate(sendbuff_, sizeof(int), stream_);
-    device_allocator_->deallocate(recvbuff_, sizeof(int), stream_);
-  }
-
   void initialize() {
-    sendbuff_ = reinterpret_cast<int *>(
-      device_allocator_->allocate(sizeof(int), stream_));
-    recvbuff_ = reinterpret_cast<int *>(
-      device_allocator_->allocate(sizeof(int), stream_));
+    sendbuff_ = status_.data();
+    recvbuff_ = status_.data() + 1;
   }
 
   int get_size() const { return num_ranks_; }
@@ -113,8 +104,8 @@ class std_comms : public comms_iface {
   int get_rank() const { return rank_; }
 
   std::unique_ptr<comms_iface> comm_split(int color, int key) const {
-    mr::device::buffer<int> d_colors(device_allocator_, stream_, get_size());
-    mr::device::buffer<int> d_keys(device_allocator_, stream_, get_size());
+    rmm::device_uvector<int> d_colors(get_size(), stream_);
+    rmm::device_uvector<int> d_keys(get_size(), stream_);
 
     update_device(d_colors.data() + get_rank(), &color, 1, stream_);
     update_device(d_keys.data() + get_rank(), &key, 1, stream_);
@@ -167,12 +158,12 @@ class std_comms : public comms_iface {
 
     if (ucp_worker_ != nullptr && subcomms_ucp_) {
       auto eps_sp = std::make_shared<ucp_ep_h *>(new_ucx_ptrs.data());
-      return std::unique_ptr<comms_iface>(new std_comms(
-        nccl_comm, (ucp_worker_h)ucp_worker_, eps_sp, subcomm_ranks.size(), key,
-        device_allocator_, stream_, subcomms_ucp_));
+      return std::unique_ptr<comms_iface>(
+        new std_comms(nccl_comm, (ucp_worker_h)ucp_worker_, eps_sp,
+                      subcomm_ranks.size(), key, stream_, subcomms_ucp_));
     } else {
-      return std::unique_ptr<comms_iface>(new std_comms(
-        nccl_comm, subcomm_ranks.size(), key, device_allocator_, stream_));
+      return std::unique_ptr<comms_iface>(
+        new std_comms(nccl_comm, subcomm_ranks.size(), key, stream_));
     }
   }
 
@@ -465,6 +456,7 @@ class std_comms : public comms_iface {
   cudaStream_t stream_;
 
   int *sendbuff_, *recvbuff_;
+  rmm::device_uvector<int> status_;
 
   int num_ranks_;
   int rank_;
@@ -478,8 +470,6 @@ class std_comms : public comms_iface {
   mutable std::unordered_map<request_t, struct ucp_request *>
     requests_in_flight_;
   mutable std::unordered_set<request_t> free_requests_;
-
-  std::shared_ptr<mr::device::allocator> device_allocator_;
 };
 }  // end namespace comms
 }  // end namespace raft
diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp
index 4e95c4eef0..17db8e88af 100644
--- a/cpp/include/raft/comms/test.hpp
+++ b/cpp/include/raft/comms/test.hpp
@@ -18,7 +18,6 @@
 
 #include <raft/comms/comms.hpp>
 #include <raft/handle.hpp>
-#include <raft/mr/device/buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -44,8 +43,7 @@ bool test_collective_allreduce(const handle_t &handle, int root) {
 
   cudaStream_t stream = handle.get_stream();
 
-  raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream);
-  temp_d.resize(1, stream);
+  rmm::device_scalar<int> temp_d(stream);
   CUDA_CHECK(
     cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream));
 
@@ -76,8 +74,7 @@ bool test_collective_broadcast(const handle_t &handle, int root) {
 
   cudaStream_t stream = handle.get_stream();
 
-  raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream);
-  temp_d.resize(1, stream);
+  rmm::device_scalar<int> temp_d(stream);
 
   if (communicator.get_rank() == root)
     CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
@@ -104,8 +101,7 @@ bool test_collective_reduce(const handle_t &handle, int root) {
 
   cudaStream_t stream = handle.get_stream();
 
-  raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream);
-  temp_d.resize(1, stream);
+  rmm::device_scalar<int> temp_d(stream);
 
   CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
                              cudaMemcpyHostToDevice, stream));
@@ -134,11 +130,8 @@ bool test_collective_allgather(const handle_t &handle, int root) {
 
   cudaStream_t stream = handle.get_stream();
 
-  raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream);
-  temp_d.resize(1, stream);
-
-  raft::mr::device::buffer<int> recv_d(handle.get_device_allocator(), stream,
-                                       communicator.get_size());
+  rmm::device_scalar<int> temp_d(stream);
+  rmm::device_uvector<int> recv_d(communicator.get_size(), stream);
 
   CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
                              cudaMemcpyHostToDevice, stream));
@@ -169,12 +162,9 @@ bool test_collective_gather(const handle_t &handle, int root) {
 
   cudaStream_t stream = handle.get_stream();
 
-  raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream);
-  temp_d.resize(1, stream);
-
-  raft::mr::device::buffer<int> recv_d(
-    handle.get_device_allocator(), stream,
-    communicator.get_rank() == root ? communicator.get_size() : 0);
+  rmm::device_scalar<int> temp_d(stream);
+  rmm::device_uvector<int> recv_d(
+    communicator.get_rank() == root ? communicator.get_size() : 0, stream);
 
   CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
                              cudaMemcpyHostToDevice, stream));
@@ -211,12 +201,9 @@ bool test_collective_gatherv(const handle_t &handle, int root) {
 
   cudaStream_t stream = handle.get_stream();
 
-  raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream);
-  temp_d.resize(sends.size(), stream);
-
-  raft::mr::device::buffer<int> recv_d(
-    handle.get_device_allocator(), stream,
-    communicator.get_rank() == root ? displacements.back() : 0);
+  rmm::device_uvector<int> temp_d(sends.size(), stream);
+  rmm::device_uvector<int> recv_d(
+    communicator.get_rank() == root ? displacements.back() : 0, stream);
 
   CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(),
                              sends.size() * sizeof(int), cudaMemcpyHostToDevice,
@@ -256,10 +243,8 @@ bool test_collective_reducescatter(const handle_t &handle, int root) {
 
   cudaStream_t stream = handle.get_stream();
 
-  raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream,
-                                       sends.size());
-  raft::mr::device::buffer<int> recv_d(handle.get_device_allocator(), stream,
-                                       1);
+  rmm::device_uvector<int> temp_d(sends.size(), stream);
+  rmm::device_scalar<int> recv_d(stream);
 
   CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(),
                              sends.size() * sizeof(int), cudaMemcpyHostToDevice,
diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index 86c60addf2..85ca310530 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -17,6 +17,8 @@
 #pragma once
 
 #include <raft/error.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <cuda_runtime.h>
 
@@ -25,6 +27,8 @@
 #include <cstdio>
 #include <iomanip>
 #include <iostream>
+#include <mutex>
+#include <unordered_map>
 
 ///@todo: enable once logging has been enabled in raft
 //#include "logger.hpp"
@@ -200,7 +204,8 @@ class grid_1d_block_t {
  * @param stream cuda stream
  */
 template <typename Type>
-void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) {
+void copy(Type* dst, const Type* src, size_t len,
+          rmm::cuda_stream_view stream) {
   CUDA_CHECK(
     cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
 }
@@ -214,20 +219,20 @@ void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) {
 /** performs a host to device copy */
 template <typename Type>
 void update_device(Type* d_ptr, const Type* h_ptr, size_t len,
-                   cudaStream_t stream) {
+                   rmm::cuda_stream_view stream) {
   copy(d_ptr, h_ptr, len, stream);
 }
 
 /** performs a device to host copy */
 template <typename Type>
 void update_host(Type* h_ptr, const Type* d_ptr, size_t len,
-                 cudaStream_t stream) {
+                 rmm::cuda_stream_view stream) {
   copy(h_ptr, d_ptr, len, stream);
 }
 
 template <typename Type>
 void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len,
-                cudaStream_t stream) {
+                rmm::cuda_stream_view stream) {
   CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type),
                              cudaMemcpyDeviceToDevice, stream));
 }
@@ -259,11 +264,36 @@ void print_device_vector(const char* variable_name, const T* devMem,
 }
 /** @} */
 
-/** cuda malloc */
+static std::mutex mutex_;
+static std::unordered_map<void*, size_t> allocations;
+
+template <typename Type>
+void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream,
+              bool setZero = false) {
+  size_t size = len * sizeof(Type);
+  ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream);
+  if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream));
+
+  std::lock_guard<std::mutex> _(mutex_);
+  allocations[ptr] = size;
+}
+
 template <typename Type>
-void allocate(Type*& ptr, size_t len, bool setZero = false) {
-  CUDA_CHECK(cudaMalloc((void**)&ptr, sizeof(Type) * len));
-  if (setZero) CUDA_CHECK(cudaMemset(ptr, 0, sizeof(Type) * len));
+void deallocate(Type*& ptr, rmm::cuda_stream_view stream) {
+  std::lock_guard<std::mutex> _(mutex_);
+  size_t size = allocations[ptr];
+  allocations.erase(ptr);
+  rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream);
+}
+
+inline void deallocate_all(rmm::cuda_stream_view stream) {
+  std::lock_guard<std::mutex> _(mutex_);
+  for (auto& alloc : allocations) {
+    void* ptr = alloc.first;
+    size_t size = alloc.second;
+    rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream);
+  }
+  allocations.clear();
 }
 
 /** helper method to get max usable shared mem per block parameter */
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
index 02d8fb6d03..65b4f3b830 100644
--- a/cpp/include/raft/distance/distance.cuh
+++ b/cpp/include/raft/distance/distance.cuh
@@ -31,7 +31,7 @@
 #include <raft/distance/l1.cuh>
 #include <raft/distance/minkowski.cuh>
 #include <raft/distance/russell_rao.cuh>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace distance {
@@ -376,7 +376,7 @@ void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
 template <typename Type, typename Index_, raft::distance::DistanceType DistType>
 void pairwise_distance_impl(const Type *x, const Type *y, Type *dist, Index_ m,
                             Index_ n, Index_ k,
-                            raft::mr::device::buffer<char> &workspace,
+                            rmm::device_uvector<char> &workspace,
                             cudaStream_t stream, bool isRowMajor,
                             Type metric_arg = 2.0f) {
   auto worksize =
@@ -389,8 +389,7 @@ void pairwise_distance_impl(const Type *x, const Type *y, Type *dist, Index_ m,
 
 template <typename Type, typename Index_ = int>
 void pairwise_distance(const Type *x, const Type *y, Type *dist, Index_ m,
-                       Index_ n, Index_ k,
-                       raft::mr::device::buffer<char> &workspace,
+                       Index_ n, Index_ k, rmm::device_uvector<char> &workspace,
                        raft::distance::DistanceType metric, cudaStream_t stream,
                        bool isRowMajor = true, Type metric_arg = 2.0f) {
   switch (metric) {
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index dbe7e83189..c925669530 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -36,9 +36,8 @@
 #include <raft/linalg/cusolver_wrappers.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/comms/comms.hpp>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/host/allocator.hpp>
 #include <rmm/cuda_stream_pool.hpp>
+#include <rmm/exec_policy.hpp>
 #include "cudart_utils.h"
 
 namespace raft {
@@ -63,10 +62,9 @@ class handle_t {
         CUDA_CHECK(cudaGetDevice(&cur_dev));
         return cur_dev;
       }()),
-      streams_(n_streams),
-      device_allocator_(std::make_shared<mr::device::default_allocator>()),
-      host_allocator_(std::make_shared<mr::host::default_allocator>()) {
+      streams_(n_streams) {
     create_resources();
+    thrust_policy_ = std::make_unique<rmm::exec_policy>(user_stream_);
   }
 
   /**
@@ -86,10 +84,9 @@ class handle_t {
       "ERROR: the main handle must have at least one worker stream\n");
     prop_ = other.get_device_properties();
     device_prop_initialized_ = true;
-    device_allocator_ = other.get_device_allocator();
-    host_allocator_ = other.get_host_allocator();
     create_resources();
     set_stream(other.get_internal_stream(stream_id));
+    thrust_policy_ = std::make_unique<rmm::exec_policy>(user_stream_);
   }
 
   /** Destroys all held-up resources */
@@ -103,20 +100,6 @@ class handle_t {
     return rmm::cuda_stream_view(user_stream_);
   }
 
-  void set_device_allocator(std::shared_ptr<mr::device::allocator> allocator) {
-    device_allocator_ = allocator;
-  }
-  std::shared_ptr<mr::device::allocator> get_device_allocator() const {
-    return device_allocator_;
-  }
-
-  void set_host_allocator(std::shared_ptr<mr::host::allocator> allocator) {
-    host_allocator_ = allocator;
-  }
-  std::shared_ptr<mr::host::allocator> get_host_allocator() const {
-    return host_allocator_;
-  }
-
   cublasHandle_t get_cublas_handle() const {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cublas_initialized_) {
@@ -153,6 +136,8 @@ class handle_t {
     return cusparse_handle_;
   }
 
+  rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; }
+
   // legacy compatibility for cuML
   cudaStream_t get_internal_stream(int sid) const {
     return streams_.get_stream(sid).value();
@@ -236,8 +221,7 @@ class handle_t {
   mutable bool cusolver_sp_initialized_{false};
   mutable cusparseHandle_t cusparse_handle_;
   mutable bool cusparse_initialized_{false};
-  std::shared_ptr<mr::device::allocator> device_allocator_;
-  std::shared_ptr<mr::host::allocator> host_allocator_;
+  std::unique_ptr<rmm::exec_policy> thrust_policy_{nullptr};
   cudaStream_t user_stream_{nullptr};
   cudaEvent_t event_;
   mutable cudaDeviceProp prop_;
diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh
index 0da7da2eb6..b2302836bc 100644
--- a/cpp/include/raft/label/classlabels.cuh
+++ b/cpp/include/raft/label/classlabels.cuh
@@ -21,8 +21,8 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace label {
@@ -36,41 +36,39 @@ namespace label {
  * \tparam value_t numeric type of the arrays with class labels
  * \param [in] y device array of labels, size [n]
  * \param [in] n number of labels
- * \param [out] y_unique device array of unique labels, unallocated on entry,
+ * \param [out] unique device array of unique labels, unallocated on entry,
  *   on exit it has size [n_unique]
  * \param [out] n_unique number of unique labels
  * \param [in] stream cuda stream
- * \param [in] allocator device allocator
  */
 template <typename value_t>
-void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique,
-                     cudaStream_t stream,
-                     std::shared_ptr<raft::mr::device::allocator> allocator) {
-  raft::mr::device::buffer<value_t> y2(allocator, stream, n);
-  raft::mr::device::buffer<value_t> y3(allocator, stream, n);
-  raft::mr::device::buffer<int> d_num_selected(allocator, stream, 1);
+int getUniquelabels(rmm::device_uvector<value_t> &unique, value_t *y, size_t n,
+                    cudaStream_t stream) {
+  rmm::device_scalar<int> d_num_selected(stream);
+  rmm::device_uvector<value_t> workspace(n, stream);
   size_t bytes = 0;
   size_t bytes2 = 0;
 
   // Query how much temporary storage we will need for cub operations
   // and allocate it
-  cub::DeviceRadixSort::SortKeys(NULL, bytes, y, y2.data(), n);
-  cub::DeviceSelect::Unique(NULL, bytes2, y2.data(), y3.data(),
+  cub::DeviceRadixSort::SortKeys(NULL, bytes, y, workspace.data(), n);
+  cub::DeviceSelect::Unique(NULL, bytes2, workspace.data(), workspace.data(),
                             d_num_selected.data(), n);
   bytes = max(bytes, bytes2);
-  raft::mr::device::buffer<char> cub_storage(allocator, stream, bytes);
+  rmm::device_uvector<char> cub_storage(bytes, stream);
 
   // Select Unique classes
-  cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, y2.data(), n);
-  cub::DeviceSelect::Unique(cub_storage.data(), bytes, y2.data(), y3.data(),
-                            d_num_selected.data(), n);
-  raft::update_host(n_unique, d_num_selected.data(), 1, stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, workspace.data(),
+                                 n);
+  cub::DeviceSelect::Unique(cub_storage.data(), bytes, workspace.data(),
+                            workspace.data(), d_num_selected.data(), n);
 
+  int n_unique = d_num_selected.value(stream);
   // Copy unique classes to output
-  *y_unique =
-    (value_t *)allocator->allocate(*n_unique * sizeof(value_t), stream);
-  raft::copy(*y_unique, y3.data(), *n_unique, stream);
+  unique.resize(n_unique, stream);
+  raft::copy(unique.data(), workspace.data(), n_unique, stream);
+
+  return n_unique;
 }
 
 /**
@@ -147,22 +145,17 @@ __global__ void map_label_kernel(Type *map_ids, size_t N_labels, Type *in,
    */
 template <typename Type, typename Lambda>
 void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream,
-                    Lambda filter_op,
-                    std::shared_ptr<raft::mr::device::allocator> allocator,
-                    bool zero_based = false) {
+                    Lambda filter_op, bool zero_based = false) {
   static const size_t TPB_X = 256;
 
   dim3 blocks(raft::ceildiv(N, TPB_X));
   dim3 threads(TPB_X);
 
-  Type *map_ids;
-  int num_clusters;
-  getUniquelabels(in, N, &map_ids, &num_clusters, stream, allocator);
+  rmm::device_uvector<Type> map_ids(0, stream);
+  int num_clusters = getUniquelabels(map_ids, in, N, stream);
 
   map_label_kernel<Type, TPB_X><<<blocks, threads, 0, stream>>>(
-    map_ids, num_clusters, in, out, N, filter_op, zero_based);
-
-  allocator->deallocate(map_ids, num_clusters * sizeof(Type), stream);
+    map_ids.data(), num_clusters, in, out, N, filter_op, zero_based);
 }
 
 /**
@@ -184,11 +177,9 @@ void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream,
    */
 template <typename Type>
 void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream,
-                    std::shared_ptr<raft::mr::device::allocator> allocator,
                     bool zero_based = false) {
   make_monotonic<Type>(
-    out, in, N, stream, [] __device__(Type val) { return false; }, allocator,
-    zero_based);
+    out, in, N, stream, [] __device__(Type val) { return false; }, zero_based);
 }
 };  // namespace label
 };  // end namespace raft
diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.cuh
index 6bc1c08029..f64afb3549 100644
--- a/cpp/include/raft/lap/lap.cuh
+++ b/cpp/include/raft/lap/lap.cuh
@@ -25,6 +25,7 @@
 #pragma once
 
 #include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include "d_structs.h"
 #include "lap_functions.cuh"
@@ -44,19 +45,19 @@ class LinearAssignmentProblem {
   VertexData<vertex_t> d_row_data_dev, d_col_data_dev;
 
   raft::handle_t const &handle_;
-  raft::mr::device::buffer<int> row_covers_v;
-  raft::mr::device::buffer<int> col_covers_v;
-  raft::mr::device::buffer<weight_t> row_duals_v;
-  raft::mr::device::buffer<weight_t> col_duals_v;
-  raft::mr::device::buffer<weight_t> col_slacks_v;
-  raft::mr::device::buffer<int> row_is_visited_v;
-  raft::mr::device::buffer<int> col_is_visited_v;
-  raft::mr::device::buffer<vertex_t> row_parents_v;
-  raft::mr::device::buffer<vertex_t> col_parents_v;
-  raft::mr::device::buffer<vertex_t> row_children_v;
-  raft::mr::device::buffer<vertex_t> col_children_v;
-  raft::mr::device::buffer<weight_t> obj_val_primal_v;
-  raft::mr::device::buffer<weight_t> obj_val_dual_v;
+  rmm::device_uvector<int> row_covers_v;
+  rmm::device_uvector<int> col_covers_v;
+  rmm::device_uvector<weight_t> row_duals_v;
+  rmm::device_uvector<weight_t> col_duals_v;
+  rmm::device_uvector<weight_t> col_slacks_v;
+  rmm::device_uvector<int> row_is_visited_v;
+  rmm::device_uvector<int> col_is_visited_v;
+  rmm::device_uvector<vertex_t> row_parents_v;
+  rmm::device_uvector<vertex_t> col_parents_v;
+  rmm::device_uvector<vertex_t> row_children_v;
+  rmm::device_uvector<vertex_t> col_children_v;
+  rmm::device_uvector<weight_t> obj_val_primal_v;
+  rmm::device_uvector<weight_t> obj_val_dual_v;
 
  public:
   LinearAssignmentProblem(raft::handle_t const &handle, vertex_t size,
@@ -66,19 +67,19 @@ class LinearAssignmentProblem {
       batchsize_(batchsize),
       epsilon_(epsilon),
       d_costs_(nullptr),
-      row_covers_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      col_covers_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      row_duals_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      col_duals_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      col_slacks_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      row_is_visited_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      col_is_visited_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      row_parents_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      col_parents_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      row_children_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      col_children_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      obj_val_primal_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      obj_val_dual_v(handle_.get_device_allocator(), handle_.get_stream(), 0) {}
+      row_covers_v(0, handle_.get_stream()),
+      col_covers_v(0, handle_.get_stream()),
+      row_duals_v(0, handle_.get_stream()),
+      col_duals_v(0, handle_.get_stream()),
+      col_slacks_v(0, handle_.get_stream()),
+      row_is_visited_v(0, handle_.get_stream()),
+      col_is_visited_v(0, handle_.get_stream()),
+      row_parents_v(0, handle_.get_stream()),
+      col_parents_v(0, handle_.get_stream()),
+      row_children_v(0, handle_.get_stream()),
+      col_children_v(0, handle_.get_stream()),
+      obj_val_primal_v(0, handle_.get_stream()),
+      obj_val_dual_v(0, handle_.get_stream()) {}
 
   // Executes Hungarian algorithm on the input cost matrix.
   void solve(weight_t const *d_cost_matrix, vertex_t *d_row_assignment,
@@ -152,19 +153,20 @@ class LinearAssignmentProblem {
  private:
   // Helper function for initializing global variables and arrays on a single host.
   void initializeDevice() {
-    row_covers_v.resize(batchsize_ * size_);
-    col_covers_v.resize(batchsize_ * size_);
-    row_duals_v.resize(batchsize_ * size_);
-    col_duals_v.resize(batchsize_ * size_);
-    col_slacks_v.resize(batchsize_ * size_);
-    row_is_visited_v.resize(batchsize_ * size_);
-    col_is_visited_v.resize(batchsize_ * size_);
-    row_parents_v.resize(batchsize_ * size_);
-    col_parents_v.resize(batchsize_ * size_);
-    row_children_v.resize(batchsize_ * size_);
-    col_children_v.resize(batchsize_ * size_);
-    obj_val_primal_v.resize(batchsize_);
-    obj_val_dual_v.resize(batchsize_);
+    cudaStream_t stream = handle_.get_stream();
+    row_covers_v.resize(batchsize_ * size_, stream);
+    col_covers_v.resize(batchsize_ * size_, stream);
+    row_duals_v.resize(batchsize_ * size_, stream);
+    col_duals_v.resize(batchsize_ * size_, stream);
+    col_slacks_v.resize(batchsize_ * size_, stream);
+    row_is_visited_v.resize(batchsize_ * size_, stream);
+    col_is_visited_v.resize(batchsize_ * size_, stream);
+    row_parents_v.resize(batchsize_ * size_, stream);
+    col_parents_v.resize(batchsize_ * size_, stream);
+    row_children_v.resize(batchsize_ * size_, stream);
+    col_children_v.resize(batchsize_ * size_, stream);
+    obj_val_primal_v.resize(batchsize_, stream);
+    obj_val_dual_v.resize(batchsize_, stream);
 
     d_vertices_dev.row_covers = row_covers_v.data();
     d_vertices_dev.col_covers = col_covers_v.data();
@@ -231,17 +233,16 @@ class LinearAssignmentProblem {
   int hungarianStep3() {
     int next;
 
-    raft::mr::device::buffer<bool> flag_v(handle_.get_device_allocator(),
-                                          handle_.get_stream(), 1);
+    rmm::device_scalar<bool> flag_v(handle_.get_stream());
 
     bool h_flag = false;
-    raft::update_device(flag_v.data(), &h_flag, 1, handle_.get_stream());
+    flag_v.set_value_async(h_flag, handle_.get_stream());
 
     detail::executeZeroCover(handle_, d_costs_, d_vertices_dev, d_row_data_dev,
                              d_col_data_dev, flag_v.data(), batchsize_, size_,
                              epsilon_);
 
-    raft::update_host(&h_flag, flag_v.data(), 1, handle_.get_stream());
+    h_flag = flag_v.value(handle_.get_stream());
 
     next = h_flag ? 4 : 5;
 
diff --git a/cpp/include/raft/lap/lap_functions.cuh b/cpp/include/raft/lap/lap_functions.cuh
index 7640f3f816..830940f0ec 100644
--- a/cpp/include/raft/lap/lap_functions.cuh
+++ b/cpp/include/raft/lap/lap_functions.cuh
@@ -29,7 +29,8 @@
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 #include <raft/lap/lap_kernels.cuh>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <thrust/reduce.h>
 #include <thrust/scan.h>
@@ -125,10 +126,8 @@ inline void computeInitialAssignments(raft::handle_t const &handle,
 
   std::size_t size = SP * N;
 
-  raft::mr::device::buffer<int> row_lock_v(handle.get_device_allocator(),
-                                           handle.get_stream(), size);
-  raft::mr::device::buffer<int> col_lock_v(handle.get_device_allocator(),
-                                           handle.get_stream(), size);
+  rmm::device_uvector<int> row_lock_v(size, handle.get_stream());
+  rmm::device_uvector<int> col_lock_v(size, handle.get_stream());
 
   thrust::fill_n(thrust::device, d_vertices.row_assignments, size, -1);
   thrust::fill_n(thrust::device, d_vertices.col_assignments, size, -1);
@@ -214,25 +213,21 @@ inline vertex_t zeroCoverIteration(raft::handle_t const &handle,
                                    weight_t epsilon) {
   vertex_t M;
 
-  raft::mr::device::buffer<vertex_t> csr_ptrs_v(handle.get_device_allocator(),
-                                                handle.get_stream(), 0);
-  raft::mr::device::buffer<vertex_t> csr_neighbors_v(
-    handle.get_device_allocator(), handle.get_stream(), 0);
+  rmm::device_uvector<vertex_t> csr_ptrs_v(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> csr_neighbors_v(0, handle.get_stream());
 
   {
     dim3 blocks_per_grid;
     dim3 threads_per_block;
     int total_blocks = 0;
 
-    raft::mr::device::buffer<bool> predicates_v(handle.get_device_allocator(),
-                                                handle.get_stream(), SP * N);
-    raft::mr::device::buffer<vertex_t> addresses_v(
-      handle.get_device_allocator(), handle.get_stream(), SP * N);
+    rmm::device_uvector<bool> predicates_v(SP * N, handle.get_stream());
+    rmm::device_uvector<vertex_t> addresses_v(SP * N, handle.get_stream());
 
     thrust::fill_n(thrust::device, predicates_v.data(), SP * N, false);
     thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0});
 
-    csr_ptrs_v.resize(SP + 1);
+    csr_ptrs_v.resize(SP + 1, handle.get_stream());
 
     thrust::fill_n(thrust::device, csr_ptrs_v.data(), (SP + 1), vertex_t{-1});
 
@@ -251,7 +246,7 @@ inline vertex_t zeroCoverIteration(raft::handle_t const &handle,
                            addresses_v.end(), addresses_v.begin());
 
     if (M > 0) {
-      csr_neighbors_v.resize(M);
+      csr_neighbors_v.resize(M, handle.get_stream());
 
       kernel_rowScatterCSR<<<blocks_per_grid, threads_per_block, 0,
                              handle.get_stream()>>>(
@@ -300,10 +295,8 @@ inline void reversePass(raft::handle_t const &handle,
   raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
                                          total_blocks, size);
 
-  raft::mr::device::buffer<bool> predicates_v(handle.get_device_allocator(),
-                                              handle.get_stream(), size);
-  raft::mr::device::buffer<vertex_t> addresses_v(handle.get_device_allocator(),
-                                                 handle.get_stream(), size);
+  rmm::device_uvector<bool> predicates_v(size, handle.get_stream());
+  rmm::device_uvector<vertex_t> addresses_v(size, handle.get_stream());
 
   thrust::fill_n(thrust::device, predicates_v.data(), size, false);
   thrust::fill_n(thrust::device, addresses_v.data(), size, vertex_t{0});
@@ -329,8 +322,7 @@ inline void reversePass(raft::handle_t const &handle,
     raft::lap::detail::calculateLinearDims(
       blocks_per_grid_1, threads_per_block_1, total_blocks_1, csr_size);
 
-    raft::mr::device::buffer<vertex_t> elements_v(
-      handle.get_device_allocator(), handle.get_stream(), csr_size);
+    rmm::device_uvector<vertex_t> elements_v(csr_size, handle.get_stream());
 
     kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0,
                             handle.get_stream()>>>(
@@ -358,10 +350,8 @@ inline void augmentationPass(raft::handle_t const &handle,
   raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
                                          total_blocks, SP * N);
 
-  raft::mr::device::buffer<bool> predicates_v(handle.get_device_allocator(),
-                                              handle.get_stream(), SP * N);
-  raft::mr::device::buffer<vertex_t> addresses_v(handle.get_device_allocator(),
-                                                 handle.get_stream(), SP * N);
+  rmm::device_uvector<bool> predicates_v(SP * N, handle.get_stream());
+  rmm::device_uvector<vertex_t> addresses_v(SP * N, handle.get_stream());
 
   thrust::fill_n(thrust::device, predicates_v.data(), SP * N, false);
   thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0});
@@ -388,8 +378,8 @@ inline void augmentationPass(raft::handle_t const &handle,
     raft::lap::detail::calculateLinearDims(
       blocks_per_grid_1, threads_per_block_1, total_blocks_1, row_ids_csr_size);
 
-    raft::mr::device::buffer<vertex_t> elements_v(
-      handle.get_device_allocator(), handle.get_stream(), row_ids_csr_size);
+    rmm::device_uvector<vertex_t> elements_v(row_ids_csr_size,
+                                             handle.get_stream());
 
     kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0,
                             handle.get_stream()>>>(
@@ -418,8 +408,7 @@ inline void dualUpdate(raft::handle_t const &handle,
   dim3 threads_per_block;
   int total_blocks;
 
-  raft::mr::device::buffer<weight_t> sp_min_v(handle.get_device_allocator(),
-                                              handle.get_stream(), 1);
+  rmm::device_scalar<weight_t> sp_min_v(handle.get_stream());
 
   raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
                                          total_blocks, SP);
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh
index b5a93c4953..d6d064c20e 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh
@@ -63,11 +63,11 @@ namespace linalg {
  * @code{.cpp}
  * // Initialize arrays
  * int ld_L = n_rows;
- * device_buffer<math_t> L(allocator, stream, ld_L * n_rows);
+ * rmm::device_uvector<math_t> L(ld_L * n_rows, stream);
  * MLCommon::LinAlg::choleskyRank1Update(handle, L, n_rows, ld_L, nullptr,
  *                                       &n_bytes, CUBLAS_FILL_MODE_LOWER,
  *                                       stream);
- * device_buffer<char> workspace(allocator, stream, n_bytes);
+ * rmm::device_uvector<char> workspace(n_bytes, stream);
  *
  * for (n=1; n<=n_rows; rank++) {
  *   // Calculate a new row/column of matrix A into A_new
@@ -87,11 +87,11 @@ namespace linalg {
  * @code{.cpp}
  * // Initialize arrays
  * int ld_U = n_rows;
- * device_buffer<math_t> U(allocator, stream, ld_U * n_rows);
+ * rmm::device_uvector<math_t> U(ld_U * n_rows, stream);
  * MLCommon::LinAlg::choleskyRank1Update(handle, L, n_rows, ld_U, nullptr,
  *                                       &n_bytes, CUBLAS_FILL_MODE_UPPER,
  *                                       stream);
- * device_buffer<char> workspace(allocator, stream, n_bytes);
+ * rmm::device_uvector<char> workspace(stream, n_bytes, stream);
  *
  * for (n=1; n<=n_rows; n++) {
  *   // Calculate a new row/column of matrix A into array A_new
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
index 6172618380..5b2df3bcb3 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -22,7 +22,8 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
 #include <raft/matrix/matrix.cuh>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace linalg {
@@ -44,7 +45,6 @@ template <typename math_t>
 void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows,
            int n_cols, math_t *eig_vectors, math_t *eig_vals,
            cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int lwork;
@@ -52,8 +52,8 @@ void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows,
                                             CUBLAS_FILL_MODE_UPPER, n_rows, in,
                                             n_cols, eig_vals, &lwork));
 
-  raft::mr::device::buffer<math_t> d_work(allocator, stream, lwork);
-  raft::mr::device::buffer<int> d_dev_info(allocator, stream, 1);
+  rmm::device_uvector<math_t> d_work(lwork, stream);
+  rmm::device_scalar<int> d_dev_info(stream);
 
   raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
 
@@ -63,9 +63,7 @@ void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows,
                                  d_dev_info.data(), stream));
   CUDA_CHECK(cudaGetLastError());
 
-  int dev_info;
-  raft::update_host(&dev_info, d_dev_info.data(), 1, stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  int dev_info = d_dev_info.value(stream);
   ASSERT(dev_info == 0,
          "eig.cuh: eigensolver couldn't converge to a solution. "
          "This usually occurs when some of the features do not vary enough.");
@@ -93,7 +91,6 @@ template <typename math_t>
 void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
               int n_eig_vals, math_t *eig_vectors, math_t *eig_vals,
               EigVecMemUsage memUsage, cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int lwork;
@@ -104,9 +101,9 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
     CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0),
     n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, &lwork));
 
-  raft::mr::device::buffer<math_t> d_work(allocator, stream, lwork);
-  raft::mr::device::buffer<int> d_dev_info(allocator, stream, 1);
-  raft::mr::device::buffer<math_t> d_eig_vectors(allocator, stream, 0);
+  rmm::device_uvector<math_t> d_work(lwork, stream);
+  rmm::device_scalar<int> d_dev_info(stream);
+  rmm::device_uvector<math_t> d_eig_vectors(0, stream);
 
   if (memUsage == OVERWRITE_INPUT) {
     CUSOLVER_CHECK(cusolverDnsyevdx(
@@ -127,9 +124,7 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
 
   CUDA_CHECK(cudaGetLastError());
 
-  int dev_info;
-  raft::update_host(&dev_info, d_dev_info.data(), 1, stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  int dev_info = d_dev_info.value(stream);
   ASSERT(dev_info == 0,
          "eig.cuh: eigensolver couldn't converge to a solution. "
          "This usually occurs when some of the features do not vary enough.");
@@ -163,7 +158,6 @@ template <typename math_t>
 void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows,
                int n_cols, math_t *eig_vectors, math_t *eig_vals,
                cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) {
-  auto allocator = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   syevjInfo_t syevj_params = nullptr;
@@ -176,8 +170,8 @@ void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows,
     cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, n_rows,
     eig_vectors, n_cols, eig_vals, &lwork, syevj_params));
 
-  raft::mr::device::buffer<math_t> d_work(allocator, stream, lwork);
-  raft::mr::device::buffer<int> dev_info(allocator, stream, 1);
+  rmm::device_uvector<math_t> d_work(lwork, stream);
+  rmm::device_scalar<int> dev_info(stream);
 
   raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
 
diff --git a/cpp/include/raft/linalg/init.h b/cpp/include/raft/linalg/init.h
index cb2e8ed1ab..9944685a1f 100644
--- a/cpp/include/raft/linalg/init.h
+++ b/cpp/include/raft/linalg/init.h
@@ -19,6 +19,7 @@
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <rmm/exec_policy.hpp>
 
 namespace raft {
 namespace linalg {
@@ -40,7 +41,7 @@ void range(T *out, int start, int end, cudaStream_t stream) {
   thrust::counting_iterator<int> first(start);
   thrust::counting_iterator<int> last = first + (end - start);
   thrust::device_ptr<T> ptr(out);
-  thrust::copy(thrust::cuda::par.on(stream), first, last, ptr);
+  thrust::copy(rmm::exec_policy(stream), first, last, ptr);
 }
 
 /**
diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh
index cafa8d54f1..cc912d7d86 100644
--- a/cpp/include/raft/linalg/qr.cuh
+++ b/cpp/include/raft/linalg/qr.cuh
@@ -19,7 +19,8 @@
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/linalg/cusolver_wrappers.h>
 #include <raft/matrix/matrix.cuh>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace linalg {
@@ -42,7 +43,6 @@ namespace linalg {
 template <typename math_t>
 void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q,
             int n_rows, int n_cols, cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int m = n_rows, n = n_cols;
@@ -50,14 +50,14 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q,
   CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n,
                              cudaMemcpyDeviceToDevice, stream));
 
-  raft::mr::device::buffer<math_t> tau(allocator, stream, k);
+  rmm::device_uvector<math_t> tau(k, stream);
   CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream));
 
-  raft::mr::device::buffer<int> devInfo(allocator, stream, 1);
+  rmm::device_scalar<int> devInfo(stream);
   int Lwork;
 
   CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork));
-  raft::mr::device::buffer<math_t> workspace(allocator, stream, Lwork);
+  rmm::device_uvector<math_t> workspace(Lwork, stream);
   CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, m, n, Q, m, tau.data(),
                                  workspace.data(), Lwork, devInfo.data(),
                                  stream));
@@ -86,12 +86,11 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q,
 template <typename math_t>
 void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R,
              int n_rows, int n_cols, cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int m = n_rows, n = n_cols;
-  raft::mr::device::buffer<math_t> R_full(allocator, stream, m * n);
-  raft::mr::device::buffer<math_t> tau(allocator, stream, min(m, n));
+  rmm::device_uvector<math_t> R_full(m * n, stream);
+  rmm::device_uvector<math_t> tau(min(m, n), stream);
   CUDA_CHECK(
     cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream));
   int R_full_nrows = m, R_full_ncols = n;
@@ -99,12 +98,12 @@ void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R,
                              cudaMemcpyDeviceToDevice, stream));
 
   int Lwork;
-  raft::mr::device::buffer<int> devInfo(allocator, stream, 1);
+  rmm::device_scalar<int> devInfo(stream);
 
   CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, R_full_nrows,
                                             R_full_ncols, R_full.data(),
                                             R_full_nrows, &Lwork));
-  raft::mr::device::buffer<math_t> workspace(allocator, stream, Lwork);
+  rmm::device_uvector<math_t> workspace(Lwork, stream);
   CUSOLVER_CHECK(cusolverDngeqrf(
     cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows,
     tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh
index 7357a68a4c..8b40a80903 100644
--- a/cpp/include/raft/linalg/svd.cuh
+++ b/cpp/include/raft/linalg/svd.cuh
@@ -23,7 +23,8 @@
 #include <raft/handle.hpp>
 #include <raft/matrix/math.cuh>
 #include <raft/matrix/matrix.cuh>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
 #include "eig.cuh"
 #include "gemm.cuh"
 #include "transpose.h"
@@ -54,8 +55,6 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
            T *sing_vals, T *left_sing_vecs, T *right_sing_vecs,
            bool trans_right, bool gen_left_vec, bool gen_right_vec,
            cudaStream_t stream) {
-  std::shared_ptr<raft::mr::device::allocator> allocator =
-    handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
   cublasHandle_t cublasH = handle.get_cublas_handle();
 
@@ -71,13 +70,13 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
   const int m = n_rows;
   const int n = n_cols;
 
-  raft::mr::device::buffer<int> devInfo(allocator, stream, 1);
+  rmm::device_scalar<int> devInfo(stream);
   T *d_rwork = nullptr;
 
   int lwork = 0;
   CUSOLVER_CHECK(
     cusolverDngesvd_bufferSize<T>(cusolverH, n_rows, n_cols, &lwork));
-  raft::mr::device::buffer<T> d_work(allocator, stream, lwork);
+  rmm::device_uvector<T> d_work(lwork, stream);
 
   char jobu = 'S';
   char jobvt = 'A';
@@ -112,12 +111,11 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
 template <typename T>
 void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S,
             T *U, T *V, bool gen_left_vec, cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
   cublasHandle_t cublasH = handle.get_cublas_handle();
 
   int len = n_cols * n_cols;
-  raft::mr::device::buffer<T> in_cross_mult(allocator, stream, len);
+  rmm::device_uvector<T> in_cross_mult(len, stream);
 
   T alpha = T(1);
   T beta = T(0);
@@ -162,7 +160,6 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
                math_t *sing_vals, math_t *left_sing_vecs,
                math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec,
                math_t tol, int max_sweeps, cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   gesvdjInfo_t gesvdj_params = NULL;
@@ -174,7 +171,7 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
   int m = n_rows;
   int n = n_cols;
 
-  raft::mr::device::buffer<int> devInfo(allocator, stream, 1);
+  rmm::device_scalar<int> devInfo(stream);
 
   int lwork = 0;
   int econ = 1;
@@ -183,7 +180,7 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
     cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals,
     left_sing_vecs, m, right_sing_vecs, n, &lwork, gesvdj_params));
 
-  raft::mr::device::buffer<math_t> d_work(allocator, stream, lwork);
+  rmm::device_uvector<math_t> d_work(lwork, stream);
 
   CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj(
     cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals,
@@ -210,10 +207,8 @@ template <typename math_t>
 void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S,
                        math_t *V, math_t *out, int n_rows, int n_cols, int k,
                        cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
-
   const math_t alpha = 1.0, beta = 0.0;
-  raft::mr::device::buffer<math_t> SVT(allocator, stream, k * n_cols);
+  rmm::device_uvector<math_t> SVT(k * n_cols, stream);
 
   raft::linalg::gemm(handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N,
                      CUBLAS_OP_T, alpha, beta, stream);
@@ -239,14 +234,13 @@ template <typename math_t>
 bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U,
                          math_t *S_vec, math_t *V, int n_rows, int n_cols,
                          int k, math_t tol, cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
   cublasHandle_t cublasH = handle.get_cublas_handle();
 
   int m = n_rows, n = n_cols;
 
   // form product matrix
-  raft::mr::device::buffer<math_t> P_d(allocator, stream, m * n);
-  raft::mr::device::buffer<math_t> S_mat(allocator, stream, k * k);
+  rmm::device_uvector<math_t> P_d(m * n, stream);
+  rmm::device_uvector<math_t> S_mat(k * k, stream);
   CUDA_CHECK(cudaMemsetAsync(P_d.data(), 0, sizeof(math_t) * m * n, stream));
   CUDA_CHECK(cudaMemsetAsync(S_mat.data(), 0, sizeof(math_t) * k * k, stream));
 
@@ -262,7 +256,7 @@ bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U,
 
   // calculate percent error
   const math_t alpha = 1.0, beta = -1.0;
-  raft::mr::device::buffer<math_t> A_minus_P(allocator, stream, m * n);
+  rmm::device_uvector<math_t> A_minus_P(m * n, stream);
   CUDA_CHECK(
     cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream));
 
diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/transpose.h
index d90f6271fa..db1cabd694 100644
--- a/cpp/include/raft/linalg/transpose.h
+++ b/cpp/include/raft/linalg/transpose.h
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <raft/linalg/cublas_wrappers.h>
-#include <thrust/device_vector.h>
 #include <raft/handle.hpp>
+#include <rmm/exec_policy.hpp>
 
 namespace raft {
 namespace linalg {
@@ -60,7 +60,7 @@ void transpose(math_t *inout, int n, cudaStream_t stream) {
   auto d_inout = inout;
   auto counting = thrust::make_counting_iterator<int>(0);
 
-  thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size,
+  thrust::for_each(rmm::exec_policy(stream), counting, counting + size,
                    [=] __device__(int idx) {
                      int s_row = idx % m;
                      int s_col = idx / m;
diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh
index 0a72117140..41ca85dce0 100644
--- a/cpp/include/raft/matrix/math.cuh
+++ b/cpp/include/raft/matrix/math.cuh
@@ -21,8 +21,8 @@
 #include <raft/linalg/map_then_reduce.cuh>
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace matrix {
@@ -285,7 +285,6 @@ void setValue(math_t *out, const math_t *in, math_t scalar, int len,
  * @param src: input matrix
  * @param dest: output matrix. The result is stored in the dest matrix
  * @param len: number elements of input matrix
- * @param allocator device allocator
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
@@ -294,10 +293,7 @@ void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len,
   auto d_src = src;
   auto d_dest = dest;
 
-  std::shared_ptr<raft::mr::device::allocator> allocator =
-    handle.get_device_allocator();
-
-  raft::mr::device::buffer<math_t> d_sum(allocator, stream, 1);
+  rmm::device_scalar<math_t> d_sum(stream);
   auto *d_sum_ptr = d_sum.data();
   auto no_op = [] __device__(math_t in) { return in; };
   raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src);
diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh
index 5f5755e24e..688b92da09 100644
--- a/cpp/include/raft/matrix/matrix.cuh
+++ b/cpp/include/raft/matrix/matrix.cuh
@@ -20,13 +20,13 @@
 #include <cusolverDn.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/cublas_wrappers.h>
-#include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <algorithm>
 #include <cstddef>
 #include <raft/cache/cache_util.cuh>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
+#include <rmm/exec_policy.hpp>
 
 namespace raft {
 namespace matrix {
@@ -64,7 +64,7 @@ void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out,
   idx_t size = n_rows_indices * n_cols;
   auto counting = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size,
+  thrust::for_each(rmm::exec_policy(stream), counting, counting + size,
                    [=] __device__(idx_t idx) {
                      idx_t row = idx % n_rows_indices;
                      idx_t col = idx / n_rows_indices;
@@ -108,7 +108,7 @@ void truncZeroOrigin(m_t *in, idx_t in_n_rows, m_t *out, idx_t out_n_rows,
   auto d_q_trunc = out;
   auto counting = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size,
+  thrust::for_each(rmm::exec_policy(stream), counting, counting + size,
                    [=] __device__(idx_t idx) {
                      idx_t row = idx % m;
                      idx_t col = idx / m;
@@ -133,8 +133,8 @@ void colReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) {
   auto d_q_reversed = inout;
   auto counting = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(thrust::cuda::par.on(stream), counting,
-                   counting + (size / 2), [=] __device__(idx_t idx) {
+  thrust::for_each(rmm::exec_policy(stream), counting, counting + (size / 2),
+                   [=] __device__(idx_t idx) {
                      idx_t dest_row = idx % m;
                      idx_t dest_col = idx / m;
                      idx_t src_row = dest_row;
@@ -161,8 +161,8 @@ void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) {
   auto d_q_reversed = inout;
   auto counting = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(thrust::cuda::par.on(stream), counting,
-                   counting + (size / 2), [=] __device__(idx_t idx) {
+  thrust::for_each(rmm::exec_policy(stream), counting, counting + (size / 2),
+                   [=] __device__(idx_t idx) {
                      idx_t dest_row = idx % m;
                      idx_t dest_col = idx / m;
                      idx_t src_row = (m - dest_row) - 1;
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh
index 56710ea81f..3d2e44e49b 100644
--- a/cpp/include/raft/random/rng.cuh
+++ b/cpp/include/raft/random/rng.cuh
@@ -24,9 +24,8 @@
 #include <raft/common/scatter.cuh>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
 #include <random>
+#include <rmm/device_uvector.hpp>
 #include <type_traits>
 #include "rng_impl.cuh"
 
@@ -498,7 +497,6 @@ class Rng {
    * sampling is desired
    * @param sampledLen output sampled array length
    * @param len input array length
-   * @param allocator device allocator for allocating any workspace required
    * @param stream cuda stream
    */
   template <typename DataT, typename WeightsT, typename IdxT = int>
@@ -509,13 +507,10 @@ class Rng {
     ASSERT(sampledLen <= len,
            "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'.");
 
-    std::shared_ptr<raft::mr::device::allocator> allocator =
-      handle.get_device_allocator();
-
-    raft::mr::device::buffer<WeightsT> expWts(allocator, stream, len);
-    raft::mr::device::buffer<WeightsT> sortedWts(allocator, stream, len);
-    raft::mr::device::buffer<IdxT> inIdx(allocator, stream, len);
-    raft::mr::device::buffer<IdxT> outIdxBuff(allocator, stream, len);
+    rmm::device_uvector<WeightsT> expWts(len, stream);
+    rmm::device_uvector<WeightsT> sortedWts(len, stream);
+    rmm::device_uvector<IdxT> inIdx(len, stream);
+    rmm::device_uvector<IdxT> outIdxBuff(len, stream);
     auto *inIdxPtr = inIdx.data();
     // generate modified weights
     custom_distribution(
@@ -533,7 +528,7 @@ class Rng {
     ///@todo: use a more efficient partitioning scheme instead of full sort
     // sort the array and pick the top sampledLen items
     IdxT *outIdxPtr = outIdxBuff.data();
-    raft::mr::device::buffer<char> workspace(allocator, stream);
+    rmm::device_uvector<char> workspace(0, stream);
     sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr,
               (int)len, stream);
     if (outIdx != nullptr) {
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
index a034bdbda8..79b18ebd0a 100644
--- a/cpp/include/raft/sparse/convert/csr.cuh
+++ b/cpp/include/raft/sparse/convert/csr.cuh
@@ -22,8 +22,7 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
@@ -49,16 +48,15 @@ void coo_to_csr(const raft::handle_t &handle, const int *srcRows,
                 int *dst_offsets, int *dstCols, value_t *dstVals) {
   auto stream = handle.get_stream();
   auto cusparseHandle = handle.get_cusparse_handle();
-  auto d_alloc = handle.get_device_allocator();
-  raft::mr::device::buffer<int> dstRows(d_alloc, stream, nnz);
+  rmm::device_uvector<int> dstRows(nnz, stream);
   CUDA_CHECK(cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz,
                              cudaMemcpyDeviceToDevice, stream));
   CUDA_CHECK(cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz,
                              cudaMemcpyDeviceToDevice, stream));
   auto buffSize = raft::sparse::cusparsecoosort_bufferSizeExt(
     cusparseHandle, m, m, nnz, srcRows, srcCols, stream);
-  raft::mr::device::buffer<char> pBuffer(d_alloc, stream, buffSize);
-  raft::mr::device::buffer<int> P(d_alloc, stream, nnz);
+  rmm::device_uvector<char> pBuffer(buffSize, stream);
+  rmm::device_uvector<int> P(nnz, stream);
   CUSPARSE_CHECK(
     cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data()));
   raft::sparse::cusparsecoosortByRow(cusparseHandle, m, m, nnz, dstRows.data(),
@@ -147,14 +145,12 @@ void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
  * @param nnz: size of COO rows array
  * @param row_ind: output row indices array
  * @param m: number of rows in dense matrix
- * @param d_alloc device allocator for temporary buffers
  * @param stream: cuda stream to use
  */
 template <typename T>
 void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m,
-                       std::shared_ptr<raft::mr::device::allocator> d_alloc,
                        cudaStream_t stream) {
-  raft::mr::device::buffer<T> row_counts(d_alloc, stream, m);
+  rmm::device_uvector<T> row_counts(m, stream);
 
   CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream));
 
@@ -164,7 +160,7 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m,
   thrust::device_ptr<T> row_counts_d =
     thrust::device_pointer_cast(row_counts.data());
   thrust::device_ptr<T> c_ind_d = thrust::device_pointer_cast(row_ind);
-  exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m,
+  exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m,
                  c_ind_d);
 }
 
@@ -173,15 +169,11 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m,
  *
  * @param coo: Input COO matrix
  * @param row_ind: output row indices array
- * @param d_alloc device allocator for temporary buffers
  * @param stream: cuda stream to use
  */
 template <typename T>
-void sorted_coo_to_csr(COO<T> *coo, int *row_ind,
-                       std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                       cudaStream_t stream) {
-  sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, d_alloc,
-                    stream);
+void sorted_coo_to_csr(COO<T> *coo, int *row_ind, cudaStream_t stream) {
+  sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, stream);
 }
 
 };  // end NAMESPACE convert
diff --git a/cpp/include/raft/sparse/coo.cuh b/cpp/include/raft/sparse/coo.cuh
index 73120fea8c..fa21614f8f 100644
--- a/cpp/include/raft/sparse/coo.cuh
+++ b/cpp/include/raft/sparse/coo.cuh
@@ -17,13 +17,11 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <cusparse_v2.h>
 
 #include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
 #include <thrust/scan.h>
 
 #include <cuda_runtime.h>
@@ -58,9 +56,9 @@ namespace sparse {
 template <typename T, typename Index_Type = int>
 class COO {
  protected:
-  raft::mr::device::buffer<Index_Type> rows_arr;
-  raft::mr::device::buffer<Index_Type> cols_arr;
-  raft::mr::device::buffer<T> vals_arr;
+  rmm::device_uvector<Index_Type> rows_arr;
+  rmm::device_uvector<Index_Type> cols_arr;
+  rmm::device_uvector<T> vals_arr;
 
  public:
   Index_Type nnz;
@@ -68,13 +66,12 @@ class COO {
   Index_Type n_cols;
 
   /**
-    * @param d_alloc: the device allocator to use for the underlying buffers
     * @param stream: CUDA stream to use
     */
-  COO(std::shared_ptr<raft::mr::device::allocator> d_alloc, cudaStream_t stream)
-    : rows_arr(d_alloc, stream, 0),
-      cols_arr(d_alloc, stream, 0),
-      vals_arr(d_alloc, stream, 0),
+  COO(cudaStream_t stream)
+    : rows_arr(0, stream),
+      cols_arr(0, stream),
+      vals_arr(0, stream),
       nnz(0),
       n_rows(0),
       n_cols(0) {}
@@ -87,10 +84,9 @@ class COO {
     * @param n_rows: number of rows in the dense matrix
     * @param n_cols: number of cols in the dense matrix
     */
-  COO(raft::mr::device::buffer<Index_Type> &rows,
-      raft::mr::device::buffer<Index_Type> &cols,
-      raft::mr::device::buffer<T> &vals, Index_Type nnz, Index_Type n_rows = 0,
-      Index_Type n_cols = 0)
+  COO(rmm::device_uvector<Index_Type> &rows,
+      rmm::device_uvector<Index_Type> &cols, rmm::device_uvector<T> &vals,
+      Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0)
     : rows_arr(rows),
       cols_arr(cols),
       vals_arr(vals),
@@ -99,19 +95,17 @@ class COO {
       n_cols(n_cols) {}
 
   /**
-    * @param d_alloc: the device allocator use
     * @param stream: CUDA stream to use
     * @param nnz: size of the rows/cols/vals arrays
     * @param n_rows: number of rows in the dense matrix
     * @param n_cols: number of cols in the dense matrix
     * @param init: initialize arrays with zeros
     */
-  COO(std::shared_ptr<raft::mr::device::allocator> d_alloc, cudaStream_t stream,
-      Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0,
-      bool init = true)
-    : rows_arr(d_alloc, stream, nnz),
-      cols_arr(d_alloc, stream, nnz),
-      vals_arr(d_alloc, stream, nnz),
+  COO(cudaStream_t stream, Index_Type nnz, Index_Type n_rows = 0,
+      Index_Type n_cols = 0, bool init = true)
+    : rows_arr(nnz, stream),
+      cols_arr(nnz, stream),
+      vals_arr(nnz, stream),
       nnz(nnz),
       n_rows(n_rows),
       n_cols(n_cols) {
diff --git a/cpp/include/raft/sparse/csr.cuh b/cpp/include/raft/sparse/csr.cuh
index bc4a68d296..041aedf41c 100644
--- a/cpp/include/raft/sparse/csr.cuh
+++ b/cpp/include/raft/sparse/csr.cuh
@@ -20,8 +20,8 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
@@ -208,7 +208,6 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
  * @param row_ind_ptr the row index pointer of the CSR array
  * @param nnz the size of row_ind_ptr array
  * @param N number of vertices
- * @param d_alloc: deviceAllocator to use for temp memory
  * @param stream the cuda stream to use
  * @param filter_op an optional filtering function to determine which points
  * should get considered for labeling. It gets global indexes (not batch-wide!)
@@ -216,11 +215,8 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
 template <typename Index_ = int, int TPB_X = 256,
           typename Lambda = auto(Index_)->bool>
 void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
-             Index_ nnz, Index_ N,
-             std::shared_ptr<raft::mr::device::allocator> d_alloc,
-             cudaStream_t stream, Lambda filter_op) {
-  raft::mr::device::buffer<bool> m(d_alloc, stream, 1);
-
+             Index_ nnz, Index_ N, cudaStream_t stream, Lambda filter_op) {
+  rmm::device_scalar<bool> m(stream);
   WeakCCState state(m.data());
   weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N,
                                  stream, filter_op);
@@ -245,15 +241,12 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
  * @param row_ind_ptr the row index pointer of the CSR array
  * @param nnz the size of row_ind_ptr array
  * @param N number of vertices
- * @param d_alloc: deviceAllocator to use for temp memory
  * @param stream the cuda stream to use
  */
 template <typename Index_, int TPB_X = 256>
 void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
-             Index_ nnz, Index_ N,
-             std::shared_ptr<raft::mr::device::allocator> d_alloc,
-             cudaStream_t stream) {
-  raft::mr::device::buffer<bool> m(d_alloc, stream, 1);
+             Index_ nnz, Index_ N, cudaStream_t stream) {
+  rmm::device_scalar<bool> m(stream);
   WeakCCState state(m.data());
   weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N,
                                  stream, [](Index_) { return true; });
diff --git a/cpp/include/raft/sparse/distance/bin_distance.cuh b/cpp/include/raft/sparse/distance/bin_distance.cuh
index f3109556b7..6885c250c0 100644
--- a/cpp/include/raft/sparse/distance/bin_distance.cuh
+++ b/cpp/include/raft/sparse/distance/bin_distance.cuh
@@ -21,13 +21,11 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/cuda_utils.cuh>
-
-#include <raft/mr/device/allocator.hpp>
-
 #include <raft/sparse/distance/common.h>
 #include <raft/sparse/utils.h>
+#include <raft/cuda_utils.cuh>
 #include <raft/sparse/distance/ip_distance.cuh>
+#include <rmm/device_uvector.hpp>
 
 #include <nvfunctional>
 
@@ -84,7 +82,6 @@ void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows,
                           const value_t *Q_data, value_idx Q_nnz,
                           const value_idx *R_coo_rows, const value_t *R_data,
                           value_idx R_nnz, value_idx m, value_idx n,
-                          std::shared_ptr<raft::mr::device::allocator> alloc,
                           cudaStream_t stream, expansion_f expansion_func) {
   rmm::device_uvector<value_t> Q_norms(m, stream);
   rmm::device_uvector<value_t> R_norms(n, stream);
@@ -130,7 +127,7 @@ class jaccard_expanded_distances_t : public distances_t<value_t> {
     compute_bin_distance(
       out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
       b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
-      config_->handle.get_device_allocator(), config_->handle.get_stream(),
+      config_->handle.get_stream(),
       [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
         value_t q_r_union = q_norm + r_norm;
         value_t denom = q_r_union - dot;
@@ -179,7 +176,7 @@ class dice_expanded_distances_t : public distances_t<value_t> {
     compute_bin_distance(
       out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
       b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
-      config_->handle.get_device_allocator(), config_->handle.get_stream(),
+      config_->handle.get_stream(),
       [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
         value_t q_r_union = q_norm + r_norm;
         value_t dice = (2 * dot) / q_r_union;
diff --git a/cpp/include/raft/sparse/distance/coo_spmv.cuh b/cpp/include/raft/sparse/distance/coo_spmv.cuh
index 3a78f9ada0..24be171900 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv.cuh
+++ b/cpp/include/raft/sparse/distance/coo_spmv.cuh
@@ -22,7 +22,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
 
 #include "../csr.cuh"
@@ -35,8 +34,6 @@
 
 #include <cusparse_v2.h>
 
-#include <rmm/exec_policy.hpp>
-
 namespace raft {
 namespace sparse {
 namespace distance {
diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh
index 5ace978a23..3b57225350 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh
@@ -22,8 +22,6 @@
 #include "coo_mask_row_iterators.cuh"
 
 #include <rmm/device_uvector.hpp>
-#include <rmm/device_vector.hpp>
-#include <rmm/exec_policy.hpp>
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh
index 44c3833f96..74eb37bc2b 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh
+++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh
@@ -20,7 +20,6 @@
 #include "../utils.cuh"
 
 #include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh
index 1295d24103..a95c6ff85b 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh
@@ -55,7 +55,7 @@ class hash_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
                        rmm::device_uvector<value_idx> &mask_indptr,
                        std::tuple<value_idx, value_idx> &n_rows_divided,
                        cudaStream_t stream) {
-    auto policy = rmm::exec_policy(stream);
+    auto policy = this->config.handle.get_thrust_policy();
 
     auto less = thrust::copy_if(
       policy, thrust::make_counting_iterator(value_idx(0)),
diff --git a/cpp/include/raft/sparse/distance/distance.cuh b/cpp/include/raft/sparse/distance/distance.cuh
index a1974b3666..03df396b2e 100644
--- a/cpp/include/raft/sparse/distance/distance.cuh
+++ b/cpp/include/raft/sparse/distance/distance.cuh
@@ -22,7 +22,6 @@
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
 
 #include <raft/sparse/linalg/transpose.h>
diff --git a/cpp/include/raft/sparse/distance/ip_distance.cuh b/cpp/include/raft/sparse/distance/ip_distance.cuh
index 882ccba027..b1e2756671 100644
--- a/cpp/include/raft/sparse/distance/ip_distance.cuh
+++ b/cpp/include/raft/sparse/distance/ip_distance.cuh
@@ -22,8 +22,6 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
 
-#include <raft/mr/device/allocator.hpp>
-
 #include <raft/sparse/distance/common.h>
 #include <raft/sparse/linalg/transpose.h>
 #include <raft/sparse/utils.h>
@@ -31,6 +29,7 @@
 #include <raft/sparse/convert/dense.cuh>
 #include <raft/sparse/distance/coo_spmv.cuh>
 #include <raft/sparse/distance/operators.cuh>
+#include <rmm/device_uvector.hpp>
 
 #include <nvfunctional>
 
diff --git a/cpp/include/raft/sparse/distance/l2_distance.cuh b/cpp/include/raft/sparse/distance/l2_distance.cuh
index 8886d4c9df..6ccfd4adcb 100644
--- a/cpp/include/raft/sparse/distance/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/l2_distance.cuh
@@ -21,17 +21,13 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/distance/common.h>
+#include <raft/sparse/utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/mr/device/allocator.hpp>
-
-#include <rmm/device_uvector.hpp>
-
-#include <raft/sparse/utils.h>
 #include <raft/sparse/csr.cuh>
-
-#include <raft/sparse/distance/common.h>
 #include <raft/sparse/distance/ip_distance.cuh>
+#include <rmm/device_uvector.hpp>
 
 #include <nvfunctional>
 
@@ -127,9 +123,8 @@ template <typename value_idx, typename value_t, int tpb = 256,
 void compute_l2(value_t *out, const value_idx *Q_coo_rows,
                 const value_t *Q_data, value_idx Q_nnz,
                 const value_idx *R_coo_rows, const value_t *R_data,
-                value_idx R_nnz, value_idx m, value_idx n,
-                std::shared_ptr<raft::mr::device::allocator> alloc,
-                cudaStream_t stream, expansion_f expansion_func) {
+                value_idx R_nnz, value_idx m, value_idx n, cudaStream_t stream,
+                expansion_f expansion_func) {
   rmm::device_uvector<value_t> Q_sq_norms(m, stream);
   rmm::device_uvector<value_t> R_sq_norms(n, stream);
   CUDA_CHECK(
@@ -161,7 +156,6 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows,
                   const value_t *Q_data, value_idx Q_nnz,
                   const value_idx *R_coo_rows, const value_t *R_data,
                   value_idx R_nnz, value_idx m, value_idx n, value_idx n_cols,
-                  std::shared_ptr<raft::mr::device::allocator> alloc,
                   cudaStream_t stream) {
   // sum_sq for std dev
   rmm::device_uvector<value_t> Q_sq_norms(m, stream);
@@ -221,7 +215,7 @@ class l2_expanded_distances_t : public distances_t<value_t> {
     compute_l2(
       out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
       b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
-      config_->handle.get_device_allocator(), config_->handle.get_stream(),
+      config_->handle.get_stream(),
       [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
         return -2 * dot + q_norm + r_norm;
       });
@@ -283,7 +277,6 @@ class correlation_expanded_distances_t : public distances_t<value_t> {
     compute_corr(out_dists, search_coo_rows.data(), config_->a_data,
                  config_->a_nnz, b_indices, b_data, config_->b_nnz,
                  config_->a_nrows, config_->b_nrows, config_->b_ncols,
-                 config_->handle.get_device_allocator(),
                  config_->handle.get_stream());
   }
 
@@ -322,7 +315,7 @@ class cosine_expanded_distances_t : public distances_t<value_t> {
     compute_l2(
       out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
       b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
-      config_->handle.get_device_allocator(), config_->handle.get_stream(),
+      config_->handle.get_stream(),
       [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
         value_t norms = sqrt(q_norm) * sqrt(r_norm);
         // deal with potential for 0 in denominator by forcing 0/1 instead
diff --git a/cpp/include/raft/sparse/distance/utils.cuh b/cpp/include/raft/sparse/distance/utils.cuh
index 6b6d77a2d5..3bee1bc87d 100644
--- a/cpp/include/raft/sparse/distance/utils.cuh
+++ b/cpp/include/raft/sparse/distance/utils.cuh
@@ -21,8 +21,6 @@
 
 #include <cub/cub.cuh>
 
-#include <rmm/device_vector.hpp>
-
 namespace raft {
 namespace sparse {
 namespace distance {
diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
index 3cffa1c28a..4ef2ac43e2 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
@@ -22,7 +22,6 @@
 #include <raft/mr/device/buffer.hpp>
 
 #include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
 
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
@@ -100,9 +99,8 @@ class UnionFind {
 template <typename value_idx, typename value_t>
 void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
                            const value_idx *cols, const value_t *data,
-                           std::size_t nnz, value_idx *children,
-                           value_t *out_delta, value_idx *out_size) {
-  auto d_alloc = handle.get_device_allocator();
+                           size_t nnz, value_idx *children, value_t *out_delta,
+                           value_idx *out_size) {
   auto stream = handle.get_stream();
 
   value_idx n_edges = nnz;
@@ -225,11 +223,10 @@ struct init_label_roots {
  */
 template <typename value_idx, int tpb = 256>
 void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
-                                const value_idx *children,
-                                std::size_t n_clusters, std::size_t n_leaves) {
-  auto d_alloc = handle.get_device_allocator();
+                                const value_idx *children, size_t n_clusters,
+                                size_t n_leaves) {
   auto stream = handle.get_stream();
-  auto thrust_policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
+  auto thrust_policy = handle.get_thrust_policy();
 
   // Handle special case where n_clusters == 1
   if (n_clusters == 1) {
diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
index 7cf959dda6..31e4a0f263 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
@@ -22,7 +22,6 @@
 
 #include <raft/linalg/unary_op.cuh>
 #include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
 
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/hierarchy/common.h>
@@ -60,12 +59,11 @@ struct distance_graph_impl<raft::hierarchy::LinkageDistance::KNN_GRAPH,
            rmm::device_uvector<value_idx> &indptr,
            rmm::device_uvector<value_idx> &indices,
            rmm::device_uvector<value_t> &data, int c) {
-    auto d_alloc = handle.get_device_allocator();
     auto stream = handle.get_stream();
-    auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
+    auto thrust_policy = handle.get_thrust_policy();
 
     // Need to symmetrize knn into undirected graph
-    raft::sparse::COO<value_t, value_idx> knn_graph_coo(d_alloc, stream);
+    raft::sparse::COO<value_t, value_idx> knn_graph_coo(stream);
 
     raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo,
                                        c);
@@ -78,7 +76,7 @@ struct distance_graph_impl<raft::hierarchy::LinkageDistance::KNN_GRAPH,
       knn_graph_coo.rows(), knn_graph_coo.cols(), knn_graph_coo.vals()));
 
     thrust::transform(
-      exec_policy, transform_in, transform_in + knn_graph_coo.nnz,
+      thrust_policy, transform_in, transform_in + knn_graph_coo.nnz,
       knn_graph_coo.vals(),
       [=] __device__(const thrust::tuple<value_idx, value_idx, value_t> &tup) {
         bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup);
@@ -86,9 +84,8 @@ struct distance_graph_impl<raft::hierarchy::LinkageDistance::KNN_GRAPH,
                (!self_loop * thrust::get<2>(tup));
       });
 
-    raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(),
-                                             knn_graph_coo.nnz, indptr.data(),
-                                             m + 1, d_alloc, stream);
+    raft::sparse::convert::sorted_coo_to_csr(
+      knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), m + 1, stream);
 
     // TODO: Wouldn't need to copy here if we could compute knn
     // graph directly on the device uvectors
diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
index 765a5ad77f..6ef6f9879b 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
@@ -25,12 +25,9 @@
 #include <raft/sparse/selection/connect_components.cuh>
 #include <rmm/device_uvector.hpp>
 
-#include <rmm/exec_policy.hpp>
-
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
 #include <thrust/sort.h>
-#include <rmm/exec_policy.hpp>
 
 namespace raft {
 namespace hierarchy {
@@ -80,18 +77,16 @@ void connect_knn_graph(const raft::handle_t &handle, const value_t *X,
                        red_op reduction_op,
                        raft::distance::DistanceType metric =
                          raft::distance::DistanceType::L2SqrtExpanded) {
-  auto d_alloc = handle.get_device_allocator();
   auto stream = handle.get_stream();
 
-  raft::sparse::COO<value_t, value_idx> connected_edges(d_alloc, stream);
+  raft::sparse::COO<value_t, value_idx> connected_edges(stream);
 
   raft::linkage::connect_components<value_idx, value_t>(
     handle, connected_edges, X, color, m, n, reduction_op);
 
   rmm::device_uvector<value_idx> indptr2(m + 1, stream);
-  raft::sparse::convert::sorted_coo_to_csr(connected_edges.rows(),
-                                           connected_edges.nnz, indptr2.data(),
-                                           m + 1, d_alloc, stream);
+  raft::sparse::convert::sorted_coo_to_csr(
+    connected_edges.rows(), connected_edges.nnz, indptr2.data(), m + 1, stream);
 
   // On the second call, we hand the MST the original colors
   // and the new set of edges and let it restart the optimization process
@@ -136,7 +131,6 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X,
                       raft::distance::DistanceType metric =
                         raft::distance::DistanceType::L2SqrtExpanded,
                       int max_iter = 10) {
-  auto d_alloc = handle.get_device_allocator();
   auto stream = handle.get_stream();
 
   // We want to have MST initialize colors on first call.
@@ -145,7 +139,7 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X,
     true);
 
   int iters = 1;
-  int n_components = linkage::get_n_components(color, m, d_alloc, stream);
+  int n_components = linkage::get_n_components(color, m, stream);
 
   while (n_components > 1 && iters < max_iter) {
     connect_knn_graph<value_idx, value_t>(handle, X, mst_coo, m, n, color,
@@ -153,7 +147,7 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X,
 
     iters++;
 
-    n_components = linkage::get_n_components(color, m, d_alloc, stream);
+    n_components = linkage::get_n_components(color, m, stream);
   }
 
   /**
diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
index 01a033945c..06fffb8aed 100644
--- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
+++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
@@ -58,7 +58,6 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m,
          "n_clusters must be less than or equal to the number of data points");
 
   auto stream = handle.get_stream();
-  auto d_alloc = handle.get_device_allocator();
 
   rmm::device_uvector<value_idx> indptr(EMPTY, stream);
   rmm::device_uvector<value_idx> indices(EMPTY, stream);
diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh
index 47b1ba6e41..7ed627b9e2 100644
--- a/cpp/include/raft/sparse/linalg/add.cuh
+++ b/cpp/include/raft/sparse/linalg/add.cuh
@@ -21,8 +21,8 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
@@ -156,19 +156,17 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr,
  * @param nnz2: size of right hand index_ptr and val arrays
  * @param m: size of output array (number of rows in final matrix)
  * @param out_ind: output row_ind array
- * @param d_alloc: device allocator to use for temp memory
  * @param stream: cuda stream to use
  */
 template <typename T, int TPB_X = 128>
 size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val,
                          int nnz1, const int *b_ind, const int *b_indptr,
                          const T *b_val, int nnz2, int m, int *out_ind,
-                         std::shared_ptr<raft::mr::device::allocator> d_alloc,
                          cudaStream_t stream) {
   dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
-  raft::mr::device::buffer<int> row_counts(d_alloc, stream, m + 1);
+  rmm::device_uvector<int> row_counts(m + 1, stream);
   CUDA_CHECK(
     cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream));
 
@@ -184,7 +182,7 @@ size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val,
   thrust::device_ptr<int> row_counts_d =
     thrust::device_pointer_cast(row_counts.data());
   thrust::device_ptr<int> c_ind_d = thrust::device_pointer_cast(out_ind);
-  exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m,
+  exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m,
                  c_ind_d);
 
   return cnnz;
diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh
index 15302f3b74..ce0c4bbe6f 100644
--- a/cpp/include/raft/sparse/linalg/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/spectral.cuh
@@ -18,9 +18,8 @@
 
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
 #include <raft/spectral/partition.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.cuh>
@@ -35,16 +34,15 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals,
                    int nnz, int n, int n_components, T *out,
                    unsigned long long seed = 1234567) {
   auto stream = handle.get_stream();
-  auto d_alloc = handle.get_device_allocator();
-  raft::mr::device::buffer<int> src_offsets(d_alloc, stream, n + 1);
-  raft::mr::device::buffer<int> dst_cols(d_alloc, stream, nnz);
-  raft::mr::device::buffer<T> dst_vals(d_alloc, stream, nnz);
+  rmm::device_uvector<int> src_offsets(n + 1, stream);
+  rmm::device_uvector<int> dst_cols(nnz, stream);
+  rmm::device_uvector<T> dst_vals(nnz, stream);
   convert::coo_to_csr(handle, rows, cols, vals, nnz, n, src_offsets.data(),
                       dst_cols.data(), dst_vals.data());
 
-  raft::mr::device::buffer<T> eigVals(d_alloc, stream, n_components + 1);
-  raft::mr::device::buffer<T> eigVecs(d_alloc, stream, n * (n_components + 1));
-  raft::mr::device::buffer<int> labels(d_alloc, stream, n);
+  rmm::device_uvector<T> eigVals(n_components + 1, stream);
+  rmm::device_uvector<T> eigVecs(n * (n_components + 1), stream);
+  rmm::device_uvector<int> labels(n, stream);
 
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
@@ -65,8 +63,6 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals,
   index_type maxiter = 4000;  //default reset value (when set to 0);
   value_type tol = 0.01;
   index_type restart_iter = 15 + neigvs;  //what cugraph is using
-  auto t_exe_p = thrust::cuda::par.on(stream);
-  using thrust_exe_policy_t = decltype(t_exe_p);
 
   raft::eigen_solver_config_t<index_type, value_type> cfg{neigvs, maxiter,
                                                           restart_iter, tol};
@@ -85,15 +81,14 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals,
     using value_type_t = value_type;
 
     std::pair<value_type_t, index_type_t> solve(
-      handle_t const &handle, thrust_exe_policy_t t_exe_policy,
-      size_type_t n_obs_vecs, size_type_t dim,
+      handle_t const &handle, size_type_t n_obs_vecs, size_type_t dim,
       value_type_t const *__restrict__ obs,
       index_type_t *__restrict__ codes) const {
       return std::make_pair<value_type_t, index_type_t>(0, 0);
     }
   };
 
-  raft::spectral::partition(handle, t_exe_p, r_csr_m, eig_solver,
+  raft::spectral::partition(handle, r_csr_m, eig_solver,
                             no_op_cluster_solver_t{}, labels.data(),
                             eigVals.data(), eigVecs.data());
 
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh
index 5c2c78f0c3..a6e1027288 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh
@@ -21,8 +21,8 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <raft/sparse/op/sort.h>
 #include <thrust/device_ptr.h>
@@ -31,8 +31,6 @@
 
 #include <cuda_runtime.h>
 #include <stdio.h>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
 
 #include <algorithm>
 #include <iostream>
@@ -122,22 +120,20 @@ __global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols,
  * @param in: Input COO matrix
  * @param out: Output symmetrized COO matrix
  * @param reduction_op: a custom reduction function
- * @param d_alloc device allocator for temporary buffers
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 128, typename T, typename Lambda>
 void coo_symmetrize(COO<T> *in, COO<T> *out,
                     Lambda reduction_op,  // two-argument reducer
-                    std::shared_ptr<raft::mr::device::allocator> d_alloc,
                     cudaStream_t stream) {
   dim3 grid(raft::ceildiv(in->n_rows, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
   ASSERT(!out->validate_mem(), "Expecting unallocated COO for output");
 
-  raft::mr::device::buffer<int> in_row_ind(d_alloc, stream, in->n_rows);
+  rmm::device_uvector<int> in_row_ind(in->n_rows, stream);
 
-  convert::sorted_coo_to_csr(in, in_row_ind.data(), d_alloc, stream);
+  convert::sorted_coo_to_csr(in, in_row_ind.data(), stream);
 
   out->allocate(in->nnz * 2, in->n_rows, in->n_cols, true, stream);
 
@@ -250,14 +246,14 @@ __global__ static void symmetric_sum(value_idx *restrict edges,
  * @param k: Number of n_neighbors
  * @param out: Output COO Matrix class
  * @param stream: Input cuda stream
- * @param d_alloc device allocator for temporary buffers
  */
 template <typename value_idx = int64_t, typename value_t = float,
           int TPB_X = 32, int TPB_Y = 32>
-void from_knn_symmetrize_matrix(
-  const value_idx *restrict knn_indices, const value_t *restrict knn_dists,
-  const value_idx n, const int k, COO<value_t, value_idx> *out,
-  cudaStream_t stream, std::shared_ptr<raft::mr::device::allocator> d_alloc) {
+void from_knn_symmetrize_matrix(const value_idx *restrict knn_indices,
+                                const value_t *restrict knn_dists,
+                                const value_idx n, const int k,
+                                COO<value_t, value_idx> *out,
+                                cudaStream_t stream) {
   // (1) Find how much space needed in each row
   // We look through all datapoints and increment the count for each row.
   const dim3 threadsPerBlock(TPB_X, TPB_Y);
@@ -265,11 +261,11 @@ void from_knn_symmetrize_matrix(
                        raft::ceildiv(k, TPB_Y));
 
   // Notice n+1 since we can reuse these arrays for transpose_edges, original_edges in step (4)
-  raft::mr::device::buffer<value_idx> row_sizes(d_alloc, stream, n);
+  rmm::device_uvector<value_idx> row_sizes(n, stream);
   CUDA_CHECK(
     cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream));
 
-  raft::mr::device::buffer<value_idx> row_sizes2(d_alloc, stream, n);
+  rmm::device_uvector<value_idx> row_sizes2(n, stream);
   CUDA_CHECK(
     cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream));
 
@@ -298,8 +294,8 @@ void from_knn_symmetrize_matrix(
     thrust::device_pointer_cast(row_sizes.data());
 
   // Rolling cumulative sum
-  thrust::exclusive_scan(thrust::cuda::par.on(stream), __row_sizes,
-                         __row_sizes + n, __edges);
+  thrust::exclusive_scan(rmm::exec_policy(stream), __row_sizes, __row_sizes + n,
+                         __edges);
 
   // (5) Perform final data + data.T operation in tandem with memcpying
   symmetric_sum<<<numBlocks, threadsPerBlock, 0, stream>>>(
@@ -314,7 +310,6 @@ template <typename value_idx, typename value_t>
 void symmetrize(const raft::handle_t &handle, const value_idx *rows,
                 const value_idx *cols, const value_t *vals, size_t m, size_t n,
                 size_t nnz, raft::sparse::COO<value_t, value_idx> &out) {
-  auto d_alloc = handle.get_device_allocator();
   auto stream = handle.get_stream();
 
   // copy rows to cols and cols to rows
@@ -333,7 +328,7 @@ void symmetrize(const raft::handle_t &handle, const value_idx *rows,
   // sort COO
   raft::sparse::op::coo_sort((value_idx)m, (value_idx)n, (value_idx)nnz * 2,
                              symm_rows.data(), symm_cols.data(),
-                             symm_vals.data(), d_alloc, stream);
+                             symm_vals.data(), stream);
 
   raft::sparse::op::max_duplicates(handle, out, symm_rows.data(),
                                    symm_cols.data(), symm_vals.data(), nnz * 2,
diff --git a/cpp/include/raft/sparse/linalg/transpose.h b/cpp/include/raft/sparse/linalg/transpose.h
index 6afe4ca8f6..7ad4b93ec0 100644
--- a/cpp/include/raft/sparse/linalg/transpose.h
+++ b/cpp/include/raft/sparse/linalg/transpose.h
@@ -21,8 +21,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
@@ -53,7 +52,6 @@ namespace linalg {
  * @param[in] csr_nrows : Number of rows in CSR
  * @param[in] csr_ncols : Number of columns in CSR
  * @param[in] nnz : Number of nonzeros of CSR
- * @param[in] allocator : Allocator for intermediate memory
  * @param[in] stream : Cuda stream for ordering events
  */
 template <typename value_idx, typename value_t>
@@ -61,9 +59,7 @@ void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr,
                    const value_idx *csr_indices, const value_t *csr_data,
                    value_idx *csc_indptr, value_idx *csc_indices,
                    value_t *csc_data, value_idx csr_nrows, value_idx csr_ncols,
-                   value_idx nnz,
-                   std::shared_ptr<raft::mr::device::allocator> allocator,
-                   cudaStream_t stream) {
+                   value_idx nnz, cudaStream_t stream) {
   size_t convert_csc_workspace_size = 0;
 
   CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize(
@@ -72,8 +68,8 @@ void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr,
     CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1,
     &convert_csc_workspace_size, stream));
 
-  raft::mr::device::buffer<char> convert_csc_workspace(
-    allocator, stream, convert_csc_workspace_size);
+  rmm::device_uvector<char> convert_csc_workspace(convert_csc_workspace_size,
+                                                  stream);
 
   CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc(
     handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices,
diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
index c5ba4fcb4f..33b980afcd 100644
--- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
+++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
@@ -23,11 +23,10 @@
 #include "utils.cuh"
 
 #include <raft/cudart_utils.h>
-#include <rmm/device_buffer.hpp>
-#include <rmm/exec_policy.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/host_vector.h>
 #include <thrust/reduce.h>
@@ -35,11 +34,6 @@
 #include <thrust/transform.h>
 #include <iostream>
 
-#include <raft/cudart_utils.h>
-
-#include <rmm/device_buffer.hpp>
-#include <rmm/exec_policy.hpp>
-
 namespace raft {
 namespace mst {
 typedef std::chrono::high_resolution_clock Clock;
@@ -65,20 +59,20 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(
     offsets(offsets_),
     indices(indices_),
     weights(weights_),
-    altered_weights(e_),
+    altered_weights(e_, stream_),
     v(v_),
     e(e_),
     color_index(color_),
-    color(v_),
-    next_color(v_),
-    min_edge_color(v_),
-    new_mst_edge(v_),
-    mst_edge(e_, false),
-    temp_src(2 * v_),
-    temp_dst(2 * v_),
-    temp_weights(2 * v_),
-    mst_edge_count(1, 0),
-    prev_mst_edge_count(1, 0),
+    color(v_, stream_),
+    next_color(v_, stream_),
+    min_edge_color(v_, stream_),
+    new_mst_edge(v_, stream_),
+    mst_edge(e_, stream_),
+    temp_src(2 * v_, stream_),
+    temp_dst(2 * v_, stream_),
+    temp_weights(2 * v_, stream_),
+    mst_edge_count(1, stream_),
+    prev_mst_edge_count(1, stream_),
     stream(stream_),
     symmetrize_output(symmetrize_output_),
     initialize_colors(initialize_colors_),
@@ -87,13 +81,18 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(
   max_threads = handle_.get_device_properties().maxThreadsPerBlock;
   sm_count = handle_.get_device_properties().multiProcessorCount;
 
+  mst_edge_count.set_value_to_zero_async(stream);
+  prev_mst_edge_count.set_value_to_zero_async(stream);
+  CUDA_CHECK(cudaMemsetAsync(mst_edge.data(), 0, mst_edge.size() * sizeof(bool),
+                             stream));
+
   //Initially, color holds the vertex id as color
-  auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
+  auto policy = handle.get_thrust_policy();
   if (initialize_colors_) {
     thrust::sequence(policy, color.begin(), color.end(), 0);
     thrust::sequence(policy, color_index, color_index + v, 0);
   } else {
-    raft::copy(color.data().get(), color_index, v, stream);
+    raft::copy(color.data(), color_index, v, stream);
   }
   thrust::sequence(policy, next_color.begin(), next_color.end(), 0);
 }
@@ -160,12 +159,12 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve() {
     timer3 += duration_us(stop - start);
 #endif
 
-    auto curr_mst_edge_count = mst_edge_count[0];
+    auto curr_mst_edge_count = mst_edge_count.value(stream);
     RAFT_EXPECTS(curr_mst_edge_count <= max_mst_edges,
                  "Number of edges found by MST is invalid. This may be due to "
                  "loss in precision. Try increasing precision of weights.");
 
-    if (curr_mst_edge_count == prev_mst_edge_count[0]) {
+    if (curr_mst_edge_count == prev_mst_edge_count.value(stream)) {
 #ifdef MST_TIME
       std::cout << "Iterations: " << i << std::endl;
       std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3
@@ -196,12 +195,11 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve() {
 #endif
 
     // copy this iteration's results and store
-    prev_mst_edge_count = mst_edge_count;
+    prev_mst_edge_count.set_value_async(curr_mst_edge_count, stream);
   }
 
   // result packaging
-  thrust::host_vector<edge_t> host_mst_edge_count = mst_edge_count;
-  mst_result.n_edges = host_mst_edge_count[0];
+  mst_result.n_edges = mst_edge_count.value(stream);
   mst_result.src.resize(mst_result.n_edges, stream);
   mst_result.dst.resize(mst_result.n_edges, stream);
   mst_result.weights.resize(mst_result.n_edges, stream);
@@ -227,8 +225,8 @@ template <typename vertex_t, typename edge_t, typename weight_t,
           typename alteration_t>
 alteration_t
 MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration_max() {
-  auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
-  rmm::device_vector<weight_t> tmp(e);
+  auto policy = handle.get_thrust_policy();
+  rmm::device_uvector<weight_t> tmp(e, stream);
   thrust::device_ptr<const weight_t> weights_ptr(weights);
   thrust::copy(policy, weights_ptr, weights_ptr + e, tmp.begin());
   //sort tmp weights
@@ -242,7 +240,7 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration_max() {
     thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1));
   auto end =
     thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end));
-  auto init = tmp[1] - tmp[0];
+  auto init = tmp.element(1, stream) - tmp.element(0, stream);
   auto max =
     thrust::transform_reduce(policy, begin, end, alteration_functor<weight_t>(),
                              init, thrust::minimum<weight_t>());
@@ -261,7 +259,7 @@ void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration() {
   alteration_t max = alteration_max();
 
   // pool of rand values
-  rmm::device_vector<alteration_t> rand_values(v);
+  rmm::device_uvector<alteration_t> rand_values(v, stream);
 
   // Random number generator
   curandGenerator_t randGen;
@@ -269,8 +267,7 @@ void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration() {
   curandSetPseudoRandomGeneratorSeed(randGen, 1234567);
 
   // Initialize rand values
-  auto curand_status =
-    curand_generate_uniformX(randGen, rand_values.data().get(), v);
+  auto curand_status = curand_generate_uniformX(randGen, rand_values.data(), v);
   RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND failed");
   curand_status = curandDestroyGenerator(randGen);
   RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS,
@@ -278,8 +275,8 @@ void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration() {
 
   //Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu
   detail::alteration_kernel<<<nblocks, nthreads, 0, stream>>>(
-    v, e, offsets, indices, weights, max, rand_values.data().get(),
-    altered_weights.data().get());
+    v, e, offsets, indices, weights, max, rand_values.data(),
+    altered_weights.data());
 }
 
 // updates colors of vertices by propagating the lower color to the higher
@@ -288,23 +285,24 @@ template <typename vertex_t, typename edge_t, typename weight_t,
 void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::label_prop(
   vertex_t* mst_src, vertex_t* mst_dst) {
   // update the colors of both ends its until there is no change in colors
-  thrust::host_vector<edge_t> curr_mst_edge_count = mst_edge_count;
+  edge_t curr_mst_edge_count = mst_edge_count.value(stream);
 
   auto min_pair_nthreads = std::min(v, (vertex_t)max_threads);
   auto min_pair_nblocks = std::min(
     (v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks);
 
-  rmm::device_vector<bool> done(1, false);
-
-  edge_t* new_mst_edge_ptr = new_mst_edge.data().get();
-  vertex_t* color_ptr = color.data().get();
-  vertex_t* next_color_ptr = next_color.data().get();
+  edge_t* new_mst_edge_ptr = new_mst_edge.data();
+  vertex_t* color_ptr = color.data();
+  vertex_t* next_color_ptr = next_color.data();
 
-  bool* done_ptr = done.data().get();
+  rmm::device_scalar<bool> done(stream);
+  done.set_value_to_zero_async(stream);
+  bool* done_ptr = done.data();
+  const bool true_val = true;
 
   auto i = 0;
-  while (!done[0]) {
-    done[0] = true;
+  while (!done.value(stream)) {
+    done.set_value_async(true_val, stream);
 
     detail::min_pair_colors<<<min_pair_nblocks, min_pair_nthreads, 0, stream>>>(
       v, indices, new_mst_edge_ptr, color_ptr, color_index, next_color_ptr);
@@ -327,7 +325,7 @@ template <typename vertex_t, typename edge_t, typename weight_t,
           typename alteration_t>
 void MST_solver<vertex_t, edge_t, weight_t,
                 alteration_t>::min_edge_per_vertex() {
-  auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
+  auto policy = handle.get_thrust_policy();
   thrust::fill(policy, min_edge_color.begin(), min_edge_color.end(),
                std::numeric_limits<alteration_t>::max());
   thrust::fill(policy, new_mst_edge.begin(), new_mst_edge.end(),
@@ -335,11 +333,11 @@ void MST_solver<vertex_t, edge_t, weight_t,
 
   int n_threads = 32;
 
-  vertex_t* color_ptr = color.data().get();
-  edge_t* new_mst_edge_ptr = new_mst_edge.data().get();
-  bool* mst_edge_ptr = mst_edge.data().get();
-  alteration_t* min_edge_color_ptr = min_edge_color.data().get();
-  alteration_t* altered_weights_ptr = altered_weights.data().get();
+  vertex_t* color_ptr = color.data();
+  edge_t* new_mst_edge_ptr = new_mst_edge.data();
+  bool* mst_edge_ptr = mst_edge.data();
+  alteration_t* min_edge_color_ptr = min_edge_color.data();
+  alteration_t* altered_weights_ptr = altered_weights.data();
 
   detail::kernel_min_edge_per_vertex<<<v, n_threads, 0, stream>>>(
     offsets, indices, altered_weights_ptr, color_ptr, color_index,
@@ -354,18 +352,18 @@ void MST_solver<vertex_t, edge_t, weight_t,
   auto nthreads = std::min(v, max_threads);
   auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks);
 
-  auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
+  auto policy = handle.get_thrust_policy();
   thrust::fill(policy, temp_src.begin(), temp_src.end(),
                std::numeric_limits<vertex_t>::max());
 
-  vertex_t* color_ptr = color.data().get();
-  edge_t* new_mst_edge_ptr = new_mst_edge.data().get();
-  bool* mst_edge_ptr = mst_edge.data().get();
-  alteration_t* min_edge_color_ptr = min_edge_color.data().get();
-  alteration_t* altered_weights_ptr = altered_weights.data().get();
-  vertex_t* temp_src_ptr = temp_src.data().get();
-  vertex_t* temp_dst_ptr = temp_dst.data().get();
-  weight_t* temp_weights_ptr = temp_weights.data().get();
+  vertex_t* color_ptr = color.data();
+  edge_t* new_mst_edge_ptr = new_mst_edge.data();
+  bool* mst_edge_ptr = mst_edge.data();
+  alteration_t* min_edge_color_ptr = min_edge_color.data();
+  alteration_t* altered_weights_ptr = altered_weights.data();
+  vertex_t* temp_src_ptr = temp_src.data();
+  vertex_t* temp_dst_ptr = temp_dst.data();
+  weight_t* temp_weights_ptr = temp_weights.data();
 
   detail::min_edge_per_supervertex<<<nblocks, nthreads, 0, stream>>>(
     color_ptr, color_index, new_mst_edge_ptr, mst_edge_ptr, indices, weights,
@@ -390,8 +388,8 @@ void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::check_termination() {
     std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks);
 
   // count number of new mst edges
-  edge_t* mst_edge_count_ptr = mst_edge_count.data().get();
-  vertex_t* temp_src_ptr = temp_src.data().get();
+  edge_t* mst_edge_count_ptr = mst_edge_count.data();
+  vertex_t* temp_src_ptr = temp_src.data();
 
   detail::kernel_count_new_mst_edges<<<nblocks, nthreads, 0, stream>>>(
     temp_src_ptr, mst_edge_count_ptr, 2 * v);
@@ -411,9 +409,9 @@ template <typename vertex_t, typename edge_t, typename weight_t,
           typename alteration_t>
 void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::append_src_dst_pair(
   vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights) {
-  auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
+  auto policy = handle.get_thrust_policy();
 
-  auto curr_mst_edge_count = prev_mst_edge_count[0];
+  edge_t curr_mst_edge_count = prev_mst_edge_count.value(stream);
 
   // iterator to end of mst edges added to final output in previous iteration
   auto src_dst_zip_end = thrust::make_zip_iterator(thrust::make_tuple(
diff --git a/cpp/include/raft/sparse/mst/detail/utils.cuh b/cpp/include/raft/sparse/mst/detail/utils.cuh
index 8f755de459..4d5ca6ebe1 100644
--- a/cpp/include/raft/sparse/mst/detail/utils.cuh
+++ b/cpp/include/raft/sparse/mst/detail/utils.cuh
@@ -18,7 +18,7 @@
 #pragma once
 
 #include <iostream>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 #define MST_TIME
 
 namespace raft {
@@ -32,7 +32,7 @@ __device__ idx_t get_1D_idx() {
 
 // somewhat smart vector print
 template <typename T>
-void printv(rmm::device_vector<T>& vec, const std::string& name = "",
+void printv(rmm::device_uvector<T>& vec, const std::string& name = "",
             const size_t displ = 5) {
 #ifdef MST_TIME
   std::cout.precision(15);
diff --git a/cpp/include/raft/sparse/mst/mst_solver.cuh b/cpp/include/raft/sparse/mst/mst_solver.cuh
index 833882ea0d..44b34ee5c7 100644
--- a/cpp/include/raft/sparse/mst/mst_solver.cuh
+++ b/cpp/include/raft/sparse/mst/mst_solver.cuh
@@ -18,8 +18,8 @@
 #pragma once
 
 #include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/device_vector.hpp>
 
 namespace raft {
 
@@ -68,24 +68,24 @@ class MST_solver {
   vertex_t sm_count;
 
   vertex_t* color_index;  // represent each supervertex as a color
-  rmm::device_vector<alteration_t>
+  rmm::device_uvector<alteration_t>
     min_edge_color;  // minimum incident edge weight per color
-  rmm::device_vector<edge_t> new_mst_edge;  // new minimum edge per vertex
-  rmm::device_vector<alteration_t>
+  rmm::device_uvector<edge_t> new_mst_edge;  // new minimum edge per vertex
+  rmm::device_uvector<alteration_t>
     altered_weights;  // weights to be used for mst
-  rmm::device_vector<edge_t>
+  rmm::device_scalar<edge_t>
     mst_edge_count;  // total number of edges added after every iteration
-  rmm::device_vector<edge_t>
+  rmm::device_scalar<edge_t>
     prev_mst_edge_count;  // total number of edges up to the previous iteration
-  rmm::device_vector<bool>
+  rmm::device_uvector<bool>
     mst_edge;  // mst output -  true if the edge belongs in mst
-  rmm::device_vector<vertex_t> next_color;  //  next iteration color
-  rmm::device_vector<vertex_t> color;  // index of color that vertex points to
+  rmm::device_uvector<vertex_t> next_color;  //  next iteration color
+  rmm::device_uvector<vertex_t> color;  // index of color that vertex points to
 
   // new src-dst pairs found per iteration
-  rmm::device_vector<vertex_t> temp_src;
-  rmm::device_vector<vertex_t> temp_dst;
-  rmm::device_vector<weight_t> temp_weights;
+  rmm::device_uvector<vertex_t> temp_src;
+  rmm::device_uvector<vertex_t> temp_dst;
+  rmm::device_uvector<weight_t> temp_weights;
 
   void label_prop(vertex_t* mst_src, vertex_t* mst_dst);
   void min_edge_per_vertex();
diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh
index 562d506cfe..492058f85f 100644
--- a/cpp/include/raft/sparse/op/filter.cuh
+++ b/cpp/include/raft/sparse/op/filter.cuh
@@ -21,8 +21,8 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
@@ -84,11 +84,9 @@ __global__ void coo_remove_scalar_kernel(const int *rows, const int *cols,
 template <int TPB_X, typename T>
 void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz,
                        int *crows, int *ccols, T *cvals, int *cnnz,
-                       int *cur_cnnz, T scalar, int n,
-                       std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                       cudaStream_t stream) {
-  raft::mr::device::buffer<int> ex_scan(d_alloc, stream, n);
-  raft::mr::device::buffer<int> cur_ex_scan(d_alloc, stream, n);
+                       int *cur_cnnz, T scalar, int n, cudaStream_t stream) {
+  rmm::device_uvector<int> ex_scan(n, stream);
+  rmm::device_uvector<int> cur_ex_scan(n, stream);
 
   CUDA_CHECK(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream));
   CUDA_CHECK(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream));
@@ -96,14 +94,14 @@ void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz,
   thrust::device_ptr<int> dev_cnnz = thrust::device_pointer_cast(cnnz);
   thrust::device_ptr<int> dev_ex_scan =
     thrust::device_pointer_cast(ex_scan.data());
-  thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cnnz, dev_cnnz + n,
+  thrust::exclusive_scan(rmm::exec_policy(stream), dev_cnnz, dev_cnnz + n,
                          dev_ex_scan);
   CUDA_CHECK(cudaPeekAtLastError());
 
   thrust::device_ptr<int> dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz);
   thrust::device_ptr<int> dev_cur_ex_scan =
     thrust::device_pointer_cast(cur_ex_scan.data());
-  thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cur_cnnz,
+  thrust::exclusive_scan(rmm::exec_policy(stream), dev_cur_cnnz,
                          dev_cur_cnnz + n, dev_cur_ex_scan);
   CUDA_CHECK(cudaPeekAtLastError());
 
@@ -122,15 +120,12 @@ void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz,
  * @param in: input COO matrix
  * @param out: output COO matrix
  * @param scalar: scalar to remove from arrays
- * @param d_alloc device allocator for temporary buffers
  * @param stream: cuda stream to use
  */
 template <int TPB_X, typename T>
-void coo_remove_scalar(COO<T> *in, COO<T> *out, T scalar,
-                       std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                       cudaStream_t stream) {
-  raft::mr::device::buffer<int> row_count_nz(d_alloc, stream, in->n_rows);
-  raft::mr::device::buffer<int> row_count(d_alloc, stream, in->n_rows);
+void coo_remove_scalar(COO<T> *in, COO<T> *out, T scalar, cudaStream_t stream) {
+  rmm::device_uvector<int> row_count_nz(in->n_rows, stream);
+  rmm::device_uvector<int> row_count(in->n_rows, stream);
 
   CUDA_CHECK(
     cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream));
@@ -146,7 +141,7 @@ void coo_remove_scalar(COO<T> *in, COO<T> *out, T scalar,
 
   thrust::device_ptr<int> d_row_count_nz =
     thrust::device_pointer_cast(row_count_nz.data());
-  int out_nnz = thrust::reduce(thrust::cuda::par.on(stream), d_row_count_nz,
+  int out_nnz = thrust::reduce(rmm::exec_policy(stream), d_row_count_nz,
                                d_row_count_nz + in->n_rows);
 
   out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream);
@@ -154,7 +149,7 @@ void coo_remove_scalar(COO<T> *in, COO<T> *out, T scalar,
   coo_remove_scalar<TPB_X, T>(in->rows(), in->cols(), in->vals(), in->nnz,
                               out->rows(), out->cols(), out->vals(),
                               row_count_nz.data(), row_count.data(), scalar,
-                              in->n_rows, d_alloc, stream);
+                              in->n_rows, stream);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -163,14 +158,11 @@ void coo_remove_scalar(COO<T> *in, COO<T> *out, T scalar,
  *
  * @param in: input COO matrix
  * @param out: output COO matrix
- * @param d_alloc device allocator for temporary buffers
  * @param stream: cuda stream to use
  */
 template <int TPB_X, typename T>
-void coo_remove_zeros(COO<T> *in, COO<T> *out,
-                      std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                      cudaStream_t stream) {
-  coo_remove_scalar<TPB_X, T>(in, out, T(0.0), d_alloc, stream);
+void coo_remove_zeros(COO<T> *in, COO<T> *out, cudaStream_t stream) {
+  coo_remove_scalar<TPB_X, T>(in, out, T(0.0), stream);
 }
 
 };  // namespace op
diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh
index 53c9f89074..09a35720fb 100644
--- a/cpp/include/raft/sparse/op/reduce.cuh
+++ b/cpp/include/raft/sparse/op/reduce.cuh
@@ -21,7 +21,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
 
 #include <raft/sparse/op/sort.h>
@@ -32,7 +31,6 @@
 #include <cuda_runtime.h>
 #include <stdio.h>
 #include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
 
 #include <algorithm>
 #include <iostream>
@@ -126,18 +124,16 @@ void max_duplicates(const raft::handle_t &handle,
                     raft::sparse::COO<value_t, value_idx> &out,
                     const value_idx *rows, const value_idx *cols,
                     const value_t *vals, size_t nnz, size_t m, size_t n) {
-  auto d_alloc = handle.get_device_allocator();
   auto stream = handle.get_stream();
-
-  auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
+  auto thrust_policy = handle.get_thrust_policy();
 
   // compute diffs & take exclusive scan
   rmm::device_uvector<value_idx> diff(nnz + 1, stream);
 
   compute_duplicates_mask(diff.data(), rows, cols, nnz, stream);
 
-  thrust::exclusive_scan(thrust::cuda::par.on(stream), diff.data(),
-                         diff.data() + diff.size(), diff.data());
+  thrust::exclusive_scan(thrust_policy, diff.data(), diff.data() + diff.size(),
+                         diff.data());
 
   // compute final size
   value_idx size = 0;
diff --git a/cpp/include/raft/sparse/op/sort.h b/cpp/include/raft/sparse/op/sort.h
index 09d5b568be..c40801a0b1 100644
--- a/cpp/include/raft/sparse/op/sort.h
+++ b/cpp/include/raft/sparse/op/sort.h
@@ -20,9 +20,8 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/sparse/utils.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/coo.cuh>
+#include <rmm/exec_policy.hpp>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
@@ -59,35 +58,28 @@ struct TupleComp {
  * @param rows rows array from coo matrix
  * @param cols cols array from coo matrix
  * @param vals vals array from coo matrix
- * @param d_alloc device allocator for temporary buffers
  * @param stream: cuda stream to use
  */
 template <typename T>
 void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals,
-              // TODO: Remove this
-              std::shared_ptr<raft::mr::device::allocator> d_alloc,
               cudaStream_t stream) {
   auto coo_indices = thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
 
   // get all the colors in contiguous locations so we can map them to warps.
-  thrust::sort_by_key(thrust::cuda::par.on(stream), coo_indices,
-                      coo_indices + nnz, vals, TupleComp());
+  thrust::sort_by_key(rmm::exec_policy(stream), coo_indices, coo_indices + nnz,
+                      vals, TupleComp());
 }
 
 /**
  * @brief Sort the underlying COO arrays by row
  * @tparam T: the type name of the underlying value array
  * @param in: COO to sort by row
- * @param d_alloc device allocator for temporary buffers
  * @param stream: the cuda stream to use
  */
 template <typename T>
-void coo_sort(COO<T> *const in,
-              // TODO: Remove this
-              std::shared_ptr<raft::mr::device::allocator> d_alloc,
-              cudaStream_t stream) {
+void coo_sort(COO<T> *const in, cudaStream_t stream) {
   coo_sort<T>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(),
-              in->vals(), d_alloc, stream);
+              in->vals(), stream);
 }
 
 /**
@@ -107,8 +99,7 @@ void coo_sort_by_weight(value_idx *rows, value_idx *cols, value_t *data,
 
   auto first = thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
 
-  thrust::sort_by_key(thrust::cuda::par.on(stream), t_data, t_data + nnz,
-                      first);
+  thrust::sort_by_key(rmm::exec_policy(stream), t_data, t_data + nnz, first);
 }
 };  // namespace op
 };  // end NAMESPACE sparse
diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh
index 8aae90f1d8..46369ca964 100644
--- a/cpp/include/raft/sparse/selection/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/connect_components.cuh
@@ -159,14 +159,10 @@ struct CubKVPMinReduce {
  */
 template <typename value_idx>
 value_idx get_n_components(value_idx *colors, size_t n_rows,
-                           std::shared_ptr<raft::mr::device::allocator> d_alloc,
                            cudaStream_t stream) {
-  value_idx *map_ids;
-  int num_clusters;
-  raft::label::getUniquelabels(colors, n_rows, &map_ids, &num_clusters, stream,
-                               d_alloc);
-  d_alloc->deallocate(map_ids, num_clusters * sizeof(value_idx), stream);
-
+  rmm::device_uvector<value_idx> map_ids(0, stream);
+  int num_clusters =
+    raft::label::getUniquelabels(map_ids, colors, n_rows, stream);
   return num_clusters;
 }
 
@@ -197,15 +193,13 @@ struct LookupColorOp {
  * @param[in] X original dense data
  * @param[in] n_rows number of rows in original dense data
  * @param[in] n_cols number of columns in original dense data
- * @param[in] d_alloc device allocator to use
  * @param[in] stream cuda stream for which to order cuda operations
  */
 template <typename value_idx, typename value_t, typename red_op>
 void perform_1nn(cub::KeyValuePair<value_idx, value_t> *kvp,
                  value_idx *nn_colors, value_idx *colors, const value_t *X,
-                 size_t n_rows, size_t n_cols,
-                 std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                 cudaStream_t stream, red_op reduction_op) {
+                 size_t n_rows, size_t n_cols, cudaStream_t stream,
+                 red_op reduction_op) {
   rmm::device_uvector<int> workspace(n_rows, stream);
   rmm::device_uvector<value_t> x_norm(n_rows, stream);
 
@@ -218,7 +212,7 @@ void perform_1nn(cub::KeyValuePair<value_idx, value_t> *kvp,
     workspace.data(), reduction_op, reduction_op, true, true, stream);
 
   LookupColorOp<value_idx, value_t> extract_colors_op(colors);
-  thrust::transform(thrust::cuda::par.on(stream), kvp, kvp + n_rows, nn_colors,
+  thrust::transform(rmm::exec_policy(stream), kvp, kvp + n_rows, nn_colors,
                     extract_colors_op);
 }
 
@@ -239,15 +233,15 @@ void sort_by_color(value_idx *colors, value_idx *nn_colors,
                    cub::KeyValuePair<value_idx, value_t> *kvp,
                    value_idx *src_indices, size_t n_rows, cudaStream_t stream) {
   thrust::counting_iterator<value_idx> arg_sort_iter(0);
-  thrust::copy(thrust::cuda::par.on(stream), arg_sort_iter,
-               arg_sort_iter + n_rows, src_indices);
+  thrust::copy(rmm::exec_policy(stream), arg_sort_iter, arg_sort_iter + n_rows,
+               src_indices);
 
   auto keys = thrust::make_zip_iterator(thrust::make_tuple(
     colors, nn_colors, (raft::linkage::KeyValuePair<value_idx, value_t> *)kvp));
   auto vals = thrust::make_zip_iterator(thrust::make_tuple(src_indices));
 
   // get all the colors in contiguous locations so we can map them to warps.
-  thrust::sort_by_key(thrust::cuda::par.on(stream), keys, keys + n_rows, vals,
+  thrust::sort_by_key(rmm::exec_policy(stream), keys, keys + n_rows, vals,
                       TupleComp());
 }
 
@@ -324,7 +318,6 @@ void connect_components(const raft::handle_t &handle,
                         size_t n_rows, size_t n_cols, red_op reduction_op,
                         raft::distance::DistanceType metric =
                           raft::distance::DistanceType::L2SqrtExpanded) {
-  auto d_alloc = handle.get_device_allocator();
   auto stream = handle.get_stream();
 
   RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded,
@@ -336,10 +329,9 @@ void connect_components(const raft::handle_t &handle,
 
   // Normalize colors so they are drawn from a monotonically increasing set
   raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream,
-                              d_alloc, true);
+                              true);
 
-  value_idx n_components =
-    get_n_components(colors.data(), n_rows, d_alloc, stream);
+  value_idx n_components = get_n_components(colors.data(), n_rows, stream);
 
   /**
    * First compute 1-nn for all colors where the color of each data point
@@ -351,7 +343,7 @@ void connect_components(const raft::handle_t &handle,
   rmm::device_uvector<value_idx> src_indices(n_rows, stream);
 
   perform_1nn(temp_inds_dists.data(), nn_colors.data(), colors.data(), X,
-              n_rows, n_cols, d_alloc, stream, reduction_op);
+              n_rows, n_cols, stream, reduction_op);
 
   /**
    * Sort data points by color (neighbors are not sorted)
@@ -369,7 +361,7 @@ void connect_components(const raft::handle_t &handle,
   raft::sparse::op::compute_duplicates_mask(out_index.data(), colors.data(),
                                             nn_colors.data(), n_rows, stream);
 
-  thrust::exclusive_scan(thrust::cuda::par.on(stream), out_index.data(),
+  thrust::exclusive_scan(handle.get_thrust_policy(), out_index.data(),
                          out_index.data() + out_index.size(), out_index.data());
 
   // compute final size
@@ -380,7 +372,7 @@ void connect_components(const raft::handle_t &handle,
 
   size++;
 
-  raft::sparse::COO<value_t, value_idx> min_edges(d_alloc, stream);
+  raft::sparse::COO<value_t, value_idx> min_edges(stream);
   min_edges.allocate(size, n_rows, n_rows, true, stream);
 
   min_components_by_color(min_edges, out_index.data(), src_indices.data(),
diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh
index 71fbb8ab3d..3566939bc4 100644
--- a/cpp/include/raft/sparse/selection/knn.cuh
+++ b/cpp/include/raft/sparse/selection/knn.cuh
@@ -24,7 +24,6 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/matrix.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
 
 #include <raft/sparse/op/slice.h>
@@ -415,7 +414,6 @@ class sparse_knn_t {
    * @param[out] output_dists dense matrix for output distances (size n_query_rows * k)
    * @param[in] k the number of neighbors to query
    * @param[in] cusparseHandle the initialized cusparseHandle instance to use
-   * @param[in] allocator device allocator instance to use
    * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to
    * @param[in] batch_size_index maximum number of rows to use from index matrix per batch
    * @param[in] batch_size_query maximum number of rows to use from query matrix per batch
diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh
index 1cf225087a..1cdd66f516 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.cuh
+++ b/cpp/include/raft/sparse/selection/knn_graph.cuh
@@ -96,7 +96,6 @@ void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n,
                raft::sparse::COO<value_t, value_idx> &out, int c = 15) {
   int k = build_k(m, c);
 
-  auto d_alloc = handle.get_device_allocator();
   auto stream = handle.get_stream();
 
   size_t nnz = m * k;
diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp
index 77d7831b4a..2cdf9bf4f5 100644
--- a/cpp/include/raft/spatial/knn/ann.hpp
+++ b/cpp/include/raft/spatial/knn/ann.hpp
@@ -22,15 +22,12 @@
 #include <faiss/gpu/GpuIndex.h>
 #include <faiss/gpu/StandardGpuResources.h>
 
-#include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
 
 namespace raft {
 namespace spatial {
 namespace knn {
 
-using deviceAllocator = raft::mr::device::allocator;
-
 /**
  * @brief Flat C++ API function to build an approximate nearest neighbors index
  * from an index array and a set of parameters.
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index 6e4c99b646..77ad4afe96 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -39,14 +39,11 @@
 #include <faiss/gpu/utils/Select.cuh>
 #include <faiss/gpu/utils/Tensor.cuh>
 
-#include <thrust/device_vector.h>
 #include <thrust/iterator/transform_iterator.h>
 
 #include <raft/linalg/distance_type.h>
 
-#include <cuml/common/device_buffer.hpp>
 #include <cuml/neighbors/knn.hpp>
-#include <raft/mr/device/allocator.hpp>
 
 #include <iostream>
 #include <set>
@@ -145,8 +142,7 @@ void approx_knn_build_index(raft::handle_t &handle,
   // perform preprocessing
   // k set to 0 (unused during preprocessing / revertion)
   std::unique_ptr<MetricProcessor<float>> query_metric_processor =
-    create_processor<float>(metric, n, D, 0, false, handle.get_stream(),
-                            handle.get_device_allocator());
+    create_processor<float>(metric, n, D, 0, false, handle.get_stream());
 
   query_metric_processor->preprocess(index_array);
 
@@ -183,7 +179,7 @@ void approx_knn_search(raft::handle_t &handle, float *distances,
   // perform preprocessing
   std::unique_ptr<MetricProcessor<float>> query_metric_processor =
     create_processor<float>(index->metric, n, index->index->d, k, false,
-                            handle.get_stream(), handle.get_device_allocator());
+                            handle.get_stream());
 
   query_metric_processor->preprocess(query_array);
   index->index->search(n, query_array, k, distances, indices);
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 09494e9eb1..84c130b0e4 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -19,6 +19,8 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 
+#include <rmm/device_uvector.hpp>
+
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/StandardGpuResources.h>
@@ -27,7 +29,6 @@
 #include <faiss/gpu/utils/Select.cuh>
 
 #include <raft/linalg/distance_type.h>
-#include <thrust/device_vector.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <iostream>
 #include <raft/handle.hpp>
@@ -179,7 +180,6 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
  * @param[out] res_I    pointer to device memory for returning k nearest indices
  * @param[out] res_D    pointer to device memory for returning k nearest distances
  * @param[in] k        number of neighbors to query
- * @param[in] allocator the device memory allocator to use for temporary scratch memory
  * @param[in] userStream the main cuda stream to use
  * @param[in] internalStreams optional when n_params > 0, the index partitions can be
  *        queried in parallel using these streams. Note that n_int_streams also
@@ -198,7 +198,6 @@ template <typename IntType = int>
 void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
                           IntType D, float *search_items, IntType n,
                           int64_t *res_I, float *res_D, IntType k,
-                          std::shared_ptr<deviceAllocator> allocator,
                           cudaStream_t userStream,
                           cudaStream_t *internalStreams = nullptr,
                           int n_int_streams = 0, bool rowMajorIndex = true,
@@ -228,28 +227,26 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
 
   // perform preprocessing
   std::unique_ptr<MetricProcessor<float>> query_metric_processor =
-    create_processor<float>(metric, n, D, k, rowMajorQuery, userStream,
-                            allocator);
+    create_processor<float>(metric, n, D, k, rowMajorQuery, userStream);
   query_metric_processor->preprocess(search_items);
 
   std::vector<std::unique_ptr<MetricProcessor<float>>> metric_processors(
     input.size());
   for (size_t i = 0; i < input.size(); i++) {
-    metric_processors[i] = create_processor<float>(
-      metric, sizes[i], D, k, rowMajorQuery, userStream, allocator);
+    metric_processors[i] = create_processor<float>(metric, sizes[i], D, k,
+                                                   rowMajorQuery, userStream);
     metric_processors[i]->preprocess(input[i]);
   }
 
   int device;
   CUDA_CHECK(cudaGetDevice(&device));
 
-  raft::mr::device::buffer<int64_t> trans(allocator, userStream,
-                                          id_ranges->size());
+  rmm::device_uvector<int64_t> trans(id_ranges->size(), userStream);
   raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(),
                       userStream);
 
-  raft::mr::device::buffer<float> all_D(allocator, userStream, 0);
-  raft::mr::device::buffer<int64_t> all_I(allocator, userStream, 0);
+  rmm::device_uvector<float> all_D(0, userStream);
+  rmm::device_uvector<int64_t> all_I(0, userStream);
 
   float *out_D = res_D;
   int64_t *out_I = res_I;
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index a645412c2f..876e91e877 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -19,16 +19,14 @@
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/norm.cuh>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
 #include <raft/stats/mean.cuh>
 #include <raft/stats/mean_center.cuh>
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace spatial {
 namespace knn {
 
-using deviceAllocator = raft::mr::device::allocator;
 /**
  * @brief A virtual class defining pre- and post-processing
  * for metrics. This class will temporarily modify its given
@@ -56,16 +54,13 @@ class CosineMetricProcessor : public MetricProcessor<math_t> {
   size_t n_rows_;
   size_t n_cols_;
   cudaStream_t stream_;
-  std::shared_ptr<deviceAllocator> device_allocator_;
-  raft::mr::device::buffer<math_t> colsums_;
+  rmm::device_uvector<math_t> colsums_;
 
  public:
   CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major,
-                        cudaStream_t stream,
-                        std::shared_ptr<deviceAllocator> allocator)
-    : device_allocator_(allocator),
-      stream_(stream),
-      colsums_(allocator, stream, n_rows),
+                        cudaStream_t stream)
+    : stream_(stream),
+      colsums_(n_rows, stream),
       n_cols_(n_cols),
       n_rows_(n_rows),
       row_major_(row_major),
@@ -104,11 +99,9 @@ class CorrelationMetricProcessor : public CosineMetricProcessor<math_t> {
 
  public:
   CorrelationMetricProcessor(size_t n_rows, size_t n_cols, int k,
-                             bool row_major, cudaStream_t stream,
-                             std::shared_ptr<deviceAllocator> allocator)
-    : CosineMetricProcessor<math_t>(n_rows, n_cols, k, row_major, stream,
-                                    allocator),
-      means_(allocator, stream, n_rows) {}
+                             bool row_major, cudaStream_t stream)
+    : CosineMetricProcessor<math_t>(n_rows, n_cols, k, row_major, stream),
+      means_(n_rows, stream) {}
 
   void preprocess(math_t *data) {
     math_t normalizer_const = 1.0 / (math_t)cosine::n_cols_;
@@ -143,7 +136,7 @@ class CorrelationMetricProcessor : public CosineMetricProcessor<math_t> {
 
   ~CorrelationMetricProcessor() = default;
 
-  raft::mr::device::buffer<math_t> means_;
+  rmm::device_uvector<math_t> means_;
 };
 
 template <typename math_t>
@@ -161,18 +154,18 @@ class DefaultMetricProcessor : public MetricProcessor<math_t> {
 template <typename math_t>
 inline std::unique_ptr<MetricProcessor<math_t>> create_processor(
   distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery,
-  cudaStream_t userStream, std::shared_ptr<deviceAllocator> allocator) {
+  cudaStream_t userStream) {
   MetricProcessor<math_t> *mp = nullptr;
 
   switch (metric) {
     case distance::DistanceType::CosineExpanded:
-      mp = new CosineMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream,
-                                             allocator);
+      mp =
+        new CosineMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream);
       break;
 
     case distance::DistanceType::CorrelationExpanded:
       mp = new CorrelationMetricProcessor<math_t>(n, D, k, rowMajorQuery,
-                                                  userStream, allocator);
+                                                  userStream);
       break;
     default:
       mp = new DefaultMetricProcessor<math_t>();
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index a3a1972c13..71c547c281 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -18,15 +18,12 @@
 
 #include "detail/knn_brute_force_faiss.cuh"
 
-#include <raft/mr/device/allocator.hpp>
 #include <raft/mr/device/buffer.hpp>
 
 namespace raft {
 namespace spatial {
 namespace knn {
 
-using deviceAllocator = raft::mr::device::allocator;
-
 template <typename value_idx = int64_t, typename value_t = float>
 inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
                             value_idx *outV, size_t n_samples, int n_parts,
@@ -72,8 +69,7 @@ inline void brute_force_knn(
   std::vector<cudaStream_t> int_streams = handle.get_internal_streams();
 
   detail::brute_force_knn_impl(input, sizes, D, search_items, n, res_I, res_D,
-                               k, handle.get_device_allocator(),
-                               handle.get_stream(), int_streams.data(),
+                               k, handle.get_stream(), int_streams.data(),
                                handle.get_num_internal_streams(), rowMajorIndex,
                                rowMajorQuery, translations, metric, metric_arg);
 }
diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp
index 922ae7cfab..6f507331d9 100644
--- a/cpp/include/raft/spectral/cluster_solvers.hpp
+++ b/cpp/include/raft/spectral/cluster_solvers.hpp
@@ -42,19 +42,16 @@ struct kmeans_solver_t {
                                                    size_type_t> const& config)
     : config_(config) {}
 
-  template <typename thrust_exe_policy_t>
   std::pair<value_type_t, index_type_t> solve(
-    handle_t const& handle, thrust_exe_policy_t t_exe_policy,
-    size_type_t n_obs_vecs, size_type_t dim,
+    handle_t const& handle, size_type_t n_obs_vecs, size_type_t dim,
     value_type_t const* __restrict__ obs,
     index_type_t* __restrict__ codes) const {
     RAFT_EXPECTS(obs != nullptr, "Null obs buffer.");
     RAFT_EXPECTS(codes != nullptr, "Null codes buffer.");
     value_type_t residual{};
     index_type_t iters{};
-    kmeans(handle, t_exe_policy, n_obs_vecs, dim, config_.n_clusters,
-           config_.tol, config_.maxIter, obs, codes, residual, iters,
-           config_.seed);
+    kmeans(handle, n_obs_vecs, dim, config_.n_clusters, config_.tol,
+           config_.maxIter, obs, codes, residual, iters, config_.seed);
     return std::make_pair(residual, iters);
   }
 
diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp
index fb05bff3e2..b6f0105487 100644
--- a/cpp/include/raft/spectral/kmeans.hpp
+++ b/cpp/include/raft/spectral/kmeans.hpp
@@ -21,7 +21,6 @@
 
 #include <cuda.h>
 #include <thrust/binary_search.h>
-#include <thrust/device_vector.h>
 #include <thrust/gather.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/random.h>
@@ -325,7 +324,6 @@ static __global__ void divideCentroids(
  *    Centroid is randomly chosen with k-means++ algorithm.
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
- *  @tparam thrust_exe_pol_t the type of thrust execution policy.
  *  @param handle the raft handle.
  *  @param n Number of observation vectors.
  *  @param d Dimension of observation vectors.
@@ -341,12 +339,9 @@ static __global__ void divideCentroids(
  *    coordinates.
  *  @return Zero if successful. Otherwise non-zero.
  */
-template <typename index_type_t, typename value_type_t,
-          typename thrust_exe_pol_t>
-static int chooseNewCentroid(handle_t const& handle,
-                             thrust_exe_pol_t thrust_exec_policy,
-                             index_type_t n, index_type_t d, index_type_t k,
-                             value_type_t rand,
+template <typename index_type_t, typename value_type_t>
+static int chooseNewCentroid(handle_t const& handle, index_type_t n,
+                             index_type_t d, index_type_t k, value_type_t rand,
                              const value_type_t* __restrict__ obs,
                              value_type_t* __restrict__ dists,
                              value_type_t* __restrict__ centroid) {
@@ -357,8 +352,9 @@ static int chooseNewCentroid(handle_t const& handle,
   // Observation vector that is chosen as new centroid
   index_type_t obsIndex;
 
-  auto cublas_h = handle.get_cublas_handle();
   auto stream = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
+  auto thrust_exec_policy = handle.get_thrust_policy();
 
   // Compute cumulative sum of distances
   thrust::inclusive_scan(thrust_exec_policy, thrust::device_pointer_cast(dists),
@@ -417,10 +413,7 @@ static int chooseNewCentroid(handle_t const& handle,
  *    Centroids are randomly chosen with k-means++ algorithm
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
- *  @tparam thrust_exe_pol_t the type of thrust execution policy.
  *  @param handle the raft handle.
- *  @param  thrust_exec_policy thrust execution policy 
- *    (assumed to have same stream as handle.stream).
  *  @param n Number of observation vectors.
  *  @param d Dimension of observation vectors.
  *  @param k Number of clusters.
@@ -439,14 +432,12 @@ static int chooseNewCentroid(handle_t const& handle,
  *    distance between observation vectors and the closest centroid.
  *  @return Zero if successful. Otherwise non-zero.
  */
-template <typename index_type_t, typename value_type_t,
-          typename thrust_exe_pol_t>
+template <typename index_type_t, typename value_type_t>
 static int initializeCentroids(
-  handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n,
-  index_type_t d, index_type_t k, const value_type_t* __restrict__ obs,
-  value_type_t* __restrict__ centroids, index_type_t* __restrict__ codes,
-  index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ dists,
-  unsigned long long seed) {
+  handle_t const& handle, index_type_t n, index_type_t d, index_type_t k,
+  const value_type_t* __restrict__ obs, value_type_t* __restrict__ centroids,
+  index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes,
+  value_type_t* __restrict__ dists, unsigned long long seed) {
   // -------------------------------------------------------
   // Variable declarations
   // -------------------------------------------------------
@@ -458,8 +449,9 @@ static int initializeCentroids(
   thrust::default_random_engine rng(seed);
   thrust::uniform_real_distribution<value_type_t> uniformDist(0, 1);
 
-  auto cublas_h = handle.get_cublas_handle();
   auto stream = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
+  auto thrust_exec_policy = handle.get_thrust_policy();
 
   constexpr index_type_t grid_lower_bound{65535};
 
@@ -486,8 +478,8 @@ static int initializeCentroids(
   thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists),
                thrust::device_pointer_cast(dists + n), 1);
   CHECK_CUDA(stream);
-  if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng),
-                        obs, dists, centroids))
+  if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, dists,
+                        centroids))
     WARNING("error in k-means++ (could not pick centroid)");
 
   // Compute distances from first centroid
@@ -499,8 +491,8 @@ static int initializeCentroids(
   // Choose remaining centroids
   for (i = 1; i < k; ++i) {
     // Choose ith centroid
-    if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng),
-                          obs, dists, centroids + IDX(0, i, d)))
+    if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, dists,
+                          centroids + IDX(0, i, d)))
       WARNING("error in k-means++ (could not pick centroid)");
 
     // Compute distances from ith centroid
@@ -529,10 +521,7 @@ static int initializeCentroids(
  *    Distance is measured with Euclidean norm.
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
- *  @tparam thrust_exe_pol_t the type of thrust execution policy.
  *  @param handle the raft handle.
- *  @param  thrust_exec_policy thrust execution policy
- *    (assumed to have same stream as handle.stream).
  *  @param n Number of observation vectors.
  *  @param d Dimension of observation vectors.
  *  @param k Number of clusters.
@@ -553,16 +542,18 @@ static int initializeCentroids(
  *    of squares of assignment.
  *  @return Zero if successful. Otherwise non-zero.
  */
-template <typename index_type_t, typename value_type_t,
-          typename thrust_exe_pol_t>
-static int assignCentroids(
-  handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n,
-  index_type_t d, index_type_t k, const value_type_t* __restrict__ obs,
-  const value_type_t* __restrict__ centroids, value_type_t* __restrict__ dists,
-  index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes,
-  value_type_t* residual_host) {
-  auto cublas_h = handle.get_cublas_handle();
+template <typename index_type_t, typename value_type_t>
+static int assignCentroids(handle_t const& handle, index_type_t n,
+                           index_type_t d, index_type_t k,
+                           const value_type_t* __restrict__ obs,
+                           const value_type_t* __restrict__ centroids,
+                           value_type_t* __restrict__ dists,
+                           index_type_t* __restrict__ codes,
+                           index_type_t* __restrict__ clusterSizes,
+                           value_type_t* residual_host) {
   auto stream = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
+  auto thrust_exec_policy = handle.get_thrust_policy();
 
   // Compute distance between centroids and observation vectors
   CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream));
@@ -606,10 +597,7 @@ static int assignCentroids(
  *    All clusters are assumed to be non-empty.
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
- *  @tparam thrust_exe_pol_t the type of thrust execution policy.
  *  @param handle the raft handle.
- *  @param  thrust_exec_policy thrust execution policy
- *    (assumed to have same stream as handle.stream).
  *  @param n Number of observation vectors.
  *  @param d Dimension of observation vectors.
  *  @param k Number of clusters.
@@ -628,10 +616,8 @@ static int assignCentroids(
  *    Workspace.
  *  @return Zero if successful. Otherwise non-zero.
  */
-template <typename index_type_t, typename value_type_t,
-          typename thrust_exe_pol_t>
-static int updateCentroids(handle_t const& handle,
-                           thrust_exe_pol_t thrust_exec_policy, index_type_t n,
+template <typename index_type_t, typename value_type_t>
+static int updateCentroids(handle_t const& handle, index_type_t n,
                            index_type_t d, index_type_t k,
                            const value_type_t* __restrict__ obs,
                            const index_type_t* __restrict__ codes,
@@ -649,8 +635,9 @@ static int updateCentroids(handle_t const& handle,
 
   constexpr index_type_t grid_lower_bound{65535};
 
-  auto cublas_h = handle.get_cublas_handle();
   auto stream = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
+  auto thrust_exec_policy = handle.get_thrust_policy();
 
   // Device memory
   thrust::device_ptr<value_type_t> obs_copy(work);
@@ -722,10 +709,7 @@ namespace raft {
  *    k-means++ algorithm.
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
- *  @tparam thrust_exe_pol_t the type of thrust execution policy.
  *  @param handle the raft handle.
- *  @param  thrust_exec_policy thrust execution policy
- *    (assumed to have same stream as handle.stream).
  *  @param n Number of observation vectors.
  *  @param d Dimension of observation vectors.
  *  @param k Number of clusters.
@@ -754,11 +738,10 @@ namespace raft {
  *  @param seed random seed to be used.
  *  @return error flag.
  */
-template <typename index_type_t, typename value_type_t,
-          typename thrust_exe_pol_t>
-int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
-           index_type_t n, index_type_t d, index_type_t k, value_type_t tol,
-           index_type_t maxiter, const value_type_t* __restrict__ obs,
+template <typename index_type_t, typename value_type_t>
+int kmeans(handle_t const& handle, index_type_t n, index_type_t d,
+           index_type_t k, value_type_t tol, index_type_t maxiter,
+           const value_type_t* __restrict__ obs,
            index_type_t* __restrict__ codes,
            index_type_t* __restrict__ clusterSizes,
            value_type_t* __restrict__ centroids,
@@ -785,16 +768,17 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
   // Initialization
   // -------------------------------------------------------
 
-  auto cublas_h = handle.get_cublas_handle();
   auto stream = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
+  auto thrust_exec_policy = handle.get_thrust_policy();
 
   // Trivial cases
   if (k == 1) {
     CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
     CUDA_TRY(cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t),
                              cudaMemcpyHostToDevice, stream));
-    if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes,
-                        clusterSizes, centroids, work, work_int))
+    if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids,
+                        work, work_int))
       WARNING("could not compute k-means centroids");
 
     dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};
@@ -840,21 +824,21 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
   // -------------------------------------------------------
 
   // Choose initial cluster centroids
-  if (initializeCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids,
-                          codes, clusterSizes, work, seed))
+  if (initializeCentroids(handle, n, d, k, obs, centroids, codes, clusterSizes,
+                          work, seed))
     WARNING("could not initialize k-means centroids");
 
   // Apply k-means iteration until convergence
   for (iter = 0; iter < maxiter; ++iter) {
     // Update cluster centroids
-    if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes,
-                        clusterSizes, centroids, work, work_int))
+    if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids,
+                        work, work_int))
       WARNING("could not update k-means centroids");
 
     // Determine centroid closest to each observation
     residualPrev = *residual_host;
-    if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids,
-                        work, codes, clusterSizes, residual_host))
+    if (assignCentroids(handle, n, d, k, obs, centroids, work, codes,
+                        clusterSizes, residual_host))
       WARNING("could not assign observation vectors to k-means clusters");
 
     // Reinitialize empty clusters with new centroids
@@ -868,12 +852,11 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
     // conditions, such as if obs is corrupt (as seen as a result of a
     // DataFrame column of NULL edge vals used to create the Graph)
     while (emptyCentroid < k) {
-      if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k,
-                            uniformDist(rng), obs, work,
+      if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, work,
                             centroids + IDX(0, emptyCentroid, d)))
         WARNING("could not replace empty centroid");
-      if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids,
-                          work, codes, clusterSizes, residual_host))
+      if (assignCentroids(handle, n, d, k, obs, centroids, work, codes,
+                          clusterSizes, residual_host))
         WARNING("could not assign observation vectors to k-means clusters");
       emptyCentroid =
         (thrust::find(thrust_exec_policy,
@@ -905,10 +888,7 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
  *    k-means++ algorithm.
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
- *  @tparam thrust_exe_pol_t the type of thrust execution policy.
  *  @param handle the raft handle.
- *  @param  thrust_exec_policy thrust execution policy
- *    (assumed to have same stream as handle.stream).
  *  @param n Number of observation vectors.
  *  @param d Dimension of observation vectors.
  *  @param k Number of clusters.
@@ -926,11 +906,10 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
  *  @param seed random seed to be used.
  *  @return error flag
  */
-template <typename index_type_t, typename value_type_t,
-          typename thrust_exe_pol_t>
-int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
-           index_type_t n, index_type_t d, index_type_t k, value_type_t tol,
-           index_type_t maxiter, const value_type_t* __restrict__ obs,
+template <typename index_type_t, typename value_type_t>
+int kmeans(handle_t const& handle, index_type_t n, index_type_t d,
+           index_type_t k, value_type_t tol, index_type_t maxiter,
+           const value_type_t* __restrict__ obs,
            index_type_t* __restrict__ codes, value_type_t& residual,
            index_type_t& iters, unsigned long long seed = 123456) {
   using namespace matrix;
@@ -950,9 +929,8 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
 
   // Perform k-means
   return kmeans<index_type_t, value_type_t>(
-    handle, thrust_exec_policy, n, d, k, tol, maxiter, obs, codes,
-    clusterSizes.raw(), centroids.raw(), work.raw(), work_int.raw(), &residual,
-    &iters, seed);
+    handle, n, d, k, tol, maxiter, obs, codes, clusterSizes.raw(),
+    centroids.raw(), work.raw(), work_int.raw(), &residual, &iters, seed);
 }
 
 }  // namespace raft
diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp
index c43154d17a..42fc621a1a 100644
--- a/cpp/include/raft/spectral/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/matrix_wrappers.hpp
@@ -19,6 +19,7 @@
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/reduce.h>
@@ -72,52 +73,30 @@ struct vector_view_t {
     : buffer_(buffer), size_(sz) {}
 
   vector_view_t(vector_view_t&& other)
-    : buffer_(other.buffer_), size_(other.size_) {
-    other.buffer_ = nullptr;
-    other.size_ = 0;
-  }
+    : buffer_(other.raw()), size_(other.size()) {}
 
   vector_view_t& operator=(vector_view_t&& other) {
-    buffer_ = other.buffer_;
-    size_ = other.size_;
-
-    other.buffer_ = nullptr;
-    other.size_ = 0;
+    buffer_ = other.raw();
+    size_ = other.size();
   }
 };
 
-// allocatable vector, using raft handle allocator
-//
 template <typename value_type>
 class vector_t {
-  handle_t const& handle_;
-  value_type* buffer_;
-  size_type size_;
-  cudaStream_t stream_;
-
  public:
   vector_t(handle_t const& raft_handle, size_type sz)
-    : handle_(raft_handle),
-      buffer_(
-        static_cast<value_type*>(raft_handle.get_device_allocator()->allocate(
-          sz * sizeof(value_type), raft_handle.get_stream()))),
-      size_(sz),
-      stream_(raft_handle.get_stream()) {}
-
-  ~vector_t(void) {
-    handle_.get_device_allocator()->deallocate(
-      buffer_, size_ * sizeof(value_type), stream_);
-  }
+    : buffer_(sz, raft_handle.get_stream()),
+      thrust_policy(raft_handle.get_thrust_policy()) {}
 
-  size_type size(void) const { return size_; }
+  size_type size(void) const { return buffer_.size(); }
 
-  value_type* raw(void) { return buffer_; }
+  value_type* raw(void) { return buffer_.data(); }
 
-  value_type const* raw(void) const { return buffer_; }
+  value_type const* raw(void) const { return buffer_.data(); }
 
-  template <typename ThrustExecPolicy>
-  value_type nrm1(ThrustExecPolicy t_exe_pol) const {
-    return thrust::reduce(t_exe_pol, buffer_, buffer_ + size_, value_type{0},
+  value_type nrm1() const {
+    return thrust::reduce(thrust_policy, buffer_.data(),
+                          buffer_.data() + buffer_.size(), value_type{0},
                           [] __device__(auto left, auto right) {
                             auto abs_left = left > 0 ? left : -left;
                             auto abs_right = right > 0 ? right : -right;
@@ -125,10 +104,15 @@ class vector_t {
                           });
   }
 
-  template <typename ThrustExecPolicy>
-  void fill(ThrustExecPolicy t_exe_pol, value_type value) {
-    thrust::fill_n(t_exe_pol, buffer_, size_, value);
+  void fill(value_type value) {
+    thrust::fill_n(thrust_policy, buffer_.data(), buffer_.size(), value);
   }
+
+ private:
+  using thrust_exec_policy_t = thrust::detail::execute_with_allocator<
+    rmm::mr::thrust_allocator<char>, thrust::cuda_cub::execute_on_stream_base>;
+  rmm::device_uvector<value_type> buffer_;
+  const thrust_exec_policy_t thrust_policy;
 };
 
 template <typename index_type, typename value_type>
@@ -280,31 +264,26 @@ struct sparse_matrix_t {
 
 template <typename index_type, typename value_type>
 struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
-  template <typename ThrustExePolicy>
-  laplacian_matrix_t(handle_t const& raft_handle,
-                     ThrustExePolicy thrust_exec_policy,
-                     index_type const* row_offsets,
+  laplacian_matrix_t(handle_t const& raft_handle, index_type const* row_offsets,
                      index_type const* col_indices, value_type const* values,
                      index_type const nrows, index_type const nnz)
     : sparse_matrix_t<index_type, value_type>(raft_handle, row_offsets,
                                               col_indices, values, nrows, nnz),
       diagonal_(raft_handle, nrows) {
     vector_t<value_type> ones{raft_handle, nrows};
-    ones.fill(thrust_exec_policy, 1.0);
+    ones.fill(1.0);
     sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0,
                                                 diagonal_.raw());
   }
 
-  template <typename ThrustExePolicy>
   laplacian_matrix_t(handle_t const& raft_handle,
-                     ThrustExePolicy thrust_exec_policy,
                      sparse_matrix_t<index_type, value_type> const& csr_m)
     : sparse_matrix_t<index_type, value_type>(raft_handle, csr_m.row_offsets_,
                                               csr_m.col_indices_, csr_m.values_,
                                               csr_m.nrows_, csr_m.nnz_),
       diagonal_(raft_handle, csr_m.nrows_) {
     vector_t<value_type> ones{raft_handle, csr_m.nrows_};
-    ones.fill(thrust_exec_policy, 1.0);
+    ones.fill(1.0);
     sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0,
                                                 diagonal_.raw());
   }
@@ -351,27 +330,19 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
 
 template <typename index_type, typename value_type>
 struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
-  template <typename ThrustExePolicy>
   modularity_matrix_t(handle_t const& raft_handle,
-                      ThrustExePolicy thrust_exec_policy,
                       index_type const* row_offsets,
                       index_type const* col_indices, value_type const* values,
                       index_type const nrows, index_type const nnz)
     : laplacian_matrix_t<index_type, value_type>(
-        raft_handle, thrust_exec_policy, row_offsets, col_indices, values,
-        nrows, nnz) {
-    edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1(
-      thrust_exec_policy);
+        raft_handle, row_offsets, col_indices, values, nrows, nnz) {
+    edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1();
   }
 
-  template <typename ThrustExePolicy>
   modularity_matrix_t(handle_t const& raft_handle,
-                      ThrustExePolicy thrust_exec_policy,
                       sparse_matrix_t<index_type, value_type> const& csr_m)
-    : laplacian_matrix_t<index_type, value_type>(raft_handle,
-                                                 thrust_exec_policy, csr_m) {
-    edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1(
-      thrust_exec_policy);
+    : laplacian_matrix_t<index_type, value_type>(raft_handle, csr_m) {
+    edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1();
   }
 
   // y = alpha*A*x + beta*y
diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp
index f8dfe5daa3..fededbfcb4 100644
--- a/cpp/include/raft/spectral/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/modularity_maximization.hpp
@@ -20,7 +20,6 @@
 #include <stdio.h>
 
 #include <cuda.h>
-#include <thrust/device_vector.h>
 #include <thrust/fill.h>
 #include <thrust/reduce.h>
 #include <thrust/transform.h>
@@ -79,19 +78,18 @@ using namespace linalg;
  *    performed.
  *  @return error flag.
  */
-template <typename vertex_t, typename weight_t, typename ThrustExePolicy,
-          typename EigenSolver, typename ClusterSolver>
+template <typename vertex_t, typename weight_t, typename EigenSolver,
+          typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
-  handle_t const &handle, ThrustExePolicy thrust_exec_policy,
-  sparse_matrix_t<vertex_t, weight_t> const &csr_m,
+  handle_t const &handle, sparse_matrix_t<vertex_t, weight_t> const &csr_m,
   EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver,
   vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) {
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
   RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
   RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
 
-  auto cublas_h = handle.get_cublas_handle();
   auto stream = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
 
   std::tuple<vertex_t, weight_t, vertex_t>
     stats;  // # iters eigen solver, cluster solver residual, # iters cluster solver
@@ -101,7 +99,7 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
   // Compute eigenvectors of Modularity Matrix
 
   // Initialize Modularity Matrix
-  modularity_matrix_t<vertex_t, weight_t> B{handle, thrust_exec_policy, csr_m};
+  modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
 
   auto eigen_config = eigen_solver.get_config();
   auto nEigVecs = eigen_config.n_eigVecs;
@@ -111,7 +109,7 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
     eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs);
 
   // Whiten eigenvector matrix
-  transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs);
+  transform_eigen_matrix(handle, n, nEigVecs, eigVecs);
 
   // notice that at this point the matrix has already been transposed, so we are scaling
   // columns
@@ -119,8 +117,8 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
   CHECK_CUDA(stream);
 
   // Find partition clustering
-  auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n,
-                                           nEigVecs, eigVecs, clusters);
+  auto pair_cluster =
+    cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
 
   std::get<1>(stats) = pair_cluster.first;
   std::get<2>(stats) = pair_cluster.second;
@@ -138,9 +136,8 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
  *  @param clusters (Input, device memory, n entries) Cluster assignments.
  *  @param modularity On exit, modularity
  */
-template <typename vertex_t, typename weight_t, typename ThrustExePolicy>
+template <typename vertex_t, typename weight_t>
 void analyzeModularity(handle_t const &handle,
-                       ThrustExePolicy thrust_exec_policy,
                        sparse_matrix_t<vertex_t, weight_t> const &csr_m,
                        vertex_t nClusters,
                        vertex_t const *__restrict__ clusters,
@@ -163,15 +160,15 @@ void analyzeModularity(handle_t const &handle,
     cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // Initialize Modularity
-  modularity_matrix_t<vertex_t, weight_t> B{handle, thrust_exec_policy, csr_m};
+  modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
 
   // Initialize output
   modularity = 0;
 
   // Iterate through partitions
   for (i = 0; i < nClusters; ++i) {
-    if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize,
-                             partModularity, clusters, part_i, Bx, B)) {
+    if (!construct_indicator(handle, i, n, clustersize, partModularity,
+                             clusters, part_i, Bx, B)) {
       WARNING("empty partition");
       continue;
     }
@@ -180,7 +177,7 @@ void analyzeModularity(handle_t const &handle,
     modularity += partModularity;
   }
 
-  modularity = modularity / B.diagonal_.nrm1(thrust_exec_policy);
+  modularity = modularity / B.diagonal_.nrm1();
 }
 
 }  // namespace spectral
diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp
index 841fca04d9..2df3812a4a 100644
--- a/cpp/include/raft/spectral/partition.hpp
+++ b/cpp/include/raft/spectral/partition.hpp
@@ -19,7 +19,6 @@
 #include <stdio.h>
 
 #include <cuda.h>
-#include <thrust/device_vector.h>
 #include <thrust/fill.h>
 #include <thrust/reduce.h>
 #include <thrust/transform.h>
@@ -62,19 +61,18 @@ using namespace linalg;
  *    performed.
  *  @return statistics: number of eigensolver iterations, .
  */
-template <typename vertex_t, typename weight_t, typename ThrustExePolicy,
-          typename EigenSolver, typename ClusterSolver>
+template <typename vertex_t, typename weight_t, typename EigenSolver,
+          typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> partition(
-  handle_t const &handle, ThrustExePolicy thrust_exec_policy,
-  sparse_matrix_t<vertex_t, weight_t> const &csr_m,
+  handle_t const &handle, sparse_matrix_t<vertex_t, weight_t> const &csr_m,
   EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver,
   vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) {
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
   RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
   RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
 
-  auto cublas_h = handle.get_cublas_handle();
   auto stream = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
 
   std::tuple<vertex_t, weight_t, vertex_t>
     stats;  //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver
@@ -89,7 +87,7 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
 
   // Initialize Laplacian
   ///sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
-  laplacian_matrix_t<vertex_t, weight_t> L{handle, thrust_exec_policy, csr_m};
+  laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
 
   auto eigen_config = eigen_solver.get_config();
   auto nEigVecs = eigen_config.n_eigVecs;
@@ -99,11 +97,11 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
     eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs);
 
   // Whiten eigenvector matrix
-  transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs);
+  transform_eigen_matrix(handle, n, nEigVecs, eigVecs);
 
   // Find partition clustering
-  auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n,
-                                           nEigVecs, eigVecs, clusters);
+  auto pair_cluster =
+    cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
 
   std::get<1>(stats) = pair_cluster.first;
   std::get<2>(stats) = pair_cluster.second;
@@ -129,9 +127,8 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
  *  @param cost On exit, partition cost function.
  *  @return error flag.
  */
-template <typename vertex_t, typename weight_t, typename ThrustExePolicy>
+template <typename vertex_t, typename weight_t>
 void analyzePartition(handle_t const &handle,
-                      ThrustExePolicy thrust_exec_policy,
                       sparse_matrix_t<vertex_t, weight_t> const &csr_m,
                       vertex_t nClusters, const vertex_t *__restrict__ clusters,
                       weight_t &edgeCut, weight_t &cost) {
@@ -140,8 +137,8 @@ void analyzePartition(handle_t const &handle,
   vertex_t i;
   vertex_t n = csr_m.nrows_;
 
-  auto cublas_h = handle.get_cublas_handle();
   auto stream = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
 
   weight_t partEdgesCut, clustersize;
 
@@ -155,7 +152,7 @@ void analyzePartition(handle_t const &handle,
 
   // Initialize Laplacian
   ///sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
-  laplacian_matrix_t<vertex_t, weight_t> L{handle, thrust_exec_policy, csr_m};
+  laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
 
   // Initialize output
   cost = 0;
@@ -164,8 +161,8 @@ void analyzePartition(handle_t const &handle,
   // Iterate through partitions
   for (i = 0; i < nClusters; ++i) {
     // Construct indicator vector for ith partition
-    if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize,
-                             partEdgesCut, clusters, part_i, Lx, L)) {
+    if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters,
+                             part_i, Lx, L)) {
       WARNING("empty partition");
       continue;
     }
diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp
index 40dde30a74..c148350c0f 100644
--- a/cpp/include/raft/spectral/spectral_util.hpp
+++ b/cpp/include/raft/spectral/spectral_util.hpp
@@ -19,7 +19,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 
-#include <thrust/device_vector.h>
 #include <thrust/fill.h>
 #include <thrust/reduce.h>
 #include <thrust/transform.h>
@@ -108,13 +107,12 @@ cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) {
   return cudaSuccess;
 }
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename ThrustExePolicy>
-void transform_eigen_matrix(handle_t const& handle,
-                            ThrustExePolicy thrust_exec_policy, edge_t n,
-                            vertex_t nEigVecs, weight_t* eigVecs) {
-  auto cublas_h = handle.get_cublas_handle();
+template <typename vertex_t, typename edge_t, typename weight_t>
+void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs,
+                            weight_t* eigVecs) {
   auto stream = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
+  auto thrust_exec_policy = handle.get_thrust_policy();
 
   const weight_t zero{0.0};
   const weight_t one{1.0};
@@ -187,16 +185,15 @@ struct equal_to_i_op {
 
 // Construct indicator vector for ith partition
 //
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename ThrustExePolicy>
-bool construct_indicator(handle_t const& handle,
-                         ThrustExePolicy thrust_exec_policy, edge_t index,
-                         edge_t n, weight_t& clustersize, weight_t& partStats,
+template <typename vertex_t, typename edge_t, typename weight_t>
+bool construct_indicator(handle_t const& handle, edge_t index, edge_t n,
+                         weight_t& clustersize, weight_t& partStats,
                          vertex_t const* __restrict__ clusters,
                          vector_t<weight_t>& part_i, vector_t<weight_t>& Bx,
                          laplacian_matrix_t<vertex_t, weight_t> const& B) {
-  auto cublas_h = handle.get_cublas_handle();
   auto stream = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
+  auto thrust_exec_policy = handle.get_thrust_policy();
 
   thrust::for_each(thrust_exec_policy,
                    thrust::make_zip_iterator(thrust::make_tuple(
diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu
index 4ff6cdf5fa..d280b3e95c 100644
--- a/cpp/test/cluster_solvers.cu
+++ b/cpp/test/cluster_solvers.cu
@@ -49,8 +49,7 @@ TEST(Raft, ClusterSolvers) {
 
   kmeans_solver_t<index_type, value_type> cluster_solver{cfg};
 
-  EXPECT_ANY_THROW(cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d,
-                                        eigvecs, codes));
+  EXPECT_ANY_THROW(cluster_solver.solve(h, n, d, eigvecs, codes));
 }
 
 TEST(Raft, ModularitySolvers) {
@@ -89,14 +88,12 @@ TEST(Raft, ModularitySolvers) {
   auto stream = h.get_stream();
   sparse_matrix_t<index_type, value_type> sm{h,       nullptr, nullptr,
                                              nullptr, 0,       0};
-  auto t_exe_p = thrust::cuda::par.on(stream);
 
   EXPECT_ANY_THROW(spectral::modularity_maximization(
-    h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
+    h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
 
   value_type modularity{0};
-  EXPECT_ANY_THROW(
-    spectral::analyzeModularity(h, t_exe_p, sm, k, clusters, modularity));
+  EXPECT_ANY_THROW(spectral::analyzeModularity(h, sm, k, clusters, modularity));
 }
 
 }  // namespace raft
diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index e2ed2c01dc..8d5cd68f13 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -77,12 +77,11 @@ class DistanceAdjTest
     int n = params.n;
     int k = params.k;
     bool isRowMajor = params.isRowMajor;
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(x, m * k);
-    raft::allocate(y, n * k);
-    raft::allocate(dist_ref, m * n);
-    raft::allocate(dist, m * n);
+    raft::allocate(x, m * k, stream);
+    raft::allocate(y, n * k, stream);
+    raft::allocate(dist_ref, m * n, stream);
+    raft::allocate(dist, m * n, stream);
     r.uniform(x, m * k, DataType(-1.0), DataType(1.0), stream);
     r.uniform(y, n * k, DataType(-1.0), DataType(1.0), stream);
 
@@ -94,7 +93,7 @@ class DistanceAdjTest
       raft::distance::getWorkspaceSize<raft::distance::DistanceType::L2Expanded,
                                        DataType, DataType, bool>(x, y, m, n, k);
     if (worksize != 0) {
-      raft::allocate(workspace, worksize);
+      raft::allocate(workspace, worksize, stream);
     }
 
     auto fin_op = [threshold] __device__(DataType d_val, int g_d_idx) {
@@ -103,21 +102,16 @@ class DistanceAdjTest
     raft::distance::distance<raft::distance::DistanceType::L2Expanded, DataType,
                              DataType, bool>(
       x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-    CUDA_CHECK(cudaFree(workspace));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    CUDA_CHECK(cudaFree(x));
-    CUDA_CHECK(cudaFree(y));
-    CUDA_CHECK(cudaFree(dist_ref));
-    CUDA_CHECK(cudaFree(dist));
-  }
+  void TearDown() override {}
 
  protected:
   DistanceAdjInputs<DataType> params;
   DataType *x, *y;
   bool *dist_ref, *dist;
+  cudaStream_t stream;
 };
 
 const std::vector<DistanceAdjInputs<float>> inputsf = {
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index 9e3290593d..4798d102f3 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -392,13 +392,12 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
     int k = params.k;
     DataType metric_arg = params.metric_arg;
     bool isRowMajor = params.isRowMajor;
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(x, m * k);
-    raft::allocate(y, n * k);
-    raft::allocate(dist_ref, m * n);
-    raft::allocate(dist, m * n);
-    raft::allocate(dist2, m * n);
+    raft::allocate(x, m * k, stream);
+    raft::allocate(y, n * k, stream);
+    raft::allocate(dist_ref, m * n, stream);
+    raft::allocate(dist, m * n, stream);
+    raft::allocate(dist2, m * n, stream);
     if (distanceType == raft::distance::DistanceType::HellingerExpanded ||
         distanceType == raft::distance::DistanceType::JensenShannon ||
         distanceType == raft::distance::DistanceType::KLDivergence) {
@@ -416,7 +415,6 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
       r.uniform(x, m * k, DataType(-1.0), DataType(1.0), stream);
       r.uniform(y, n * k, DataType(-1.0), DataType(1.0), stream);
     }
-
     naiveDistance(dist_ref, x, y, m, n, k, distanceType, isRowMajor,
                   metric_arg);
     char *workspace = nullptr;
@@ -424,28 +422,24 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
       raft::distance::getWorkspaceSize<distanceType, DataType, DataType,
                                        DataType>(x, y, m, n, k);
     if (worksize != 0) {
-      raft::allocate(workspace, worksize);
+      raft::allocate(workspace, worksize, stream);
     }
 
     DataType threshold = -10000.f;
     distanceLauncher<distanceType, DataType>(x, y, dist, dist2, m, n, k, params,
                                              threshold, workspace, worksize,
                                              stream, isRowMajor, metric_arg);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-    CUDA_CHECK(cudaFree(workspace));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(x));
-    CUDA_CHECK(cudaFree(y));
-    CUDA_CHECK(cudaFree(dist_ref));
-    CUDA_CHECK(cudaFree(dist));
-    CUDA_CHECK(cudaFree(dist2));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   DistanceInputs<DataType> params;
   DataType *x, *y, *dist_ref, *dist, *dist2;
+  cudaStream_t stream;
 };
 
 }  // end namespace distance
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 4573a070b6..cfea4ee2d9 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -107,13 +107,13 @@ class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
     int n = params.n;
     int k = params.k;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(x, m * k);
-    raft::allocate(y, n * k);
-    raft::allocate(xn, m);
-    raft::allocate(yn, n);
-    raft::allocate(workspace, sizeof(int) * m);
-    raft::allocate(min, m);
-    raft::allocate(min_ref, m);
+    raft::allocate(x, m * k, stream);
+    raft::allocate(y, n * k, stream);
+    raft::allocate(xn, m, stream);
+    raft::allocate(yn, n, stream);
+    raft::allocate(workspace, sizeof(int) * m, stream);
+    raft::allocate(min, m, stream);
+    raft::allocate(min_ref, m, stream);
     r.uniform(x, m * k, DataT(-1.0), DataT(1.0), stream);
     r.uniform(y, n * k, DataT(-1.0), DataT(1.0), stream);
     generateGoldenResult();
@@ -122,15 +122,8 @@ class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    raft::deallocate_all(stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
-    CUDA_CHECK(cudaFree(x));
-    CUDA_CHECK(cudaFree(y));
-    CUDA_CHECK(cudaFree(xn));
-    CUDA_CHECK(cudaFree(yn));
-    CUDA_CHECK(cudaFree(workspace));
-    CUDA_CHECK(cudaFree(min_ref));
-    CUDA_CHECK(cudaFree(min));
   }
 
  protected:
@@ -282,18 +275,17 @@ class FusedL2NNDetTest : public FusedL2NNTest<DataT, Sqrt> {
   void SetUp() override {
     FusedL2NNTest<DataT, Sqrt>::SetUp();
     int m = this->params.m;
-    raft::allocate(min1, m);
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    raft::allocate(min1, m, stream);
   }
 
-  void TearDown() override {
-    FusedL2NNTest<DataT, Sqrt>::TearDown();
-    CUDA_CHECK(cudaFree(min1));
-  }
+  void TearDown() override { FusedL2NNTest<DataT, Sqrt>::TearDown(); }
 
  protected:
   cub::KeyValuePair<int, DataT> *min1;
 
   static const int NumRepeats = 100;
+  cudaStream_t stream;
 
   void generateGoldenResult() override {}
 };
diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu
index 328137f42d..15794ef568 100644
--- a/cpp/test/eigen_solvers.cu
+++ b/cpp/test/eigen_solvers.cu
@@ -100,18 +100,15 @@ TEST(Raft, SpectralSolvers) {
                                                             seed};
   kmeans_solver_t<index_type, value_type> cluster_solver{clust_cfg};
 
-  auto stream = h.get_stream();
-
-  auto t_exe_p = thrust::cuda::par.on(stream);
   sparse_matrix_t<index_type, value_type> sm{h,       nullptr, nullptr,
                                              nullptr, 0,       0};
-  EXPECT_ANY_THROW(spectral::partition(
-    h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
+  EXPECT_ANY_THROW(spectral::partition(h, sm, eig_solver, cluster_solver,
+                                       clusters, eigvals, eigvecs));
 
   value_type edgeCut{0};
   value_type cost{0};
   EXPECT_ANY_THROW(
-    spectral::analyzePartition(h, t_exe_p, sm, k, clusters, edgeCut, cost));
+    spectral::analyzePartition(h, sm, k, clusters, edgeCut, cost));
 }
 
 }  // namespace raft
diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu
index dc2846fdba..b28c754a5a 100644
--- a/cpp/test/label/label.cu
+++ b/cpp/test/label/label.cu
@@ -20,7 +20,6 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include "../test_utils.h"
 
 #include <iostream>
@@ -44,9 +43,9 @@ TEST_F(MakeMonotonicTest, Result) {
 
   float *data, *actual, *expected;
 
-  raft::allocate(data, m, true);
-  raft::allocate(actual, m, true);
-  raft::allocate(expected, m, true);
+  raft::allocate(data, m, stream, true);
+  raft::allocate(actual, m, stream, true);
+  raft::allocate(expected, m, stream, true);
 
   float *data_h =
     new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0};
@@ -57,17 +56,14 @@ TEST_F(MakeMonotonicTest, Result) {
   raft::update_device(data, data_h, m, stream);
   raft::update_device(expected, expected_h, m, stream);
 
-  std::shared_ptr<raft::mr::device::allocator> allocator(
-    new raft::mr::device::default_allocator);
-  make_monotonic(actual, data, m, stream, allocator);
+  make_monotonic(actual, data, m, stream);
 
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
   ASSERT_TRUE(devArrMatch(actual, expected, m, raft::Compare<bool>(), stream));
 
+  raft::deallocate_all(stream);
   CUDA_CHECK(cudaStreamDestroy(stream));
-  CUDA_CHECK(cudaFree(data));
-  CUDA_CHECK(cudaFree(actual));
 
   delete data_h;
   delete expected_h;
@@ -76,39 +72,35 @@ TEST_F(MakeMonotonicTest, Result) {
 TEST(labelTest, Classlabels) {
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
-  std::shared_ptr<raft::mr::device::allocator> allocator(
-    new raft::mr::device::default_allocator);
 
   int n_rows = 6;
   float *y_d;
-  raft::allocate(y_d, n_rows);
+  raft::allocate(y_d, n_rows, stream);
 
   float y_h[] = {2, -1, 1, 2, 1, 1};
   raft::update_device(y_d, y_h, n_rows, stream);
 
-  int n_classes;
-  float *y_unique_d;
-  getUniquelabels(y_d, n_rows, &y_unique_d, &n_classes, stream, allocator);
+  rmm::device_uvector<float> y_unique_d(0, stream);
+  int n_classes = getUniquelabels(y_unique_d, y_d, n_rows, stream);
 
   ASSERT_EQ(n_classes, 3);
 
   float y_unique_exp[] = {-1, 1, 2};
-  EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d, n_classes,
+  EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d.data(), n_classes,
                               raft::Compare<float>(), stream));
 
   float *y_relabeled_d;
-  raft::allocate(y_relabeled_d, n_rows);
+  raft::allocate(y_relabeled_d, n_rows, stream);
 
-  getOvrlabels(y_d, n_rows, y_unique_d, n_classes, y_relabeled_d, 2, stream);
+  getOvrlabels(y_d, n_rows, y_unique_d.data(), n_classes, y_relabeled_d, 2,
+               stream);
 
   float y_relabeled_exp[] = {1, -1, -1, 1, -1, -1};
   EXPECT_TRUE(devArrMatchHost(y_relabeled_exp, y_relabeled_d, n_rows,
                               raft::Compare<float>(), stream));
 
+  raft::deallocate_all(stream);
   CUDA_CHECK(cudaStreamDestroy(stream));
-  CUDA_CHECK(cudaFree(y_d));
-  CUDA_CHECK(cudaFree(y_unique_d));
-  CUDA_CHECK(cudaFree(y_relabeled_d));
 }
 };  // namespace label
 };  // namespace raft
diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu
index a2f14a8dbc..28d8d59884 100644
--- a/cpp/test/label/merge_labels.cu
+++ b/cpp/test/label/merge_labels.cu
@@ -20,7 +20,7 @@
 #include <raft/cudart_utils.h>
 #include <thrust/device_ptr.h>
 #include <raft/handle.hpp>
-#include <raft/mr/device/allocator.hpp>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
 
@@ -50,7 +50,7 @@ class MergeLabelsTest
       expected(params.N, stream),
       R(params.N, stream),
       mask(params.N, stream),
-      m(1, stream) {}
+      m(stream) {}
 
   void Run() {
     raft::update_device(labels_a.data(), params.labels_a.data(), params.N,
@@ -76,7 +76,7 @@ class MergeLabelsTest
   raft::handle_t handle;
   cudaStream_t stream;
   rmm::device_uvector<Index_> labels_a, labels_b, expected, R;
-  rmm::device_uvector<bool> mask, m;
+  rmm::device_scalar<bool> mask, m;
 };
 
 using MergeLabelsTestI = MergeLabelsTest<int>;
diff --git a/cpp/test/lap/lap.cu b/cpp/test/lap/lap.cu
index 04f473f836..08429e18f2 100644
--- a/cpp/test/lap/lap.cu
+++ b/cpp/test/lap/lap.cu
@@ -24,6 +24,8 @@
  */
 #include <gtest/gtest.h>
 
+#include <rmm/device_uvector.hpp>
+
 #include <omp.h>
 #include <iostream>
 #include <raft/lap/lap.cuh>
@@ -65,15 +67,12 @@ void hungarian_test(int problemsize, int costrange, int problemcount,
   for (int j = 0; j < problemcount; j++) {
     generateProblem(h_cost, batchsize, problemsize, costrange);
 
-    raft::mr::device::buffer<weight_t> elements_v(
-      handle.get_device_allocator(), handle.get_stream(),
-      batchsize * problemsize * problemsize);
-    raft::mr::device::buffer<vertex_t> row_assignment_v(
-      handle.get_device_allocator(), handle.get_stream(),
-      batchsize * problemsize);
-    raft::mr::device::buffer<vertex_t> col_assignment_v(
-      handle.get_device_allocator(), handle.get_stream(),
-      batchsize * problemsize);
+    rmm::device_uvector<weight_t> elements_v(
+      batchsize * problemsize * problemsize, handle.get_stream());
+    rmm::device_uvector<vertex_t> row_assignment_v(batchsize * problemsize,
+                                                   handle.get_stream());
+    rmm::device_uvector<vertex_t> col_assignment_v(batchsize * problemsize,
+                                                   handle.get_stream());
 
     raft::update_device(elements_v.data(), h_cost,
                         batchsize * problemsize * problemsize,
diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu
index 2fc9d4e30f..301f069a33 100644
--- a/cpp/test/linalg/add.cu
+++ b/cpp/test/linalg/add.cu
@@ -32,10 +32,10 @@ class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
     raft::random::Rng r(params.seed);
     int len = params.len;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(in1, len);
-    raft::allocate(in2, len);
-    raft::allocate(out_ref, len);
-    raft::allocate(out, len);
+    raft::allocate(in1, len, stream);
+    raft::allocate(in2, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(out, len, stream);
     r.uniform(in1, len, InT(-1.0), InT(1.0), stream);
     r.uniform(in2, len, InT(-1.0), InT(1.0), stream);
     naiveAddElem<InT, OutT>(out_ref, in1, in2, len);
@@ -43,11 +43,7 @@ class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-    CUDA_CHECK(cudaFree(in1));
-    CUDA_CHECK(cudaFree(in2));
-    CUDA_CHECK(cudaFree(out_ref));
-    CUDA_CHECK(cudaFree(out));
+    raft::deallocate_all(stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu
index 3ae4f86066..475d8e58ff 100644
--- a/cpp/test/linalg/binary_op.cu
+++ b/cpp/test/linalg/binary_op.cu
@@ -17,8 +17,8 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/binary_op.cuh>
-#include <raft/mr/device/buffer.hpp>
 #include <raft/random/rng.cuh>
+#include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
 #include "binary_op.cuh"
 
@@ -48,10 +48,10 @@ class BinaryOpTest
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     IdxType len = params.len;
-    allocate(in1, len);
-    allocate(in2, len);
-    allocate(out_ref, len);
-    allocate(out, len);
+    raft::allocate(in1, len, stream);
+    raft::allocate(in2, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(out, len, stream);
     r.uniform(in1, len, InType(-1.0), InType(1.0), stream);
     r.uniform(in2, len, InType(-1.0), InType(1.0), stream);
     naiveAdd(out_ref, in1, in2, len);
@@ -136,9 +136,9 @@ class BinaryOpAlignment : public ::testing::Test {
     // Test to trigger cudaErrorMisalignedAddress if veclen is incorrectly
     // chosen.
     int n = 1024;
-    mr::device::buffer<math_t> x(handle.get_device_allocator(), stream, n);
-    mr::device::buffer<math_t> y(handle.get_device_allocator(), stream, n);
-    mr::device::buffer<math_t> z(handle.get_device_allocator(), stream, n);
+    rmm::device_uvector<math_t> x(n, stream);
+    rmm::device_uvector<math_t> y(n, stream);
+    rmm::device_uvector<math_t> z(n, stream);
     CUDA_CHECK(cudaMemsetAsync(x.data(), 0, n * sizeof(math_t), stream));
     CUDA_CHECK(cudaMemsetAsync(y.data(), 0, n * sizeof(math_t), stream));
     raft::linalg::binaryOp(
diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu
index 00236d53fa..00db1715dc 100644
--- a/cpp/test/linalg/cholesky_r1.cu
+++ b/cpp/test/linalg/cholesky_r1.cu
@@ -19,8 +19,9 @@
 #include <raft/linalg/cusolver_wrappers.h>
 #include <raft/handle.hpp>
 #include <raft/linalg/cholesky_r1_update.cuh>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
 #include <sstream>
 #include <vector>
 #include "../test_utils.h"
@@ -31,12 +32,11 @@ template <typename math_t>
 class CholeskyR1Test : public ::testing::Test {
  protected:
   CholeskyR1Test()
-    : allocator(handle.get_device_allocator()),
-      G(allocator, handle.get_stream(), n_rows * n_rows),
-      L(allocator, handle.get_stream(), n_rows * n_rows),
-      L_exp(allocator, handle.get_stream(), n_rows * n_rows),
-      devInfo(allocator, handle.get_stream(), 1),
-      workspace(allocator, handle.get_stream()) {
+    : G(n_rows * n_rows, handle.get_stream()),
+      L(n_rows * n_rows, handle.get_stream()),
+      L_exp(n_rows * n_rows, handle.get_stream()),
+      devInfo(handle.get_stream()),
+      workspace(0, handle.get_stream()) {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
     raft::update_device(G.data(), G_host, n_rows * n_rows, stream);
@@ -105,7 +105,6 @@ class CholeskyR1Test : public ::testing::Test {
   }
 
   raft::handle_t handle;
-  std::shared_ptr<raft::mr::device::allocator> allocator;
   cusolverDnHandle_t solver_handle;
   cudaStream_t stream;
 
@@ -120,11 +119,11 @@ class CholeskyR1Test : public ::testing::Test {
 
   math_t G2_host[4] = {3, 4, 2, 1};
 
-  raft::mr::device::buffer<int> devInfo;
-  raft::mr::device::buffer<math_t> G;
-  raft::mr::device::buffer<math_t> L_exp;
-  raft::mr::device::buffer<math_t> L;
-  raft::mr::device::buffer<char> workspace;
+  rmm::device_scalar<int> devInfo;
+  rmm::device_uvector<math_t> G;
+  rmm::device_uvector<math_t> L_exp;
+  rmm::device_uvector<math_t> L;
+  rmm::device_uvector<char> workspace;
 };
 
 typedef ::testing::Types<float, double> FloatTypes;
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index e45f5651b4..45dbd9dcc4 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -57,11 +57,10 @@ class coalescedReductionTest
     raft::random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(data, len);
-    raft::allocate(dots_exp, rows);
-    raft::allocate(dots_act, rows);
+    raft::allocate(data, len, stream);
+    raft::allocate(dots_exp, rows, stream);
+    raft::allocate(dots_act, rows, stream);
     r.uniform(data, len, T(-1.0), T(1.0), stream);
     naiveCoalescedReduction(dots_exp, data, cols, rows, stream);
 
@@ -70,18 +69,18 @@ class coalescedReductionTest
     // Add to result with inplace = true next
     coalescedReductionLaunch(dots_act, data, cols, rows, stream, true);
 
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(dots_exp));
-    CUDA_CHECK(cudaFree(dots_act));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   coalescedReductionInputs<T> params;
   T *data, *dots_exp, *dots_act;
+  cudaStream_t stream;
 };
 
 const std::vector<coalescedReductionInputs<float>> inputsf = {
diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu
index 2396558939..563f96c835 100644
--- a/cpp/test/linalg/divide.cu
+++ b/cpp/test/linalg/divide.cu
@@ -51,27 +51,26 @@ class DivideTest
       ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
 
-    raft::allocate(in, len);
-    raft::allocate(out_ref, len);
-    raft::allocate(out, len);
+    raft::allocate(in, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(out, len, stream);
     r.uniform(in, len, T(-1.0), T(1.0), stream);
     naiveDivide(out_ref, in, params.scalar, len, stream);
     divideScalar(out, in, params.scalar, len, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(in));
-    CUDA_CHECK(cudaFree(out_ref));
-    CUDA_CHECK(cudaFree(out));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   UnaryOpInputs<T> params;
   T *in, *out_ref, *out;
+  cudaStream_t stream;
 };
 
 const std::vector<UnaryOpInputs<float>> inputsf = {
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
index 159d288174..6e26757cf3 100644
--- a/cpp/test/linalg/eig.cu
+++ b/cpp/test/linalg/eig.cu
@@ -50,24 +50,24 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
     raft::random::Rng r(params.seed);
     int len = params.len;
 
-    raft::allocate(cov_matrix, len);
+    raft::allocate(cov_matrix, len, stream);
     T cov_matrix_h[] = {1.0,  0.9, 0.81, 0.729, 0.9,   1.0,  0.9, 0.81,
                         0.81, 0.9, 1.0,  0.9,   0.729, 0.81, 0.9, 1.0};
     ASSERT(len == 16, "This test only works with 4x4 matrices!");
     raft::update_device(cov_matrix, cov_matrix_h, len, stream);
 
-    raft::allocate(eig_vectors, len);
-    raft::allocate(eig_vals, params.n_col);
-    raft::allocate(eig_vectors_jacobi, len);
-    raft::allocate(eig_vals_jacobi, params.n_col);
+    raft::allocate(eig_vectors, len, stream);
+    raft::allocate(eig_vals, params.n_col, stream);
+    raft::allocate(eig_vectors_jacobi, len, stream);
+    raft::allocate(eig_vals_jacobi, params.n_col, stream);
 
     T eig_vectors_ref_h[] = {0.2790, -0.6498, 0.6498, -0.2789, -0.5123, 0.4874,
                              0.4874, -0.5123, 0.6498, 0.2789,  -0.2789, -0.6498,
                              0.4874, 0.5123,  0.5123, 0.4874};
     T eig_vals_ref_h[] = {0.0614, 0.1024, 0.3096, 3.5266};
 
-    raft::allocate(eig_vectors_ref, len);
-    raft::allocate(eig_vals_ref, params.n_col);
+    raft::allocate(eig_vectors_ref, len, stream);
+    raft::allocate(eig_vals_ref, params.n_col, stream);
 
     raft::update_device(eig_vectors_ref, eig_vectors_ref_h, len, stream);
     raft::update_device(eig_vals_ref, eig_vals_ref_h, params.n_col, stream);
@@ -82,11 +82,11 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
 
     // test code for comparing two methods
     len = params.n * params.n;
-    raft::allocate(cov_matrix_large, len);
-    raft::allocate(eig_vectors_large, len);
-    raft::allocate(eig_vectors_jacobi_large, len);
-    raft::allocate(eig_vals_large, params.n);
-    raft::allocate(eig_vals_jacobi_large, params.n);
+    raft::allocate(cov_matrix_large, len, stream);
+    raft::allocate(eig_vectors_large, len, stream);
+    raft::allocate(eig_vectors_jacobi_large, len, stream);
+    raft::allocate(eig_vals_large, params.n, stream);
+    raft::allocate(eig_vals_jacobi_large, params.n, stream);
 
     r.uniform(cov_matrix_large, len, T(-1.0), T(1.0), stream);
 
@@ -97,15 +97,7 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
               sweeps);
   }
 
-  void TearDown() override {
-    CUDA_CHECK(cudaFree(cov_matrix));
-    CUDA_CHECK(cudaFree(eig_vectors));
-    CUDA_CHECK(cudaFree(eig_vectors_jacobi));
-    CUDA_CHECK(cudaFree(eig_vals));
-    CUDA_CHECK(cudaFree(eig_vals_jacobi));
-    CUDA_CHECK(cudaFree(eig_vectors_ref));
-    CUDA_CHECK(cudaFree(eig_vals_ref));
-  }
+  void TearDown() override { raft::deallocate_all(stream); }
 
  protected:
   EigInputs<T> params;
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index b3980f281d..bdd0a08ff6 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -51,36 +51,31 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
     params = ::testing::TestWithParam<EigSelInputs<T>>::GetParam();
     int len = params.len;
 
-    raft::allocate(cov_matrix, len);
+    raft::allocate(cov_matrix, len, stream);
     T cov_matrix_h[] = {1.0,  0.9, 0.81, 0.729, 0.9,   1.0,  0.9, 0.81,
                         0.81, 0.9, 1.0,  0.9,   0.729, 0.81, 0.9, 1.0};
     ASSERT(len == 16, "This test only works with 4x4 matrices!");
     raft::update_device(cov_matrix, cov_matrix_h, len, stream);
 
-    raft::allocate(eig_vectors, 12);
-    raft::allocate(eig_vals, params.n_col);
+    raft::allocate(eig_vectors, 12, stream);
+    raft::allocate(eig_vals, params.n_col, stream);
 
     T eig_vectors_ref_h[] = {-0.5123, 0.4874,  0.4874, -0.5123, 0.6498, 0.2789,
                              -0.2789, -0.6498, 0.4874, 0.5123,  0.5123, 0.4874};
     T eig_vals_ref_h[] = {0.1024, 0.3096, 3.5266, 3.5266};
 
-    raft::allocate(eig_vectors_ref, 12);
-    raft::allocate(eig_vals_ref, params.n_col);
+    raft::allocate(eig_vectors_ref, 12, stream);
+    raft::allocate(eig_vals_ref, params.n_col, stream);
 
     raft::update_device(eig_vectors_ref, eig_vectors_ref_h, 12, stream);
     raft::update_device(eig_vals_ref, eig_vals_ref_h, 4, stream);
 
     eigSelDC(handle, cov_matrix, params.n_row, params.n_col, 3, eig_vectors,
              eig_vals, EigVecMemUsage::OVERWRITE_INPUT, stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    CUDA_CHECK(cudaFree(cov_matrix));
-    CUDA_CHECK(cudaFree(eig_vectors));
-    CUDA_CHECK(cudaFree(eig_vals));
-    CUDA_CHECK(cudaFree(eig_vectors_ref));
-    CUDA_CHECK(cudaFree(eig_vals_ref));
-  }
+  void TearDown() override { raft::deallocate_all(stream); }
 
  protected:
   EigSelInputs<T> params;
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu
index 572951c557..e955f7a354 100644
--- a/cpp/test/linalg/eltwise.cu
+++ b/cpp/test/linalg/eltwise.cu
@@ -69,9 +69,9 @@ class ScalarMultiplyTest
 
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    allocate(in, len);
-    allocate(out_ref, len);
-    allocate(out, len);
+    raft::allocate(in, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(out, len, stream);
     r.uniform(in, len, T(-1.0), T(1.0), stream);
     naiveScale(out_ref, in, scalar, len, stream);
     scalarMultiply(out, in, scalar, len, stream);
@@ -156,10 +156,10 @@ class EltwiseAddTest : public ::testing::TestWithParam<EltwiseAddInputs<T>> {
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     int len = params.len;
-    allocate(in1, len);
-    allocate(in2, len);
-    allocate(out_ref, len);
-    allocate(out, len);
+    raft::allocate(in1, len, stream);
+    raft::allocate(in2, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(out, len, stream);
     r.uniform(in1, len, T(-1.0), T(1.0), stream);
     r.uniform(in2, len, T(-1.0), T(1.0), stream);
     naiveAdd(out_ref, in1, in2, len, stream);
diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu
index 227bce6a48..5b13fb5362 100644
--- a/cpp/test/linalg/map.cu
+++ b/cpp/test/linalg/map.cu
@@ -18,7 +18,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/eltwise.cuh>
 #include <raft/linalg/map.cuh>
-#include <raft/mr/device/buffer.hpp>
 #include <raft/random/rng.cuh>
 #include "../test_utils.h"
 
@@ -48,11 +47,10 @@ void create_ref(OutType *out_ref, const InType *in1, const InType *in2,
                 const InType *in3, InType scalar, IdxType len,
                 cudaStream_t stream) {
   InType *tmp;
-  allocate(tmp, len);
+  raft::allocate(tmp, len, stream);
   eltwiseAdd(tmp, in1, in2, len, stream);
   eltwiseAdd(out_ref, tmp, in3, len, stream);
   scalarAdd(out_ref, out_ref, (OutType)scalar, len, stream);
-  CUDA_CHECK(cudaFree(tmp));
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
@@ -64,35 +62,32 @@ class MapTest
       ::testing::TestWithParam<MapInputs<InType, IdxType, OutType>>::GetParam();
     raft::random::Rng r(params.seed);
 
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     IdxType len = params.len;
-    allocate(in1, len);
-    allocate(in2, len);
-    allocate(in3, len);
-    allocate(out_ref, len);
-    allocate(out, len);
+    raft::allocate(in1, len, stream);
+    raft::allocate(in2, len, stream);
+    raft::allocate(in3, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(out, len, stream);
     r.uniform(in1, len, InType(-1.0), InType(1.0), stream);
     r.uniform(in2, len, InType(-1.0), InType(1.0), stream);
     r.uniform(in3, len, InType(-1.0), InType(1.0), stream);
 
     create_ref(out_ref, in1, in2, in3, params.scalar, len, stream);
     mapLaunch(out, in1, in2, in3, params.scalar, len, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(in1));
-    CUDA_CHECK(cudaFree(in2));
-    CUDA_CHECK(cudaFree(in3));
-    CUDA_CHECK(cudaFree(out_ref));
-    CUDA_CHECK(cudaFree(out));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   MapInputs<InType, IdxType, OutType> params;
   InType *in1, *in2, *in3;
   OutType *out_ref, *out;
+  cudaStream_t stream;
 };
 
 const std::vector<MapInputs<float, int>> inputsf_i32 = {
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index 6e146fa4bb..4a44e59504 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -19,6 +19,8 @@
 #include <limits>
 #include <raft/linalg/map_then_reduce.cuh>
 #include <raft/random/rng.cuh>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
 
 namespace raft {
@@ -74,26 +76,25 @@ class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<InType>> {
     raft::random::Rng r(params.seed);
     auto len = params.len;
 
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    allocate(in, len);
-    allocate(out_ref, len);
-    allocate(out, len);
+    raft::allocate(in, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(out, len, stream);
     r.uniform(in, len, InType(-1.0), InType(1.0), stream);
     mapReduceLaunch(out_ref, out, in, len, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(in));
-    CUDA_CHECK(cudaFree(out_ref));
-    CUDA_CHECK(cudaFree(out));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   MapReduceInputs<InType> params;
   InType *in;
   OutType *out_ref, *out;
+  cudaStream_t stream;
 };
 
 const std::vector<MapReduceInputs<float>> inputsf = {
@@ -131,9 +132,7 @@ class MapGenericReduceTest : public ::testing::Test {
 
  protected:
   MapGenericReduceTest()
-    : allocator(handle.get_device_allocator()),
-      input(allocator, handle.get_stream(), n),
-      output(allocator, handle.get_stream(), 1) {
+    : input(n, handle.get_stream()), output(handle.get_stream()) {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
     initInput(input.data(), input.size(), stream);
@@ -172,9 +171,8 @@ class MapGenericReduceTest : public ::testing::Test {
   int n = 1237;
   raft::handle_t handle;
   cudaStream_t stream;
-  std::shared_ptr<raft::mr::device::allocator> allocator;
-  raft::mr::device::buffer<InType> input;
-  raft::mr::device::buffer<OutType> output;
+  rmm::device_uvector<InType> input;
+  rmm::device_scalar<OutType> output;
 };
 
 using IoTypePair =
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index aa46c78b0f..e017ee0918 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -66,14 +66,13 @@ class MatVecOpTest
     IdxType N = params.rows, D = params.cols;
     IdxType len = N * D;
 
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    allocate(in, len);
-    allocate(out_ref, len);
-    allocate(out, len);
+    raft::allocate(in, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(out, len, stream);
     IdxType vecLen = params.bcastAlongRows ? D : N;
-    allocate(vec1, vecLen);
-    allocate(vec2, vecLen);
+    raft::allocate(vec1, vecLen, stream);
+    raft::allocate(vec2, vecLen, stream);
     r.uniform(in, len, (T)-1.0, (T)1.0, stream);
     r.uniform(vec1, vecLen, (T)-1.0, (T)1.0, stream);
     r.uniform(vec2, vecLen, (T)-1.0, (T)1.0, stream);
@@ -86,20 +85,18 @@ class MatVecOpTest
     }
     matrixVectorOpLaunch(out, in, vec1, vec2, D, N, params.rowMajor,
                          params.bcastAlongRows, params.useTwoVectors, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(vec1));
-    CUDA_CHECK(cudaFree(vec2));
-    CUDA_CHECK(cudaFree(out));
-    CUDA_CHECK(cudaFree(out_ref));
-    CUDA_CHECK(cudaFree(in));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   MatVecOpInputs<T, IdxType> params;
   T *in, *out, *out_ref, *vec1, *vec2;
+  cudaStream_t stream;
 };
 
 const std::vector<MatVecOpInputs<float, int>> inputsf_i32 = {
diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu
index 1d3e753de3..d7bda7c27d 100644
--- a/cpp/test/linalg/multiply.cu
+++ b/cpp/test/linalg/multiply.cu
@@ -31,27 +31,26 @@ class MultiplyTest : public ::testing::TestWithParam<UnaryOpInputs<T>> {
     params = ::testing::TestWithParam<UnaryOpInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
 
-    raft::allocate(in, len);
-    raft::allocate(out_ref, len);
-    raft::allocate(out, len);
+    raft::allocate(in, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(out, len, stream);
     r.uniform(in, len, T(-1.0), T(1.0), stream);
     naiveScale(out_ref, in, params.scalar, len, stream);
     multiplyScalar(out, in, params.scalar, len, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(in));
-    CUDA_CHECK(cudaFree(out_ref));
-    CUDA_CHECK(cudaFree(out));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   UnaryOpInputs<T> params;
   T *in, *out_ref, *out;
+  cudaStream_t stream;
 };
 
 const std::vector<UnaryOpInputs<float>> inputsf = {
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
index acc25addd0..5563064982 100644
--- a/cpp/test/linalg/norm.cu
+++ b/cpp/test/linalg/norm.cu
@@ -78,9 +78,9 @@ class RowNormTest : public ::testing::TestWithParam<NormInputs<T>> {
     int rows = params.rows, cols = params.cols, len = rows * cols;
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(data, len);
-    raft::allocate(dots_exp, rows);
-    raft::allocate(dots_act, rows);
+    raft::allocate(data, len, stream);
+    raft::allocate(dots_exp, rows, stream);
+    raft::allocate(dots_act, rows, stream);
     r.uniform(data, len, T(-1.0), T(1.0), stream);
     naiveRowNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt,
                  stream);
@@ -143,10 +143,10 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<T>> {
     int rows = params.rows, cols = params.cols, len = rows * cols;
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(data, len);
+    raft::allocate(data, len, stream);
     r.uniform(data, len, T(-1.0), T(1.0), stream);
-    raft::allocate(dots_exp, cols);
-    raft::allocate(dots_act, cols);
+    raft::allocate(dots_exp, cols, stream);
+    raft::allocate(dots_act, cols, stream);
 
     naiveColNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt,
                  stream);
@@ -157,13 +157,11 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<T>> {
     } else {
       colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream);
     }
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(dots_exp));
-    CUDA_CHECK(cudaFree(dots_act));
+    raft::deallocate_all(stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
index 9082397265..7ceeaf7f8e 100644
--- a/cpp/test/linalg/reduce.cu
+++ b/cpp/test/linalg/reduce.cu
@@ -63,9 +63,9 @@ class ReduceTest
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
     outlen = params.alongRows ? rows : cols;
-    raft::allocate(data, len);
-    raft::allocate(dots_exp, outlen);
-    raft::allocate(dots_act, outlen);
+    raft::allocate(data, len, stream);
+    raft::allocate(dots_exp, outlen, stream);
+    raft::allocate(dots_act, outlen, stream);
     r.uniform(data, len, InType(-1.0), InType(1.0), stream);
     naiveReduction(dots_exp, data, cols, rows, params.rowMajor,
                    params.alongRows, stream);
@@ -82,9 +82,7 @@ class ReduceTest
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(dots_exp));
-    CUDA_CHECK(cudaFree(dots_act));
+    raft::deallocate_all(stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh
index 30a9c2e271..7f8319636b 100644
--- a/cpp/test/linalg/reduce.cuh
+++ b/cpp/test/linalg/reduce.cuh
@@ -18,9 +18,9 @@
 
 #include <cublas_v2.h>
 #include <raft/linalg/cublas_wrappers.h>
-#include <thrust/device_vector.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace linalg {
@@ -54,17 +54,20 @@ void unaryAndGemv(OutType *dots, const InType *data, int D, int N,
                   cudaStream_t stream) {
   //computes a MLCommon unary op on data (squares it), then computes Ax
   //(A input matrix and x column vector) to sum columns
-  thrust::device_vector<OutType> sq(D * N);
+  rmm::device_uvector<OutType> sq(D * N, stream);
   raft::linalg::unaryOp(
     thrust::raw_pointer_cast(sq.data()), data, D * N,
     [] __device__(InType v) { return static_cast<OutType>(v * v); }, stream);
   cublasHandle_t handle;
   CUBLAS_CHECK(cublasCreate(&handle));
-  thrust::device_vector<OutType> ones(N, 1);  //column vector [1...1]
+  rmm::device_uvector<OutType> ones(N, stream);  //column vector [1...1]
+  raft::linalg::unaryOp<OutType>(
+    ones.data(), ones.data(), ones.size(),
+    [=] __device__(OutType input) { return 1; }, stream);
   OutType alpha = 1, beta = 0;
-  CUBLAS_CHECK(raft::linalg::cublasgemv(
-    handle, CUBLAS_OP_N, D, N, &alpha, thrust::raw_pointer_cast(sq.data()), D,
-    thrust::raw_pointer_cast(ones.data()), 1, &beta, dots, 1, stream));
+  CUBLAS_CHECK(raft::linalg::cublasgemv(handle, CUBLAS_OP_N, D, N, &alpha,
+                                        sq.data(), D, ones.data(), 1, &beta,
+                                        dots, 1, stream));
   CUDA_CHECK(cudaDeviceSynchronize());
   CUBLAS_CHECK(cublasDestroy(handle));
 }
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu
index b27fa2ac1a..55d8cc0e92 100644
--- a/cpp/test/linalg/strided_reduction.cu
+++ b/cpp/test/linalg/strided_reduction.cu
@@ -49,9 +49,9 @@ class stridedReductionTest
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
 
-    raft::allocate(data, len);
-    raft::allocate(dots_exp, cols);  //expected dot products (from test)
-    raft::allocate(dots_act, cols);  //actual dot products (from prim)
+    raft::allocate(data, len, stream);
+    raft::allocate(dots_exp, cols, stream);  //expected dot products (from test)
+    raft::allocate(dots_act, cols, stream);  //actual dot products (from prim)
     r.uniform(data, len, T(-1.0), T(1.0),
               stream);  //initialize matrix to random
 
@@ -60,9 +60,7 @@ class stridedReductionTest
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(dots_exp));
-    CUDA_CHECK(cudaFree(dots_act));
+    raft::deallocate_all(stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu
index ced3f65fdd..27dea8503f 100644
--- a/cpp/test/linalg/subtract.cu
+++ b/cpp/test/linalg/subtract.cu
@@ -79,12 +79,11 @@ class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
     params = ::testing::TestWithParam<SubtractInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(in1, len);
-    raft::allocate(in2, len);
-    raft::allocate(out_ref, len);
-    raft::allocate(out, len);
+    raft::allocate(in1, len, stream);
+    raft::allocate(in2, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(out, len, stream);
     r.uniform(in1, len, T(-1.0), T(1.0), stream);
     r.uniform(in2, len, T(-1.0), T(1.0), stream);
 
@@ -95,19 +94,18 @@ class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
     subtractScalar(out, out, T(1), len, stream);
     subtract(in1, in1, in2, len, stream);
     subtractScalar(in1, in1, T(1), len, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(in1));
-    CUDA_CHECK(cudaFree(in2));
-    CUDA_CHECK(cudaFree(out_ref));
-    CUDA_CHECK(cudaFree(out));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   SubtractInputs<T> params;
   T *in1, *in2, *out_ref, *out;
+  cudaStream_t stream;
 };
 
 const std::vector<SubtractInputs<float>> inputsf2 = {
diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu
index fff321768f..72a27790de 100644
--- a/cpp/test/linalg/svd.cu
+++ b/cpp/test/linalg/svd.cu
@@ -48,8 +48,8 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
     params = ::testing::TestWithParam<SvdInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
-    cudaStream_t stream = handle.get_stream();
-    raft::allocate(data, len);
+    stream = handle.get_stream();
+    raft::allocate(data, len, stream);
 
     ASSERT(params.n_row == 3, "This test only supports nrows=3!");
     ASSERT(params.len == 6, "This test only supports len=6!");
@@ -59,9 +59,9 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
     int left_evl = params.n_row * params.n_col;
     int right_evl = params.n_col * params.n_col;
 
-    raft::allocate(left_eig_vectors_qr, left_evl);
-    raft::allocate(right_eig_vectors_trans_qr, right_evl);
-    raft::allocate(sing_vals_qr, params.n_col);
+    raft::allocate(left_eig_vectors_qr, left_evl, stream);
+    raft::allocate(right_eig_vectors_trans_qr, right_evl, stream);
+    raft::allocate(sing_vals_qr, params.n_col, stream);
 
     // allocate(left_eig_vectors_jacobi, left_evl);
     // allocate(right_eig_vectors_trans_jacobi, right_evl);
@@ -74,9 +74,9 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
 
     T sing_vals_ref_h[] = {7.065283, 1.040081};
 
-    raft::allocate(left_eig_vectors_ref, left_evl);
-    raft::allocate(right_eig_vectors_ref, right_evl);
-    raft::allocate(sing_vals_ref, params.n_col);
+    raft::allocate(left_eig_vectors_ref, left_evl, stream);
+    raft::allocate(right_eig_vectors_ref, right_evl, stream);
+    raft::allocate(sing_vals_ref, params.n_col, stream);
 
     raft::update_device(left_eig_vectors_ref, left_eig_vectors_ref_h, left_evl,
                         stream);
@@ -87,22 +87,16 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
     svdQR(handle, data, params.n_row, params.n_col, sing_vals_qr,
           left_eig_vectors_qr, right_eig_vectors_trans_qr, true, true, true,
           stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(left_eig_vectors_qr));
-    CUDA_CHECK(cudaFree(right_eig_vectors_trans_qr));
-    CUDA_CHECK(cudaFree(sing_vals_qr));
-    CUDA_CHECK(cudaFree(left_eig_vectors_ref));
-    CUDA_CHECK(cudaFree(right_eig_vectors_ref));
-    CUDA_CHECK(cudaFree(sing_vals_ref));
-  }
+  void TearDown() override { raft::deallocate_all(stream); }
 
  protected:
   SvdInputs<T> params;
   T *data, *left_eig_vectors_qr, *right_eig_vectors_trans_qr, *sing_vals_qr,
     *left_eig_vectors_ref, *right_eig_vectors_ref, *sing_vals_ref;
+  cudaStream_t stream;
 };
 
 const std::vector<SvdInputs<float>> inputsf2 = {
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index f10b029962..c574f54a05 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -48,26 +48,22 @@ class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
 
     int len = params.len;
 
-    raft::allocate(data, len);
+    raft::allocate(data, len, stream);
     ASSERT(params.len == 9, "This test works only with len=9!");
     T data_h[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
     raft::update_device(data, data_h, len, stream);
 
-    raft::allocate(data_trans_ref, len);
+    raft::allocate(data_trans_ref, len, stream);
     T data_ref_h[] = {1.0, 4.0, 7.0, 2.0, 5.0, 8.0, 3.0, 6.0, 9.0};
     raft::update_device(data_trans_ref, data_ref_h, len, stream);
 
-    raft::allocate(data_trans, len);
+    raft::allocate(data_trans, len, stream);
 
     transpose(handle, data, data_trans, params.n_row, params.n_col, stream);
     transpose(data, params.n_row, stream);
   }
 
-  void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(data_trans));
-    CUDA_CHECK(cudaFree(data_trans_ref));
-  }
+  void TearDown() override { raft::deallocate_all(stream); }
 
  protected:
   TranposeInputs<T> params;
diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu
index 666ab8619d..042e8b9cbf 100644
--- a/cpp/test/linalg/unary_op.cu
+++ b/cpp/test/linalg/unary_op.cu
@@ -53,18 +53,15 @@ class UnaryOpTest
     raft::random::Rng r(params.seed);
     CUDA_CHECK(cudaStreamCreate(&stream));
     auto len = params.len;
-    allocate(in, len);
-    allocate(out_ref, len);
-    allocate(out, len);
+    raft::allocate(in, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(out, len, stream);
     r.uniform(in, len, InType(-1.0), InType(1.0), stream);
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    raft::deallocate_all(stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
-    CUDA_CHECK(cudaFree(in));
-    CUDA_CHECK(cudaFree(out_ref));
-    CUDA_CHECK(cudaFree(out));
   }
 
   virtual void DoTest() {
diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu
index 578139623a..63381dec07 100644
--- a/cpp/test/matrix/math.cu
+++ b/cpp/test/matrix/math.cu
@@ -115,22 +115,22 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
     random::Rng r(params.seed);
     int len = params.len;
 
-    allocate(in_power, len);
-    allocate(out_power_ref, len);
-    allocate(in_sqrt, len);
-    allocate(out_sqrt_ref, len);
-    allocate(in_sign_flip, len);
-    allocate(out_sign_flip_ref, len);
-
     raft::handle_t handle;
-    cudaStream_t stream;
+    stream = handle.get_stream();
     CUDA_CHECK(cudaStreamCreate(&stream));
 
-    allocate(in_ratio, 4);
+    raft::allocate(in_power, len, stream);
+    raft::allocate(out_power_ref, len, stream);
+    raft::allocate(in_sqrt, len, stream);
+    raft::allocate(out_sqrt_ref, len, stream);
+    raft::allocate(in_sign_flip, len, stream);
+    raft::allocate(out_sign_flip_ref, len, stream);
+
+    raft::allocate(in_ratio, 4, stream);
     T in_ratio_h[4] = {1.0, 2.0, 2.0, 3.0};
     update_device(in_ratio, in_ratio_h, 4, stream);
 
-    allocate(out_ratio_ref, 4);
+    raft::allocate(out_ratio_ref, 4, stream);
     T out_ratio_ref_h[4] = {0.125, 0.25, 0.25, 0.375};
     update_device(out_ratio_ref, out_ratio_ref_h, 4, stream);
 
@@ -150,9 +150,9 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
     naiveSignFlip(in_sign_flip, out_sign_flip_ref, params.n_row, params.n_col);
     signFlip(in_sign_flip, params.n_row, params.n_col, stream);
 
-    allocate(in_recip, 4);
-    allocate(in_recip_ref, 4);
-    allocate(out_recip, 4);
+    raft::allocate(in_recip, 4, stream);
+    raft::allocate(in_recip_ref, 4, stream);
+    raft::allocate(out_recip, 4, stream);
     // default threshold is 1e-15
     std::vector<T> in_recip_h = {0.1, 0.01, -0.01, 0.1e-16};
     std::vector<T> in_recip_ref_h = {10.0, 100.0, -100.0, 0.0};
@@ -167,38 +167,23 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
 
     std::vector<T> in_small_val_zero_h = {0.1, 1e-16, -1e-16, -0.1};
     std::vector<T> in_small_val_zero_ref_h = {0.1, 0.0, 0.0, -0.1};
-    allocate(in_smallzero, 4);
-    allocate(out_smallzero, 4);
-    allocate(out_smallzero_ref, 4);
+    raft::allocate(in_smallzero, 4, stream);
+    raft::allocate(out_smallzero, 4, stream);
+    raft::allocate(out_smallzero_ref, 4, stream);
     update_device(in_smallzero, in_small_val_zero_h.data(), 4, stream);
     update_device(out_smallzero_ref, in_small_val_zero_ref_h.data(), 4, stream);
     setSmallValuesZero(out_smallzero, in_smallzero, 4, stream);
     setSmallValuesZero(in_smallzero, 4, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override {
-    CUDA_CHECK(cudaFree(in_power));
-    CUDA_CHECK(cudaFree(out_power_ref));
-    CUDA_CHECK(cudaFree(in_sqrt));
-    CUDA_CHECK(cudaFree(out_sqrt_ref));
-    CUDA_CHECK(cudaFree(in_ratio));
-    CUDA_CHECK(cudaFree(out_ratio_ref));
-    CUDA_CHECK(cudaFree(in_sign_flip));
-    CUDA_CHECK(cudaFree(out_sign_flip_ref));
-    CUDA_CHECK(cudaFree(in_recip));
-    CUDA_CHECK(cudaFree(in_recip_ref));
-    CUDA_CHECK(cudaFree(out_recip));
-    CUDA_CHECK(cudaFree(in_smallzero));
-    CUDA_CHECK(cudaFree(out_smallzero));
-    CUDA_CHECK(cudaFree(out_smallzero_ref));
-  }
+  void TearDown() override { raft::deallocate_all(stream); }
 
  protected:
   MathInputs<T> params;
   T *in_power, *out_power_ref, *in_sqrt, *out_sqrt_ref, *in_ratio,
     *out_ratio_ref, *in_sign_flip, *out_sign_flip_ref, *in_recip, *in_recip_ref,
     *out_recip, *in_smallzero, *out_smallzero, *out_smallzero_ref;
+  cudaStream_t stream;
 };
 
 const std::vector<MathInputs<float>> inputsf = {
diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu
index 28222c0697..cc88df0a73 100644
--- a/cpp/test/matrix/matrix.cu
+++ b/cpp/test/matrix/matrix.cu
@@ -18,6 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/matrix/matrix.cuh>
 #include <raft/random/rng.cuh>
+#include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
 
 namespace raft {
@@ -43,11 +44,10 @@ class MatrixTest : public ::testing::TestWithParam<MatrixInputs<T>> {
     params = ::testing::TestWithParam<MatrixInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.n_row * params.n_col;
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(in1, len);
-    raft::allocate(in2, len);
-    raft::allocate(in1_revr, len);
+    raft::allocate(in1, len, stream);
+    raft::allocate(in2, len, stream);
+    raft::allocate(in1_revr, len, stream);
     r.uniform(in1, len, T(-1.0), T(1.0), stream);
 
     copy(in1, in2, params.n_row, params.n_col, stream);
@@ -55,20 +55,20 @@ class MatrixTest : public ::testing::TestWithParam<MatrixInputs<T>> {
     // colReverse(in1_revr, params.n_row, params.n_col);
 
     T *outTrunc;
-    raft::allocate(outTrunc, 6);
+    raft::allocate(outTrunc, 6, stream);
     truncZeroOrigin(in1, params.n_row, outTrunc, 3, 2, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(in1));
-    CUDA_CHECK(cudaFree(in2));
-    // CUDA_CHECK(cudaFree(in1_revr));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   MatrixInputs<T> params;
   T *in1, *in2, *in1_revr;
+  cudaStream_t stream;
 };
 
 const std::vector<MatrixInputs<float>> inputsf2 = {{0.000001f, 4, 4, 1234ULL}};
@@ -102,17 +102,16 @@ class MatrixCopyRowsTest : public ::testing::Test {
 
  protected:
   MatrixCopyRowsTest()
-    : allocator(handle.get_device_allocator()),
-      input(allocator, handle.get_stream(), n_cols * n_rows),
-      indices(allocator, handle.get_stream(), n_selected),
-      output(allocator, handle.get_stream(), n_cols * n_selected) {
+    : input(n_cols * n_rows, handle.get_stream()),
+      indices(n_selected, handle.get_stream()),
+      output(n_cols * n_selected, handle.get_stream()) {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
     raft::update_device(indices.data(), indices_host, n_selected, stream);
     // Init input array
     thrust::counting_iterator<idx_t> first(0);
     thrust::device_ptr<math_t> ptr(input.data());
-    thrust::copy(thrust::cuda::par.on(stream), first, first + n_cols * n_rows,
+    thrust::copy(handle.get_thrust_policy(), first, first + n_cols * n_rows,
                  ptr);
   }
 
@@ -143,10 +142,9 @@ class MatrixCopyRowsTest : public ::testing::Test {
                                     14, 21, 22, 23, 27, 28, 29};
   raft::handle_t handle;
   cudaStream_t stream;
-  std::shared_ptr<raft::mr::device::allocator> allocator;
-  raft::mr::device::buffer<math_t> input;
-  raft::mr::device::buffer<math_t> output;
-  raft::mr::device::buffer<idx_array_t> indices;
+  rmm::device_uvector<math_t> input;
+  rmm::device_uvector<math_t> output;
+  rmm::device_uvector<idx_array_t> indices;
 };
 
 using TypeTuple =
diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp
index 223efdbfe8..fe42cea8b3 100644
--- a/cpp/test/mr/device/buffer.cpp
+++ b/cpp/test/mr/device/buffer.cpp
@@ -15,22 +15,21 @@
  */
 
 #include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
 #include <iostream>
 #include <memory>
-#include <raft/mr/device/buffer.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/limiting_resource_adaptor.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 
 namespace raft {
 namespace mr {
 namespace device {
 
 TEST(Raft, DeviceBufferAlloc) {
-  auto alloc = std::make_shared<default_allocator>();
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
   // no allocation at construction
-  buffer<char> buff(alloc, stream);
+  rmm::device_uvector<char> buff(0, stream);
   ASSERT_EQ(0, buff.size());
   // explicit allocation after construction
   buff.resize(20, stream);
@@ -39,12 +38,12 @@ TEST(Raft, DeviceBufferAlloc) {
   buff.resize(10, stream);
   ASSERT_EQ(10, buff.size());
   // explicit deallocation
-  buff.release(stream);
+  buff.release();
   ASSERT_EQ(0, buff.size());
   // use these methods without the explicit stream parameter
-  buff.resize(20);
+  buff.resize(20, stream);
   ASSERT_EQ(20, buff.size());
-  buff.resize(10);
+  buff.resize(10, stream);
   ASSERT_EQ(10, buff.size());
   buff.release();
   ASSERT_EQ(0, buff.size());
@@ -62,11 +61,10 @@ TEST(Raft, DeviceBufferZeroResize) {
 
   rmm::mr::set_current_device_resource(limit_mr.get());
 
-  auto alloc = std::make_shared<default_allocator>();
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
   // no allocation at construction
-  buffer<char> buff(alloc, stream, 10);
+  rmm::device_uvector<char> buff(10, stream);
   ASSERT_EQ(10, buff.size());
   // explicit allocation after construction
   buff.resize(0, stream);
@@ -75,7 +73,7 @@ TEST(Raft, DeviceBufferZeroResize) {
   buff.resize(20, stream);
   ASSERT_EQ(20, buff.size());
   // explicit deallocation
-  buff.release(stream);
+  buff.release();
   ASSERT_EQ(0, buff.size());
 
   // Now check that there is no memory left. (Used to not be true)
diff --git a/cpp/test/mst.cu b/cpp/test/mst.cu
index 94f81cddb8..781e6d1d3f 100644
--- a/cpp/test/mst.cu
+++ b/cpp/test/mst.cu
@@ -14,6 +14,12 @@
  * limitations under the License.
  */
 
+#include <bits/stdc++.h>
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <rmm/device_uvector.hpp>
+#include <vector>
 #include "test_utils.h"
 
 #include <raft/cudart_utils.h>
@@ -128,11 +134,18 @@ class MSTTest
     v = static_cast<vertex_t>((csr_d.offsets.size() / sizeof(vertex_t)) - 1);
     e = static_cast<edge_t>(csr_d.indices.size() / sizeof(edge_t));
 
-    rmm::device_vector<vertex_t> mst_src(2 * v - 2,
-                                         std::numeric_limits<vertex_t>::max());
-    rmm::device_vector<vertex_t> mst_dst(2 * v - 2,
-                                         std::numeric_limits<vertex_t>::max());
-    rmm::device_vector<vertex_t> color(v, 0);
+    rmm::device_uvector<vertex_t> mst_src(2 * v - 2, handle.get_stream());
+    rmm::device_uvector<vertex_t> mst_dst(2 * v - 2, handle.get_stream());
+    rmm::device_uvector<vertex_t> color(v, handle.get_stream());
+
+    CUDA_CHECK(
+      cudaMemsetAsync(mst_src.data(), std::numeric_limits<vertex_t>::max(),
+                      mst_src.size() * sizeof(vertex_t), handle.get_stream()));
+    CUDA_CHECK(
+      cudaMemsetAsync(mst_dst.data(), std::numeric_limits<vertex_t>::max(),
+                      mst_dst.size() * sizeof(vertex_t), handle.get_stream()));
+    CUDA_CHECK(cudaMemsetAsync(color.data(), 0, color.size() * sizeof(vertex_t),
+                               handle.get_stream()));
 
     vertex_t *color_ptr = thrust::raw_pointer_cast(color.data());
 
@@ -215,7 +228,6 @@ class MSTTest
  protected:
   MSTTestInput<vertex_t, edge_t, weight_t> mst_input;
   CSRDevice<vertex_t, edge_t, weight_t> csr_d;
-  rmm::device_vector<bool> mst_edge;
   vertex_t v;
   edge_t e;
   int iterations;
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
index af10dcab30..c2ec7a340f 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/test/random/rng.cu
@@ -85,11 +85,10 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     // 4 x sigma indicates the test shouldn't fail 99.9% of the time.
     num_sigma = 10;
     params = ::testing::TestWithParam<RngInputs<T>>::GetParam();
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     Rng r(params.seed, params.gtype);
-    allocate(data, params.len);
-    allocate(stats, 2, true);
+    raft::allocate(data, params.len, stream);
+    raft::allocate(stats, 2, stream, true);
     switch (params.type) {
       case RNG_Normal:
         r.normal(data, params.len, params.start, params.end, stream);
@@ -124,12 +123,12 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     CUDA_CHECK(cudaStreamSynchronize(stream));
     h_stats[0] /= params.len;
     h_stats[1] = (h_stats[1] / params.len) - (h_stats[0] * h_stats[0]);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(stats));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
   void getExpectedMeanVar(T meanvar[2]) {
@@ -182,6 +181,7 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
   T *data, *stats;
   T h_stats[2];  // mean, var
   int num_sigma;
+  cudaStream_t stream;
 };
 
 // The measured mean and standard deviation for each tested distribution are,
@@ -383,9 +383,9 @@ TEST(Rng, MeanError) {
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
 
-  allocate(data, len);
-  allocate(mean_result, num_experiments);
-  allocate(std_result, num_experiments);
+  raft::allocate(data, len, stream);
+  raft::allocate(mean_result, num_experiments, stream);
+  raft::allocate(std_result, num_experiments, stream);
 
   for (auto rtype : {GenPhilox, GenKiss99 /*, raft::random::GenTaps */}) {
     Rng r(seed, rtype);
@@ -416,10 +416,8 @@ TEST(Rng, MeanError) {
     ASSERT_TRUE(
       (diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5));
   }
+  raft::deallocate_all(stream);
   CUDA_CHECK(cudaStreamDestroy(stream));
-  CUDA_CHECK(cudaFree(data));
-  CUDA_CHECK(cudaFree(mean_result));
-  CUDA_CHECK(cudaFree(std_result));
 
   // std::cout << "mean_res:" << h_mean_result << "\n";
 }
@@ -432,7 +430,7 @@ class ScaledBernoulliTest : public ::testing::Test {
 
     Rng r(42);
 
-    allocate(data, len * sizeof(T), stream);
+    raft::allocate(data, len * sizeof(T), stream);
     r.scaled_bernoulli(data, len, T(0.5), T(scale), stream);
   }
 
@@ -463,7 +461,7 @@ class BernoulliTest : public ::testing::Test {
   void SetUp() override {
     CUDA_CHECK(cudaStreamCreate(&stream));
     Rng r(42);
-    allocate(data, len * sizeof(bool), stream);
+    raft::allocate(data, len * sizeof(bool), stream);
     r.bernoulli(data, len, T(0.5), stream);
   }
 
@@ -515,12 +513,11 @@ class RngNormalTableTest
     params = ::testing::TestWithParam<RngNormalTableInputs<T>>::GetParam();
     int len = params.rows * params.cols;
 
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     Rng r(params.seed, params.gtype);
-    allocate(data, len);
-    allocate(stats, 2, true);
-    allocate(mu_vec, params.cols);
+    raft::allocate(data, len, stream);
+    raft::allocate(stats, 2, stream, true);
+    raft::allocate(mu_vec, params.cols, stream);
     r.fill(mu_vec, params.cols, params.mu, stream);
     T* sigma_vec = nullptr;
     r.normalTable(data, params.rows, params.cols, mu_vec, sigma_vec,
@@ -532,13 +529,12 @@ class RngNormalTableTest
     CUDA_CHECK(cudaStreamSynchronize(stream));
     h_stats[0] /= len;
     h_stats[1] = (h_stats[1] / len) - (h_stats[0] * h_stats[0]);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(stats));
-    CUDA_CHECK(cudaFree(mu_vec));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
   void getExpectedMeanVar(T meanvar[2]) {
@@ -551,6 +547,7 @@ class RngNormalTableTest
   T *data, *stats, *mu_vec;
   T h_stats[2];  // mean, var
   int num_sigma;
+  cudaStream_t stream;
 };
 
 typedef RngNormalTableTest<float> RngNormalTableTestF;
diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu
index 92f12206e8..a98619e5b4 100644
--- a/cpp/test/random/rng_int.cu
+++ b/cpp/test/random/rng_int.cu
@@ -70,10 +70,9 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     params = ::testing::TestWithParam<RngInputs<T>>::GetParam();
     Rng r(params.seed, params.gtype);
 
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    allocate(data, params.len);
-    allocate(stats, 2, true);
+    raft::allocate(data, params.len, stream);
+    raft::allocate(stats, 2, stream, true);
     switch (params.type) {
       case RNG_Uniform:
         r.uniformInt(data, params.len, params.start, params.end, stream);
@@ -87,12 +86,12 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     CUDA_CHECK(cudaStreamSynchronize(stream));
     h_stats[0] /= params.len;
     h_stats[1] = (h_stats[1] / params.len) - (h_stats[0] * h_stats[0]);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(stats));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
   void getExpectedMeanVar(float meanvar[2]) {
@@ -110,6 +109,7 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
   T *data;
   float *stats;
   float h_stats[2];  // mean, var
+  cudaStream_t stream;
 };
 
 typedef RngTest<uint32_t> RngTestU32;
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu
index d7e52a8958..cf60f46afe 100644
--- a/cpp/test/random/sample_without_replacement.cu
+++ b/cpp/test/random/sample_without_replacement.cu
@@ -50,10 +50,10 @@ class SWoRTest : public ::testing::TestWithParam<SWoRInputs<T>> {
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     Rng r(params.seed, params.gtype);
-    allocate(in, params.len);
-    allocate(wts, params.len);
-    allocate(out, params.sampledLen);
-    allocate(outIdx, params.sampledLen);
+    raft::allocate(in, params.len, stream);
+    raft::allocate(wts, params.len, stream);
+    raft::allocate(out, params.sampledLen, stream);
+    raft::allocate(outIdx, params.sampledLen, stream);
     h_outIdx.resize(params.sampledLen);
     r.uniform(in, params.len, T(-1.0), T(1.0), stream);
     r.uniform(wts, params.len, T(1.0), T(2.0), stream);
@@ -67,12 +67,8 @@ class SWoRTest : public ::testing::TestWithParam<SWoRInputs<T>> {
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    raft::deallocate_all(stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
-    CUDA_CHECK(cudaFree(in));
-    CUDA_CHECK(cudaFree(wts));
-    CUDA_CHECK(cudaFree(out));
-    CUDA_CHECK(cudaFree(outIdx));
   }
 
  protected:
diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu
index 713708d4cd..8429a46941 100644
--- a/cpp/test/sparse/add.cu
+++ b/cpp/test/sparse/add.cu
@@ -56,27 +56,24 @@ class CSRAddTest
 
     cudaStreamCreate(&stream);
 
-    raft::allocate(ind_a, n_rows);
-    raft::allocate(ind_ptr_a, nnz_a);
-    raft::allocate(values_a, nnz_a);
+    raft::allocate(ind_a, n_rows, stream);
+    raft::allocate(ind_ptr_a, nnz_a, stream);
+    raft::allocate(values_a, nnz_a, stream);
 
-    raft::allocate(ind_b, n_rows);
-    raft::allocate(ind_ptr_b, nnz_b);
-    raft::allocate(values_b, nnz_b);
+    raft::allocate(ind_b, n_rows, stream);
+    raft::allocate(ind_ptr_b, nnz_b, stream);
+    raft::allocate(values_b, nnz_b, stream);
 
-    raft::allocate(ind_verify, n_rows);
-    raft::allocate(ind_ptr_verify, nnz_result);
-    raft::allocate(values_verify, nnz_result);
+    raft::allocate(ind_verify, n_rows, stream);
+    raft::allocate(ind_ptr_verify, nnz_result, stream);
+    raft::allocate(values_verify, nnz_result, stream);
 
-    raft::allocate(ind_result, n_rows);
-    raft::allocate(ind_ptr_result, nnz_result);
-    raft::allocate(values_result, nnz_result);
+    raft::allocate(ind_result, n_rows, stream);
+    raft::allocate(ind_ptr_result, nnz_result, stream);
+    raft::allocate(values_result, nnz_result, stream);
   }
 
   void Run() {
-    std::shared_ptr<raft::mr::device::allocator> alloc(
-      new raft::mr::device::default_allocator);
-
     raft::update_device(ind_a, params.matrix_a.row_ind.data(), n_rows, stream);
     raft::update_device(ind_ptr_a, params.matrix_a.row_ind_ptr.data(), nnz_a,
                         stream);
@@ -96,7 +93,7 @@ class CSRAddTest
 
     Index_ nnz = linalg::csr_add_calc_inds<Type_f, 32>(
       ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b,
-      n_rows, ind_result, alloc, stream);
+      n_rows, ind_result, stream);
 
     ASSERT_TRUE(nnz == nnz_result);
     ASSERT_TRUE(raft::devArrMatch<Index_>(ind_verify, ind_result, n_rows,
@@ -113,18 +110,8 @@ class CSRAddTest
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(ind_a));
-    CUDA_CHECK(cudaFree(ind_b));
-    CUDA_CHECK(cudaFree(ind_result));
-    CUDA_CHECK(cudaFree(ind_ptr_a));
-    CUDA_CHECK(cudaFree(ind_ptr_b));
-    CUDA_CHECK(cudaFree(ind_ptr_verify));
-    CUDA_CHECK(cudaFree(ind_ptr_result));
-    CUDA_CHECK(cudaFree(values_a));
-    CUDA_CHECK(cudaFree(values_b));
-    CUDA_CHECK(cudaFree(values_verify));
-    CUDA_CHECK(cudaFree(values_result));
-    cudaStreamDestroy(stream);
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu
index d98f9de9c3..dd6ba1479e 100644
--- a/cpp/test/sparse/connect_components.cu
+++ b/cpp/test/sparse/connect_components.cu
@@ -28,7 +28,6 @@
 
 #include <raft/linalg/distance_type.h>
 #include <raft/linalg/transpose.h>
-#include <raft/mr/device/allocator.hpp>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.cuh>
 #include <raft/sparse/hierarchy/single_linkage.hpp>
@@ -57,14 +56,12 @@ class ConnectComponentsTest : public ::testing::TestWithParam<
   void basicTest() {
     raft::handle_t handle;
 
-    auto d_alloc = handle.get_device_allocator();
     auto stream = handle.get_stream();
 
     params = ::testing::TestWithParam<
       ConnectComponentsInputs<value_t, value_idx>>::GetParam();
 
-    raft::sparse::COO<value_t, value_idx> out_edges(
-      handle.get_device_allocator(), handle.get_stream());
+    raft::sparse::COO<value_t, value_idx> out_edges(handle.get_stream());
 
     rmm::device_uvector<value_t> data(params.n_row * params.n_col,
                                       handle.get_stream());
@@ -77,7 +74,7 @@ class ConnectComponentsTest : public ::testing::TestWithParam<
     /**
      * 1. Construct knn graph
      */
-    raft::sparse::COO<value_t, value_idx> knn_graph_coo(d_alloc, stream);
+    raft::sparse::COO<value_t, value_idx> knn_graph_coo(stream);
 
     raft::sparse::selection::knn_graph(
       handle, data.data(), params.n_row, params.n_col,
@@ -85,7 +82,7 @@ class ConnectComponentsTest : public ::testing::TestWithParam<
 
     raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(),
                                              knn_graph_coo.nnz, indptr.data(),
-                                             params.n_row + 1, d_alloc, stream);
+                                             params.n_row + 1, stream);
 
     /**
      * 2. Construct MST, sorted by weights
@@ -112,7 +109,7 @@ class ConnectComponentsTest : public ::testing::TestWithParam<
 
     raft::sparse::convert::sorted_coo_to_csr(out_edges.rows(), out_edges.nnz,
                                              indptr2.data(), params.n_row + 1,
-                                             d_alloc, stream);
+                                             stream);
 
     auto output_mst = raft::mst::mst<value_idx, value_idx, value_t>(
       handle, indptr2.data(), out_edges.cols(), out_edges.vals(), params.n_row,
diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu
index ea69ecfc53..4f9c00c7ab 100644
--- a/cpp/test/sparse/convert_coo.cu
+++ b/cpp/test/sparse/convert_coo.cu
@@ -43,9 +43,9 @@ class CSRtoCOOTest : public ::testing::TestWithParam<CSRtoCOOInputs<Index_>> {
     params = ::testing::TestWithParam<CSRtoCOOInputs<Index_>>::GetParam();
 
     cudaStreamCreate(&stream);
-    raft::allocate(ex_scan, params.ex_scan.size());
-    raft::allocate(verify, params.verify.size());
-    raft::allocate(result, params.verify.size(), true);
+    raft::allocate(ex_scan, params.ex_scan.size(), stream);
+    raft::allocate(verify, params.verify.size(), stream);
+    raft::allocate(result, params.verify.size(), stream, true);
   }
 
   void Run() {
@@ -62,9 +62,7 @@ class CSRtoCOOTest : public ::testing::TestWithParam<CSRtoCOOInputs<Index_>> {
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(ex_scan));
-    CUDA_CHECK(cudaFree(verify));
-    CUDA_CHECK(cudaFree(result));
+    raft::deallocate_all(stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index 553ef2ddee..465aad4e7f 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -19,7 +19,6 @@
 #include <raft/random/rng.cuh>
 #include "../test_utils.h"
 
-#include <raft/mr/device/allocator.hpp>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.cuh>
 
@@ -61,8 +60,6 @@ typedef SparseConvertCSRTest<float> SortedCOOToCSR;
 TEST_P(SortedCOOToCSR, Result) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
-  std::shared_ptr<raft::mr::device::allocator> alloc(
-    new raft::mr::device::default_allocator);
 
   int nnz = 8;
 
@@ -71,14 +68,14 @@ TEST_P(SortedCOOToCSR, Result) {
   int *in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
   int *exp_h = new int[4]{0, 2, 4, 6};
 
-  raft::allocate(in, nnz, true);
-  raft::allocate(exp, 4, true);
-  raft::allocate(out, 4, true);
+  raft::allocate(in, nnz, stream, true);
+  raft::allocate(exp, 4, stream, true);
+  raft::allocate(out, 4, stream, true);
 
   raft::update_device(in, in_h, nnz, stream);
   raft::update_device(exp, exp_h, 4, stream);
 
-  convert::sorted_coo_to_csr<int>(in, nnz, out, 4, alloc, stream);
+  convert::sorted_coo_to_csr<int>(in, nnz, out, 4, stream);
 
   ASSERT_TRUE(raft::devArrMatch<int>(out, exp, 4, raft::Compare<int>()));
 
@@ -115,10 +112,10 @@ class CSRAdjGraphTest
     cudaStreamCreate(&stream);
     nnz = params.verify.size();
 
-    raft::allocate(row_ind, params.n_rows);
-    raft::allocate(adj, params.n_rows * params.n_cols);
-    raft::allocate(result, nnz, true);
-    raft::allocate(verify, nnz);
+    raft::allocate(row_ind, params.n_rows, stream);
+    raft::allocate(adj, params.n_rows * params.n_cols, stream);
+    raft::allocate(result, nnz, stream, true);
+    raft::allocate(verify, nnz, stream);
   }
 
   void Run() {
@@ -135,11 +132,8 @@ class CSRAdjGraphTest
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(row_ind));
-    CUDA_CHECK(cudaFree(adj));
-    CUDA_CHECK(cudaFree(verify));
-    CUDA_CHECK(cudaFree(result));
-    cudaStreamDestroy(stream);
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu
index 625772a842..00e6899cb2 100644
--- a/cpp/test/sparse/csr_row_slice.cu
+++ b/cpp/test/sparse/csr_row_slice.cu
@@ -19,8 +19,6 @@
 
 #include <gtest/gtest.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
 
 #include <raft/sparse/op/slice.h>
 
@@ -61,9 +59,9 @@ class CSRRowSliceTest
     std::vector<value_idx> indices_h = params.indices_h;
     std::vector<value_t> data_h = params.data_h;
 
-    allocate(indptr, indptr_h.size());
-    allocate(indices, indices_h.size());
-    allocate(data, data_h.size());
+    raft::allocate(indptr, indptr_h.size(), stream);
+    raft::allocate(indices, indices_h.size(), stream);
+    raft::allocate(data, data_h.size(), stream);
 
     update_device(indptr, indptr_h.data(), indptr_h.size(), stream);
     update_device(indices, indices_h.data(), indices_h.size(), stream);
@@ -73,9 +71,9 @@ class CSRRowSliceTest
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
     std::vector<value_t> out_data_ref_h = params.out_data_ref_h;
 
-    allocate(out_indptr_ref, out_indptr_ref_h.size());
-    allocate(out_indices_ref, out_indices_ref_h.size());
-    allocate(out_data_ref, out_data_ref_h.size());
+    raft::allocate(out_indptr_ref, out_indptr_ref_h.size(), stream);
+    raft::allocate(out_indices_ref, out_indices_ref_h.size(), stream);
+    raft::allocate(out_data_ref, out_data_ref_h.size(), stream);
 
     update_device(out_indptr_ref, out_indptr_ref_h.data(),
                   out_indptr_ref_h.size(), stream);
@@ -84,16 +82,14 @@ class CSRRowSliceTest
     update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(),
                   stream);
 
-    allocate(out_indptr, out_indptr_ref_h.size());
-    allocate(out_indices, out_indices_ref_h.size());
-    allocate(out_data, out_data_ref_h.size());
+    raft::allocate(out_indptr, out_indptr_ref_h.size(), stream);
+    raft::allocate(out_indices, out_indices_ref_h.size(), stream);
+    raft::allocate(out_data, out_data_ref_h.size(), stream);
   }
 
   void SetUp() override {
     params = ::testing::TestWithParam<
       CSRRowSliceInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<raft::mr::device::allocator> alloc(
-      new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     make_data();
@@ -113,16 +109,8 @@ class CSRRowSliceTest
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-    CUDA_CHECK(cudaFree(indptr));
-    CUDA_CHECK(cudaFree(indices));
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(out_indptr));
-    CUDA_CHECK(cudaFree(out_indices));
-    CUDA_CHECK(cudaFree(out_data));
-    CUDA_CHECK(cudaFree(out_indptr_ref));
-    CUDA_CHECK(cudaFree(out_indices_ref));
-    CUDA_CHECK(cudaFree(out_data_ref));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
   void compare() {
diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu
index 5535df4fe3..7f6b7dad07 100644
--- a/cpp/test/sparse/csr_to_dense.cu
+++ b/cpp/test/sparse/csr_to_dense.cu
@@ -16,8 +16,6 @@
 
 #include <cusparse_v2.h>
 #include <raft/cudart_utils.h>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
 
 #include <gtest/gtest.h>
 #include <raft/sparse/cusparse_wrappers.h>
@@ -57,9 +55,9 @@ class CSRToDenseTest
     std::vector<value_idx> indices_h = params.indices_h;
     std::vector<value_t> data_h = params.data_h;
 
-    allocate(indptr, indptr_h.size());
-    allocate(indices, indices_h.size());
-    allocate(data, data_h.size());
+    raft::allocate(indptr, indptr_h.size(), stream);
+    raft::allocate(indices, indices_h.size(), stream);
+    raft::allocate(data, data_h.size(), stream);
 
     update_device(indptr, indptr_h.data(), indptr_h.size(), stream);
     update_device(indices, indices_h.data(), indices_h.size(), stream);
@@ -67,18 +65,16 @@ class CSRToDenseTest
 
     std::vector<value_t> out_ref_h = params.out_ref_h;
 
-    allocate(out_ref, out_ref_h.size());
+    raft::allocate(out_ref, out_ref_h.size(), stream);
 
     update_device(out_ref, out_ref_h.data(), out_ref_h.size(), stream);
 
-    allocate(out, out_ref_h.size());
+    raft::allocate(out, out_ref_h.size(), stream);
   }
 
   void SetUp() override {
     params = ::testing::TestWithParam<
       CSRToDenseInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<raft::mr::device::allocator> alloc(
-      new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
     CUSPARSE_CHECK(cusparseCreate(&handle));
 
@@ -92,12 +88,8 @@ class CSRToDenseTest
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-    CUDA_CHECK(cudaFree(indptr));
-    CUDA_CHECK(cudaFree(indices));
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(out));
-    CUDA_CHECK(cudaFree(out_ref));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
   void compare() {
diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu
index c257d6eb3c..e50a9d94a9 100644
--- a/cpp/test/sparse/csr_transpose.cu
+++ b/cpp/test/sparse/csr_transpose.cu
@@ -20,8 +20,6 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
 
 #include <raft/sparse/linalg/transpose.h>
 
@@ -63,9 +61,9 @@ class CSRTransposeTest
     std::vector<value_idx> indices_h = params.indices_h;
     std::vector<value_t> data_h = params.data_h;
 
-    allocate(indptr, indptr_h.size());
-    allocate(indices, indices_h.size());
-    allocate(data, data_h.size());
+    raft::allocate(indptr, indptr_h.size(), stream);
+    raft::allocate(indices, indices_h.size(), stream);
+    raft::allocate(data, data_h.size(), stream);
 
     update_device(indptr, indptr_h.data(), indptr_h.size(), stream);
     update_device(indices, indices_h.data(), indices_h.size(), stream);
@@ -75,9 +73,9 @@ class CSRTransposeTest
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
     std::vector<value_t> out_data_ref_h = params.out_data_ref_h;
 
-    allocate(out_indptr_ref, out_indptr_ref_h.size());
-    allocate(out_indices_ref, out_indices_ref_h.size());
-    allocate(out_data_ref, out_data_ref_h.size());
+    raft::allocate(out_indptr_ref, out_indptr_ref_h.size(), stream);
+    raft::allocate(out_indices_ref, out_indices_ref_h.size(), stream);
+    raft::allocate(out_data_ref, out_data_ref_h.size(), stream);
 
     update_device(out_indptr_ref, out_indptr_ref_h.data(),
                   out_indptr_ref_h.size(), stream);
@@ -86,16 +84,14 @@ class CSRTransposeTest
     update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(),
                   stream);
 
-    allocate(out_indptr, out_indptr_ref_h.size());
-    allocate(out_indices, out_indices_ref_h.size());
-    allocate(out_data, out_data_ref_h.size());
+    raft::allocate(out_indptr, out_indptr_ref_h.size(), stream);
+    raft::allocate(out_indices, out_indices_ref_h.size(), stream);
+    raft::allocate(out_data, out_data_ref_h.size(), stream);
   }
 
   void SetUp() override {
     params = ::testing::TestWithParam<
       CSRTransposeInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<raft::mr::device::allocator> alloc(
-      new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
     CUSPARSE_CHECK(cusparseCreate(&handle));
 
@@ -103,23 +99,15 @@ class CSRTransposeTest
 
     raft::sparse::linalg::csr_transpose(
       handle, indptr, indices, data, out_indptr, out_indices, out_data,
-      params.nrows, params.ncols, params.nnz, alloc, stream);
+      params.nrows, params.ncols, params.nnz, stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUSPARSE_CHECK(cusparseDestroy(handle));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-    CUDA_CHECK(cudaFree(indptr));
-    CUDA_CHECK(cudaFree(indices));
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(out_indptr));
-    CUDA_CHECK(cudaFree(out_indices));
-    CUDA_CHECK(cudaFree(out_data));
-    CUDA_CHECK(cudaFree(out_indptr_ref));
-    CUDA_CHECK(cudaFree(out_indices_ref));
-    CUDA_CHECK(cudaFree(out_data_ref));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
   void compare() {
diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu
index 5d687ad92b..f8a469af45 100644
--- a/cpp/test/sparse/degree.cu
+++ b/cpp/test/sparse/degree.cu
@@ -48,25 +48,27 @@ const std::vector<SparseDegreeInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
 typedef SparseDegreeTests<float> COODegree;
 TEST_P(COODegree, Result) {
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
   int *in_rows, *verify, *results;
 
   int in_rows_h[5] = {0, 0, 1, 2, 2};
   int verify_h[5] = {2, 1, 2, 0, 0};
 
-  raft::allocate(in_rows, 5);
-  raft::allocate(verify, 5, true);
-  raft::allocate(results, 5, true);
+  raft::allocate(in_rows, 5, stream);
+  raft::allocate(verify, 5, stream, true);
+  raft::allocate(results, 5, stream, true);
 
-  raft::update_device(in_rows, *&in_rows_h, 5, 0);
-  raft::update_device(verify, *&verify_h, 5, 0);
+  raft::update_device(in_rows, *&in_rows_h, 5, stream);
+  raft::update_device(verify, *&verify_h, 5, stream);
 
-  linalg::coo_degree<32>(in_rows, 5, results, 0);
+  linalg::coo_degree<32>(in_rows, 5, results, stream);
   cudaDeviceSynchronize();
 
   ASSERT_TRUE(raft::devArrMatch<int>(verify, results, 5, raft::Compare<int>()));
 
-  CUDA_CHECK(cudaFree(in_rows));
-  CUDA_CHECK(cudaFree(verify));
+  raft::deallocate_all(stream);
+  CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
 typedef SparseDegreeTests<float> COODegreeNonzero;
@@ -81,23 +83,21 @@ TEST_P(COODegreeNonzero, Result) {
   float in_vals_h[5] = {0.0, 5.0, 0.0, 1.0, 1.0};
   int verify_h[5] = {1, 0, 2, 0, 0};
 
-  raft::allocate(in_rows, 5);
-  raft::allocate(verify, 5, true);
-  raft::allocate(results, 5, true);
-  raft::allocate(in_vals, 5, true);
+  raft::allocate(in_rows, 5, stream);
+  raft::allocate(verify, 5, stream, true);
+  raft::allocate(results, 5, stream, true);
+  raft::allocate(in_vals, 5, stream, true);
 
-  raft::update_device(in_rows, *&in_rows_h, 5, 0);
-  raft::update_device(verify, *&verify_h, 5, 0);
-  raft::update_device(in_vals, *&in_vals_h, 5, 0);
+  raft::update_device(in_rows, *&in_rows_h, 5, stream);
+  raft::update_device(verify, *&verify_h, 5, stream);
+  raft::update_device(in_vals, *&in_vals_h, 5, stream);
 
   linalg::coo_degree_nz<32, float>(in_rows, in_vals, 5, results, stream);
   cudaDeviceSynchronize();
 
   ASSERT_TRUE(raft::devArrMatch<int>(verify, results, 5, raft::Compare<int>()));
 
-  CUDA_CHECK(cudaFree(in_rows));
-  CUDA_CHECK(cudaFree(verify));
-
+  raft::deallocate_all(stream);
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index a83b93f83f..563dcf6f15 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -22,7 +22,7 @@
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/mr/device/allocator.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/distance/coo_spmv.cuh>
@@ -94,10 +94,9 @@ class SparseDistanceCOOSPMVTest
   template <typename reduce_f, typename accum_f, typename write_f>
   void compute_dist(reduce_f reduce_func, accum_f accum_func,
                     write_f write_func, bool rev = true) {
-    raft::mr::device::buffer<value_idx> coo_rows(
-      dist_config.handle.get_device_allocator(),
-      dist_config.handle.get_stream(),
-      max(dist_config.b_nnz, dist_config.a_nnz));
+    rmm::device_uvector<value_idx> coo_rows(
+      max(dist_config.b_nnz, dist_config.a_nnz),
+      dist_config.handle.get_stream());
 
     raft::sparse::convert::csr_to_coo(dist_config.b_indptr, dist_config.b_nrows,
                                       coo_rows.data(), dist_config.b_nnz,
@@ -161,9 +160,9 @@ class SparseDistanceCOOSPMVTest
     std::vector<value_idx> indices_h = params.input_configuration.indices_h;
     std::vector<value_t> data_h = params.input_configuration.data_h;
 
-    allocate(indptr, indptr_h.size());
-    allocate(indices, indices_h.size());
-    allocate(data, data_h.size());
+    raft::allocate(indptr, indptr_h.size(), handle.get_stream());
+    raft::allocate(indices, indices_h.size(), handle.get_stream());
+    raft::allocate(data, data_h.size(), handle.get_stream());
 
     update_device(indptr, indptr_h.data(), indptr_h.size(),
                   handle.get_stream());
@@ -174,7 +173,8 @@ class SparseDistanceCOOSPMVTest
     std::vector<value_t> out_dists_ref_h =
       params.input_configuration.out_dists_ref_h;
 
-    allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1));
+    raft::allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1),
+                   handle.get_stream());
 
     update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
                   handle.get_stream());
@@ -201,21 +201,14 @@ class SparseDistanceCOOSPMVTest
 
     int out_size = dist_config.a_nrows * dist_config.b_nrows;
 
-    allocate(out_dists, out_size);
+    raft::allocate(out_dists, out_size, handle.get_stream());
 
     run_spmv();
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
   }
 
-  void TearDown() override {
-    CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
-    CUDA_CHECK(cudaFree(indptr));
-    CUDA_CHECK(cudaFree(indices));
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(out_dists));
-    CUDA_CHECK(cudaFree(out_dists_ref));
-  }
+  void TearDown() override { raft::deallocate_all(handle.get_stream()); }
 
   void compare() {
     ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists,
diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu
index 0589637061..4b531992f0 100644
--- a/cpp/test/sparse/distance.cu
+++ b/cpp/test/sparse/distance.cu
@@ -21,7 +21,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/mr/device/allocator.hpp>
 
 #include <raft/sparse/distance/distance.cuh>
 
@@ -82,21 +81,14 @@ class SparseDistanceTest
 
     int out_size = dist_config.a_nrows * dist_config.b_nrows;
 
-    allocate(out_dists, out_size);
+    raft::allocate(out_dists, out_size, handle.get_stream());
 
     pairwiseDistance(out_dists, dist_config, params.metric, params.metric_arg);
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
   }
 
-  void TearDown() override {
-    CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
-    CUDA_CHECK(cudaFree(indptr));
-    CUDA_CHECK(cudaFree(indices));
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(out_dists));
-    CUDA_CHECK(cudaFree(out_dists_ref));
-  }
+  void TearDown() override { raft::deallocate_all(handle.get_stream()); }
 
   void compare() {
     ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists,
@@ -110,9 +102,9 @@ class SparseDistanceTest
     std::vector<value_idx> indices_h = params.indices_h;
     std::vector<value_t> data_h = params.data_h;
 
-    allocate(indptr, indptr_h.size());
-    allocate(indices, indices_h.size());
-    allocate(data, data_h.size());
+    raft::allocate(indptr, indptr_h.size(), handle.get_stream());
+    raft::allocate(indices, indices_h.size(), handle.get_stream());
+    raft::allocate(data, data_h.size(), handle.get_stream());
 
     update_device(indptr, indptr_h.data(), indptr_h.size(),
                   handle.get_stream());
@@ -122,7 +114,8 @@ class SparseDistanceTest
 
     std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
 
-    allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1));
+    raft::allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1),
+                   handle.get_stream());
 
     update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
                   dist_config.handle.get_stream());
diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu
index f7954f899f..4634e5fc0e 100644
--- a/cpp/test/sparse/filter.cu
+++ b/cpp/test/sparse/filter.cu
@@ -20,7 +20,6 @@
 #include "../test_utils.h"
 
 #include <raft/sparse/op/sort.h>
-#include <raft/mr/device/allocator.hpp>
 #include <raft/sparse/coo.cuh>
 #include <raft/sparse/op/filter.cuh>
 
@@ -53,13 +52,11 @@ typedef SparseFilterTests<float> COORemoveZeros;
 TEST_P(COORemoveZeros, Result) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
-  std::shared_ptr<raft::mr::device::allocator> alloc(
-    new raft::mr::device::default_allocator);
   params = ::testing::TestWithParam<SparseFilterInputs<float>>::GetParam();
 
   float *in_h_vals = new float[params.nnz];
 
-  COO<float> in(alloc, stream, params.nnz, 5, 5);
+  COO<float> in(stream, params.nnz, 5, 5);
 
   raft::random::Rng r(params.seed);
   r.uniform(in.vals(), params.nnz, float(-1.0), float(1.0), stream);
@@ -82,7 +79,7 @@ TEST_P(COORemoveZeros, Result) {
   raft::update_device(in.cols(), in_h_cols, params.nnz, stream);
   raft::update_device(in.vals(), in_h_vals, params.nnz, stream);
 
-  op::coo_sort<float>(&in, alloc, stream);
+  op::coo_sort<float>(&in, stream);
 
   int out_rows_ref_h[2] = {0, 3};
   int out_cols_ref_h[2] = {4, 1};
@@ -91,14 +88,14 @@ TEST_P(COORemoveZeros, Result) {
   out_vals_ref_h[0] = in_h_vals[4];
   out_vals_ref_h[1] = in_h_vals[1];
 
-  COO<float> out_ref(alloc, stream, 2, 5, 5);
-  COO<float> out(alloc, stream);
+  COO<float> out_ref(stream, 2, 5, 5);
+  COO<float> out(stream);
 
   raft::update_device(out_ref.rows(), *&out_rows_ref_h, 2, stream);
   raft::update_device(out_ref.cols(), *&out_cols_ref_h, 2, stream);
   raft::update_device(out_ref.vals(), out_vals_ref_h, 2, stream);
 
-  op::coo_remove_zeros<32, float>(&in, &out, alloc, stream);
+  op::coo_remove_zeros<32, float>(&in, &out, stream);
 
   ASSERT_TRUE(raft::devArrMatch<int>(out_ref.rows(), out.rows(), 2,
                                      raft::Compare<int>()));
diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu
index 8c3bf36318..22f97559b1 100644
--- a/cpp/test/sparse/knn.cu
+++ b/cpp/test/sparse/knn.cu
@@ -24,8 +24,6 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
 
 namespace raft {
 namespace sparse {
@@ -82,15 +80,7 @@ class SparseKNNTest
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
   }
 
-  void TearDown() override {
-    CUDA_CHECK(cudaFree(indptr));
-    CUDA_CHECK(cudaFree(indices));
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(out_indices));
-    CUDA_CHECK(cudaFree(out_dists));
-    CUDA_CHECK(cudaFree(out_indices_ref));
-    CUDA_CHECK(cudaFree(out_dists_ref));
-  }
+  void TearDown() override { raft::deallocate_all(handle.get_stream()); }
 
   void compare() {
     ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, n_rows * k,
@@ -105,9 +95,9 @@ class SparseKNNTest
     std::vector<value_idx> indices_h = params.indices_h;
     std::vector<value_t> data_h = params.data_h;
 
-    allocate(indptr, indptr_h.size());
-    allocate(indices, indices_h.size());
-    allocate(data, data_h.size());
+    raft::allocate(indptr, indptr_h.size(), handle.get_stream());
+    raft::allocate(indices, indices_h.size(), handle.get_stream());
+    raft::allocate(data, data_h.size(), handle.get_stream());
 
     update_device(indptr, indptr_h.data(), indptr_h.size(),
                   handle.get_stream());
@@ -118,16 +108,17 @@ class SparseKNNTest
     std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
 
-    allocate(out_indices_ref, out_indices_ref_h.size());
-    allocate(out_dists_ref, out_dists_ref_h.size());
+    raft::allocate(out_indices_ref, out_indices_ref_h.size(),
+                   handle.get_stream());
+    raft::allocate(out_dists_ref, out_dists_ref_h.size(), handle.get_stream());
 
     update_device(out_indices_ref, out_indices_ref_h.data(),
                   out_indices_ref_h.size(), handle.get_stream());
     update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
                   handle.get_stream());
 
-    allocate(out_dists, n_rows * k);
-    allocate(out_indices, n_rows * k);
+    raft::allocate(out_dists, n_rows * k, handle.get_stream());
+    raft::allocate(out_indices, n_rows * k, handle.get_stream());
   }
 
   raft::handle_t handle;
diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu
index ec41b32374..e259eafa70 100644
--- a/cpp/test/sparse/knn_graph.cu
+++ b/cpp/test/sparse/knn_graph.cu
@@ -17,6 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.cuh>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
 
@@ -64,12 +65,11 @@ class KNNGraphTest
 
     raft::handle_t handle;
 
-    auto alloc = handle.get_device_allocator();
     stream = handle.get_stream();
 
-    out = new raft::sparse::COO<value_t, value_idx>(alloc, stream);
+    out = new raft::sparse::COO<value_t, value_idx>(stream);
 
-    allocate(X, params.X.size());
+    raft::allocate(X, params.X.size(), stream);
 
     update_device(X, params.X.data(), params.X.size(), stream);
 
@@ -77,9 +77,8 @@ class KNNGraphTest
       handle, X, params.m, params.n, raft::distance::DistanceType::L2Unexpanded,
       *out);
 
-    rmm::device_uvector<value_idx> sum(1, stream);
-
-    CUDA_CHECK(cudaMemsetAsync(sum.data(), 0, 1 * sizeof(value_idx), stream));
+    rmm::device_scalar<value_idx> sum(stream);
+    sum.set_value_to_zero_async(stream);
 
     /**
      * Assert the knn graph is symmetric
@@ -87,12 +86,13 @@ class KNNGraphTest
     assert_symmetry<<<raft::ceildiv(out->nnz, 256), 256, 0, stream>>>(
       out->rows(), out->cols(), out->vals(), out->nnz, sum.data());
 
-    raft::update_host(&sum_h, sum.data(), 1, stream);
+    sum_h = sum.value(stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(X));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
 
     delete out;
   }
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu
index a157a17e30..3bd144ef54 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/sparse/linkage.cu
@@ -19,8 +19,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
 #include <raft/linalg/transpose.h>
-#include <raft/cuda_utils.cuh>
-#include <raft/mr/device/allocator.hpp>
 #include <raft/sparse/coo.cuh>
 #include <raft/sparse/hierarchy/single_linkage.hpp>
 
@@ -108,18 +106,16 @@ __global__ void computeTheNumerator(const T* firstClusterArray,
 * @param firstClusterArray: the array of classes of type T
 * @param secondClusterArray: the array of classes of type T
 * @param size: the size of the data points of type uint64_t
-* @param allocator: object that takes care of temporary device memory allocation of type std::shared_ptr<MLCommon::deviceAllocator>
 * @param stream: the cudaStream object
 */
 template <typename T>
-double compute_rand_index(
-  T* firstClusterArray, T* secondClusterArray, uint64_t size,
-  std::shared_ptr<raft::mr::device::allocator> allocator, cudaStream_t stream) {
+double compute_rand_index(T* firstClusterArray, T* secondClusterArray,
+                          uint64_t size, cudaStream_t stream) {
   //rand index for size less than 2 is not defined
   ASSERT(size >= 2, "Rand Index for size less than 2 not defined!");
 
   //allocating and initializing memory for a and b in the GPU
-  raft::mr::device::buffer<uint64_t> arr_buf(allocator, stream, 2);
+  rmm::device_uvector<uint64_t> arr_buf(2, stream);
   CUDA_CHECK(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream));
 
   //kernel configuration
@@ -159,30 +155,27 @@ template <typename T, typename IdxT>
 class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
  protected:
   void basicTest() {
-    raft::handle_t handle;
+    CUDA_CHECK(cudaStreamCreate(&stream));
 
     params = ::testing::TestWithParam<LinkageInputs<T, IdxT>>::GetParam();
 
-    rmm::device_uvector<T> data(params.n_row * params.n_col,
-                                handle.get_stream());
+    rmm::device_uvector<T> data(params.n_row * params.n_col, stream);
 
     // Allocate result labels and expected labels on device
-    raft::allocate(labels, params.n_row);
-    raft::allocate(labels_ref, params.n_row);
+    raft::allocate(labels, params.n_row, stream);
+    raft::allocate(labels_ref, params.n_row, stream);
 
-    raft::copy(data.data(), params.data.data(), data.size(),
-               handle.get_stream());
-    raft::copy(labels_ref, params.expected_labels.data(), params.n_row,
-               handle.get_stream());
+    raft::copy(data.data(), params.data.data(), data.size(), stream);
+    raft::copy(labels_ref, params.expected_labels.data(), params.n_row, stream);
 
     raft::hierarchy::linkage_output<IdxT, T> out_arrs;
     out_arrs.labels = labels;
 
-    rmm::device_uvector<IdxT> out_children(params.n_row * 2,
-                                           handle.get_stream());
+    rmm::device_uvector<IdxT> out_children(params.n_row * 2, stream);
 
     out_arrs.children = out_children.data();
 
+    raft::handle_t handle;
     raft::hierarchy::single_linkage<
       IdxT, T, raft::hierarchy::LinkageDistance::KNN_GRAPH>(
       handle, data.data(), params.n_row, params.n_col,
@@ -191,23 +184,21 @@ class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
 
-    score =
-      compute_rand_index(labels, labels_ref, params.n_row,
-                         handle.get_device_allocator(), handle.get_stream());
+    score = compute_rand_index(labels, labels_ref, params.n_row, stream);
   }
 
   void SetUp() override { basicTest(); }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(labels));
-    CUDA_CHECK(cudaFree(labels_ref));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   LinkageInputs<T, IdxT> params;
   IdxT *labels, *labels_ref;
-
   double score;
+  cudaStream_t stream;
 };
 
 const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
index 7adbbf8b9a..d69dd15c57 100644
--- a/cpp/test/sparse/norm.cu
+++ b/cpp/test/sparse/norm.cu
@@ -47,10 +47,10 @@ class CSRRowNormalizeTest
       CSRRowNormalizeInputs<Type_f, Index_>>::GetParam();
     cudaStreamCreate(&stream);
 
-    raft::allocate(in_vals, params.in_vals.size());
-    raft::allocate(verify, params.verify.size());
-    raft::allocate(ex_scan, params.ex_scan.size());
-    raft::allocate(result, params.verify.size(), true);
+    raft::allocate(in_vals, params.in_vals.size(), stream);
+    raft::allocate(verify, params.verify.size(), stream);
+    raft::allocate(ex_scan, params.ex_scan.size(), stream);
+    raft::allocate(result, params.verify.size(), stream, true);
   }
 
   void Run() {
@@ -77,11 +77,8 @@ class CSRRowNormalizeTest
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(ex_scan));
-    CUDA_CHECK(cudaFree(in_vals));
-    CUDA_CHECK(cudaFree(verify));
-    CUDA_CHECK(cudaFree(result));
-    cudaStreamDestroy(stream);
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu
index 50b5dc5993..8ff4a600bc 100644
--- a/cpp/test/sparse/reduce.cu
+++ b/cpp/test/sparse/reduce.cu
@@ -53,7 +53,6 @@ class SparseReduceTest
   void Run() {
     raft::handle_t handle;
 
-    auto d_alloc = handle.get_device_allocator();
     auto stream = handle.get_stream();
 
     rmm::device_uvector<value_idx> in_rows(params.in_rows.size(), stream);
@@ -76,7 +75,7 @@ class SparseReduceTest
     raft::update_device(out_vals.data(), params.out_vals.data(),
                         params.out_vals.size(), stream);
 
-    raft::sparse::COO<value_t, value_idx> out(d_alloc, stream);
+    raft::sparse::COO<value_t, value_idx> out(stream);
     raft::sparse::op::max_duplicates(handle, out, in_rows.data(),
                                      in_cols.data(), in_vals.data(),
                                      params.in_rows.size(), params.m, params.n);
diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu
index b64fa25883..805a3d85da 100644
--- a/cpp/test/sparse/row_op.cu
+++ b/cpp/test/sparse/row_op.cu
@@ -59,9 +59,9 @@ class CSRRowOpTest
     n_rows = params.ex_scan.size();
     nnz = params.verify.size();
 
-    raft::allocate(verify, nnz);
-    raft::allocate(ex_scan, n_rows);
-    raft::allocate(result, nnz, true);
+    raft::allocate(verify, nnz, stream);
+    raft::allocate(ex_scan, n_rows, stream);
+    raft::allocate(result, nnz, stream, true);
   }
 
   void Run() {
@@ -75,10 +75,8 @@ class CSRRowOpTest
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(ex_scan));
-    CUDA_CHECK(cudaFree(verify));
-    CUDA_CHECK(cudaFree(result));
-    cudaStreamDestroy(stream);
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
diff --git a/cpp/test/sparse/selection.cu b/cpp/test/sparse/selection.cu
index 46f2f6a844..256ecfdfb7 100644
--- a/cpp/test/sparse/selection.cu
+++ b/cpp/test/sparse/selection.cu
@@ -57,32 +57,30 @@ class SparseSelectionTest
   void make_data() {
     std::vector<value_t> dists_h = params.dists_h;
 
-    allocate(dists, n_rows * n_cols);
+    raft::allocate(dists, n_rows * n_cols, stream);
     update_device(dists, dists_h.data(), dists_h.size(), stream);
 
-    allocate(inds, n_rows * n_cols);
+    raft::allocate(inds, n_rows * n_cols, stream);
     iota_fill(inds, n_rows, n_cols, stream);
 
     std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
 
-    allocate(out_indices_ref, out_indices_ref_h.size());
-    allocate(out_dists_ref, out_dists_ref_h.size());
+    raft::allocate(out_indices_ref, out_indices_ref_h.size(), stream);
+    raft::allocate(out_dists_ref, out_dists_ref_h.size(), stream);
 
     update_device(out_indices_ref, out_indices_ref_h.data(),
                   out_indices_ref_h.size(), stream);
     update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
                   stream);
 
-    allocate(out_dists, n_rows * k);
-    allocate(out_indices, n_rows * k);
+    raft::allocate(out_dists, n_rows * k, stream);
+    raft::allocate(out_indices, n_rows * k, stream);
   }
 
   void SetUp() override {
     params = ::testing::TestWithParam<
       SparseSelectionInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<raft::mr::device::allocator> alloc(
-      new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     n_rows = params.n_rows;
@@ -99,15 +97,7 @@ class SparseSelectionTest
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-
-    CUDA_CHECK(cudaFree(dists));
-    CUDA_CHECK(cudaFree(inds));
-    CUDA_CHECK(cudaFree(out_indices));
-    CUDA_CHECK(cudaFree(out_dists));
-    CUDA_CHECK(cudaFree(out_indices_ref));
-    CUDA_CHECK(cudaFree(out_dists_ref));
-
+    raft::deallocate_all(stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu
index b9a8b849eb..e73a8a547b 100644
--- a/cpp/test/sparse/sort.cu
+++ b/cpp/test/sparse/sort.cu
@@ -20,7 +20,6 @@
 #include "../test_utils.h"
 
 #include <raft/sparse/op/sort.h>
-#include <raft/mr/device/allocator.hpp>
 
 #include <iostream>
 
@@ -55,10 +54,8 @@ TEST_P(COOSort, Result) {
   raft::random::Rng r(params.seed);
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
-  std::shared_ptr<raft::mr::device::allocator> alloc(
-    new raft::mr::device::default_allocator);
 
-  raft::allocate(in_vals, params.nnz);
+  raft::allocate(in_vals, params.nnz, stream);
   r.uniform(in_vals, params.nnz, float(-1.0), float(1.0), stream);
 
   int *in_rows_h = (int *)malloc(params.nnz * sizeof(int));
@@ -71,16 +68,16 @@ TEST_P(COOSort, Result) {
     in_cols_h[i] = i;
   }
 
-  raft::allocate(in_rows, params.nnz);
-  raft::allocate(in_cols, params.nnz);
-  raft::allocate(verify, params.nnz);
+  raft::allocate(in_rows, params.nnz, stream);
+  raft::allocate(in_cols, params.nnz, stream);
+  raft::allocate(verify, params.nnz, stream);
 
   raft::update_device(in_rows, in_rows_h, params.nnz, stream);
 
   raft::update_device(in_cols, in_cols_h, params.nnz, stream);
   raft::update_device(verify, verify_h, params.nnz, stream);
 
-  op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals, alloc,
+  op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals,
                stream);
 
   ASSERT_TRUE(
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu
index d104028d2b..35233dc473 100644
--- a/cpp/test/sparse/symmetrize.cu
+++ b/cpp/test/sparse/symmetrize.cu
@@ -17,11 +17,13 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.cuh>
-#include "../test_utils.h"
-
 #include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/coo.cuh>
 #include <raft/sparse/linalg/symmetrize.cuh>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include "../test_utils.h"
 
 #include <iostream>
 
@@ -63,9 +65,9 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam<
     std::vector<value_idx> indices_h = params.indices_h;
     std::vector<value_t> data_h = params.data_h;
 
-    allocate(indptr, indptr_h.size());
-    allocate(indices, indices_h.size());
-    allocate(data, data_h.size());
+    raft::allocate(indptr, indptr_h.size(), stream);
+    raft::allocate(indices, indices_h.size(), stream);
+    raft::allocate(data, data_h.size(), stream);
 
     update_device(indptr, indptr_h.data(), indptr_h.size(), stream);
     update_device(indices, indices_h.data(), indices_h.size(), stream);
@@ -78,7 +80,6 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam<
 
     raft::handle_t handle;
 
-    auto alloc = handle.get_device_allocator();
     stream = handle.get_stream();
 
     make_data();
@@ -87,23 +88,22 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam<
     value_idx n = params.n_cols;
     value_idx nnz = params.indices_h.size();
 
-    raft::mr::device::buffer<value_idx> coo_rows(alloc, stream, nnz);
+    rmm::device_uvector<value_idx> coo_rows(nnz, stream);
 
     raft::sparse::convert::csr_to_coo(indptr, m, coo_rows.data(), nnz, stream);
 
-    raft::sparse::COO<value_t, value_idx> out(alloc, stream);
+    raft::sparse::COO<value_t, value_idx> out(stream);
 
     raft::sparse::linalg::symmetrize(handle, coo_rows.data(), indices, data, m,
                                      n, coo_rows.size(), out);
 
-    raft::mr::device::buffer<value_idx> sum(alloc, stream, 1);
-
-    CUDA_CHECK(cudaMemsetAsync(sum.data(), 0, 1 * sizeof(value_idx), stream));
+    rmm::device_scalar<value_idx> sum(stream);
+    sum.set_value_to_zero_async(stream);
 
     assert_symmetry<<<raft::ceildiv(out.nnz, 256), 256, 0, stream>>>(
       out.rows(), out.cols(), out.vals(), out.nnz, sum.data());
 
-    raft::update_host(&sum_h, sum.data(), 1, stream);
+    sum_h = sum.value(stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
@@ -148,9 +148,6 @@ TEST_P(COOSymmetrize, Result) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
 
-  std::shared_ptr<raft::mr::device::default_allocator> alloc(
-    new raft::mr::device::default_allocator);
-
   int nnz = 8;
 
   int *in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
@@ -164,19 +161,19 @@ TEST_P(COOSymmetrize, Result) {
   float *exp_vals_h = new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0,
                                          0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0};
 
-  COO<float> in(alloc, stream, nnz, 4, 4);
+  COO<float> in(stream, nnz, 4, 4);
   raft::update_device(in.rows(), *&in_rows_h, nnz, stream);
   raft::update_device(in.cols(), *&in_cols_h, nnz, stream);
   raft::update_device(in.vals(), *&in_vals_h, nnz, stream);
 
-  COO<float> out(alloc, stream);
+  COO<float> out(stream);
 
   linalg::coo_symmetrize<32, float>(
     &in, &out,
     [] __device__(int row, int col, float val, float trans) {
       return val + trans;
     },
-    alloc, stream);
+    stream);
 
   CUDA_CHECK(cudaStreamSynchronize(stream));
   std::cout << out << std::endl;
diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu
index def1f1685b..122d7f2d6a 100644
--- a/cpp/test/spatial/haversine.cu
+++ b/cpp/test/spatial/haversine.cu
@@ -18,7 +18,6 @@
 #include <raft/linalg/distance_type.h>
 #include <iostream>
 #include <raft/spatial/knn/detail/haversine_distance.cuh>
-#include <rmm/device_buffer.hpp>
 #include <vector>
 #include "../test_utils.h"
 
@@ -30,18 +29,18 @@ template <typename value_idx, typename value_t>
 class HaversineKNNTest : public ::testing::Test {
  protected:
   void basicTest() {
-    auto alloc = std::make_shared<raft::mr::device::default_allocator>();
+    CUDA_CHECK(cudaStreamCreate(&stream));
 
     // Allocate input
-    raft::allocate(d_train_inputs, n * d);
+    raft::allocate(d_train_inputs, n * d, stream);
 
     // Allocate reference arrays
-    raft::allocate<value_idx>(d_ref_I, n * n);
-    raft::allocate(d_ref_D, n * n);
+    raft::allocate<value_idx>(d_ref_I, n * n, stream);
+    raft::allocate(d_ref_D, n * n, stream);
 
     // Allocate predicted arrays
-    raft::allocate<value_idx>(d_pred_I, n * n);
-    raft::allocate(d_pred_D, n * n);
+    raft::allocate<value_idx>(d_pred_I, n * n, stream);
+    raft::allocate(d_pred_D, n * n, stream);
 
     // make testdata on host
     std::vector<value_t> h_train_inputs = {
@@ -50,7 +49,7 @@ class HaversineKNNTest : public ::testing::Test {
       0.53154002, -1.47049808, 0.72891737, -1.54095137};
 
     h_train_inputs.resize(n);
-    raft::update_device(d_train_inputs, h_train_inputs.data(), n * d, 0);
+    raft::update_device(d_train_inputs, h_train_inputs.data(), n * d, stream);
 
     std::vector<value_t> h_res_D = {
       0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595,
@@ -60,34 +59,28 @@ class HaversineKNNTest : public ::testing::Test {
       0., 0.16461092, 0.20535265, 0.23048252, 0.2426416,  0.5170737,
       0., 0.152463,   0.18767063, 0.20535265, 0.2345792,  0.44288665};
     h_res_D.resize(n * n);
-    raft::update_device(d_ref_D, h_res_D.data(), n * n, 0);
+    raft::update_device(d_ref_D, h_res_D.data(), n * n, stream);
 
     std::vector<value_idx> h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0,
                                       2, 0, 5, 4, 3, 1, 3, 4, 5, 2, 0, 1,
                                       4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1};
     h_res_I.resize(n * n);
-    raft::update_device<value_idx>(d_ref_I, h_res_I.data(), n * n, 0);
+    raft::update_device<value_idx>(d_ref_I, h_res_I.data(), n * n, stream);
 
     std::vector<value_t *> input_vec = {d_train_inputs};
     std::vector<value_idx> sizes_vec = {n};
 
-    cudaStream_t stream;
-    CUDA_CHECK(cudaStreamCreate(&stream));
-
     raft::spatial::knn::detail::haversine_knn(
       d_pred_I, d_pred_D, d_train_inputs, d_train_inputs, n, n, k, stream);
 
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void SetUp() override { basicTest(); }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(d_train_inputs));
-    CUDA_CHECK(cudaFree(d_pred_I));
-    CUDA_CHECK(cudaFree(d_pred_D));
-    CUDA_CHECK(cudaFree(d_ref_I));
-    CUDA_CHECK(cudaFree(d_ref_D));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
@@ -103,6 +96,8 @@ class HaversineKNNTest : public ::testing::Test {
 
   value_idx *d_ref_I;
   value_t *d_ref_D;
+
+  cudaStream_t stream;
 };
 
 typedef HaversineKNNTest<int, float> HaversineKNNTestF;
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index de6251d32d..e4d05920c0 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -68,8 +68,8 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
 
     auto stream = handle_.get_stream();
 
-    raft::allocate(actual_labels_, rows_ * k_, true);
-    raft::allocate(expected_labels_, rows_ * k_, true);
+    raft::allocate(actual_labels_, rows_ * k_, stream, true);
+    raft::allocate(expected_labels_, rows_ * k_, stream, true);
 
     std::vector<float *> input_vec;
     std::vector<int> sizes_vec;
@@ -104,6 +104,8 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
     cols_ = params_.input[0].size();
     k_ = params_.k;
 
+    cudaStream_t stream = handle_.get_stream();
+
     std::vector<float> row_major_input;
     for (std::size_t i = 0; i < params_.input.size(); ++i) {
       for (std::size_t j = 0; j < params_.input[i].size(); ++j) {
@@ -111,31 +113,27 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
       }
     }
     rmm::device_buffer input_d = rmm::device_buffer(
-      row_major_input.data(), row_major_input.size() * sizeof(float),
-      handle_.get_stream());
+      row_major_input.data(), row_major_input.size() * sizeof(float), stream);
     float *input_ptr = static_cast<float *>(input_d.data());
 
     rmm::device_buffer labels_d = rmm::device_buffer(
-      params_.labels.data(), params_.labels.size() * sizeof(int),
-      handle_.get_stream());
+      params_.labels.data(), params_.labels.size() * sizeof(int), stream);
     int *labels_ptr = static_cast<int *>(labels_d.data());
 
-    raft::allocate(input_, rows_ * cols_, true);
-    raft::allocate(search_data_, rows_ * cols_, true);
-    raft::allocate(indices_, rows_ * k_, true);
-    raft::allocate(distances_, rows_ * k_, true);
-    raft::allocate(search_labels_, rows_, true);
+    raft::allocate(input_, rows_ * cols_, stream, true);
+    raft::allocate(search_data_, rows_ * cols_, stream, true);
+    raft::allocate(indices_, rows_ * k_, stream, true);
+    raft::allocate(distances_, rows_ * k_, stream, true);
+    raft::allocate(search_labels_, rows_, stream, true);
 
-    raft::copy(input_, input_ptr, rows_ * cols_, handle_.get_stream());
-    raft::copy(search_data_, input_ptr, rows_ * cols_, handle_.get_stream());
-    raft::copy(search_labels_, labels_ptr, rows_, handle_.get_stream());
+    raft::copy(input_, input_ptr, rows_ * cols_, stream);
+    raft::copy(search_data_, input_ptr, rows_ * cols_, stream);
+    raft::copy(search_labels_, labels_ptr, rows_, stream);
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(search_data_));
-    CUDA_CHECK(cudaFree(indices_));
-    CUDA_CHECK(cudaFree(distances_));
-    CUDA_CHECK(cudaFree(actual_labels_));
+    cudaStream_t stream = handle_.get_stream();
+    raft::deallocate_all(stream);
   }
 
  private:
diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu
index e5c2d52764..b85d35e3f8 100644
--- a/cpp/test/spectral_matrix.cu
+++ b/cpp/test/spectral_matrix.cu
@@ -57,27 +57,24 @@ TEST(Raft, SpectralMatrices) {
   ASSERT_EQ(nullptr, sm2.row_offsets_);
 
   auto stream = h.get_stream();
-  auto t_exe_pol = thrust::cuda::par.on(stream);
 
-  auto cnstr_lm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) {
-    laplacian_matrix_t<index_type, value_type> lm1{h,  t_exe_pol, ro, ci,
-                                                   vs, nrows,     nnz};
+  auto cnstr_lm1 = [&h, ro, ci, vs, nrows, nnz](void) {
+    laplacian_matrix_t<index_type, value_type> lm1{h, ro, ci, vs, nrows, nnz};
   };
   EXPECT_ANY_THROW(cnstr_lm1());  // because of nullptr ptr args
 
-  auto cnstr_lm2 = [&h, t_exe_pol, &sm2](void) {
-    laplacian_matrix_t<index_type, value_type> lm2{h, t_exe_pol, sm2};
+  auto cnstr_lm2 = [&h, &sm2](void) {
+    laplacian_matrix_t<index_type, value_type> lm2{h, sm2};
   };
   EXPECT_ANY_THROW(cnstr_lm2());  // because of nullptr ptr args
 
-  auto cnstr_mm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) {
-    modularity_matrix_t<index_type, value_type> mm1{h,  t_exe_pol, ro, ci,
-                                                    vs, nrows,     nnz};
+  auto cnstr_mm1 = [&h, ro, ci, vs, nrows, nnz](void) {
+    modularity_matrix_t<index_type, value_type> mm1{h, ro, ci, vs, nrows, nnz};
   };
   EXPECT_ANY_THROW(cnstr_mm1());  // because of nullptr ptr args
 
-  auto cnstr_mm2 = [&h, t_exe_pol, &sm2](void) {
-    modularity_matrix_t<index_type, value_type> mm2{h, t_exe_pol, sm2};
+  auto cnstr_mm2 = [&h, &sm2](void) {
+    modularity_matrix_t<index_type, value_type> mm2{h, sm2};
   };
   EXPECT_ANY_THROW(cnstr_mm2());  // because of nullptr ptr args
 }
diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu
index 4a3b0ed196..a3c88a92be 100644
--- a/cpp/test/stats/mean.cu
+++ b/cpp/test/stats/mean.cu
@@ -49,11 +49,10 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
 
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
 
-    allocate(data, len);
-    allocate(mean_act, cols);
+    raft::allocate(data, len, stream);
+    raft::allocate(mean_act, cols, stream);
     r.normal(data, len, params.mean, (T)1.0, stream);
 
     meanSGtest(data, stream);
@@ -66,13 +65,14 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(mean_act));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   MeanInputs<T> params;
   T *data, *mean_act;
+  cudaStream_t stream;
 };
 
 // Note: For 1024 samples, 256 experiments, a mean of 1.0 with stddev=1.0, the
diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu
index 8b0d607561..b827230b5d 100644
--- a/cpp/test/stats/mean_center.cu
+++ b/cpp/test/stats/mean_center.cu
@@ -47,17 +47,16 @@ class MeanCenterTest
     params = ::testing::TestWithParam<MeanCenterInputs<T, IdxType>>::GetParam();
     raft::random::Rng r(params.seed);
 
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     auto rows = params.rows, cols = params.cols;
     auto len = rows * cols;
     IdxType vecLen = params.bcastAlongRows ? cols : rows;
 
-    raft::allocate(out, len);
-    raft::allocate(out_ref, len);
-    raft::allocate(data, len);
-    raft::allocate(meanVec, vecLen);
+    raft::allocate(out, len, stream);
+    raft::allocate(out_ref, len, stream);
+    raft::allocate(data, len, stream);
+    raft::allocate(meanVec, vecLen, stream);
     r.normal(data, len, params.mean, (T)1.0, stream);
     raft::stats::mean(meanVec, data, cols, rows, params.sample, params.rowMajor,
                       stream);
@@ -65,19 +64,18 @@ class MeanCenterTest
                params.bcastAlongRows, stream);
     raft::linalg::naiveMatVec(out_ref, data, meanVec, cols, rows,
                               params.rowMajor, params.bcastAlongRows, (T)-1.0);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(out));
-    CUDA_CHECK(cudaFree(out_ref));
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(meanVec));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   MeanCenterInputs<T, IdxType> params;
   T *data, *meanVec, *out, *out_ref;
+  cudaStream_t stream;
 };
 
 const std::vector<MeanCenterInputs<float, int>> inputsf_i32 = {
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index ff2698788f..fd374249d2 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -47,15 +47,14 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
 
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    allocate(data, len);
-    allocate(mean_act, cols);
-    allocate(stddev_act, cols);
-    allocate(vars_act, cols);
+    raft::allocate(data, len, stream);
+    raft::allocate(mean_act, cols, stream);
+    raft::allocate(stddev_act, cols, stream);
+    raft::allocate(vars_act, cols, stream);
     r.normal(data, len, params.mean, params.stddev, stream);
     stdVarSGtest(data, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void stdVarSGtest(T *data, cudaStream_t stream) {
@@ -73,15 +72,14 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(mean_act));
-    CUDA_CHECK(cudaFree(stddev_act));
-    CUDA_CHECK(cudaFree(vars_act));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   StdDevInputs<T> params;
   T *data, *mean_act, *stddev_act, *vars_act;
+  cudaStream_t stream;
 };
 
 const std::vector<StdDevInputs<float>> inputsf = {
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index c3140d4588..58ebec7859 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -43,9 +43,8 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
     params = ::testing::TestWithParam<SumInputs<T>>::GetParam();
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
-    cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(data, len);
+    raft::allocate(data, len, stream);
 
     T data_h[len];
     for (int i = 0; i < len; i++) {
@@ -54,19 +53,20 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
 
     raft::update_device(data, data_h, len, stream);
 
-    raft::allocate(sum_act, cols);
+    raft::allocate(sum_act, cols, stream);
     sum(sum_act, data, cols, rows, false, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(sum_act));
+    raft::deallocate_all(stream);
+    CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
  protected:
   SumInputs<T> params;
   T *data, *sum_act;
+  cudaStream_t stream;
 };
 
 const std::vector<SumInputs<float>> inputsf = {{0.05f, 1024, 32, 1234ULL},
diff --git a/python/raft/common/handle.pxd b/python/raft/common/handle.pxd
index 6076640312..884d81bed1 100644
--- a/python/raft/common/handle.pxd
+++ b/python/raft/common/handle.pxd
@@ -34,7 +34,5 @@ cdef extern from "raft/handle.hpp" namespace "raft" nogil:
         handle_t() except +
         handle_t(int ns) except +
         void set_stream(_Stream s) except +
-        void set_device_allocator(shared_ptr[allocator] a) except +
-        shared_ptr[allocator] get_device_allocator() except +
         _Stream get_stream() except +
         int get_num_internal_streams() except +

From 820e14d621f4c25a1e0dd6b879e01431dee2a300 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Mon, 30 Aug 2021 17:08:41 -0500
Subject: [PATCH 012/171] Pin rmm to branch-21.10 and remove warnings from
 kmeans.hpp (#322)

Authors:
  - Dante Gama Dessavre (https://github.com/dantegd)

Approvers:
  - Seunghwa Kang (https://github.com/seunghwak)
  - Corey J. Nolet (https://github.com/cjnolet)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/raft/pull/322
---
 cpp/cmake/thirdparty/get_rmm.cmake   |  6 +++---
 cpp/include/raft/spectral/kmeans.hpp | 16 ++++++----------
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake
index e990ab1367..51f959a8d9 100644
--- a/cpp/cmake/thirdparty/get_rmm.cmake
+++ b/cpp/cmake/thirdparty/get_rmm.cmake
@@ -32,8 +32,8 @@ function(find_and_configure_rmm VERSION)
         INSTALL_EXPORT_SET  raft-exports
         CPM_ARGS
             GIT_REPOSITORY  https://github.com/rapidsai/rmm.git
-            GIT_TAG         23bbe745af1d988224b5498f7b8e3fe3720532d4
-            GIT_SHALLOW     FALSE
+            GIT_TAG         branch-${MAJOR_AND_MINOR}
+            GIT_SHALLOW     TRUE
             OPTIONS         "BUILD_TESTS OFF"
                             "BUILD_BENCHMARKS OFF"
                             "CUDA_STATIC_RUNTIME ${CUDA_STATIC_RUNTIME}"
@@ -44,4 +44,4 @@ endfunction()
 
 set(RAFT_MIN_VERSION_rmm "${RAFT_VERSION_MAJOR}.${RAFT_VERSION_MINOR}.00")
 
-find_and_configure_rmm(${RAFT_MIN_VERSION_rmm})
\ No newline at end of file
+find_and_configure_rmm(${RAFT_MIN_VERSION_rmm})
diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp
index b6f0105487..d089b85518 100644
--- a/cpp/include/raft/spectral/kmeans.hpp
+++ b/cpp/include/raft/spectral/kmeans.hpp
@@ -256,7 +256,7 @@ static __global__ void minDistances2(index_type_t n,
  */
 template <typename index_type_t>
 static __global__ void computeClusterSizes(
-  index_type_t n, index_type_t k, const index_type_t* __restrict__ codes,
+  index_type_t n, const index_type_t* __restrict__ codes,
   index_type_t* __restrict__ clusterSizes) {
   index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
   while (i < n) {
@@ -341,7 +341,7 @@ static __global__ void divideCentroids(
  */
 template <typename index_type_t, typename value_type_t>
 static int chooseNewCentroid(handle_t const& handle, index_type_t n,
-                             index_type_t d, index_type_t k, value_type_t rand,
+                             index_type_t d, value_type_t rand,
                              const value_type_t* __restrict__ obs,
                              value_type_t* __restrict__ dists,
                              value_type_t* __restrict__ centroid) {
@@ -353,7 +353,6 @@ static int chooseNewCentroid(handle_t const& handle, index_type_t n,
   index_type_t obsIndex;
 
   auto stream = handle.get_stream();
-  auto cublas_h = handle.get_cublas_handle();
   auto thrust_exec_policy = handle.get_thrust_policy();
 
   // Compute cumulative sum of distances
@@ -450,7 +449,6 @@ static int initializeCentroids(
   thrust::uniform_real_distribution<value_type_t> uniformDist(0, 1);
 
   auto stream = handle.get_stream();
-  auto cublas_h = handle.get_cublas_handle();
   auto thrust_exec_policy = handle.get_thrust_policy();
 
   constexpr index_type_t grid_lower_bound{65535};
@@ -478,8 +476,7 @@ static int initializeCentroids(
   thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists),
                thrust::device_pointer_cast(dists + n), 1);
   CHECK_CUDA(stream);
-  if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, dists,
-                        centroids))
+  if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids))
     WARNING("error in k-means++ (could not pick centroid)");
 
   // Compute distances from first centroid
@@ -491,7 +488,7 @@ static int initializeCentroids(
   // Choose remaining centroids
   for (i = 1; i < k; ++i) {
     // Choose ith centroid
-    if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, dists,
+    if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists,
                           centroids + IDX(0, i, d)))
       WARNING("error in k-means++ (could not pick centroid)");
 
@@ -509,7 +506,7 @@ static int initializeCentroids(
 
   // Compute cluster sizes
   CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
-  computeClusterSizes<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, k, codes,
+  computeClusterSizes<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, codes,
                                                                 clusterSizes);
   CHECK_CUDA(stream);
 
@@ -552,7 +549,6 @@ static int assignCentroids(handle_t const& handle, index_type_t n,
                            index_type_t* __restrict__ clusterSizes,
                            value_type_t* residual_host) {
   auto stream = handle.get_stream();
-  auto cublas_h = handle.get_cublas_handle();
   auto thrust_exec_policy = handle.get_thrust_policy();
 
   // Compute distance between centroids and observation vectors
@@ -852,7 +848,7 @@ int kmeans(handle_t const& handle, index_type_t n, index_type_t d,
     // conditions, such as if obs is corrupt (as seen as a result of a
     // DataFrame column of NULL edge vals used to create the Graph)
     while (emptyCentroid < k) {
-      if (chooseNewCentroid(handle, n, d, k, uniformDist(rng), obs, work,
+      if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, work,
                             centroids + IDX(0, emptyCentroid, d)))
         WARNING("could not replace empty centroid");
       if (assignCentroids(handle, n, d, k, obs, centroids, work, codes,

From e806f999d950abe714921bb4ad78e8a2d1bbfae0 Mon Sep 17 00:00:00 2001
From: Micka <9810050+lowener@users.noreply.github.com>
Date: Tue, 7 Sep 2021 17:44:41 +0200
Subject: [PATCH 013/171] Fix build doc (#316)

Authors:
  - Micka (https://github.com/lowener)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/316
---
 BUILD.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/BUILD.md b/BUILD.md
index a8d22f18d9..844a563a90 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -29,14 +29,14 @@ else(DEFINED ENV{RAFT_PATH})
   set(RAFT_GIT_DIR ${CMAKE_CURRENT_BINARY_DIR}/raft CACHE STRING "Path to RAFT repo")
 
   ExternalProject_Add(raft
-    GIT_REPOSITORY    git@github.com:dantegd/barge.git
+    GIT_REPOSITORY    git@github.com:rapidsai/raft.git
     GIT_TAG           pinned_commit/git_tag/branch
     PREFIX            ${RAFT_GIT_DIR}
     CONFIGURE_COMMAND ""
     BUILD_COMMAND     ""
     INSTALL_COMMAND   "")
 
-  set(RAFT_INCLUDE_DIR ${RAFT_DIR}/src/raft/cpp/include CACHE STRING "RAFT include variable")
+  set(RAFT_INCLUDE_DIR ${RAFT_GIT_DIR}/src/raft/cpp/include CACHE STRING "RAFT include variable")
 endif(DEFINED ENV{RAFT_PATH})
 
 ```

From f311247690437f669936217b38d62c606244621e Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 9 Sep 2021 14:22:39 -0400
Subject: [PATCH 014/171] Update with rapids cmake new features (#320)

This combines some general CMake style cleanup and brings new rapids-cmake features to RAFT including:

- Usage of `rapids_cmake_install_lib_dir` to make sure we install raft correctly on non-debain based distro's ( lib64 ), while also handling conda installation requirements ( always lib no matter the distro )
- Usage of `rapids_cpm` pre-configured pacakges
- Removal of early termination before `rapids_cpm_find` since a better solution now exists ( https://github.com/rapidsai/rapids-cmake/issues/49 )

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/320
---
 cpp/CMakeLists.txt                    | 10 +++++-----
 cpp/cmake/thirdparty/get_cuco.cmake   |  4 ----
 cpp/cmake/thirdparty/get_gtest.cmake  | 27 ++++-----------------------
 cpp/cmake/thirdparty/get_nccl.cmake   |  2 +-
 cpp/cmake/thirdparty/get_rmm.cmake    | 27 ++++-----------------------
 cpp/cmake/thirdparty/get_thrust.cmake | 15 ++++++---------
 cpp/test/CMakeLists.txt               |  2 +-
 7 files changed, 21 insertions(+), 66 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 04eaf548ce..18dbb25956 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -90,12 +90,12 @@ include(cmake/modules/ConfigureCUDA.cmake)
 ##############################################################################
 # - Requirements -------------------------------------------------------------
 
-if (NOT DISABLE_OPENMP OR NOT ${DISABLE_OPENMP})
+if (NOT DISABLE_OPENMP)
   find_package(OpenMP)
   if(OPENMP_FOUND)
     message(VERBOSE "RAFT: OpenMP found in ${OpenMP_CXX_INCLUDE_DIRS}")
-  endif(OPENMP_FOUND)
-endif(NOT DISABLE_OPENMP OR NOT ${DISABLE_OPENMP})
+  endif()
+endif()
 
 # add third party dependencies using CPM
 rapids_cpm_init()
@@ -113,7 +113,7 @@ endif()
 
 ##############################################################################
 # - install targets-----------------------------------------------------------
-
+rapids_cmake_install_lib_dir( lib_dir )
 add_library(raft INTERFACE)
 add_library(raft::raft ALIAS raft)
 target_include_directories(raft INTERFACE "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
@@ -133,7 +133,7 @@ INTERFACE
 target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
 
 install(TARGETS raft
-        DESTINATION lib
+        DESTINATION ${lib_dir}
         EXPORT raft-exports
         )
 
diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index 1bfac473d5..06b2d17e7b 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -16,10 +16,6 @@
 
 function(find_and_configure_cuco VERSION)
 
-    if(TARGET cuco::cuco)
-      return()
-    endif()
-
     rapids_cpm_find(cuco ${VERSION}
       GLOBAL_TARGETS      cuco::cuco
       BUILD_EXPORT_SET    raft-exports
diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index 4cd11dab98..7c234283d5 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -14,30 +14,11 @@
 # limitations under the License.
 #=============================================================================
 
-function(find_and_configure_gtest VERSION)
+function(find_and_configure_gtest )
 
-    if(TARGET GTest::gtest)
-        return()
-    endif()
-
-    rapids_cpm_find(GTest ${VERSION}
-        GLOBAL_TARGETS  gest gtest_main GTest::gtest GTest::gtest_main
-        CPM_ARGS
-            GIT_REPOSITORY  https://github.com/google/googletest.git
-            GIT_TAG         release-${VERSION}
-            GIT_SHALLOW     TRUE
-            OPTIONS         "INSTALL_GTEST OFF"
-            # googletest >= 1.10.0 provides a cmake config file -- use it if it exists
-            FIND_PACKAGE_ARGUMENTS "CONFIG"
-    )
-
-    if(NOT TARGET GTest::gtest)
-        add_library(GTest::gtest ALIAS gtest)
-        add_library(GTest::gtest_main ALIAS gtest_main)
-    endif()
+    include(${rapids-cmake-dir}/cpm/gtest.cmake)
+    rapids_cpm_gtest()
 
 endfunction()
 
-set(RAFT_MIN_VERSION_gtest 1.10.0)
-
-find_and_configure_gtest(${RAFT_MIN_VERSION_gtest})
+find_and_configure_gtest()
diff --git a/cpp/cmake/thirdparty/get_nccl.cmake b/cpp/cmake/thirdparty/get_nccl.cmake
index a80eefab80..118ae37704 100644
--- a/cpp/cmake/thirdparty/get_nccl.cmake
+++ b/cpp/cmake/thirdparty/get_nccl.cmake
@@ -16,7 +16,7 @@
 
 function(find_and_configure_nccl)
 
-    if(TARGET nccl::nccl)
+    if(TARGET NCCL::NCCL)
         return()
     endif()
 
diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake
index 51f959a8d9..7c155d446f 100644
--- a/cpp/cmake/thirdparty/get_rmm.cmake
+++ b/cpp/cmake/thirdparty/get_rmm.cmake
@@ -14,34 +14,15 @@
 # limitations under the License.
 #=============================================================================
 
-function(find_and_configure_rmm VERSION)
+function(find_and_configure_rmm)
 
-    if(TARGET rmm::rmm)
-        return()
-    endif()
-
-    if(${VERSION} MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
-        set(MAJOR_AND_MINOR "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}")
-    else()
-        set(MAJOR_AND_MINOR "${VERSION}")
-    endif()
-
-    rapids_cpm_find(rmm ${VERSION}
+    include(${rapids-cmake-dir}/cpm/rmm.cmake)
+    rapids_cpm_rmm(
         GLOBAL_TARGETS      rmm::rmm
         BUILD_EXPORT_SET    raft-exports
         INSTALL_EXPORT_SET  raft-exports
-        CPM_ARGS
-            GIT_REPOSITORY  https://github.com/rapidsai/rmm.git
-            GIT_TAG         branch-${MAJOR_AND_MINOR}
-            GIT_SHALLOW     TRUE
-            OPTIONS         "BUILD_TESTS OFF"
-                            "BUILD_BENCHMARKS OFF"
-                            "CUDA_STATIC_RUNTIME ${CUDA_STATIC_RUNTIME}"
-                            "DISABLE_DEPRECATION_WARNING ${DISABLE_DEPRECATION_WARNING}"
     )
 
 endfunction()
 
-set(RAFT_MIN_VERSION_rmm "${RAFT_VERSION_MAJOR}.${RAFT_VERSION_MINOR}.00")
-
-find_and_configure_rmm(${RAFT_MIN_VERSION_rmm})
+find_and_configure_rmm()
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index 6dd9a91870..c28ff6e66d 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -13,18 +13,15 @@
 # =============================================================================
 
 # Use CPM to find or clone thrust
-function(find_and_configure_thrust VERSION)
+function(find_and_configure_thrust)
+  include(${rapids-cmake-dir}/cpm/thrust.cmake)
 
-  rapids_cpm_find(
-    Thrust ${VERSION}
+  rapids_cpm_thrust(
+    NAMESPACE raft
     BUILD_EXPORT_SET raft-exports
     INSTALL_EXPORT_SET raft-exports
-    CPM_ARGS
-    GIT_REPOSITORY https://github.com/NVIDIA/thrust.git
-    GIT_TAG ${VERSION}
-    GIT_SHALLOW TRUE
-    OPTIONS "THRUST_INSTALL OFF")
+  )
 
 endfunction()
 
-find_and_configure_thrust(1.12.0)
+find_and_configure_thrust()
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 0428e09142..fb766a5bcd 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -131,7 +131,7 @@ PRIVATE
   FAISS::FAISS
   GTest::gtest
   GTest::gtest_main
-  OpenMP::OpenMP_CXX
   Threads::Threads
+  $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
   $<TARGET_NAME_IF_EXISTS:conda_env>
 )

From f3df0ba74906bbc46e2245b19218674311eda02e Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Wed, 15 Sep 2021 17:51:32 -0400
Subject: [PATCH 015/171] Add broadcast with const input iterator (#328)

The current ```bcast``` function takes a single ```value_t*``` pointer (MPI style) for both input (if root) and output (if non-root).

This does not compile if we have ```const value_t*``` pointer for input. This PR adds a ```bcast``` function that takes separate ```const value_t*``` input and ```value_t```` output pointers (NCCL style).

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/328
---
 cpp/include/raft/comms/comms.hpp     | 21 +++++++++++++++++++++
 cpp/include/raft/comms/mpi_comms.hpp |  7 +++++++
 cpp/include/raft/comms/std_comms.hpp |  7 +++++++
 3 files changed, 35 insertions(+)

diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index dc172c9503..3f2f6b28f9 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -114,6 +114,10 @@ class comms_iface {
   virtual void bcast(void* buff, size_t count, datatype_t datatype, int root,
                      cudaStream_t stream) const = 0;
 
+  virtual void bcast(const void* sendbuff, void* recvbuff, size_t count,
+                     datatype_t datatype, int root,
+                     cudaStream_t stream) const = 0;
+
   virtual void reduce(const void* sendbuff, void* recvbuff, size_t count,
                       datatype_t datatype, op_t op, int root,
                       cudaStream_t stream) const = 0;
@@ -282,6 +286,23 @@ class comms_t {
                  stream);
   }
 
+  /**
+   * Broadcast data from one rank to the rest
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuff buffer containing data to broadcast (only used in root)
+   * @param recvbuff buffer to receive broadcasted data
+   * @param count number of elements if buff
+   * @param root the rank initiating the broadcast
+   * @param stream CUDA stream to synchronize operation
+   */
+  template <typename value_t>
+  void bcast(const value_t* sendbuff, value_t* recvbuff, size_t count, int root,
+             cudaStream_t stream) const {
+    impl_->bcast(static_cast<const void*>(sendbuff),
+                 static_cast<void*>(recvbuff), count, get_type<value_t>(), root,
+                 stream);
+  }
+
   /**
    * Reduce data from many ranks down to a single rank
    * @tparam value_t datatype of underlying buffers
diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp
index 8dda74f0a9..067c7bd0ab 100644
--- a/cpp/include/raft/comms/mpi_comms.hpp
+++ b/cpp/include/raft/comms/mpi_comms.hpp
@@ -202,6 +202,13 @@ class mpi_comms : public comms_iface {
                            nccl_comm_, stream));
   }
 
+  void bcast(const void* sendbuff, void* recvbuff, size_t count,
+             datatype_t datatype, int root, cudaStream_t stream) const {
+    NCCL_TRY(ncclBroadcast(sendbuff, recvbuff, count,
+                           get_nccl_datatype(datatype), root, nccl_comm_,
+                           stream));
+  }
+
   void reduce(const void* sendbuff, void* recvbuff, size_t count,
               datatype_t datatype, op_t op, int root,
               cudaStream_t stream) const {
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index ff75931fb9..562c548bcb 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -307,6 +307,13 @@ class std_comms : public comms_iface {
                            nccl_comm_, stream));
   }
 
+  void bcast(const void *sendbuff, void *recvbuff, size_t count,
+             datatype_t datatype, int root, cudaStream_t stream) const {
+    NCCL_TRY(ncclBroadcast(sendbuff, recvbuff, count,
+                           get_nccl_datatype(datatype), root, nccl_comm_,
+                           stream));
+  }
+
   void reduce(const void *sendbuff, void *recvbuff, size_t count,
               datatype_t datatype, op_t op, int root,
               cudaStream_t stream) const {

From 028609577a3e8d200e63ee306ea661da72732f53 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Thu, 16 Sep 2021 19:39:52 +0200
Subject: [PATCH 016/171] Fix wrong lda parameter in gemv (#327)

Fix wrong `lda` parameter in raft::linalg::gemv. lda should always be along `n_rows` direction, independently of `trans_a`. I also took a liberty to add couple more overloads and documentation.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Brad Rees (https://github.com/BradReesWork)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/raft/pull/327
---
 cpp/include/raft/linalg/gemv.h | 129 ++++++++++++++++++++++-------
 cpp/test/CMakeLists.txt        |   1 +
 cpp/test/linalg/gemv.cu        | 143 +++++++++++++++++++++++++++++++++
 3 files changed, 242 insertions(+), 31 deletions(-)
 create mode 100644 cpp/test/linalg/gemv.cu

diff --git a/cpp/include/raft/linalg/gemv.h b/cpp/include/raft/linalg/gemv.h
index edd18b3bee..0be11a0301 100644
--- a/cpp/include/raft/linalg/gemv.h
+++ b/cpp/include/raft/linalg/gemv.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,49 +26,116 @@ namespace raft {
 namespace linalg {
 
 template <typename math_t>
-void gemv(const raft::handle_t& handle, const math_t* a, int n_rows, int n_cols,
-          const math_t* x, int incx, math_t* y, int incy, bool trans_a,
-          math_t alpha, math_t beta, cudaStream_t stream) {
+void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows,
+          const int n_cols, const math_t *x, const int incx, math_t *y,
+          const int incy, const bool trans_a, const math_t alpha,
+          const math_t beta, cudaStream_t stream) {
   cublasHandle_t cublas_h = handle.get_cublas_handle();
-
   cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  CUBLAS_CHECK(cublasgemv(cublas_h, op_a, n_rows, n_cols, &alpha, A, n_rows, x,
+                          incx, &beta, y, incy, stream));
+}
 
-  // Unfortunately there is a clash of terminology
-  // in BLAS https://docs.nvidia.com/cuda/cublas/index.html is opposite to Machine Learning
-  // In blas:
-  //  m - number of rows in input matrix
-  //  n - number of columns in input matrix
-  //  lda - purpose of it  to have ability to operate on submatrices of matrix without copying.
-  //        If you're not think about it it's always should be equal to m
-  //  lda has deal with memory layout, but has nothing with the requirement for cuBLAS perform transpose
-
-  // In Machine Learning:
-  //  m - nunmber of columns in design matrix(number of features)
-  //  n - number of rows in designed matrix (number of train examples)
+/**
+ * y = alpha * op(A) * x + beta * y
+ *
+ * where
+ *
+ * @param A is a column-major matrix of size n_rows_a * n_cols_a.
+ *   op(A) is either the transpose operation (trans_a == true) or identity.
+ *
+ * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a.
+ *     set it when you need to use only the first n_rows_a rows of the matrix A, which has
+ *     (perhaps, due to padding) lda rows.
+ *
+ * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
+ *
+ * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ */
+template <typename math_t>
+void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a,
+          const int n_cols_a, const math_t *x, math_t *y, const bool trans_a,
+          const math_t alpha, const math_t beta, cudaStream_t stream) {
+  gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
+}
 
-  int m = n_rows;
-  int n = n_cols;
-  int lda = trans_a ? m : n;
+/**
+ * y = op(A) * x
+ *
+ * where
+ *
+ * @param A is a column-major matrix of size n_rows_a * n_cols_a.
+ *   op(A) is either the transpose operation (trans_a == true) or identity.
+ *
+ * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
+ *
+ * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ */
+template <typename math_t>
+void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a,
+          const int n_cols_a, const math_t *x, math_t *y, const bool trans_a,
+          cudaStream_t stream) {
+  math_t alpha = math_t(1);
+  math_t beta = math_t(0);
 
-  CUBLAS_CHECK(cublasgemv(cublas_h, op_a, m, n, &alpha, a, lda, x, incx, &beta,
-                          y, incy, stream));
+  gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
 }
 
+/**
+ * y = alpha * op(A) * x + beta * y
+ *
+ * where
+ *
+ * @param alpha is a scalar scale of Ax.
+ *
+ * @param beta is a scalar scale of y.
+ *
+ * @param A is a column-major matrix of size n_rows_a * n_cols_a.
+ *   op(A) is either the transpose operation (trans_a == true) or identity.
+ *
+ * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a.
+ *     set it when you need to use only the first n_rows_a rows of the matrix A, which has
+ *     (perhaps, due to padding) lda rows.
+ *
+ * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
+ *
+ * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ */
 template <typename math_t>
-void gemv(const raft::handle_t& handle, const math_t* a, int n_rows_a,
-          int n_cols_a, const math_t* x, math_t* y, bool trans_a, math_t alpha,
-          math_t beta, cudaStream_t stream) {
-  gemv(handle, a, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
+void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a,
+          const int n_cols_a, const int lda, const math_t *x, math_t *y,
+          const bool trans_a, const math_t alpha, const math_t beta,
+          cudaStream_t stream) {
+  cublasHandle_t cublas_h = handle.get_cublas_handle();
+  cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  CUBLAS_CHECK(cublasgemv(cublas_h, op_a, n_rows_a, n_cols_a, &alpha, A, lda, x,
+                          1, &beta, y, 1, stream));
 }
 
+/**
+ * y = op(A) * x
+ *
+ * where
+ *
+ * @param A is a column-major matrix of size n_rows_a * n_cols_a.
+ *   op(A) is either the transpose operation (trans_a == true) or identity.
+ *
+ * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a.
+ *     set it when you need to use only the first n_rows_a rows of the matrix A, which has
+ *     (perhaps, due to padding) lda rows.
+ *
+ * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
+ *
+ * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ *
+ */
 template <typename math_t>
-void gemv(const raft::handle_t& handle, const math_t* a, int n_rows_a,
-          int n_cols_a, const math_t* x, math_t* y, bool trans_a,
-          cudaStream_t stream) {
+void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a,
+          const int n_cols_a, const int lda, const math_t *x, math_t *y,
+          const bool trans_a, cudaStream_t stream) {
   math_t alpha = math_t(1);
   math_t beta = math_t(0);
-
-  gemv(handle, a, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
+  gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream);
 }
 
 };  // namespace linalg
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index fb766a5bcd..42066061f3 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -47,6 +47,7 @@ add_executable(test_raft
     test/linalg/eig.cu
     test/linalg/eig_sel.cu
     test/linalg/gemm_layout.cu
+    test/linalg/gemv.cu
     test/linalg/map.cu
     test/linalg/map_then_reduce.cu
     test/linalg/matrix_vector_op.cu
diff --git a/cpp/test/linalg/gemv.cu b/cpp/test/linalg/gemv.cu
new file mode 100644
index 0000000000..4a474bc461
--- /dev/null
+++ b/cpp/test/linalg/gemv.cu
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/linalg/gemv.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/random/rng.cuh>
+#include "../test_utils.h"
+
+namespace raft {
+namespace linalg {
+
+template <typename T>
+struct GemvInputs {
+  int n_rows;
+  int n_cols;
+  int lda;
+  bool trans_a;
+  unsigned long long int seed;
+};
+
+// Reference GEMV implementation.
+template <typename T>
+__global__ void naiveGemv(T *y, const T *A, const T *x, const int n_rows,
+                          const int n_cols, const int lda, const bool trans_a) {
+  int istart = blockIdx.x * blockDim.x + threadIdx.x;
+  int istep = blockDim.x * gridDim.x;
+
+  if (!trans_a) {
+    for (int i = istart; i < n_rows; i += istep) {
+      T t = T(0.0);
+      for (int j = 0; j < n_cols; j++) {
+        t += A[i + lda * j] * x[j];
+      }
+      y[i] = t;
+    }
+  } else {
+    for (int i = istart; i < n_cols; i += istep) {
+      T t = T(0.0);
+      for (int j = 0; j < n_rows; j++) {
+        t += A[lda * i + j] * x[j];
+      }
+      y[i] = t;
+    }
+  }
+}
+
+template <typename T>
+class GemvTest : public ::testing::TestWithParam<GemvInputs<T>> {
+ protected:
+  GemvInputs<T> params;
+  rmm::device_uvector<T> refy;  // Reference result for comparison
+  rmm::device_uvector<T> y;     // Computed result
+
+ public:
+  GemvTest()
+    : testing::TestWithParam<GemvInputs<T>>(),
+      refy(0, rmm::cuda_stream_default),
+      y(0, rmm::cuda_stream_default) {
+    rmm::cuda_stream_default.synchronize();
+  }
+
+ protected:
+  void SetUp() override {
+    params = ::testing::TestWithParam<GemvInputs<T>>::GetParam();
+
+    raft::handle_t handle;
+    cudaStream_t stream = handle.get_stream();
+
+    raft::random::Rng r(params.seed);
+
+    // We compute y = op(A) * x and compare against reference result
+    size_t aElems = params.lda * params.n_cols;
+    size_t xElems = params.trans_a ? params.n_rows : params.n_cols;
+    size_t yElems = params.trans_a ? params.n_cols : params.n_rows;
+
+    rmm::device_uvector<T> A(aElems, stream);
+    rmm::device_uvector<T> x(xElems, stream);
+    refy.resize(yElems, stream);
+    y.resize(yElems, stream);
+
+    r.uniform(x.data(), xElems, T(-10.0), T(10.0), stream);
+    r.uniform(A.data(), aElems, T(-10.0), T(10.0), stream);
+
+    dim3 blocks(raft::ceildiv<int>(yElems, 256), 1, 1);
+    dim3 threads(256, 1, 1);
+
+    naiveGemv<<<blocks, threads>>>(refy.data(), A.data(), x.data(),
+                                   params.n_rows, params.n_cols, params.lda,
+                                   params.trans_a);
+
+    gemv(handle, A.data(), params.n_rows, params.n_cols, params.lda, x.data(),
+         y.data(), params.trans_a, stream);
+  }
+
+  void TearDown() override {}
+};
+
+const std::vector<GemvInputs<float>> inputsf = {
+  {80, 70, 80, true, 76433ULL},    {80, 100, 80, true, 426646ULL},
+  {20, 100, 20, true, 37703ULL},   {100, 60, 200, true, 538004ULL},
+  {50, 10, 60, false, 73012ULL},   {90, 90, 90, false, 538147ULL},
+  {30, 100, 30, false, 412352ULL}, {40, 80, 100, false, 297941ULL}};
+
+const std::vector<GemvInputs<double>> inputsd = {
+  {10, 70, 10, true, 535648ULL},  {30, 30, 30, true, 956681ULL},
+  {70, 80, 70, true, 875083ULL},  {80, 90, 200, true, 50744ULL},
+  {90, 90, 90, false, 506321ULL}, {40, 100, 70, false, 638418ULL},
+  {80, 50, 80, false, 701529ULL}, {50, 80, 60, false, 893038ULL}};
+
+typedef GemvTest<float> GemvTestF;
+TEST_P(GemvTestF, Result) {
+  ASSERT_TRUE(raft::devArrMatch(refy.data(), y.data(),
+                                params.trans_a ? params.n_cols : params.n_rows,
+                                raft::CompareApprox<float>(1e-4)));
+}
+
+typedef GemvTest<double> GemvTestD;
+TEST_P(GemvTestD, Result) {
+  ASSERT_TRUE(raft::devArrMatch(refy.data(), y.data(),
+                                params.trans_a ? params.n_cols : params.n_rows,
+                                raft::CompareApprox<float>(1e-6)));
+}
+
+INSTANTIATE_TEST_SUITE_P(GemvTests, GemvTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_SUITE_P(GemvTests, GemvTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace linalg
+}  // end namespace raft

From 9ce2fd0d863be9f6dfa3cede7133acc2add705d6 Mon Sep 17 00:00:00 2001
From: Raymond Douglass <ray@raydouglass>
Date: Thu, 16 Sep 2021 16:48:16 -0400
Subject: [PATCH 017/171] DOC v21.12 Updates

---
 CHANGELOG.md       | 4 ++++
 cpp/CMakeLists.txt | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b8d89d1a8f..cf36c3facd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# raft 21.12.00 (Date TBD)
+
+Please see https://github.com/rapidsai/raft/releases/tag/v21.12.00a for the latest changes to this development branch.
+
 # raft 21.10.00 (Date TBD)
 
 Please see https://github.com/rapidsai/raft/releases/tag/v21.10.00a for the latest changes to this development branch.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 18dbb25956..4f24b5e8db 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -26,7 +26,7 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(RAFT)
 
-project(RAFT VERSION 21.10.00 LANGUAGES CXX CUDA)
+project(RAFT VERSION 21.12.00 LANGUAGES CXX CUDA)
 
 ##############################################################################
 # - build type ---------------------------------------------------------------

From 65a675f7d66851777f64b79a12af740561b1dc4f Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Fri, 17 Sep 2021 12:33:58 -0700
Subject: [PATCH 018/171] Accounting for rmm::cuda_stream_pool not having a
 constructor for 0 streams (#329)

Comes from this PR https://github.com/rapidsai/rmm/pull/873

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/329
---
 cpp/include/raft/handle.hpp | 28 +++++++++++++++++++++-------
 cpp/test/cluster_solvers.cu |  1 -
 cpp/test/eigen_solvers.cu   |  2 --
 cpp/test/handle.cpp         | 14 --------------
 cpp/test/spectral_matrix.cu |  1 -
 5 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index c925669530..190062e92f 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -61,8 +61,10 @@ class handle_t {
         int cur_dev = -1;
         CUDA_CHECK(cudaGetDevice(&cur_dev));
         return cur_dev;
-      }()),
-      streams_(n_streams) {
+      }()) {
+    if (n_streams != 0) {
+      streams_ = std::make_unique<rmm::cuda_stream_pool>(n_streams);
+    }
     create_resources();
     thrust_policy_ = std::make_unique<rmm::exec_policy>(user_stream_);
   }
@@ -78,10 +80,13 @@ class handle_t {
    */
   handle_t(const handle_t& other, int stream_id,
            int n_streams = kNumDefaultWorkerStreams)
-    : dev_id_(other.get_device()), streams_(n_streams) {
+    : dev_id_(other.get_device()) {
     RAFT_EXPECTS(
       other.get_num_internal_streams() > 0,
       "ERROR: the main handle must have at least one worker stream\n");
+    if (n_streams != 0) {
+      streams_ = std::make_unique<rmm::cuda_stream_pool>(n_streams);
+    }
     prop_ = other.get_device_properties();
     device_prop_initialized_ = true;
     create_resources();
@@ -140,14 +145,23 @@ class handle_t {
 
   // legacy compatibility for cuML
   cudaStream_t get_internal_stream(int sid) const {
-    return streams_.get_stream(sid).value();
+    RAFT_EXPECTS(
+      streams_.get() != nullptr,
+      "ERROR: rmm::cuda_stream_pool was not initialized with a non-zero value");
+    return streams_->get_stream(sid).value();
   }
   // new accessor return rmm::cuda_stream_view
   rmm::cuda_stream_view get_internal_stream_view(int sid) const {
-    return streams_.get_stream(sid);
+    RAFT_EXPECTS(
+      streams_.get() != nullptr,
+      "ERROR: rmm::cuda_stream_pool was not initialized with a non-zero value");
+    return streams_->get_stream(sid);
+  }
+
+  int get_num_internal_streams() const {
+    return streams_.get() != nullptr ? streams_->get_pool_size() : 0;
   }
 
-  int get_num_internal_streams() const { return streams_.get_pool_size(); }
   std::vector<cudaStream_t> get_internal_streams() const {
     std::vector<cudaStream_t> int_streams_vec;
     for (int i = 0; i < get_num_internal_streams(); i++) {
@@ -212,7 +226,7 @@ class handle_t {
   std::unordered_map<std::string, std::shared_ptr<comms::comms_t>> subcomms_;
 
   const int dev_id_;
-  rmm::cuda_stream_pool streams_{0};
+  std::unique_ptr<rmm::cuda_stream_pool> streams_{nullptr};
   mutable cublasHandle_t cublas_handle_;
   mutable bool cublas_initialized_{false};
   mutable cusolverDnHandle_t cusolver_dn_handle_;
diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu
index d280b3e95c..06b246d9a1 100644
--- a/cpp/test/cluster_solvers.cu
+++ b/cpp/test/cluster_solvers.cu
@@ -58,7 +58,6 @@ TEST(Raft, ModularitySolvers) {
   using value_type = double;
 
   handle_t h;
-  ASSERT_EQ(0, h.get_num_internal_streams());
   ASSERT_EQ(0, h.get_device());
 
   index_type neigvs{10};
diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu
index 15794ef568..ede790b38c 100644
--- a/cpp/test/eigen_solvers.cu
+++ b/cpp/test/eigen_solvers.cu
@@ -31,7 +31,6 @@ TEST(Raft, EigenSolvers) {
   using value_type = double;
 
   handle_t h;
-  ASSERT_EQ(0, h.get_num_internal_streams());
   ASSERT_EQ(0, h.get_device());
 
   index_type* ro{nullptr};
@@ -73,7 +72,6 @@ TEST(Raft, SpectralSolvers) {
   using value_type = double;
 
   handle_t h;
-  ASSERT_EQ(0, h.get_num_internal_streams());
   ASSERT_EQ(0, h.get_device());
 
   index_type neigvs{10};
diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp
index 4cb9809844..3e27789078 100644
--- a/cpp/test/handle.cpp
+++ b/cpp/test/handle.cpp
@@ -24,7 +24,6 @@ namespace raft {
 
 TEST(Raft, HandleDefault) {
   handle_t h;
-  ASSERT_EQ(0, h.get_num_internal_streams());
   ASSERT_EQ(0, h.get_device());
   ASSERT_EQ(nullptr, h.get_stream());
   ASSERT_NE(nullptr, h.get_cublas_handle());
@@ -55,7 +54,6 @@ TEST(Raft, GetHandleFromPool) {
 
   handle_t child(parent, 2);
   ASSERT_EQ(parent.get_internal_stream(2), child.get_stream());
-  ASSERT_EQ(0, child.get_num_internal_streams());
 
   child.set_stream(parent.get_internal_stream(3));
   ASSERT_EQ(parent.get_internal_stream(3), child.get_stream());
@@ -64,18 +62,6 @@ TEST(Raft, GetHandleFromPool) {
   ASSERT_EQ(parent.get_device(), child.get_device());
 }
 
-TEST(Raft, GetHandleFromPoolPerf) {
-  handle_t parent(100);
-  auto start = curTimeMillis();
-  for (int i = 0; i < parent.get_num_internal_streams(); i++) {
-    handle_t child(parent, i);
-    ASSERT_EQ(parent.get_internal_stream(i), child.get_stream());
-    child.wait_on_user_stream();
-  }
-  // upperbound on 0.1ms per child handle
-  ASSERT_LE(curTimeMillis() - start, 10);
-}
-
 TEST(Raft, GetHandleStreamViews) {
   handle_t parent(4);
 
diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu
index b85d35e3f8..388ad56f2d 100644
--- a/cpp/test/spectral_matrix.cu
+++ b/cpp/test/spectral_matrix.cu
@@ -38,7 +38,6 @@ TEST(Raft, SpectralMatrices) {
   using value_type = double;
 
   handle_t h;
-  ASSERT_EQ(0, h.get_num_internal_streams());
   ASSERT_EQ(0, h.get_device());
 
   csr_view_t<index_type, value_type> csr_v{nullptr, nullptr, nullptr, 0, 0};

From af3deeafd25b4263c14b93a4d3cb49437d899d35 Mon Sep 17 00:00:00 2001
From: Mahesh Doijade <36705640+mdoijade@users.noreply.github.com>
Date: Mon, 20 Sep 2021 22:39:51 +0530
Subject: [PATCH 019/171] Fused L2 (unexpanded) kNN kernel for NN <= 64,
 without using temporary gmem to store intermediate distances (#324)

benchmarking with cuml python interface kNN datasets (it claims to generate gaussian distribution) tried till 200k x 128 database/query vectors.
found some different behavior on my small GPU GP107 vs on GA102(Tesla A40)
on GP107 fused L2 kNN is slower on larger datasets
on GA102 fused L2 kNN is always faster like approx **1.15x-1.5x** for all datasets I tried (except 200k x 128).

I will also have L2 expanded version of fused L2 kNN in a separate PR due to which on larger dimension like > 128 distance computation from fused L2 kNN won't become bottleneck.

There is scope to optimize the distance computation in fused L2 kNN as there is no usage of vectorized LDG/STS in it.

Overall it looks that fused L2 kNN is better on GPUs with decent compute power but not on small old GPUs like GP107.

& benchmarking with cuml cpp kNN regression tests the performance is
On A30 (GA100) , For NN == 64, resultant Dist matrix 1M x 1M,
Fused L2 kNN = 11550ms
FAISS kNN = 23933 ms.
**Overall 2.07x faster**
And for NN == 32, it is **1.43x faster**
runtimes for NN == 32,
Fused L2 kNN = 11198ms
FAISS kNN = 16124ms

Authors:
  - Mahesh Doijade (https://github.com/mdoijade)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/324
---
 cpp/include/raft/linalg/contractions.cuh      |  21 +
 .../raft/spatial/knn/detail/fused_l2_knn.cuh  | 612 ++++++++++++++++++
 .../knn/detail/knn_brute_force_faiss.cuh      | 114 ++--
 3 files changed, 709 insertions(+), 38 deletions(-)
 create mode 100644 cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh

diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh
index aa711a9140..e6ff8a49ce 100644
--- a/cpp/include/raft/linalg/contractions.cuh
+++ b/cpp/include/raft/linalg/contractions.cuh
@@ -166,6 +166,27 @@ struct Policy4x4<double, _veclen> {
 };
 /** @} */
 
+/**
+ * @defgroup Policy2x8 16 elements per thread Policy with k-block = 16
+ * @{
+ */
+template <typename DataT, int _veclen = 1>
+struct Policy2x8 {};
+
+template <int _veclen>
+struct Policy2x8<float, _veclen> {
+  typedef KernelPolicy<float, _veclen, 16, 2, 8, 8, 32> Policy;
+  typedef ColKernelPolicy<float, _veclen, 16, 2, 8, 8, 32> ColPolicy;
+};
+
+template <int _veclen>
+struct Policy2x8<double, _veclen> {
+  // this is not used just for keeping compiler happy.
+  typedef KernelPolicy<double, _veclen, 32, 1, 2, 8, 32> Policy;
+  typedef ColKernelPolicy<double, _veclen, 32, 1, 2, 8, 32> ColPolicy;
+};
+/** @} */
+
 /**
  * @brief Base class for gemm-like NT contractions
  *
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
new file mode 100644
index 0000000000..9d00d9b9f4
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -0,0 +1,612 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <cub/cub.cuh>
+#include <faiss/gpu/utils/Select.cuh>
+#include <limits>
+#include <raft/distance/pairwise_distance_base.cuh>
+#include "processing.hpp"
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+
+template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT>
+DI void loadAllWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const IdxT m,
+                          const unsigned int numOfNN) {
+  const int lid = raft::laneId();
+#pragma unroll
+  for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+    const auto rowId =
+      (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+    if (rowId < m) {
+#pragma unroll
+      for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) {
+        const int idx = j * warpSize + lid;
+        if (idx < numOfNN) {
+          Pair KVPair = shDumpKV[rowId * numOfNN + idx];
+          heapArr[i]->warpV[j] = KVPair.key;
+          heapArr[i]->warpK[j] = KVPair.value;
+        }
+      }
+    }
+  }
+}
+
+template <typename Policy, typename Pair, typename myWarpSelect>
+DI void loadWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const int rowId,
+                       const unsigned int numOfNN) {
+  const int lid = raft::laneId();
+#pragma unroll
+  for (int j = 0; j < heapArr->kNumWarpQRegisters; ++j) {
+    const int idx = j * warpSize + lid;
+    if (idx < numOfNN) {
+      Pair KVPair = shDumpKV[rowId * numOfNN + idx];
+      heapArr->warpV[j] = KVPair.key;
+      heapArr->warpK[j] = KVPair.value;
+    }
+  }
+}
+
+template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT>
+DI void storeWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const IdxT rowId,
+                        const unsigned int numOfNN) {
+  const int lid = raft::laneId();
+
+#pragma unroll
+  for (int j = 0; j < heapArr->kNumWarpQRegisters; ++j) {
+    const int idx = j * warpSize + lid;
+    if (idx < numOfNN) {
+      Pair otherKV = Pair(heapArr->warpV[j], heapArr->warpK[j]);
+      shDumpKV[rowId * numOfNN + idx] = otherKV;
+    }
+  }
+}
+
+template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT,
+          typename OutT>
+DI void storeWarpQGmem(myWarpSelect &heapArr, OutT *out_dists, IdxT *out_inds,
+                       const IdxT m, const unsigned int numOfNN,
+                       const IdxT starty) {
+  const int lid = raft::laneId();
+#pragma unroll
+  for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+    const auto gmemRowId = starty + i * Policy::AccThRows;
+    if (gmemRowId < m) {
+#pragma unroll
+      for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) {
+        const auto idx = j * warpSize + lid;
+        if (idx < numOfNN) {
+          out_dists[gmemRowId * numOfNN + idx] = heapArr[i]->warpK[j];
+          out_inds[gmemRowId * numOfNN + idx] = (IdxT)heapArr[i]->warpV[j];
+        }
+      }
+    }
+  }
+}
+
+template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT,
+          typename OutT>
+DI void loadPrevTopKsGmemWarpQ(myWarpSelect &heapArr, OutT *out_dists,
+                               IdxT *out_inds, const IdxT m,
+                               const unsigned int numOfNN, const IdxT starty) {
+  const int lid = raft::laneId();
+#pragma unroll
+  for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+    const auto gmemRowId = starty + i * Policy::AccThRows;
+    if (gmemRowId < m) {
+#pragma unroll
+      for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) {
+        const auto idx = j * warpSize + lid;
+        if (idx < numOfNN) {
+          heapArr[i]->warpK[j] = out_dists[gmemRowId * numOfNN + idx];
+          heapArr[i]->warpV[j] = (uint32_t)out_inds[gmemRowId * numOfNN + idx];
+        }
+      }
+      auto constexpr kLaneWarpKTop = heapArr[i]->kNumWarpQRegisters - 1;
+      heapArr[i]->warpKTop =
+        raft::shfl(heapArr[i]->warpK[kLaneWarpKTop], heapArr[i]->kLane);
+    }
+  }
+}
+
+template <typename Pair, int NumWarpQRegs, typename myWarpSelect>
+DI void updateSortedWarpQ(myWarpSelect &heapArr, Pair *allWarpTopKs, int rowId,
+                          int finalNumVals, int startId = 0) {
+  constexpr uint32_t mask = 0xffffffffu;
+  const int lid = raft::laneId();
+  // calculate srcLane such that tid 0 -> 31, 1 -> 0,... 31 -> 30.
+  // warp around 0 to 31 required for NN > 32
+  const auto srcLane = (warpSize + (lid - 1)) & (warpSize - 1);
+
+  for (int k = startId; k < finalNumVals; k++) {
+    Pair KVPair = allWarpTopKs[rowId * (256) + k];
+#pragma unroll
+    for (int i = 0; i < NumWarpQRegs; i++) {
+      unsigned activeLanes =
+        __ballot_sync(mask, KVPair.value < heapArr->warpK[i]);
+      if (activeLanes) {
+        Pair tempKV;
+        tempKV.value = raft::shfl(heapArr->warpK[i], srcLane);
+        tempKV.key = raft::shfl(heapArr->warpV[i], srcLane);
+        const auto firstActiveLane = __ffs(activeLanes);
+        if (firstActiveLane == (lid + 1)) {
+          heapArr->warpK[i] = KVPair.value;
+          heapArr->warpV[i] = KVPair.key;
+        } else if (activeLanes & ((uint32_t)1 << lid)) {
+          heapArr->warpK[i] = tempKV.value;
+          heapArr->warpV[i] = tempKV.key;
+        }
+        if (i == 0 && NumWarpQRegs > 1) {
+          if (lid == 0) {
+            heapArr->warpK[1] = tempKV.value;
+            heapArr->warpV[1] = tempKV.key;
+          }
+          heapArr->warpK[1] = __shfl_up_sync(mask, heapArr->warpK[1], 1);
+          heapArr->warpV[1] = __shfl_up_sync(mask, heapArr->warpV[1], 1);
+          break;
+        }
+      }
+    }
+  }
+}
+
+template <bool useNorms, typename DataT, typename AccT, typename OutT,
+          typename IdxT, typename Policy, typename CoreLambda,
+          typename FinalLambda, int NumWarpQ, int NumThreadQ,
+          bool usePrevTopKs = false, bool isRowMajor = true>
+__global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
+  const DataT *x, const DataT *y, const DataT *_xn, const DataT *_yn,
+  const IdxT m, const IdxT n, const IdxT k, const IdxT lda, const IdxT ldb,
+  const IdxT ldd, CoreLambda core_op, FinalLambda fin_op, bool sqrt,
+  unsigned int numOfNN, int *mutexes, OutT *out_dists, IdxT *out_inds) {
+  extern __shared__ char smem[];
+
+  typedef cub::KeyValuePair<uint32_t, AccT> Pair;
+  constexpr auto identity = std::numeric_limits<AccT>::max();
+  constexpr auto keyMax = std::numeric_limits<uint32_t>::max();
+  constexpr auto Dir = false;
+  typedef faiss::gpu::WarpSelect<
+    AccT, uint32_t, Dir, faiss::gpu::Comparator<AccT>, NumWarpQ, NumThreadQ, 32>
+    myWarpSelect;
+
+  auto rowEpilog_lambda = [m, n, numOfNN, out_dists, out_inds,
+                           mutexes] __device__(IdxT gridStrideY) {
+    if (gridDim.x == 1) {
+      return;
+    }
+
+    volatile int *mutex = mutexes;
+    Pair *shDumpKV = (Pair *)(&smem[Policy::SmemSize]);
+    const int lid = threadIdx.x % warpSize;
+    const IdxT starty = gridStrideY + (threadIdx.x / Policy::AccThCols);
+
+    //  0 -> consumer done consuming the buffer.
+    // -1 -> consumer started consuming the buffer
+    // -2 -> producer done filling the buffer
+    // blockIdx.x -> prod started to fill the buffer
+    if (blockIdx.x == 0) {
+      auto cta_processed = 0;
+      myWarpSelect heapArr1(identity, keyMax, numOfNN);
+      myWarpSelect heapArr2(identity, keyMax, numOfNN);
+      myWarpSelect *heapArr[] = {&heapArr1, &heapArr2};
+      __syncthreads();
+
+      loadAllWarpQShmem<Policy, Pair>(heapArr, &shDumpKV[0], m, numOfNN);
+
+      while (cta_processed < gridDim.x - 1) {
+        Pair otherKV[Policy::AccRowsPerTh];
+
+        if (threadIdx.x == 0) {
+          int32_t old = -3;
+          while (old != -1) {
+            old = atomicCAS((int *)&mutex[gridStrideY / Policy::Mblk], -2, -1);
+          }
+          __threadfence();
+        }
+        __syncthreads();
+
+#pragma unroll
+        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+          const auto rowId = starty + i * Policy::AccThRows;
+          otherKV[i].value = identity;
+          otherKV[i].key = keyMax;
+
+          if (lid < numOfNN && rowId < m) {
+            otherKV[i].value = out_dists[rowId * numOfNN + lid];
+            otherKV[i].key = (uint32_t)out_inds[rowId * numOfNN + lid];
+          }
+        }
+        __threadfence();
+
+        if (threadIdx.x == 0) {
+          mutex[gridStrideY / Policy::Mblk] = 0;
+          __threadfence();
+        }
+
+        // Perform merging of otherKV with topk's across warp.
+#pragma unroll
+        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+          const auto rowId = starty + i * Policy::AccThRows;
+          if (rowId < m) {
+            heapArr[i]->add(otherKV[i].value, otherKV[i].key);
+          }
+        }
+
+        cta_processed++;
+      }
+#pragma unroll
+      for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+        const auto rowId = starty + i * Policy::AccThRows;
+        if (rowId < m) {
+          bool needSort = (heapArr[i]->numVals > 0);
+          needSort = __any_sync(0xffffffff, needSort);
+          if (needSort) {
+            heapArr[i]->reduce();
+          }
+        }
+      }
+      storeWarpQGmem<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN,
+                                   starty);
+    } else {
+      if (threadIdx.x == 0) {
+        int32_t old = -1;
+        int32_t blkIdX = (int32_t)blockIdx.x;
+        while (old != blkIdX) {
+          old = atomicCAS((int *)&mutex[gridStrideY / Policy::Mblk], 0, blkIdX);
+        }
+        __threadfence();
+      }
+      __syncthreads();
+
+#pragma unroll
+      for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+        const auto rowId = starty + i * Policy::AccThRows;
+        const auto shMemRowId =
+          (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+        if (rowId < m) {
+          for (int idx = lid; idx < numOfNN; idx += warpSize) {
+            Pair KVPair = shDumpKV[shMemRowId * numOfNN + idx];
+            out_dists[rowId * numOfNN + idx] = KVPair.value;
+            out_inds[rowId * numOfNN + idx] = (IdxT)KVPair.key;
+          }
+        }
+      }
+      __threadfence();
+
+      if (threadIdx.x == 0) {
+        mutex[gridStrideY / Policy::Mblk] = -2;
+        __threadfence();
+      }
+    }
+  };
+
+  // epilogue operation lambda for final value calculation
+  auto epilog_lambda =
+    [numOfNN, sqrt, m, n, ldd, out_dists, out_inds] __device__(
+      AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], DataT * regxn,
+      DataT * regyn, IdxT gridStrideX, IdxT gridStrideY) {
+      if (sqrt) {
+#pragma unroll
+        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+          for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+            acc[i][j] = raft::mySqrt(acc[i][j]);
+          }
+        }
+      }
+      Pair *shDumpKV = (Pair *)(&smem[Policy::SmemSize]);
+
+      constexpr uint32_t mask = 0xffffffffu;
+      const IdxT starty = gridStrideY + (threadIdx.x / Policy::AccThCols);
+      const IdxT startx = gridStrideX + (threadIdx.x % Policy::AccThCols);
+      const int lid = raft::laneId();
+
+      myWarpSelect heapArr1(identity, keyMax, numOfNN);
+      myWarpSelect heapArr2(identity, keyMax, numOfNN);
+      myWarpSelect *heapArr[] = {&heapArr1, &heapArr2};
+      if (usePrevTopKs) {
+        if (gridStrideX == blockIdx.x * Policy::Nblk) {
+          loadPrevTopKsGmemWarpQ<Policy, Pair>(heapArr, out_dists, out_inds, m,
+                                               numOfNN, starty);
+        }
+      }
+
+      if (gridStrideX > blockIdx.x * Policy::Nblk) {
+#pragma unroll
+        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+          const auto rowId =
+            (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+          Pair tempKV = shDumpKV[(rowId * numOfNN) + numOfNN - 1];
+          heapArr[i]->warpKTop = tempKV.value;
+        }
+
+        // total vals can atmost be 256, (32*8)
+        int numValsWarpTopK[Policy::AccRowsPerTh];
+        int anyWarpTopKs = 0;
+#pragma unroll
+        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+          const auto rowId = starty + i * Policy::AccThRows;
+          numValsWarpTopK[i] = 0;
+          if (rowId < m) {
+#pragma unroll
+            for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+              const auto colId = startx + j * Policy::AccThCols;
+              if (colId < ldd) {
+                if (acc[i][j] < heapArr[i]->warpKTop) {
+                  numValsWarpTopK[i]++;
+                }
+              }
+            }
+            anyWarpTopKs += numValsWarpTopK[i];
+          }
+        }
+        anyWarpTopKs = __syncthreads_or(anyWarpTopKs > 0);
+        if (anyWarpTopKs) {
+          Pair *allWarpTopKs = (Pair *)(&smem[0]);
+          uint32_t needScanSort[Policy::AccRowsPerTh];
+
+#pragma unroll
+          for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+            const auto gmemRowId = starty + i * Policy::AccThRows;
+            needScanSort[i] = 0;
+            if (gmemRowId < m) {
+              int myVals = numValsWarpTopK[i];
+              needScanSort[i] = __ballot_sync(mask, myVals > 0);
+              if (needScanSort[i]) {
+#pragma unroll
+                for (unsigned int k = 1; k <= 16; k *= 2) {
+                  const unsigned int n =
+                    __shfl_up_sync(mask, numValsWarpTopK[i], k);
+                  if (lid >= k) {
+                    numValsWarpTopK[i] += n;
+                  }
+                }
+              }
+              // As each thread will know its total vals to write.
+              // we only store its starting location.
+              numValsWarpTopK[i] -= myVals;
+            }
+
+            if (needScanSort[i]) {
+              const auto rowId =
+                (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+              if (gmemRowId < m) {
+                if (needScanSort[i] & ((uint32_t)1 << lid)) {
+#pragma unroll
+                  for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+                    const auto colId = startx + j * Policy::AccThCols;
+                    if (colId < ldd) {
+                      if (acc[i][j] < heapArr[i]->warpKTop) {
+                        Pair otherKV = {colId, acc[i][j]};
+                        allWarpTopKs[rowId * (256) + numValsWarpTopK[i]] =
+                          otherKV;
+                        numValsWarpTopK[i]++;
+                      }
+                    }
+                  }
+                }
+                const int finalNumVals = raft::shfl(numValsWarpTopK[i], 31);
+                loadWarpQShmem<Policy, Pair>(heapArr[i], &shDumpKV[0], rowId,
+                                             numOfNN);
+                updateSortedWarpQ<Pair, heapArr[i]->kNumWarpQRegisters>(
+                  heapArr[i], &allWarpTopKs[0], rowId, finalNumVals);
+              }
+            }
+          }
+          __syncthreads();
+#pragma unroll
+          for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+            if (needScanSort[i]) {
+              const auto rowId =
+                (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+              const auto gmemRowId = starty + i * Policy::AccThRows;
+              if (gmemRowId < m) {
+                storeWarpQShmem<Policy, Pair>(heapArr[i], shDumpKV, rowId,
+                                              numOfNN);
+              }
+            }
+          }
+        }
+      } else {
+#pragma unroll
+        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+          const auto gmemRowId = starty + i * Policy::AccThRows;
+          const auto shMemRowId =
+            (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+          if (gmemRowId < m) {
+#pragma unroll
+            for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+              const auto colId = startx + j * Policy::AccThCols;
+              Pair otherKV = {keyMax, identity};
+              if (colId < ldd) {
+                otherKV.value = acc[i][j];
+                otherKV.key = colId;
+              }
+              heapArr[i]->add(otherKV.value, otherKV.key);
+            }
+
+            bool needSort = (heapArr[i]->numVals > 0);
+            needSort = __any_sync(mask, needSort);
+            if (needSort) {
+              heapArr[i]->reduce();
+            }
+            storeWarpQShmem<Policy, Pair>(heapArr[i], shDumpKV, shMemRowId,
+                                          numOfNN);
+          }
+        }
+      }
+
+      if (((gridStrideX + Policy::Nblk * gridDim.x) > n) && gridDim.x == 1) {
+        // This is last iteration of grid stride X
+        loadAllWarpQShmem<Policy, Pair>(heapArr, &shDumpKV[0], m, numOfNN);
+        storeWarpQGmem<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN,
+                                     starty);
+      }
+    };
+
+  raft::distance::PairwiseDistances<useNorms, DataT, AccT, OutT, IdxT, Policy,
+                                    CoreLambda, decltype(epilog_lambda),
+                                    FinalLambda, decltype(rowEpilog_lambda),
+                                    isRowMajor, false>
+    obj(x, y, m, n, k, lda, ldb, ldd, _xn, _yn, nullptr, smem, core_op,
+        epilog_lambda, fin_op, rowEpilog_lambda);
+  obj.run();
+}
+
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          int VecLen, bool usePrevTopKs, bool isRowMajor>
+void fusedL2kNNImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
+                    IdxT lda, IdxT ldb, IdxT ldd, bool sqrt, OutT *out_dists,
+                    IdxT *out_inds, IdxT numOfNN, cudaStream_t stream,
+                    void *workspace, size_t &worksize) {
+  typedef typename raft::linalg::Policy2x8<DataT, 1>::Policy RowPolicy;
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
+
+  typedef typename std::conditional<true, RowPolicy, ColPolicy>::type KPolicy;
+
+  ASSERT(isRowMajor, "Only Row major inputs are allowed");
+
+  dim3 blk(KPolicy::Nthreads);
+  // Accumulation operation lambda
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
+    const auto diff = x - y;
+    acc += diff * diff;
+  };
+
+  auto fin_op = [] __device__(AccT d_val, int g_d_idx) { return d_val; };
+
+  typedef cub::KeyValuePair<uint32_t, AccT> Pair;
+
+  if (isRowMajor) {
+    constexpr auto fusedL2kNN32RowMajor =
+      fusedL2kNN<false, DataT, AccT, OutT, IdxT, KPolicy, decltype(core_lambda),
+                 decltype(fin_op), 32, 2, usePrevTopKs, true>;
+    constexpr auto fusedL2kNN64RowMajor =
+      fusedL2kNN<false, DataT, AccT, OutT, IdxT, KPolicy, decltype(core_lambda),
+                 decltype(fin_op), 64, 3, usePrevTopKs, true>;
+
+    auto fusedL2kNNRowMajor = fusedL2kNN32RowMajor;
+    if (numOfNN <= 32) {
+      fusedL2kNNRowMajor = fusedL2kNN32RowMajor;
+    } else if (numOfNN <= 64) {
+      fusedL2kNNRowMajor = fusedL2kNN64RowMajor;
+    } else {
+      ASSERT(numOfNN <= 64,
+             "fusedL2kNN: num of nearest neighbors must be <= 64");
+    }
+
+    dim3 grid = raft::distance::launchConfigGenerator<KPolicy>(
+      m, n, KPolicy::SmemSize, fusedL2kNNRowMajor);
+    if (grid.x > 1) {
+      const auto numMutexes = raft::ceildiv<int>(m, KPolicy::Mblk);
+      if (workspace == nullptr || worksize < (sizeof(int32_t) * numMutexes)) {
+        worksize = sizeof(int32_t) * numMutexes;
+        return;
+      } else {
+        CUDA_CHECK(
+          cudaMemsetAsync(workspace, 0, sizeof(int32_t) * numMutexes, stream));
+      }
+    }
+
+    const auto sharedMemSize =
+      KPolicy::SmemSize + (KPolicy::Mblk * numOfNN * sizeof(Pair));
+
+    fusedL2kNNRowMajor<<<grid, blk, sharedMemSize, stream>>>(
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, core_lambda, fin_op, sqrt,
+      (uint32_t)numOfNN, (int *)workspace, out_dists, out_inds);
+  } else {
+  }
+
+  CUDA_CHECK(cudaGetLastError());
+}
+
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          bool usePrevTopKs, bool isRowMajor>
+void fusedL2kNN(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                const DataT *x, const DataT *y, bool sqrt, OutT *out_dists,
+                IdxT *out_inds, IdxT numOfNN, cudaStream_t stream,
+                void *workspace, size_t &worksize) {
+  size_t bytesA = sizeof(DataT) * lda;
+  size_t bytesB = sizeof(DataT) * ldb;
+  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
+    fusedL2kNNImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), usePrevTopKs,
+                   isRowMajor>(x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists,
+                               out_inds, numOfNN, stream, workspace, worksize);
+  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
+    fusedL2kNNImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), usePrevTopKs,
+                   isRowMajor>(x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists,
+                               out_inds, numOfNN, stream, workspace, worksize);
+  } else {
+    fusedL2kNNImpl<DataT, AccT, OutT, IdxT, 1, usePrevTopKs, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, out_inds, numOfNN, stream,
+      workspace, worksize);
+  }
+}
+
+/**
+ * Compute the k-nearest neighbors using L2 unexpanded distance.
+
+ * @tparam value_idx
+ * @tparam value_t
+ * @param[out] out_inds output indices array on device (size n_query_rows * k)
+ * @param[out] out_dists output dists array on device (size n_query_rows * k)
+ * @param[in] index input index array on device (size n_index_rows * D)
+ * @param[in] query input query array on device (size n_query_rows * D)
+ * @param[in] n_index_rows number of rows in index array
+ * @param[in] n_query_rows number of rows in query array
+ * @param[in] k number of closest neighbors to return
+ * @param[in] rowMajorIndex are the index arrays in row-major layout?
+ * @param[in] rowMajorQuery are the query array in row-major layout?
+ * @param[in] stream stream to order kernel launch
+ */
+template <raft::distance::DistanceType distanceType, typename value_idx,
+          typename value_t, bool usePrevTopKs>
+void l2_unexpanded_knn(size_t D, value_idx *out_inds, value_t *out_dists,
+                       const value_t *index, const value_t *query,
+                       size_t n_index_rows, size_t n_query_rows, int k,
+                       bool rowMajorIndex, bool rowMajorQuery,
+                       cudaStream_t stream, void *workspace, size_t &worksize) {
+  // Validate the input data
+  ASSERT(k > 0, "l2Knn: k must be > 0");
+  ASSERT(D > 0, "l2Knn: D must be > 0");
+  ASSERT(n_index_rows > 0, "l2Knn: n_index_rows must be > 0");
+  ASSERT(index, "l2Knn: index must be provided (passed null)");
+  ASSERT(n_query_rows > 0, "l2Knn: n_query_rows must be > 0");
+  ASSERT(query, "l2Knn: query must be provided (passed null)");
+  ASSERT(out_dists, "l2Knn: out_dists must be provided (passed null)");
+  ASSERT(out_inds, "l2Knn: out_inds must be provided (passed null)");
+  // Currently we only support same layout for x & y inputs.
+  ASSERT(rowMajorIndex == rowMajorQuery,
+         "l2Knn: rowMajorIndex and rowMajorQuery should have same layout");
+
+  bool sqrt = (distanceType == raft::distance::DistanceType::L2SqrtUnexpanded);
+
+  if (rowMajorIndex) {
+    value_idx lda = D, ldb = D, ldd = n_index_rows;
+    fusedL2kNN<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(
+      n_query_rows, n_index_rows, D, lda, ldb, ldd, query, index, sqrt,
+      out_dists, out_inds, k, stream, workspace, worksize);
+  } else {
+    // TODO: Add support for column major layout
+  }
+}
+
+}  // namespace detail
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 84c130b0e4..94ace19580 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -34,6 +34,7 @@
 #include <raft/handle.hpp>
 #include <set>
 
+#include "fused_l2_knn.cuh"
 #include "haversine_distance.cuh"
 #include "processing.hpp"
 
@@ -269,44 +270,81 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
     cudaStream_t stream =
       raft::select_stream(userStream, internalStreams, n_int_streams, i);
 
-    switch (metric) {
-      case raft::distance::DistanceType::Haversine:
-
-        ASSERT(D == 2,
-               "Haversine distance requires 2 dimensions "
-               "(latitude / longitude).");
-
-        haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n,
-                      k, stream);
-        break;
-      default:
-        faiss::MetricType m = build_faiss_metric(metric);
-
-        faiss::gpu::StandardGpuResources gpu_res;
-
-        gpu_res.noTempMemory();
-        gpu_res.setDefaultStream(device, stream);
-
-        faiss::gpu::GpuDistanceParams args;
-        args.metric = m;
-        args.metricArg = metricArg;
-        args.k = k;
-        args.dims = D;
-        args.vectors = input[i];
-        args.vectorsRowMajor = rowMajorIndex;
-        args.numVectors = sizes[i];
-        args.queries = search_items;
-        args.queriesRowMajor = rowMajorQuery;
-        args.numQueries = n;
-        args.outDistances = out_d_ptr;
-        args.outIndices = out_i_ptr;
-
-        /**
-         * @todo: Until FAISS supports pluggable allocation strategies,
-         * we will not reap the benefits of the pool allocator for
-         * avoiding device-wide synchronizations from cudaMalloc/cudaFree
-         */
-        bfKnn(&gpu_res, args);
+    if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true &&
+        (metric == raft::distance::DistanceType::L2Unexpanded ||
+         metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
+         metric == raft::distance::DistanceType::L2Expanded ||
+         metric == raft::distance::DistanceType::L2SqrtExpanded)) {
+      size_t worksize = 0;
+      void *workspace = nullptr;
+
+      switch (metric) {
+        case raft::distance::DistanceType::L2Expanded:
+        case raft::distance::DistanceType::L2Unexpanded:
+        case raft::distance::DistanceType::L2SqrtExpanded:
+        // Even for L2 Sqrt distance case we use non-sqrt version
+        // as FAISS bfKNN only support non-sqrt metric & some tests
+        // in RAFT/cuML (like Linkage) fails if we use L2 sqrt.
+        // Even for L2 Sqrt distance case we use non-sqrt version
+        // as FAISS bfKNN only support non-sqrt metric & some tests
+        // in RAFT/cuML (like Linkage) fails if we use L2 sqrt.
+        case raft::distance::DistanceType::L2SqrtUnexpanded:
+          l2_unexpanded_knn<raft::distance::DistanceType::L2Unexpanded, int64_t,
+                            float, false>(
+            D, out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k,
+            rowMajorIndex, rowMajorQuery, stream, workspace, worksize);
+          if (worksize) {
+            rmm::device_uvector<int> d_mutexes(worksize, stream);
+            workspace = d_mutexes.data();
+            l2_unexpanded_knn<raft::distance::DistanceType::L2Unexpanded,
+                              int64_t, float, false>(
+              D, out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k,
+              rowMajorIndex, rowMajorQuery, stream, workspace, worksize);
+          }
+          break;
+        default:
+          break;
+      }
+    } else {
+      switch (metric) {
+        case raft::distance::DistanceType::Haversine:
+
+          ASSERT(D == 2,
+                 "Haversine distance requires 2 dimensions "
+                 "(latitude / longitude).");
+
+          haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i],
+                        n, k, stream);
+          break;
+        default:
+          faiss::MetricType m = build_faiss_metric(metric);
+
+          faiss::gpu::StandardGpuResources gpu_res;
+
+          gpu_res.noTempMemory();
+          gpu_res.setDefaultStream(device, stream);
+
+          faiss::gpu::GpuDistanceParams args;
+          args.metric = m;
+          args.metricArg = metricArg;
+          args.k = k;
+          args.dims = D;
+          args.vectors = input[i];
+          args.vectorsRowMajor = rowMajorIndex;
+          args.numVectors = sizes[i];
+          args.queries = search_items;
+          args.queriesRowMajor = rowMajorQuery;
+          args.numQueries = n;
+          args.outDistances = out_d_ptr;
+          args.outIndices = out_i_ptr;
+
+          /**
+           * @todo: Until FAISS supports pluggable allocation strategies,
+           * we will not reap the benefits of the pool allocator for
+           * avoiding device-wide synchronizations from cudaMalloc/cudaFree
+           */
+          bfKnn(&gpu_res, args);
+      }
     }
 
     CUDA_CHECK(cudaPeekAtLastError());

From 05b6edeed9b418dbfa8572d1c7024f11fcd6955a Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 21 Sep 2021 12:54:31 -0400
Subject: [PATCH 020/171] Make sure we keep the rapids-cmake and raft cal
 version in sync (#331)

When we make a new raft version, we need to also bump the rapids-cmake version at the same time. Otherwise we will get the previous releases dependencies by mistake.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/331
---
 ci/release/update-version.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index fcdb6a2233..45383aaf68 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -30,4 +30,5 @@ function sed_runner() {
     sed -i.bak ''"$1"'' $2 && rm -f ${2}.bak
 }
 
-sed_runner 's/'"RAFT VERSION .* LANGUAGES"'/'"RAFT VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt
\ No newline at end of file
+sed_runner 's/'"RAFT VERSION .* LANGUAGES"'/'"RAFT VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt
+sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' cpp/CMakeLists.txt
\ No newline at end of file

From 2ecb227e16443775ade23195a82ee9797ce8ede4 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 23 Sep 2021 16:47:27 -0500
Subject: [PATCH 021/171] Pin max `dask` and `distributed` versions to
 `2021.09.1` (#334)

Changes to be in-line with: https://github.com/rapidsai/cudf/pull/9286/

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/334
---
 ci/gpu/build.sh          | 4 ++--
 ci/local/old-gpubuild.sh | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 0ba9901107..6f5b5c3c6c 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -59,8 +59,8 @@ gpuci_mamba_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid
 # Install the master version of dask, distributed, and dask-ml
 gpuci_logger "Install the master version of dask and distributed"
 set -x
-pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
+pip install "git+https://github.com/dask/distributed.git@2021.09.1" --upgrade --no-deps
+pip install "git+https://github.com/dask/dask.git@2021.09.1" --upgrade --no-deps
 set +x
 
 
diff --git a/ci/local/old-gpubuild.sh b/ci/local/old-gpubuild.sh
index efd6c0382a..9ab4486977 100644
--- a/ci/local/old-gpubuild.sh
+++ b/ci/local/old-gpubuild.sh
@@ -81,8 +81,8 @@ fi
 
 # Install the master version of dask, distributed, and dask-ml
 set -x
-pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
+pip install "git+https://github.com/dask/distributed.git@2021.09.1" --upgrade --no-deps
+pip install "git+https://github.com/dask/dask.git@2021.09.1" --upgrade --no-deps
 set +x
 
 
From 9b4adb1e3f5e3d76db9cb3b1dc971c63617b7da1 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Thu, 23 Sep 2021 18:46:08 -0400
Subject: [PATCH 022/171] Random Ball Cover Algorithm for 2D
 Haversine/Euclidean (#213)

This PR is a proof of concept to use the triangle inequality to prune the tree of  <img src="https://latex.codecogs.com/gif.latex?O(n^2)" title="O(n^2)" /> exhaustive distance computations into something smaller, such as on the order of <img src="https://latex.codecogs.com/gif.latex?O(c^{3/2}&space;*&space;\sqrt{n})" title="O(c^{3/2} * \sqrt{n})" /> where c is called an expansion constant, based on the dimensionality.

This should (hopefully) be able to benefit both sparse and dense k-nearest neighbors and all algorithms that use them, hopefully providing a significant speedup for our sparse semirings primitive when only the k-nearest neighbors are desired.


The goal here is to construct a tree out of the random ball cover algorithm such that we can utilize it in algorithms which would otherwise be able to make efficient use of a ball tree. However, there are additional challenges to this algorithm on the GPU, such as being able to batch the tree lookups.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - William Hicks (https://github.com/wphicks)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/213
---
 cpp/include/raft/cache/cache_util.cuh         |   6 +-
 cpp/include/raft/sparse/linalg/degree.cuh     |  10 +-
 cpp/include/raft/sparse/selection/knn.cuh     |   7 +-
 .../raft/sparse/selection/knn_graph.cuh       |   3 +-
 cpp/include/raft/spatial/knn/ball_cover.hpp   | 162 ++++
 .../raft/spatial/knn/ball_cover_common.h      |  99 +++
 .../knn/detail/ann_quantized_faiss.cuh        |   2 +
 .../raft/spatial/knn/detail/ball_cover.cuh    | 351 +++++++++
 .../spatial/knn/detail/ball_cover/common.cuh  |  91 +++
 .../knn/detail/ball_cover/registers.cuh       | 537 +++++++++++++
 .../spatial/knn/detail/block_select_faiss.cuh | 224 ++++++
 .../knn/detail/knn_brute_force_faiss.cuh      |  37 +-
 .../knn/detail/selection_faiss.cuh}           |  19 +-
 .../spatial/knn/detail/warp_select_faiss.cuh  | 739 ++++++++++++++++++
 cpp/include/raft/spatial/knn/knn.hpp          |  56 +-
 cpp/test/CMakeLists.txt                       |   3 +-
 cpp/test/spatial/ball_cover.cu                | 271 +++++++
 cpp/test/spatial/knn.cu                       |   9 -
 cpp/test/{sparse => spatial}/selection.cu     |  11 +-
 cpp/test/spatial/spatial_data.h               |  27 +
 cpp/test/test_utils.h                         |  42 +
 21 files changed, 2645 insertions(+), 61 deletions(-)
 create mode 100644 cpp/include/raft/spatial/knn/ball_cover.hpp
 create mode 100644 cpp/include/raft/spatial/knn/ball_cover_common.h
 create mode 100644 cpp/include/raft/spatial/knn/detail/ball_cover.cuh
 create mode 100644 cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
 create mode 100644 cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
 create mode 100644 cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh
 rename cpp/include/raft/{sparse/selection/selection.cuh => spatial/knn/detail/selection_faiss.cuh} (94%)
 create mode 100644 cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh
 create mode 100644 cpp/test/spatial/ball_cover.cu
 rename cpp/test/{sparse => spatial}/selection.cu (93%)
 create mode 100644 cpp/test/spatial/spatial_data.h

diff --git a/cpp/include/raft/cache/cache_util.cuh b/cpp/include/raft/cache/cache_util.cuh
index ce8ef9a095..a65227c402 100644
--- a/cpp/include/raft/cache/cache_util.cuh
+++ b/cpp/include/raft/cache/cache_util.cuh
@@ -41,9 +41,9 @@ namespace cache {
  * @param [in] n the number of elements that need to be collected
  * @param [out] out vectors collected from the cache, size [n_vec * n]
  */
-template <typename math_t>
-__global__ void get_vecs(const math_t *cache, int n_vec, const int *cache_idx,
-                         int n, math_t *out) {
+template <typename math_t, typename idx_t, typename int_t>
+__global__ void get_vecs(const math_t *cache, int_t n_vec,
+                         const idx_t *cache_idx, int_t n, math_t *out) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   int row = tid % n_vec;  // row idx
   if (tid < n_vec * n) {
diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh
index 9bd322c90a..ef6a067c39 100644
--- a/cpp/include/raft/sparse/linalg/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/degree.cuh
@@ -43,11 +43,11 @@ namespace linalg {
  * @param nnz the size of the rows array
  * @param results array to place results
  */
-template <int TPB_X = 64>
-__global__ void coo_degree_kernel(const int *rows, int nnz, int *results) {
+template <int TPB_X = 64, typename T = int>
+__global__ void coo_degree_kernel(const T *rows, int nnz, T *results) {
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
   if (row < nnz) {
-    raft::myAtomicAdd(results + rows[row], 1);
+    atomicAdd(results + rows[row], (T)1);
   }
 }
 
@@ -59,8 +59,8 @@ __global__ void coo_degree_kernel(const int *rows, int nnz, int *results) {
  * @param results: output result array
  * @param stream: cuda stream to use
  */
-template <int TPB_X = 64>
-void coo_degree(const int *rows, int nnz, int *results, cudaStream_t stream) {
+template <int TPB_X = 64, typename T = int>
+void coo_degree(const T *rows, int nnz, T *results, cudaStream_t stream) {
   dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
 
diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh
index 3566939bc4..49573a679d 100644
--- a/cpp/include/raft/sparse/selection/knn.cuh
+++ b/cpp/include/raft/sparse/selection/knn.cuh
@@ -31,8 +31,6 @@
 #include <raft/sparse/coo.cuh>
 #include <raft/sparse/csr.cuh>
 #include <raft/sparse/distance/distance.cuh>
-#include <raft/sparse/selection/selection.cuh>
-
 #include <raft/spatial/knn/knn.hpp>
 
 #include <raft/cudart_utils.h>
@@ -339,8 +337,9 @@ class sparse_knn_t {
     if (metric == raft::distance::DistanceType::InnerProduct) ascending = false;
 
     // kernel to slice first (min) k cols and copy into batched merge buffer
-    select_k(batch_dists, batch_indices, batch_rows, batch_cols, out_dists,
-             out_indices, ascending, n_neighbors, handle.get_stream());
+    raft::spatial::knn::select_k(batch_dists, batch_indices, batch_rows,
+                                 batch_cols, out_dists, out_indices, ascending,
+                                 n_neighbors, handle.get_stream());
   }
 
   void compute_distances(csr_batcher_t<value_idx, value_t> &idx_batcher,
diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh
index 1cdd66f516..3df1c77081 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.cuh
+++ b/cpp/include/raft/sparse/selection/knn_graph.cuh
@@ -88,11 +88,12 @@ void conv_indices(in_t *inds, out_t *out, size_t size, cudaStream_t stream) {
  * @param[in] n number of observations (columns) in X
  * @param[in] metric distance metric to use when constructing neighborhoods
  * @param[out] out output edge list
+ * @param[out] out output edge list
  * @param c
  */
 template <typename value_idx = int, typename value_t = float>
 void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n,
-               distance::DistanceType metric,
+               raft::distance::DistanceType metric,
                raft::sparse::COO<value_t, value_idx> &out, int c = 15) {
   int k = build_k(m, c);
 
diff --git a/cpp/include/raft/spatial/knn/ball_cover.hpp b/cpp/include/raft/spatial/knn/ball_cover.hpp
new file mode 100644
index 0000000000..e4b50c77e3
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/ball_cover.hpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+#include <raft/linalg/distance_type.h>
+#include <thrust/transform.h>
+#include "ball_cover_common.h"
+#include "detail/ball_cover.cuh"
+#include "detail/ball_cover/common.cuh"
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+template <typename value_idx = std::int64_t, typename value_t,
+          typename value_int = std::uint32_t>
+void rbc_build_index(const raft::handle_t &handle,
+                     BallCoverIndex<value_idx, value_t, value_int> &index) {
+  ASSERT(index.n == 2,
+         "Random ball cover currently only works in 2-dimensions");
+  if (index.metric == raft::distance::DistanceType::Haversine) {
+    detail::rbc_build_index(handle, index, detail::HaversineFunc());
+  } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
+             index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
+    detail::rbc_build_index(handle, index, detail::EuclideanFunc());
+  } else {
+    RAFT_FAIL("Metric not support");
+  }
+
+  index.set_index_trained();
+}
+
+/**
+ * Performs a faster exact knn in metric spaces using the triangle
+ * inequality with a number of landmark points to reduce the
+ * number of distance computations from O(n^2) to O(sqrt(n)). This
+ * performs an all neighbors knn, which can reuse memory when
+ * the index and query are the same array. This function will
+ * build the index and assumes rbc_build_index() has not already
+ * been called.
+ * @tparam value_idx knn index type
+ * @tparam value_t knn distance type
+ * @tparam value_int type for integers, such as number of rows/cols
+ * @param handle raft handle for resource management
+ * @param index ball cover index which has not yet been built
+ * @param k number of nearest neighbors to find
+ * @param perform_post_filtering if this is false, only the closest k landmarks
+ *                               are considered (which will return approximate
+ *                               results).
+ * @param[out] inds output knn indices
+ * @param[out] dists output knn distances
+ * @param weight a weight for overlap between the closest landmark and
+ *               the radius of other landmarks when pruning distances.
+ *               Setting this value below 1 can effectively turn off
+ *               computing distances against many other balls, enabling
+ *               approximate nearest neighbors. Recall can be adjusted
+ *               based on how many relevant balls are ignored. Note that
+ *               many datasets can still have great recall even by only
+ *               looking in the closest landmark.
+ */
+template <typename value_idx = std::int64_t, typename value_t,
+          typename value_int = std::uint32_t>
+void rbc_all_knn_query(const raft::handle_t &handle,
+                       BallCoverIndex<value_idx, value_t, value_int> &index,
+                       value_int k, value_idx *inds, value_t *dists,
+                       bool perform_post_filtering = true, float weight = 1.0) {
+  ASSERT(index.n == 2,
+         "Random ball cover currently only works in 2-dimensions");
+  if (index.metric == raft::distance::DistanceType::Haversine) {
+    detail::rbc_all_knn_query(handle, index, k, inds, dists,
+                              detail::HaversineFunc(), perform_post_filtering,
+                              weight);
+  } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
+             index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
+    detail::rbc_all_knn_query(handle, index, k, inds, dists,
+                              detail::EuclideanFunc(), perform_post_filtering,
+                              weight);
+  } else {
+    RAFT_FAIL("Metric not supported");
+  }
+
+  index.set_index_trained();
+}
+
+/**
+ * Performs a faster exact knn in metric spaces using the triangle
+ * inequality with a number of landmark points to reduce the
+ * number of distance computations from O(n^2) to O(sqrt(n)). This
+ * function does not build the index and assumes rbc_build_index() has
+ * already been called. Use this function when the index and
+ * query arrays are different, otherwise use rbc_all_knn_query().
+ * @tparam value_idx index type
+ * @tparam value_t distances type
+ * @tparam value_int integer type for size info
+ * @param handle raft handle for resource management
+ * @param index ball cover index which has not yet been built
+ * @param k number of nearest neighbors to find
+ * @param query the
+ * @param perform_post_filtering if this is false, only the closest k landmarks
+ *                               are considered (which will return approximate
+ *                               results).
+ * @param[out] inds output knn indices
+ * @param[out] dists output knn distances
+ * @param weight a weight for overlap between the closest landmark and
+ *               the radius of other landmarks when pruning distances.
+ *               Setting this value below 1 can effectively turn off
+ *               computing distances against many other balls, enabling
+ *               approximate nearest neighbors. Recall can be adjusted
+ *               based on how many relevant balls are ignored. Note that
+ *               many datasets can still have great recall even by only
+ *               looking in the closest landmark.
+ * @param k
+ * @param inds
+ * @param dists
+ * @param n_samples
+ */
+template <typename value_idx = std::int64_t, typename value_t,
+          typename value_int = std::uint32_t>
+void rbc_knn_query(const raft::handle_t &handle,
+                   BallCoverIndex<value_idx, value_t, value_int> &index,
+                   value_int k, const value_t *query, value_int n_query_pts,
+                   value_idx *inds, value_t *dists,
+                   bool perform_post_filtering = true, float weight = 1.0) {
+  ASSERT(index.n == 2,
+         "Random ball cover currently only works in 2-dimensions");
+  if (index.metric == raft::distance::DistanceType::Haversine) {
+    detail::rbc_knn_query(handle, index, k, query, n_query_pts, inds, dists,
+                          detail::HaversineFunc(), perform_post_filtering,
+                          weight);
+  } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
+             index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
+    detail::rbc_knn_query(handle, index, k, query, n_query_pts, inds, dists,
+                          detail::EuclideanFunc(), perform_post_filtering,
+                          weight);
+  } else {
+    RAFT_FAIL("Metric not supported");
+  }
+}
+
+// TODO: implement functions for:
+//  4. rbc_eps_neigh() - given a populated index, perform query against different query array
+//  5. rbc_all_eps_neigh() - populate a BallCoverIndex and query against training data
+
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/ball_cover_common.h b/cpp/include/raft/spatial/knn/ball_cover_common.h
new file mode 100644
index 0000000000..ca614bb0cb
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/ball_cover_common.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/distance_type.h>
+#include <cstdint>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+/**
+ * Stores raw index data points, sampled landmarks, the 1-nns of index points
+ * to their closest landmarks, and the ball radii of each landmark. This
+ * class is intended to be constructed once and reused across subsequent
+ * queries.
+ * @tparam value_idx
+ * @tparam value_t
+ * @tparam value_int
+ */
+template <typename value_idx, typename value_t,
+          typename value_int = std::uint32_t>
+class BallCoverIndex {
+ public:
+  explicit BallCoverIndex(const raft::handle_t &handle_, const value_t *X_,
+                          value_int m_, value_int n_,
+                          raft::distance::DistanceType metric_)
+    : handle(handle_),
+      X(X_),
+      m(m_),
+      n(n_),
+      metric(metric_),
+      /**
+      * the sqrt() here makes the sqrt(m)^2 a linear-time lower bound
+      *
+      * Total memory footprint of index: (2 * sqrt(m)) + (n * sqrt(m)) + (2 * m)
+      */
+      n_landmarks(sqrt(m_)),
+      R_indptr(sqrt(m_) + 1, handle.get_stream()),
+      R_1nn_cols(m_, handle.get_stream()),
+      R_1nn_dists(m_, handle.get_stream()),
+      R(sqrt(m_) * n_, handle.get_stream()),
+      R_radius(sqrt(m_), handle.get_stream()),
+      index_trained(false) {}
+
+  value_idx *get_R_indptr() { return R_indptr.data(); }
+  value_idx *get_R_1nn_cols() { return R_1nn_cols.data(); }
+  value_t *get_R_1nn_dists() { return R_1nn_dists.data(); }
+  value_t *get_R_radius() { return R_radius.data(); }
+  value_t *get_R() { return R.data(); }
+  const value_t *get_X() { return X; }
+
+  bool is_index_trained() const { return index_trained; };
+
+  // This should only be set by internal functions
+  void set_index_trained() { index_trained = true; }
+
+  const raft::handle_t &handle;
+
+  const value_int m;
+  const value_int n;
+  const value_int n_landmarks;
+
+  const value_t *X;
+
+  raft::distance::DistanceType metric;
+
+ private:
+  // CSR storing the neighborhoods for each data point
+  rmm::device_uvector<value_idx> R_indptr;
+  rmm::device_uvector<value_idx> R_1nn_cols;
+  rmm::device_uvector<value_t> R_1nn_dists;
+
+  rmm::device_uvector<value_t> R_radius;
+
+  rmm::device_uvector<value_t> R;
+
+ protected:
+  bool index_trained;
+};
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index 77ad4afe96..0e91b5225d 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -17,12 +17,14 @@
 #pragma once
 
 #include "../ann_common.h"
+#include "knn_brute_force_faiss.cuh"
 
 #include "common_faiss.h"
 #include "processing.hpp"
 
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include "processing.hpp"
 
 #include <label/classlabels.cuh>
 #include <raft/distance/distance.cuh>
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
new file mode 100644
index 0000000000..46a97400e2
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/handle.hpp>
+
+#include "../ball_cover_common.h"
+#include "ball_cover/common.cuh"
+#include "ball_cover/registers.cuh"
+#include "block_select_faiss.cuh"
+#include "haversine_distance.cuh"
+#include "knn_brute_force_faiss.cuh"
+#include "selection_faiss.cuh"
+
+#include <limits.h>
+#include <cstdint>
+
+#include <raft/cuda_utils.cuh>
+
+#include <raft/matrix/matrix.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/sparse/convert/csr.cuh>
+#include <raft/sparse/distance/operators.cuh>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <faiss/utils/Heap.h>
+#include <faiss/gpu/utils/Limits.cuh>
+#include <faiss/gpu/utils/Select.cuh>
+
+#include <thrust/functional.h>
+#include <thrust/reduce.h>
+#include <thrust/sequence.h>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+
+/**
+ * Given a set of points in row-major order which are to be
+ * used as a set of index points, uniformly samples a subset
+ * of points to be used as landmarks.
+ * @tparam value_idx
+ * @tparam value_t
+ * @param handle
+ * @param index
+ */
+template <typename value_idx, typename value_t,
+          typename value_int = std::uint32_t>
+void sample_landmarks(const raft::handle_t &handle,
+                      BallCoverIndex<value_idx, value_t, value_int> &index) {
+  rmm::device_uvector<value_idx> R_1nn_cols2(index.n_landmarks,
+                                             handle.get_stream());
+  rmm::device_uvector<value_t> R_1nn_ones(index.m, handle.get_stream());
+  rmm::device_uvector<value_idx> R_indices(index.n_landmarks,
+                                           handle.get_stream());
+
+  thrust::sequence(handle.get_thrust_policy(), index.get_R_1nn_cols(),
+                   index.get_R_1nn_cols() + index.m, (value_idx)0);
+
+  thrust::fill(handle.get_thrust_policy(), R_1nn_ones.data(),
+               R_1nn_ones.data() + R_1nn_ones.size(), 1.0);
+
+  /**
+ * 1. Randomly sample sqrt(n) points from X
+ */
+  auto rng = raft::random::Rng(12345);
+  rng.sampleWithoutReplacement(handle, R_indices.data(), R_1nn_cols2.data(),
+                               index.get_R_1nn_cols(), R_1nn_ones.data(),
+                               (value_idx)index.n_landmarks, (value_idx)index.m,
+                               handle.get_stream());
+
+  raft::matrix::copyRows<value_t, value_idx, size_t>(
+    index.get_X(), index.m, index.n, index.get_R(), R_1nn_cols2.data(),
+    index.n_landmarks, handle.get_stream(), true);
+}
+
+/**
+ * Constructs a 1-nn index mapping each landmark to their closest points.
+ * @tparam value_idx
+ * @tparam value_t
+ * @param handle
+ * @param R_knn_inds_ptr
+ * @param R_knn_dists_ptr
+ * @param k
+ * @param index
+ */
+template <typename value_idx, typename value_t,
+          typename value_int = std::uint32_t>
+void construct_landmark_1nn(
+  const raft::handle_t &handle, const value_idx *R_knn_inds_ptr,
+  const value_t *R_knn_dists_ptr, value_int k,
+  BallCoverIndex<value_idx, value_t, value_int> &index) {
+  rmm::device_uvector<value_idx> R_1nn_inds(index.m, handle.get_stream());
+
+  value_idx *R_1nn_inds_ptr = R_1nn_inds.data();
+  value_t *R_1nn_dists_ptr = index.get_R_1nn_dists();
+
+  auto idxs = thrust::make_counting_iterator<value_idx>(0);
+  thrust::for_each(handle.get_thrust_policy(), idxs, idxs + index.m,
+                   [=] __device__(value_idx i) {
+                     R_1nn_inds_ptr[i] = R_knn_inds_ptr[i * k];
+                     R_1nn_dists_ptr[i] = R_knn_dists_ptr[i * k];
+                   });
+
+  auto keys = thrust::make_zip_iterator(
+    thrust::make_tuple(R_1nn_inds.data(), index.get_R_1nn_dists()));
+
+  // group neighborhoods for each reference landmark and sort each group by distance
+  thrust::sort_by_key(handle.get_thrust_policy(), keys, keys + index.m,
+                      index.get_R_1nn_cols(), NNComp());
+
+  // convert to CSR for fast lookup
+  raft::sparse::convert::sorted_coo_to_csr(
+    R_1nn_inds.data(), index.m, index.get_R_indptr(), index.n_landmarks + 1,
+    handle.get_stream());
+}
+
+/**
+ * Computes the k closest landmarks to a set of query points.
+ * @tparam value_idx
+ * @tparam value_t
+ * @tparam value_int
+ * @param handle
+ * @param index
+ * @param query_pts
+ * @param n_query_pts
+ * @param k
+ * @param R_knn_inds
+ * @param R_knn_dists
+ */
+template <typename value_idx, typename value_t,
+          typename value_int = std::uint32_t>
+void k_closest_landmarks(const raft::handle_t &handle,
+                         BallCoverIndex<value_idx, value_t, value_int> &index,
+                         const value_t *query_pts, value_int n_query_pts,
+                         value_int k, value_idx *R_knn_inds,
+                         value_t *R_knn_dists) {
+  std::vector<value_t *> input = {index.get_R()};
+  std::vector<std::uint32_t> sizes = {index.n_landmarks};
+
+  brute_force_knn_impl<std::uint32_t, std::int64_t>(
+    input, sizes, index.n, const_cast<value_t *>(query_pts), n_query_pts,
+    R_knn_inds, R_knn_dists, k, handle.get_stream(), nullptr, 0, true, true,
+    nullptr, index.metric);
+}
+
+/**
+ * Uses the sorted data points in the 1-nn landmark index to compute
+ * an array of radii for each landmark.
+ * @tparam value_idx
+ * @tparam value_t
+ * @param handle
+ * @param index
+ */
+template <typename value_idx, typename value_t,
+          typename value_int = std::uint32_t>
+void compute_landmark_radii(
+  const raft::handle_t &handle,
+  BallCoverIndex<value_idx, value_t, value_int> &index) {
+  auto entries = thrust::make_counting_iterator<value_idx>(0);
+
+  const value_idx *R_indptr_ptr = index.get_R_indptr();
+  const value_t *R_1nn_dists_ptr = index.get_R_1nn_dists();
+  value_t *R_radius_ptr = index.get_R_radius();
+  thrust::for_each(handle.get_thrust_policy(), entries,
+                   entries + index.n_landmarks,
+                   [=] __device__(value_idx input) {
+                     value_idx last_row_idx = R_indptr_ptr[input + 1] - 1;
+                     R_radius_ptr[input] = R_1nn_dists_ptr[last_row_idx];
+                   });
+}
+
+/**
+ * 4. Perform k-select over original KNN, using L_r to filter distances
+ *
+ * a. Map 1 row to each warp/block
+ * b. Add closest k R points to heap
+ * c. Iterate through batches of R, having each thread in the warp load a set
+ * of distances y from R (only if d(q, r) < 3 * distance to closest r) and
+ * marking the distance to be computed between x, y only
+ * if knn[k].distance >= d(x_i, R_k) + d(R_k, y)
+ */
+template <typename value_idx, typename value_t,
+          typename value_int = std::uint32_t, typename dist_func>
+void perform_rbc_query(const raft::handle_t &handle,
+                       BallCoverIndex<value_idx, value_t, value_int> &index,
+                       const value_t *query, value_int n_query_pts,
+                       std::uint32_t k, const value_idx *R_knn_inds,
+                       const value_t *R_knn_dists, dist_func dfunc,
+                       value_idx *inds, value_t *dists,
+                       value_int *dists_counter, value_int *post_dists_counter,
+                       float weight = 1.0, bool perform_post_filtering = true) {
+  // Compute nearest k for each neighborhood in each closest R
+  rbc_low_dim_pass_one(handle, index, query, n_query_pts, k, R_knn_inds,
+                       R_knn_dists, dfunc, inds, dists, weight, dists_counter);
+
+  if (perform_post_filtering) {
+    rbc_low_dim_pass_two(handle, index, query, n_query_pts, k, R_knn_inds,
+                         R_knn_dists, dfunc, inds, dists, weight,
+                         post_dists_counter);
+  }
+}
+
+/**
+ * Similar to a ball tree, the random ball cover algorithm
+ * uses the triangle inequality to prune distance computations
+ * in any metric space with a guarantee of sqrt(n) * c^{3/2}
+ * where `c` is an expansion constant based on the distance
+ * metric.
+ *
+ * This function variant performs an all nearest neighbors
+ * query which is useful for algorithms that need to perform
+ * A * A.T.
+ */
+template <typename value_idx = std::int64_t, typename value_t,
+          typename value_int = std::uint32_t, typename distance_func>
+void rbc_build_index(const raft::handle_t &handle,
+                     BallCoverIndex<value_idx, value_t, value_int> &index,
+                     distance_func dfunc) {
+  ASSERT(index.n == 2,
+         "only 2d vectors are supported in current implementation");
+  ASSERT(!index.is_index_trained(), "index cannot be previously trained");
+
+  rmm::device_uvector<value_idx> R_knn_inds(index.m, handle.get_stream());
+  rmm::device_uvector<value_t> R_knn_dists(index.m, handle.get_stream());
+
+  /**
+   * 1. Randomly sample sqrt(n) points from X
+   */
+  sample_landmarks<value_idx, value_t>(handle, index);
+
+  /**
+   * 2. Perform knn = bfknn(X, R, k)
+   */
+  value_int k = 1;
+  k_closest_landmarks(handle, index, index.get_X(), index.m, k,
+                      R_knn_inds.data(), R_knn_dists.data());
+
+  /**
+   * 3. Create L_r = knn[:,0].T (CSR)
+   *
+   * Slice closest neighboring R
+   * Secondary sort by (R_knn_inds, R_knn_dists)
+   */
+  construct_landmark_1nn(handle, R_knn_inds.data(), R_knn_dists.data(), k,
+                         index);
+
+  /**
+   * Compute radius of each R for filtering: p(q, r) <= p(q, q_r) + radius(r)
+   * (need to take the
+   */
+  compute_landmark_radii(handle, index);
+}
+
+/**
+ * Performs an all neighbors knn query (e.g. index == query)
+ */
+template <typename value_idx = std::int64_t, typename value_t,
+          typename value_int = std::uint32_t, typename distance_func>
+void rbc_all_knn_query(const raft::handle_t &handle,
+                       BallCoverIndex<value_idx, value_t, value_int> &index,
+                       value_int k, value_idx *inds, value_t *dists,
+                       distance_func dfunc,
+                       // approximate nn options
+                       bool perform_post_filtering = true, float weight = 1.0) {
+  ASSERT(index.n == 2,
+         "only 2d vectors are supported in current implementation");
+  ASSERT(index.n_landmarks >= k, "number of landmark samples must be >= k");
+  ASSERT(!index.is_index_trained(), "index cannot be previously trained");
+
+  rmm::device_uvector<value_idx> R_knn_inds(k * index.m, handle.get_stream());
+  rmm::device_uvector<value_t> R_knn_dists(k * index.m, handle.get_stream());
+
+  // For debugging / verification. Remove before releasing
+  rmm::device_uvector<value_int> dists_counter(index.m, handle.get_stream());
+  rmm::device_uvector<value_int> post_dists_counter(index.m,
+                                                    handle.get_stream());
+
+  sample_landmarks<value_idx, value_t>(handle, index);
+
+  k_closest_landmarks(handle, index, index.get_X(), index.m, k,
+                      R_knn_inds.data(), R_knn_dists.data());
+
+  construct_landmark_1nn(handle, R_knn_inds.data(), R_knn_dists.data(), k,
+                         index);
+
+  compute_landmark_radii(handle, index);
+
+  perform_rbc_query(handle, index, index.get_X(), index.m, k, R_knn_inds.data(),
+                    R_knn_dists.data(), dfunc, inds, dists,
+                    dists_counter.data(), post_dists_counter.data(), weight,
+                    perform_post_filtering);
+}
+
+/**
+ * Performs a knn query against an index. This assumes the index has
+ * already been built.
+ */
+template <typename value_idx = std::int64_t, typename value_t,
+          typename value_int = std::uint32_t, typename distance_func>
+void rbc_knn_query(const raft::handle_t &handle,
+                   BallCoverIndex<value_idx, value_t, value_int> &index,
+                   value_int k, const value_t *query, value_int n_query_pts,
+                   value_idx *inds, value_t *dists, distance_func dfunc,
+                   // approximate nn options
+                   bool perform_post_filtering = true, float weight = 1.0) {
+  ASSERT(index.n == 2,
+         "only 2d vectors are supported in current implementation");
+  ASSERT(index.n_landmarks >= k, "number of landmark samples must be >= k");
+  ASSERT(index.is_index_trained(), "index must be previously trained");
+
+  rmm::device_uvector<value_idx> R_knn_inds(k * index.m, handle.get_stream());
+  rmm::device_uvector<value_t> R_knn_dists(k * index.m, handle.get_stream());
+
+  k_closest_landmarks(handle, index, query, n_query_pts, k, R_knn_inds.data(),
+                      R_knn_dists.data());
+
+  // For debugging / verification. Remove before releasing
+  rmm::device_uvector<value_int> dists_counter(index.m, handle.get_stream());
+  rmm::device_uvector<value_int> post_dists_counter(index.m,
+                                                    handle.get_stream());
+  thrust::fill(handle.get_thrust_policy(), post_dists_counter.data(),
+               post_dists_counter.data() + index.m, 0);
+
+  perform_rbc_query(handle, index, query, n_query_pts, k, R_knn_inds.data(),
+                    R_knn_dists.data(), dfunc, inds, dists,
+                    dists_counter.data(), post_dists_counter.data(), weight,
+                    perform_post_filtering);
+}
+
+};  // namespace detail
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
new file mode 100644
index 0000000000..c6cb679408
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/functional.h>
+#include <cstdint>
+#include "../haversine_distance.cuh"
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+
+struct NNComp {
+  template <typename one, typename two>
+  __host__ __device__ bool operator()(const one &t1, const two &t2) {
+    // sort first by each sample's reference landmark,
+    if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
+    if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false;
+
+    // then by closest neighbor,
+    return thrust::get<1>(t1) < thrust::get<1>(t2);
+  }
+};
+
+struct HaversineFunc {
+  template <typename value_t, typename value_int = std::uint32_t>
+  __device__ __host__ __forceinline__ value_t
+  operator()(const value_t *a, const value_t *b, const value_int n_dims) {
+    return raft::spatial::knn::detail::compute_haversine(a[0], b[0], a[1],
+                                                         b[1]);
+  }
+};
+
+struct EuclideanFunc {
+  template <typename value_t, typename value_int = std::uint32_t>
+  __device__ __host__ __forceinline__ value_t
+  operator()(const value_t *a, const value_t *b, const value_int n_dims) {
+    value_t sum_sq = 0;
+    for (value_int i = 0; i < n_dims; ++i) {
+      value_t diff = a[i] - b[i];
+      sum_sq += diff * diff;
+    }
+
+    return sqrt(sum_sq);
+  }
+};
+
+/**
+ * Zeros the bit at location h in a one-hot encoded 32-bit int array
+ */
+__device__ inline void _zero_bit(std::uint32_t *arr, std::uint32_t h) {
+  int bit = h % 32;
+  int idx = h / 32;
+
+  std::uint32_t assumed;
+  std::uint32_t old = arr[idx];
+  do {
+    assumed = old;
+    old = atomicCAS(arr + idx, assumed, assumed & ~(1 << bit));
+  } while (assumed != old);
+}
+
+/**
+ * Returns whether or not bit at location h is nonzero in a one-hot
+ * encoded 32-bit in array.
+ */
+__device__ inline bool _get_val(std::uint32_t *arr, std::uint32_t h) {
+  int bit = h % 32;
+  int idx = h / 32;
+  return (arr[idx] & (1 << bit)) > 0;
+}
+
+};  // namespace detail
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
new file mode 100644
index 0000000000..4a476274dd
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
@@ -0,0 +1,537 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "common.cuh"
+
+#include "../../ball_cover_common.h"
+#include "../block_select_faiss.cuh"
+#include "../haversine_distance.cuh"
+#include "../selection_faiss.cuh"
+
+#include <limits.h>
+#include <cstdint>
+
+#include <raft/cuda_utils.cuh>
+
+#include <faiss/utils/Heap.h>
+#include <faiss/gpu/utils/Limits.cuh>
+#include <faiss/gpu/utils/Select.cuh>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+
+/**
+ * To find exact neighbors, we perform a post-processing stage
+ * that filters out those points which might have neighbors outside
+ * of their k closest landmarks. This is usually a very small portion
+ * of the total points.
+ * @tparam value_idx
+ * @tparam value_t
+ * @tparam value_int
+ * @tparam tpb
+ * @param X
+ * @param n_cols
+ * @param R_knn_inds
+ * @param R_knn_dists
+ * @param R_radius
+ * @param landmarks
+ * @param n_landmarks
+ * @param bitset_size
+ * @param k
+ * @param output
+ * @param weight
+ */
+template <typename value_idx, typename value_t,
+          typename value_int = std::uint32_t, int tpb = 32,
+          typename distance_func>
+__global__ void perform_post_filter_registers(
+  const value_t *X, value_int n_cols, const value_idx *R_knn_inds,
+  const value_t *R_knn_dists, const value_t *R_radius, const value_t *landmarks,
+  int n_landmarks, value_int bitset_size, value_int k, distance_func dfunc,
+  std::uint32_t *output, float weight = 1.0) {
+  // allocate array of size n_landmarks / 32 ints
+  extern __shared__ std::uint32_t shared_mem[];
+
+  // Start with all bits on
+  for (value_int i = threadIdx.x; i < bitset_size; i += tpb) {
+    shared_mem[i] = 0xffffffff;
+  }
+
+  __syncthreads();
+
+  // TODO: Would it be faster to use L1 for this?
+  value_t local_x_ptr[2];
+  for (value_int j = 0; j < n_cols; ++j) {
+    local_x_ptr[j] = X[n_cols * blockIdx.x + j];
+  }
+
+  value_t closest_R_dist = R_knn_dists[blockIdx.x * k + (k - 1)];
+
+  // zero out bits for closest k landmarks
+  for (value_int j = threadIdx.x; j < k; j += tpb) {
+    _zero_bit(shared_mem, (std::uint32_t)R_knn_inds[blockIdx.x * k + j]);
+  }
+
+  __syncthreads();
+
+  // Discard any landmarks where p(q, r) > p(q, r_q) + radius(r)
+  // That is, the distance between the current point and the current
+  // landmark is > the distance between the current point and
+  // its closest landmark + the radius of the current landmark.
+  for (value_int l = threadIdx.x; l < n_landmarks; l += tpb) {
+    // compute p(q, r)
+    value_t dist = dfunc(local_x_ptr, landmarks + (n_cols * l), n_cols);
+    if (dist > weight * (closest_R_dist + R_radius[l]) ||
+        dist > 3 * closest_R_dist) {
+      _zero_bit(shared_mem, l);
+    }
+  }
+
+  __syncthreads();
+
+  /**
+   * Output bitset
+   */
+  for (value_int l = threadIdx.x; l < bitset_size; l += tpb) {
+    output[blockIdx.x * bitset_size + l] = shared_mem[l];
+  }
+}
+
+/**
+ * @tparam value_idx
+ * @tparam value_t
+ * @tparam value_int
+ * @tparam bitset_type
+ * @tparam warp_q number of registers to use per warp
+ * @tparam thread_q number of registers to use within each thread
+ * @tparam tpb number of threads per block
+ * @param X
+ * @param n_cols
+ * @param bitset
+ * @param bitset_size
+ * @param R_knn_dists
+ * @param R_indptr
+ * @param R_1nn_inds
+ * @param R_1nn_dists
+ * @param knn_inds
+ * @param knn_dists
+ * @param n_landmarks
+ * @param k
+ * @param dist_counter
+ */
+template <typename value_idx, typename value_t,
+          typename value_int = std::uint32_t,
+          typename bitset_type = std::uint32_t, typename dist_func,
+          int warp_q = 32, int thread_q = 2, int tpb = 128, int col_q = 2>
+__global__ void compute_final_dists_registers(
+  const value_t *X_index, const value_t *X, const value_int n_cols,
+  bitset_type *bitset, value_int bitset_size, const value_t *R_knn_dists,
+  const value_idx *R_indptr, const value_idx *R_1nn_inds,
+  const value_t *R_1nn_dists, value_idx *knn_inds, value_t *knn_dists,
+  value_int n_landmarks, value_int k, dist_func dfunc,
+  value_int *dist_counter) {
+  static constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
+
+  __shared__ value_t shared_memK[kNumWarps * warp_q];
+  __shared__ faiss::gpu::KeyValuePair<value_t, value_idx>
+    shared_memV[kNumWarps * warp_q];
+
+  const value_t *x_ptr = X + (n_cols * blockIdx.x);
+  value_t local_x_ptr[col_q];
+  for (value_int j = 0; j < n_cols; ++j) {
+    local_x_ptr[j] = x_ptr[j];
+  }
+
+  faiss::gpu::KeyValueBlockSelect<value_t, value_idx, false,
+                                  faiss::gpu::Comparator<value_t>, warp_q,
+                                  thread_q, tpb>
+    heap(faiss::gpu::Limits<value_t>::getMax(),
+         faiss::gpu::Limits<value_t>::getMax(), -1, shared_memK, shared_memV,
+         k);
+
+  const value_int n_k = faiss::gpu::utils::roundDown(k, faiss::gpu::kWarpSize);
+  value_int i = threadIdx.x;
+  for (; i < n_k; i += tpb) {
+    value_idx ind = knn_inds[blockIdx.x * k + i];
+    heap.add(knn_dists[blockIdx.x * k + i], R_knn_dists[ind * k], ind);
+  }
+
+  if (i < k) {
+    value_idx ind = knn_inds[blockIdx.x * k + i];
+    heap.addThreadQ(knn_dists[blockIdx.x * k + i], R_knn_dists[ind * k], ind);
+  }
+
+  heap.checkThreadQ();
+
+  for (value_int cur_R_ind = 0; cur_R_ind < n_landmarks; ++cur_R_ind) {
+    // if cur R overlaps cur point's closest R, it could be a
+    // candidate
+    if (_get_val(bitset + (blockIdx.x * bitset_size), cur_R_ind)) {
+      value_idx R_start_offset = R_indptr[cur_R_ind];
+      value_idx R_stop_offset = R_indptr[cur_R_ind + 1];
+      value_idx R_size = R_stop_offset - R_start_offset;
+
+      // Loop through R's neighborhood in parallel
+
+      // Round R_size to the nearest warp threads so they can
+      // all be computing in parallel.
+
+      const value_int limit =
+        faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize);
+
+      i = threadIdx.x;
+      for (; i < limit; i += tpb) {
+        value_idx cur_candidate_ind = R_1nn_inds[R_start_offset + i];
+        value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i];
+        value_t z = heap.warpKTopRDist == 0.00
+                      ? 0.0
+                      : (abs(heap.warpKTop - heap.warpKTopRDist) *
+                           abs(heap.warpKTopRDist - cur_candidate_dist) -
+                         heap.warpKTop * cur_candidate_dist) /
+                          heap.warpKTopRDist;
+        z = isnan(z) ? 0.0 : z;
+        // If lower bound on distance could possibly be in
+        // the closest k neighbors, compute it and add to k-select
+        value_t dist = std::numeric_limits<value_t>::max();
+        if (z <= heap.warpKTop) {
+          const value_t *y_ptr = X_index + (n_cols * cur_candidate_ind);
+          value_t local_y_ptr[col_q];
+          for (value_int j = 0; j < n_cols; ++j) {
+            local_y_ptr[j] = y_ptr[j];
+          }
+
+          dist = dfunc(local_x_ptr, local_y_ptr, n_cols);
+        }
+
+        heap.add(dist, cur_candidate_dist, cur_candidate_ind);
+      }
+
+      // second round guarantees to be only a single warp.
+      if (i < R_size) {
+        value_idx cur_candidate_ind = R_1nn_inds[R_start_offset + i];
+        value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i];
+
+        value_t z = heap.warpKTopRDist == 0.00
+                      ? 0.0
+                      : (abs(heap.warpKTop - heap.warpKTopRDist) *
+                           abs(heap.warpKTopRDist - cur_candidate_dist) -
+                         heap.warpKTop * cur_candidate_dist) /
+                          heap.warpKTopRDist;
+
+        z = isnan(z) ? 0.0 : z;
+        // If lower bound on distance could possibly be in
+        // the closest k neighbors, compute it and add to k-select
+        value_t dist = std::numeric_limits<value_t>::max();
+        if (z <= heap.warpKTop) {
+          const value_t *y_ptr = X_index + (n_cols * cur_candidate_ind);
+          value_t local_y_ptr[col_q];
+          for (value_int j = 0; j < n_cols; ++j) {
+            local_y_ptr[j] = y_ptr[j];
+          }
+          dist = dfunc(local_x_ptr, local_y_ptr, n_cols);
+        }
+        heap.addThreadQ(dist, cur_candidate_dist, cur_candidate_ind);
+      }
+      heap.checkThreadQ();
+    }
+  }
+
+  heap.reduce();
+
+  for (value_int i = threadIdx.x; i < k; i += tpb) {
+    knn_dists[blockIdx.x * k + i] = shared_memK[i];
+    knn_inds[blockIdx.x * k + i] = shared_memV[i].value;
+  }
+}
+
+/**
+ * Random ball cover kernel for n_dims == 2
+ * @tparam value_idx
+ * @tparam value_t
+ * @tparam warp_q
+ * @tparam thread_q
+ * @tparam tpb
+ * @tparam value_idx
+ * @tparam value_t
+ * @param R_knn_inds
+ * @param R_knn_dists
+ * @param m
+ * @param k
+ * @param R_indptr
+ * @param R_1nn_cols
+ * @param R_1nn_dists
+ */
+template <typename value_idx = std::int64_t, typename value_t, int warp_q = 32,
+          int thread_q = 2, int tpb = 128, int col_q = 2,
+          typename value_int = std::uint32_t, typename distance_func>
+__global__ void block_rbc_kernel_registers(
+  const value_t *X_index, const value_t *X,
+  value_int n_cols,  // n_cols should be 2 or 3 dims
+  const value_idx *R_knn_inds, const value_t *R_knn_dists, value_int m,
+  value_int k, const value_idx *R_indptr, const value_idx *R_1nn_cols,
+  const value_t *R_1nn_dists, value_idx *out_inds, value_t *out_dists,
+  value_int *dist_counter, value_t *R_radius, distance_func dfunc,
+  float weight = 1.0) {
+  static constexpr value_int kNumWarps = tpb / faiss::gpu::kWarpSize;
+
+  __shared__ value_t shared_memK[kNumWarps * warp_q];
+  __shared__ faiss::gpu::KeyValuePair<value_t, value_idx>
+    shared_memV[kNumWarps * warp_q];
+
+  // TODO: Separate kernels for different widths:
+  // 1. Very small (between 3 and 32) just use registers for columns of "blockIdx.x"
+  // 2. Can fit comfortably in shared memory (32 to a few thousand?)
+  // 3. Load each time individually.
+  const value_t *x_ptr = X + (n_cols * blockIdx.x);
+
+  // Use registers only for 2d or 3d
+  value_t local_x_ptr[col_q];
+  for (value_int i = 0; i < n_cols; ++i) {
+    local_x_ptr[i] = x_ptr[i];
+  }
+
+  // Each warp works on 1 R
+  faiss::gpu::KeyValueBlockSelect<value_t, value_idx, false,
+                                  faiss::gpu::Comparator<value_t>, warp_q,
+                                  thread_q, tpb>
+    heap(faiss::gpu::Limits<value_t>::getMax(),
+         faiss::gpu::Limits<value_t>::getMax(), -1, shared_memK, shared_memV,
+         k);
+
+  value_t min_R_dist = R_knn_dists[blockIdx.x * k + (k - 1)];
+
+  value_int n_dists_computed = 0;
+
+  /**
+   * First add distances for k closest neighbors of R
+   * to the heap
+   */
+  // Start iterating through elements of each set from closest R elements,
+  // determining if the distance could even potentially be in the heap.
+  for (value_int cur_k = 0; cur_k < k; ++cur_k) {
+    // index and distance to current blockIdx.x's closest landmark
+    value_t cur_R_dist = R_knn_dists[blockIdx.x * k + cur_k];
+    value_idx cur_R_ind = R_knn_inds[blockIdx.x * k + cur_k];
+
+    // Equation (2) in Cayton's paper- prune out R's which are > 3 * p(q, r_q)
+    if (cur_R_dist > weight * (min_R_dist + R_radius[cur_R_ind])) continue;
+    if (cur_R_dist > 3 * min_R_dist) return;
+
+    // The whole warp should iterate through the elements in the current R
+    value_idx R_start_offset = R_indptr[cur_R_ind];
+    value_idx R_stop_offset = R_indptr[cur_R_ind + 1];
+
+    value_idx R_size = R_stop_offset - R_start_offset;
+
+    value_int limit =
+      faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize);
+    value_int i = threadIdx.x;
+    for (; i < limit; i += tpb) {
+      // Index and distance of current candidate's nearest landmark
+      value_idx cur_candidate_ind = R_1nn_cols[R_start_offset + i];
+      value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i];
+
+      // Take 2 landmarks l_1 and l_2 where l_1 is the furthest point in the heap
+      // and l_2 is the current landmark R. s is the current data point and
+      // t is the new candidate data point. We know that:
+      // d(s, t) cannot possibly be any smaller than | d(s, l_1) - d(l_1, l_2) | * | d(l_1, l_2) - d(l_2, t) | - d(s, l_1) * d(l_2, t)
+
+      // Therefore, if d(s, t) >= d(s, l_1) from the computation above, we know that the distance to the candidate point
+      // cannot possibly be in the nearest neighbors. However, if d(s, t) < d(s, l_1) then we should compute the
+      // distance because it's possible it could be smaller.
+      //
+      value_t z = heap.warpKTopRDist == 0.00
+                    ? 0.0
+                    : (abs(heap.warpKTop - heap.warpKTopRDist) *
+                         abs(heap.warpKTopRDist - cur_candidate_dist) -
+                       heap.warpKTop * cur_candidate_dist) /
+                        heap.warpKTopRDist;
+
+      z = isnan(z) ? 0.0 : z;
+      value_t dist = std::numeric_limits<value_t>::max();
+      if (i < k || z <= heap.warpKTop) {
+        const value_t *y_ptr = X_index + (n_cols * cur_candidate_ind);
+        value_t local_y_ptr[col_q];
+        for (value_int j = 0; j < n_cols; ++j) {
+          local_y_ptr[j] = y_ptr[j];
+        }
+        dist = dfunc(local_x_ptr, local_y_ptr, n_cols);
+        ++n_dists_computed;
+      }
+
+      heap.add(dist, cur_candidate_dist, cur_candidate_ind);
+    }
+
+    if (i < R_size) {
+      value_idx cur_candidate_ind = R_1nn_cols[R_start_offset + i];
+      value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i];
+      value_t z = heap.warpKTopRDist == 0.0
+                    ? 0.0
+                    : (abs(heap.warpKTop - heap.warpKTopRDist) *
+                         abs(heap.warpKTopRDist - cur_candidate_dist) -
+                       heap.warpKTop * cur_candidate_dist) /
+                        heap.warpKTopRDist;
+
+      z = isnan(z) ? 0.0 : z;
+      value_t dist = std::numeric_limits<value_t>::max();
+      if (i < k || z <= heap.warpKTop) {
+        const value_t *y_ptr = X_index + (n_cols * cur_candidate_ind);
+        value_t local_y_ptr[col_q];
+        for (value_int j = 0; j < n_cols; ++j) {
+          local_y_ptr[j] = y_ptr[j];
+        }
+        dist = dfunc(local_x_ptr, local_y_ptr, n_cols);
+        ++n_dists_computed;
+      }
+
+      heap.addThreadQ(dist, cur_candidate_dist, cur_candidate_ind);
+    }
+
+    heap.checkThreadQ();
+  }
+
+  heap.reduce();
+
+  for (int i = threadIdx.x; i < k; i += tpb) {
+    out_dists[blockIdx.x * k + i] = shared_memK[i];
+    out_inds[blockIdx.x * k + i] = shared_memV[i].value;
+  }
+}
+
+template <typename value_idx, typename value_t,
+          typename value_int = std::uint32_t, typename dist_func>
+void rbc_low_dim_pass_one(const raft::handle_t &handle,
+                          BallCoverIndex<value_idx, value_t, value_int> &index,
+                          const value_t *query, const value_int n_query_rows,
+                          value_int k, const value_idx *R_knn_inds,
+                          const value_t *R_knn_dists, dist_func dfunc,
+                          value_idx *inds, value_t *dists, float weight,
+                          value_int *dists_counter) {
+  if (k <= 32)
+    block_rbc_kernel_registers<value_idx, value_t, 32, 2, 128, 2, value_int>
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
+        index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k,
+        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
+        inds, dists, dists_counter, index.get_R_radius(), dfunc, weight);
+
+  else if (k <= 64)
+    block_rbc_kernel_registers<value_idx, value_t, 64, 3, 128, 2, value_int>
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
+        index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k,
+        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
+        inds, dists, dists_counter, index.get_R_radius(), dfunc, weight);
+  else if (k <= 128)
+    block_rbc_kernel_registers<value_idx, value_t, 128, 3, 128, 2, value_int>
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
+        index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k,
+        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
+        inds, dists, dists_counter, index.get_R_radius(), dfunc, weight);
+
+  else if (k <= 256)
+    block_rbc_kernel_registers<value_idx, value_t, 256, 4, 128, 2, value_int>
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
+        index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k,
+        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
+        inds, dists, dists_counter, index.get_R_radius(), dfunc, weight);
+
+  else if (k <= 512)
+    block_rbc_kernel_registers<value_idx, value_t, 512, 8, 64, 2, value_int>
+      <<<n_query_rows, 64, 0, handle.get_stream()>>>(
+        index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k,
+        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
+        inds, dists, dists_counter, index.get_R_radius(), dfunc, weight);
+
+  else if (k <= 1024)
+    block_rbc_kernel_registers<value_idx, value_t, 1024, 8, 64, 2, value_int>
+      <<<n_query_rows, 64, 0, handle.get_stream()>>>(
+        index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k,
+        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
+        inds, dists, dists_counter, index.get_R_radius(), dfunc, weight);
+}
+
+template <typename value_idx, typename value_t,
+          typename value_int = std::uint32_t, typename dist_func>
+void rbc_low_dim_pass_two(const raft::handle_t &handle,
+                          BallCoverIndex<value_idx, value_t, value_int> &index,
+                          const value_t *query, const value_int n_query_rows,
+                          value_int k, const value_idx *R_knn_inds,
+                          const value_t *R_knn_dists, dist_func dfunc,
+                          value_idx *inds, value_t *dists, float weight,
+                          value_int *post_dists_counter) {
+  const value_int bitset_size = ceil(index.n_landmarks / 32.0);
+
+  rmm::device_uvector<std::uint32_t> bitset(bitset_size * index.m,
+                                            handle.get_stream());
+
+  perform_post_filter_registers<value_idx, value_t, value_int, 128, dist_func>
+    <<<n_query_rows, 128, bitset_size * sizeof(std::uint32_t),
+       handle.get_stream()>>>(index.get_X(), index.n, R_knn_inds, R_knn_dists,
+                              index.get_R_radius(), index.get_R(),
+                              index.n_landmarks, bitset_size, k, dfunc,
+                              bitset.data(), weight);
+
+  if (k <= 32)
+    compute_final_dists_registers<value_idx, value_t, value_int, std::uint32_t,
+                                  dist_func, 32, 2, 128, 2>
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
+        index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists,
+        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
+        inds, dists, index.n_landmarks, k, dfunc, post_dists_counter);
+  else if (k <= 64)
+    compute_final_dists_registers<value_idx, value_t, value_int, std::uint32_t,
+                                  dist_func, 64, 3, 128, 2>
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
+        index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists,
+        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
+        inds, dists, index.n_landmarks, k, dfunc, post_dists_counter);
+  else if (k <= 128)
+    compute_final_dists_registers<value_idx, value_t, value_int, std::uint32_t,
+                                  dist_func, 128, 3, 128, 2>
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
+        index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists,
+        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
+        inds, dists, index.n_landmarks, k, dfunc, post_dists_counter);
+  else if (k <= 256)
+    compute_final_dists_registers<value_idx, value_t, value_int, std::uint32_t,
+                                  dist_func, 256, 4, 128, 2>
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
+        index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists,
+        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
+        inds, dists, index.n_landmarks, k, dfunc, post_dists_counter);
+  else if (k <= 512)
+    compute_final_dists_registers<value_idx, value_t, value_int, std::uint32_t,
+                                  dist_func, 512, 8, 64, 2>
+      <<<n_query_rows, 64, 0, handle.get_stream()>>>(
+        index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists,
+        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
+        inds, dists, index.n_landmarks, k, dfunc, post_dists_counter);
+  else if (k <= 1024)
+    compute_final_dists_registers<value_idx, value_t, value_int, std::uint32_t,
+                                  dist_func, 1024, 8, 64, 2>
+      <<<n_query_rows, 64, 0, handle.get_stream()>>>(
+        index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists,
+        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
+        inds, dists, index.n_landmarks, k, dfunc, post_dists_counter);
+}
+
+};  // namespace detail
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh b/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh
new file mode 100644
index 0000000000..d2f7bc2210
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh
@@ -0,0 +1,224 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/MergeNetworkUtils.cuh>
+#include <faiss/gpu/utils/PtxUtils.cuh>
+#include <faiss/gpu/utils/Select.cuh>
+#include <faiss/gpu/utils/WarpShuffles.cuh>
+
+#include "warp_select_faiss.cuh"
+
+// TODO: Need to think further about the impact (and new boundaries created) on the registers
+// because this will change the max k that can be processed. One solution might be to break
+// up k into multiple batches for larger k.
+
+namespace faiss {
+namespace gpu {
+
+// `Dir` true, produce largest values.
+// `Dir` false, produce smallest values.
+template <typename K, typename V, bool Dir, typename Comp, int NumWarpQ,
+          int NumThreadQ, int ThreadsPerBlock>
+struct KeyValueBlockSelect {
+  static constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
+  static constexpr int kTotalWarpSortSize = NumWarpQ;
+
+  __device__ inline KeyValueBlockSelect(K initKVal, K initVKey, V initVVal,
+                                        K* smemK, KeyValuePair<K, V>* smemV,
+                                        int k)
+    : initK(initKVal),
+      initVk(initVKey),
+      initVv(initVVal),
+      numVals(0),
+      warpKTop(initKVal),
+      warpKTopRDist(initKVal),
+      sharedK(smemK),
+      sharedV(smemV),
+      kMinus1(k - 1) {
+    static_assert(utils::isPowerOf2(ThreadsPerBlock),
+                  "threads must be a power-of-2");
+    static_assert(utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2");
+
+    // Fill the per-thread queue keys with the default value
+#pragma unroll
+    for (int i = 0; i < NumThreadQ; ++i) {
+      threadK[i] = initK;
+      threadV[i].key = initVk;
+      threadV[i].value = initVv;
+    }
+
+    int laneId = getLaneId();
+    int warpId = threadIdx.x / kWarpSize;
+    warpK = sharedK + warpId * kTotalWarpSortSize;
+    warpV = sharedV + warpId * kTotalWarpSortSize;
+
+    // Fill warp queue (only the actual queue space is fine, not where
+    // we write the per-thread queues for merging)
+    for (int i = laneId; i < NumWarpQ; i += kWarpSize) {
+      warpK[i] = initK;
+      warpV[i].key = initVk;
+      warpV[i].value = initVv;
+    }
+
+    warpFence();
+  }
+
+  __device__ inline void addThreadQ(K k, K vk, V vv) {
+    if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) {
+      // Rotate right
+#pragma unroll
+      for (int i = NumThreadQ - 1; i > 0; --i) {
+        threadK[i] = threadK[i - 1];
+        threadV[i].key = threadV[i - 1].key;
+        threadV[i].value = threadV[i - 1].value;
+      }
+
+      threadK[0] = k;
+      threadV[0].key = vk;
+      threadV[0].value = vv;
+      ++numVals;
+    }
+  }
+
+  __device__ inline void checkThreadQ() {
+    bool needSort = (numVals == NumThreadQ);
+
+#if CUDA_VERSION >= 9000
+    needSort = __any_sync(0xffffffff, needSort);
+#else
+    needSort = __any(needSort);
+#endif
+
+    if (!needSort) {
+      // no lanes have triggered a sort
+      return;
+    }
+
+    // This has a trailing warpFence
+    mergeWarpQ();
+
+    // Any top-k elements have been merged into the warp queue; we're
+    // free to reset the thread queues
+    numVals = 0;
+
+#pragma unroll
+    for (int i = 0; i < NumThreadQ; ++i) {
+      threadK[i] = initK;
+      threadV[i].key = initVk;
+      threadV[i].value = initVv;
+    }
+
+    // We have to beat at least this element
+    warpKTop = warpK[kMinus1];
+    warpKTopRDist = warpV[kMinus1].key;
+
+    warpFence();
+  }
+
+  /// This function handles sorting and merging together the
+  /// per-thread queues with the warp-wide queue, creating a sorted
+  /// list across both
+  __device__ inline void mergeWarpQ() {
+    int laneId = getLaneId();
+
+    // Sort all of the per-thread queues
+    warpSortAnyRegistersKVP<K, V, NumThreadQ, !Dir, Comp>(threadK, threadV);
+
+    constexpr int kNumWarpQRegisters = NumWarpQ / kWarpSize;
+    K warpKRegisters[kNumWarpQRegisters];
+    KeyValuePair<K, V> warpVRegisters[kNumWarpQRegisters];
+
+#pragma unroll
+    for (int i = 0; i < kNumWarpQRegisters; ++i) {
+      warpKRegisters[i] = warpK[i * kWarpSize + laneId];
+      warpVRegisters[i].key = warpV[i * kWarpSize + laneId].key;
+      warpVRegisters[i].value = warpV[i * kWarpSize + laneId].value;
+    }
+
+    warpFence();
+
+    // The warp queue is already sorted, and now that we've sorted the
+    // per-thread queue, merge both sorted lists together, producing
+    // one sorted list
+    warpMergeAnyRegistersKVP<K, V, kNumWarpQRegisters, NumThreadQ, !Dir, Comp,
+                             false>(warpKRegisters, warpVRegisters, threadK,
+                                    threadV);
+
+    // Write back out the warp queue
+#pragma unroll
+    for (int i = 0; i < kNumWarpQRegisters; ++i) {
+      warpK[i * kWarpSize + laneId] = warpKRegisters[i];
+      warpV[i * kWarpSize + laneId].key = warpVRegisters[i].key;
+      warpV[i * kWarpSize + laneId].value = warpVRegisters[i].value;
+    }
+
+    warpFence();
+  }
+
+  /// WARNING: all threads in a warp must participate in this.
+  /// Otherwise, you must call the constituent parts separately.
+  __device__ inline void add(K k, K vk, V vv) {
+    addThreadQ(k, vk, vv);
+    checkThreadQ();
+  }
+
+  __device__ inline void reduce() {
+    // Have all warps dump and merge their queues; this will produce
+    // the final per-warp results
+    mergeWarpQ();
+
+    // block-wide dep; thus far, all warps have been completely
+    // independent
+    __syncthreads();
+
+    // All warp queues are contiguous in smem.
+    // Now, we have kNumWarps lists of NumWarpQ elements.
+    // This is a power of 2.
+    FinalBlockMerge<kNumWarps, ThreadsPerBlock, K, KeyValuePair<K, V>, NumWarpQ,
+                    Dir, Comp>::merge(sharedK, sharedV);
+
+    // The block-wide merge has a trailing syncthreads
+  }
+
+  // Default element key
+  const K initK;
+
+  // Default element value
+  const K initVk;
+  const V initVv;
+
+  // Number of valid elements in our thread queue
+  int numVals;
+
+  // The k-th highest (Dir) or lowest (!Dir) element
+  K warpKTop;
+
+  K warpKTopRDist;
+
+  // Thread queue values
+  K threadK[NumThreadQ];
+  KeyValuePair<K, V> threadV[NumThreadQ];
+
+  // Queues for all warps
+  K* sharedK;
+  KeyValuePair<K, V>* sharedV;
+
+  // Our warp's queue (points into sharedK/sharedV)
+  // warpK[0] is highest (Dir) or lowest (!Dir)
+  K* warpK;
+  KeyValuePair<K, V>* warpV;
+
+  // This is a cached k-1 value
+  int kMinus1;
+};
+
+}  // namespace gpu
+}  // namespace faiss
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 94ace19580..43fc4a164f 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -30,6 +30,7 @@
 
 #include <raft/linalg/distance_type.h>
 #include <thrust/iterator/transform_iterator.h>
+#include <cstdint>
 #include <iostream>
 #include <raft/handle.hpp>
 #include <set>
@@ -45,8 +46,8 @@ namespace spatial {
 namespace knn {
 namespace detail {
 
-template <typename value_idx = int64_t, typename value_t = float, int warp_q,
-          int thread_q, int tpb>
+template <typename value_idx = std::int64_t, typename value_t = float,
+          int warp_q, int thread_q, int tpb>
 __global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV,
                                        value_t *outK, value_idx *outV,
                                        size_t n_samples, int n_parts,
@@ -110,8 +111,8 @@ __global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV,
   }
 }
 
-template <typename value_idx = int64_t, typename value_t = float, int warp_q,
-          int thread_q>
+template <typename value_idx = std::int64_t, typename value_t = float,
+          int warp_q, int thread_q>
 inline void knn_merge_parts_impl(value_t *inK, value_idx *inV, value_t *outK,
                                  value_idx *outV, size_t n_samples, int n_parts,
                                  int k, cudaStream_t stream,
@@ -143,7 +144,7 @@ inline void knn_merge_parts_impl(value_t *inK, value_idx *inV, value_t *outK,
  * @param stream CUDA stream to use
  * @param translations mapping of index offsets for each partition
  */
-template <typename value_idx = int64_t, typename value_t = float>
+template <typename value_idx = std::int64_t, typename value_t = float>
 inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
                             value_idx *outV, size_t n_samples, int n_parts,
                             int k, cudaStream_t stream,
@@ -195,28 +196,28 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
  * @param[in] metric corresponds to the raft::distance::DistanceType enum (default is L2Expanded)
  * @param[in] metricArg metric argument to use. Corresponds to the p arg for lp norm
  */
-template <typename IntType = int>
-void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
-                          IntType D, float *search_items, IntType n,
-                          int64_t *res_I, float *res_D, IntType k,
-                          cudaStream_t userStream,
+template <typename IntType = int, typename IdxType = std::int64_t>
+void brute_force_knn_impl(std::vector<float *> &input,
+                          std::vector<IntType> &sizes, IntType D,
+                          float *search_items, IntType n, IdxType *res_I,
+                          float *res_D, IntType k, cudaStream_t userStream,
                           cudaStream_t *internalStreams = nullptr,
                           int n_int_streams = 0, bool rowMajorIndex = true,
                           bool rowMajorQuery = true,
-                          std::vector<int64_t> *translations = nullptr,
+                          std::vector<IdxType> *translations = nullptr,
                           raft::distance::DistanceType metric =
                             raft::distance::DistanceType::L2Expanded,
                           float metricArg = 0) {
   ASSERT(input.size() == sizes.size(),
          "input and sizes vectors should be the same size");
 
-  std::vector<int64_t> *id_ranges;
+  std::vector<IdxType> *id_ranges;
   if (translations == nullptr) {
     // If we don't have explicit translations
     // for offsets of the indices, build them
     // from the local partitions
-    id_ranges = new std::vector<int64_t>();
-    int64_t total_n = 0;
+    id_ranges = new std::vector<IdxType>();
+    IdxType total_n = 0;
     for (size_t i = 0; i < input.size(); i++) {
       id_ranges->push_back(total_n);
       total_n += sizes[i];
@@ -242,15 +243,15 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
   int device;
   CUDA_CHECK(cudaGetDevice(&device));
 
-  rmm::device_uvector<int64_t> trans(id_ranges->size(), userStream);
+  rmm::device_uvector<std::int64_t> trans(id_ranges->size(), userStream);
   raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(),
                       userStream);
 
   rmm::device_uvector<float> all_D(0, userStream);
-  rmm::device_uvector<int64_t> all_I(0, userStream);
+  rmm::device_uvector<std::int64_t> all_I(0, userStream);
 
   float *out_D = res_D;
-  int64_t *out_I = res_I;
+  IdxType *out_I = res_I;
 
   if (input.size() > 1) {
     all_D.resize(input.size() * k * n, userStream);
@@ -265,7 +266,7 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
 
   for (size_t i = 0; i < input.size(); i++) {
     float *out_d_ptr = out_D + (i * k * n);
-    int64_t *out_i_ptr = out_I + (i * k * n);
+    IdxType *out_i_ptr = out_I + (i * k * n);
 
     cudaStream_t stream =
       raft::select_stream(userStream, internalStreams, n_int_streams, i);
diff --git a/cpp/include/raft/sparse/selection/selection.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
similarity index 94%
rename from cpp/include/raft/sparse/selection/selection.cuh
rename to cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
index 6066a36289..045edad0e6 100644
--- a/cpp/include/raft/sparse/selection/selection.cuh
+++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
@@ -17,13 +17,6 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
-#include <raft/cuda_utils.cuh>
-#include <raft/matrix/matrix.cuh>
-
-#include <raft/sparse/coo.cuh>
-#include <raft/sparse/csr.cuh>
-#include <raft/sparse/distance/distance.cuh>
 
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuIndexFlat.h>
@@ -33,11 +26,10 @@
 #include <faiss/gpu/utils/Limits.cuh>
 #include <faiss/gpu/utils/Select.cuh>
 
-#include <cusparse_v2.h>
-
 namespace raft {
-namespace sparse {
-namespace selection {
+namespace spatial {
+namespace knn {
+namespace detail {
 
 template <typename K, typename IndexType, bool select_min, int warp_q,
           int thread_q, int tpb>
@@ -152,6 +144,7 @@ inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols,
                                                outV, select_min, k, stream);
 }
 
-};  // namespace selection
-};  // namespace sparse
+};  // namespace detail
+};  // namespace knn
+};  // namespace spatial
 };  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh b/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh
new file mode 100644
index 0000000000..84719a0e4b
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh
@@ -0,0 +1,739 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cub/cub.cuh>
+
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/MergeNetworkUtils.cuh>
+#include <faiss/gpu/utils/PtxUtils.cuh>
+#include <faiss/gpu/utils/WarpShuffles.cuh>
+
+namespace faiss {
+namespace gpu {
+
+template <typename _Key, typename _Value>
+struct KeyValuePair {
+  typedef _Key Key;      ///< Key data type
+  typedef _Value Value;  ///< Value data type
+
+  Key key;      ///< Item key
+  Value value;  ///< Item value
+
+  /// Constructor
+  __host__ __device__ __forceinline__ KeyValuePair() {}
+
+  /// Copy Constructors
+  __host__ __device__ __forceinline__
+  KeyValuePair(cub::KeyValuePair<_Key, _Value>& kvp)
+    : key(kvp.key), value(kvp.value) {}
+
+  __host__ __device__ __forceinline__
+  KeyValuePair(faiss::gpu::KeyValuePair<_Key, _Value>& kvp)
+    : key(kvp.key), value(kvp.value) {}
+
+  /// Constructor
+  __host__ __device__ __forceinline__ KeyValuePair(Key const& key,
+                                                   Value const& value)
+    : key(key), value(value) {}
+
+  /// Inequality operator
+  __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair& b) {
+    return (value != b.value) || (key != b.key);
+  }
+};
+
+//
+// This file contains functions to:
+//
+// -perform bitonic merges on pairs of sorted lists, held in
+// registers. Each list contains N * kWarpSize (multiple of 32)
+// elements for some N.
+// The bitonic merge is implemented for arbitrary sizes;
+// sorted list A of size N1 * kWarpSize registers
+// sorted list B of size N2 * kWarpSize registers =>
+// sorted list C if size (N1 + N2) * kWarpSize registers. N1 and N2
+// are >= 1 and don't have to be powers of 2.
+//
+// -perform bitonic sorts on a set of N * kWarpSize key/value pairs
+// held in registers, by using the above bitonic merge as a
+// primitive.
+// N can be an arbitrary N >= 1; i.e., the bitonic sort here supports
+// odd sizes and doesn't require the input to be a power of 2.
+//
+// The sort or merge network is completely statically instantiated via
+// template specialization / expansion and constexpr, and it uses warp
+// shuffles to exchange values between warp lanes.
+//
+// A note about comparsions:
+//
+// For a sorting network of keys only, we only need one
+// comparison (a < b). However, what we really need to know is
+// if one lane chooses to exchange a value, then the
+// corresponding lane should also do the exchange.
+// Thus, if one just uses the negation !(x < y) in the higher
+// lane, this will also include the case where (x == y). Thus, one
+// lane in fact performs an exchange and the other doesn't, but
+// because the only value being exchanged is equivalent, nothing has
+// changed.
+// So, you can get away with just one comparison and its negation.
+//
+// If we're sorting keys and values, where equivalent keys can
+// exist, then this is a problem, since we want to treat (x, v1)
+// as not equivalent to (x, v2).
+//
+// To remedy this, you can either compare with a lexicographic
+// ordering (a.k < b.k || (a.k == b.k && a.v < b.v)), which since
+// we're predicating all of the choices results in 3 comparisons
+// being executed, or we can invert the selection so that there is no
+// middle choice of equality; the other lane will likewise
+// check that (b.k > a.k) (the higher lane has the values
+// swapped). Then, the first lane swaps if and only if the
+// second lane swaps; if both lanes have equivalent keys, no
+// swap will be performed. This results in only two comparisons
+// being executed.
+//
+// If you don't consider values as well, then this does not produce a
+// consistent ordering among (k, v) pairs with equivalent keys but
+// different values; for us, we don't really care about ordering or
+// stability here.
+//
+// I have tried both re-arranging the order in the higher lane to get
+// away with one comparison or adding the value to the check; both
+// result in greater register consumption or lower speed than just
+// perfoming both < and > comparisons with the variables, so I just
+// stick with this.
+
+// This function merges kWarpSize / 2L lists in parallel using warp
+// shuffles.
+// It works on at most size-16 lists, as we need 32 threads for this
+// shuffle merge.
+//
+// If IsBitonic is false, the first stage is reversed, so we don't
+// need to sort directionally. It's still technically a bitonic sort.
+template <typename K, typename V, int L, bool Dir, typename Comp,
+          bool IsBitonic>
+inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair<K, V>& v) {
+  static_assert(utils::isPowerOf2(L), "L must be a power-of-2");
+  static_assert(L <= kWarpSize / 2, "merge list size must be <= 16");
+
+  int laneId = getLaneId();
+
+  if (!IsBitonic) {
+    // Reverse the first comparison stage.
+    // For example, merging a list of size 8 has the exchanges:
+    // 0 <-> 15, 1 <-> 14, ...
+    K otherK = shfl_xor(k, 2 * L - 1);
+    K otherVk = shfl_xor(v.key, 2 * L - 1);
+    V otherVv = shfl_xor(v.value, 2 * L - 1);
+
+    KeyValuePair<K, V> otherV = KeyValuePair(otherVk, otherVv);
+
+    // Whether we are the lesser thread in the exchange
+    bool small = !(laneId & L);
+
+    if (Dir) {
+      // See the comment above how performing both of these
+      // comparisons in the warp seems to win out over the
+      // alternatives in practice
+      bool s = small ? Comp::gt(k, otherK) : Comp::lt(k, otherK);
+      assign(s, k, otherK);
+      assign(s, v.key, otherV.key);
+      assign(s, v.value, otherV.value);
+
+    } else {
+      bool s = small ? Comp::lt(k, otherK) : Comp::gt(k, otherK);
+      assign(s, k, otherK);
+      assign(s, v.value, otherV.value);
+      assign(s, v.key, otherV.key);
+    }
+  }
+
+#pragma unroll
+  for (int stride = IsBitonic ? L : L / 2; stride > 0; stride /= 2) {
+    K otherK = shfl_xor(k, stride);
+    K otherVk = shfl_xor(v.key, stride);
+    V otherVv = shfl_xor(v.value, stride);
+
+    KeyValuePair<K, V> otherV = KeyValuePair(otherVk, otherVv);
+
+    // Whether we are the lesser thread in the exchange
+    bool small = !(laneId & stride);
+
+    if (Dir) {
+      bool s = small ? Comp::gt(k, otherK) : Comp::lt(k, otherK);
+      assign(s, k, otherK);
+      assign(s, v.key, otherV.key);
+      assign(s, v.value, otherV.value);
+
+    } else {
+      bool s = small ? Comp::lt(k, otherK) : Comp::gt(k, otherK);
+      assign(s, k, otherK);
+      assign(s, v.key, otherV.key);
+      assign(s, v.value, otherV.value);
+    }
+  }
+}
+
+// Template for performing a bitonic merge of an arbitrary set of
+// registers
+template <typename K, typename V, int N, bool Dir, typename Comp, bool Low,
+          bool Pow2>
+struct BitonicMergeStepKVP {};
+
+//
+// Power-of-2 merge specialization
+//
+
+// All merges eventually call this
+template <typename K, typename V, bool Dir, typename Comp, bool Low>
+struct BitonicMergeStepKVP<K, V, 1, Dir, Comp, Low, true> {
+  static inline __device__ void merge(K k[1], KeyValuePair<K, V> v[1]) {
+    // Use warp shuffles
+    warpBitonicMergeLE16KVP<K, V, 16, Dir, Comp, true>(k[0], v[0]);
+  }
+};
+
+template <typename K, typename V, int N, bool Dir, typename Comp, bool Low>
+struct BitonicMergeStepKVP<K, V, N, Dir, Comp, Low, true> {
+  static inline __device__ void merge(K k[N], KeyValuePair<K, V> v[N]) {
+    static_assert(utils::isPowerOf2(N), "must be power of 2");
+    static_assert(N > 1, "must be N > 1");
+
+#pragma unroll
+    for (int i = 0; i < N / 2; ++i) {
+      K& ka = k[i];
+      KeyValuePair<K, V>& va = v[i];
+
+      K& kb = k[i + N / 2];
+      KeyValuePair<K, V>& vb = v[i + N / 2];
+
+      bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+      swap(s, ka, kb);
+      swap(s, va.key, vb.key);
+      swap(s, va.value, vb.value);
+    }
+
+    {
+      K newK[N / 2];
+      KeyValuePair<K, V> newV[N / 2];
+
+#pragma unroll
+      for (int i = 0; i < N / 2; ++i) {
+        newK[i] = k[i];
+        newV[i].key = v[i].key;
+        newV[i].value = v[i].value;
+      }
+
+      BitonicMergeStepKVP<K, V, N / 2, Dir, Comp, true, true>::merge(newK,
+                                                                     newV);
+
+#pragma unroll
+      for (int i = 0; i < N / 2; ++i) {
+        k[i] = newK[i];
+        v[i].key = newV[i].key;
+        v[i].value = newV[i].value;
+      }
+    }
+
+    {
+      K newK[N / 2];
+      KeyValuePair<K, V> newV[N / 2];
+
+#pragma unroll
+      for (int i = 0; i < N / 2; ++i) {
+        newK[i] = k[i + N / 2];
+        newV[i].key = v[i + N / 2].key;
+        newV[i].value = v[i + N / 2].value;
+      }
+
+      BitonicMergeStepKVP<K, V, N / 2, Dir, Comp, false, true>::merge(newK,
+                                                                      newV);
+
+#pragma unroll
+      for (int i = 0; i < N / 2; ++i) {
+        k[i + N / 2] = newK[i];
+        v[i + N / 2].key = newV[i].key;
+        v[i + N / 2].value = newV[i].value;
+      }
+    }
+  }
+};
+
+//
+// Non-power-of-2 merge specialization
+//
+
+// Low recursion
+template <typename K, typename V, int N, bool Dir, typename Comp>
+struct BitonicMergeStepKVP<K, V, N, Dir, Comp, true, false> {
+  static inline __device__ void merge(K k[N], KeyValuePair<K, V> v[N]) {
+    static_assert(!utils::isPowerOf2(N), "must be non-power-of-2");
+    static_assert(N >= 3, "must be N >= 3");
+
+    constexpr int kNextHighestPowerOf2 = utils::nextHighestPowerOf2(N);
+
+#pragma unroll
+    for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) {
+      K& ka = k[i];
+      KeyValuePair<K, V>& va = v[i];
+
+      K& kb = k[i + kNextHighestPowerOf2 / 2];
+      KeyValuePair<K, V>& vb = v[i + kNextHighestPowerOf2 / 2];
+
+      bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+      swap(s, ka, kb);
+      swap(s, va.key, vb.key);
+      swap(s, va.value, vb.value);
+    }
+
+    constexpr int kLowSize = N - kNextHighestPowerOf2 / 2;
+    constexpr int kHighSize = kNextHighestPowerOf2 / 2;
+    {
+      K newK[kLowSize];
+      KeyValuePair<K, V> newV[kLowSize];
+
+#pragma unroll
+      for (int i = 0; i < kLowSize; ++i) {
+        newK[i] = k[i];
+        newV[i].key = v[i].key;
+        newV[i].value = v[i].value;
+      }
+
+      constexpr bool kLowIsPowerOf2 =
+        utils::isPowerOf2(N - kNextHighestPowerOf2 / 2);
+      // FIXME: compiler doesn't like this expression? compiler bug?
+      //      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize);
+      BitonicMergeStepKVP<K, V, kLowSize, Dir, Comp,
+                          true,  // low
+                          kLowIsPowerOf2>::merge(newK, newV);
+
+#pragma unroll
+      for (int i = 0; i < kLowSize; ++i) {
+        k[i] = newK[i];
+        v[i].key = newV[i].key;
+        v[i].value = newV[i].value;
+      }
+    }
+
+    {
+      K newK[kHighSize];
+      KeyValuePair<K, V> newV[kHighSize];
+
+#pragma unroll
+      for (int i = 0; i < kHighSize; ++i) {
+        newK[i] = k[i + kLowSize];
+        newV[i].key = v[i + kLowSize].key;
+        newV[i].value = v[i + kLowSize].value;
+      }
+
+      constexpr bool kHighIsPowerOf2 =
+        utils::isPowerOf2(kNextHighestPowerOf2 / 2);
+      // FIXME: compiler doesn't like this expression? compiler bug?
+      //      constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kHighSize);
+      BitonicMergeStepKVP<K, V, kHighSize, Dir, Comp,
+                          false,  // high
+                          kHighIsPowerOf2>::merge(newK, newV);
+
+#pragma unroll
+      for (int i = 0; i < kHighSize; ++i) {
+        k[i + kLowSize] = newK[i];
+        v[i + kLowSize].key = newV[i].key;
+        v[i + kLowSize].value = newV[i].value;
+      }
+    }
+  }
+};
+
+// High recursion
+template <typename K, typename V, int N, bool Dir, typename Comp>
+struct BitonicMergeStepKVP<K, V, N, Dir, Comp, false, false> {
+  static inline __device__ void merge(K k[N], KeyValuePair<K, V> v[N]) {
+    static_assert(!utils::isPowerOf2(N), "must be non-power-of-2");
+    static_assert(N >= 3, "must be N >= 3");
+
+    constexpr int kNextHighestPowerOf2 = utils::nextHighestPowerOf2(N);
+
+#pragma unroll
+    for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) {
+      K& ka = k[i];
+      KeyValuePair<K, V>& va = v[i];
+
+      K& kb = k[i + kNextHighestPowerOf2 / 2];
+      KeyValuePair<K, V>& vb = v[i + kNextHighestPowerOf2 / 2];
+
+      bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
+      swap(s, ka, kb);
+      swap(s, va.key, vb.key);
+      swap(s, va.value, vb.value);
+    }
+
+    constexpr int kLowSize = kNextHighestPowerOf2 / 2;
+    constexpr int kHighSize = N - kNextHighestPowerOf2 / 2;
+    {
+      K newK[kLowSize];
+      KeyValuePair<K, V> newV[kLowSize];
+
+#pragma unroll
+      for (int i = 0; i < kLowSize; ++i) {
+        newK[i] = k[i];
+        newV[i].key = v[i].key;
+        newV[i].value = v[i].value;
+      }
+
+      constexpr bool kLowIsPowerOf2 =
+        utils::isPowerOf2(kNextHighestPowerOf2 / 2);
+      // FIXME: compiler doesn't like this expression? compiler bug?
+      //      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize);
+      BitonicMergeStepKVP<K, V, kLowSize, Dir, Comp,
+                          true,  // low
+                          kLowIsPowerOf2>::merge(newK, newV);
+
+#pragma unroll
+      for (int i = 0; i < kLowSize; ++i) {
+        k[i] = newK[i];
+        v[i].key = newV[i].key;
+        v[i].value = newV[i].value;
+      }
+    }
+
+    {
+      K newK[kHighSize];
+      KeyValuePair<K, V> newV[kHighSize];
+
+#pragma unroll
+      for (int i = 0; i < kHighSize; ++i) {
+        newK[i] = k[i + kLowSize];
+        newV[i].key = v[i + kLowSize].key;
+        newV[i].value = v[i + kLowSize].value;
+      }
+
+      constexpr bool kHighIsPowerOf2 =
+        utils::isPowerOf2(N - kNextHighestPowerOf2 / 2);
+      // FIXME: compiler doesn't like this expression? compiler bug?
+      //      constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kHighSize);
+      BitonicMergeStepKVP<K, V, kHighSize, Dir, Comp,
+                          false,  // high
+                          kHighIsPowerOf2>::merge(newK, newV);
+
+#pragma unroll
+      for (int i = 0; i < kHighSize; ++i) {
+        k[i + kLowSize] = newK[i];
+        v[i + kLowSize].key = newV[i].key;
+        v[i + kLowSize].value = newV[i].value;
+      }
+    }
+  }
+};
+
+/// Merges two sets of registers across the warp of any size;
+/// i.e., merges a sorted k/v list of size kWarpSize * N1 with a
+/// sorted k/v list of size kWarpSize * N2, where N1 and N2 are any
+/// value >= 1
+template <typename K, typename V, int N1, int N2, bool Dir, typename Comp,
+          bool FullMerge = true>
+inline __device__ void warpMergeAnyRegistersKVP(K k1[N1],
+                                                KeyValuePair<K, V> v1[N1],
+                                                K k2[N2],
+                                                KeyValuePair<K, V> v2[N2]) {
+  constexpr int kSmallestN = N1 < N2 ? N1 : N2;
+
+#pragma unroll
+  for (int i = 0; i < kSmallestN; ++i) {
+    K& ka = k1[N1 - 1 - i];
+    KeyValuePair<K, V>& va = v1[N1 - 1 - i];
+
+    K& kb = k2[i];
+    KeyValuePair<K, V>& vb = v2[i];
+
+    K otherKa;
+    KeyValuePair<K, V> otherVa;
+
+    if (FullMerge) {
+      // We need the other values
+      otherKa = shfl_xor(ka, kWarpSize - 1);
+      K otherVak = shfl_xor(va.key, kWarpSize - 1);
+      V otherVav = shfl_xor(va.value, kWarpSize - 1);
+      otherVa = KeyValuePair(otherVak, otherVav);
+    }
+
+    K otherKb = shfl_xor(kb, kWarpSize - 1);
+    K otherVbk = shfl_xor(vb.key, kWarpSize - 1);
+    V otherVbv = shfl_xor(vb.value, kWarpSize - 1);
+
+    // ka is always first in the list, so we needn't use our lane
+    // in this comparison
+    bool swapa = Dir ? Comp::gt(ka, otherKb) : Comp::lt(ka, otherKb);
+    assign(swapa, ka, otherKb);
+    assign(swapa, va.key, otherVbk);
+    assign(swapa, va.value, otherVbv);
+
+    // kb is always second in the list, so we needn't use our lane
+    // in this comparison
+    if (FullMerge) {
+      bool swapb = Dir ? Comp::lt(kb, otherKa) : Comp::gt(kb, otherKa);
+      assign(swapb, kb, otherKa);
+      assign(swapb, vb.key, otherVa.key);
+      assign(swapb, vb.value, otherVa.value);
+
+    } else {
+      // We don't care about updating elements in the second list
+    }
+  }
+
+  BitonicMergeStepKVP<K, V, N1, Dir, Comp, true, utils::isPowerOf2(N1)>::merge(
+    k1, v1);
+  if (FullMerge) {
+    // Only if we care about N2 do we need to bother merging it fully
+    BitonicMergeStepKVP<K, V, N2, Dir, Comp, false,
+                        utils::isPowerOf2(N2)>::merge(k2, v2);
+  }
+}
+
+// Recursive template that uses the above bitonic merge to perform a
+// bitonic sort
+template <typename K, typename V, int N, bool Dir, typename Comp>
+struct BitonicSortStepKVP {
+  static inline __device__ void sort(K k[N], KeyValuePair<K, V> v[N]) {
+    static_assert(N > 1, "did not hit specialized case");
+
+    // Sort recursively
+    constexpr int kSizeA = N / 2;
+    constexpr int kSizeB = N - kSizeA;
+
+    K aK[kSizeA];
+    KeyValuePair<K, V> aV[kSizeA];
+
+#pragma unroll
+    for (int i = 0; i < kSizeA; ++i) {
+      aK[i] = k[i];
+      aV[i].key = v[i].key;
+      aV[i].value = v[i].value;
+    }
+
+    BitonicSortStepKVP<K, V, kSizeA, Dir, Comp>::sort(aK, aV);
+
+    K bK[kSizeB];
+    KeyValuePair<K, V> bV[kSizeB];
+
+#pragma unroll
+    for (int i = 0; i < kSizeB; ++i) {
+      bK[i] = k[i + kSizeA];
+      bV[i].key = v[i + kSizeA].key;
+      bV[i].value = v[i + kSizeA].value;
+    }
+
+    BitonicSortStepKVP<K, V, kSizeB, Dir, Comp>::sort(bK, bV);
+
+    // Merge halves
+    warpMergeAnyRegistersKVP<K, V, kSizeA, kSizeB, Dir, Comp>(aK, aV, bK, bV);
+
+#pragma unroll
+    for (int i = 0; i < kSizeA; ++i) {
+      k[i] = aK[i];
+      v[i].key = aV[i].key;
+      v[i].value = aV[i].value;
+    }
+
+#pragma unroll
+    for (int i = 0; i < kSizeB; ++i) {
+      k[i + kSizeA] = bK[i];
+      v[i + kSizeA].key = bV[i].key;
+      v[i + kSizeA].value = bV[i].value;
+    }
+  }
+};
+
+// Single warp (N == 1) sorting specialization
+template <typename K, typename V, bool Dir, typename Comp>
+struct BitonicSortStepKVP<K, V, 1, Dir, Comp> {
+  static inline __device__ void sort(K k[1], KeyValuePair<K, V> v[1]) {
+    // Update this code if this changes
+    // should go from 1 -> kWarpSize in multiples of 2
+    static_assert(kWarpSize == 32, "unexpected warp size");
+
+    warpBitonicMergeLE16KVP<K, V, 1, Dir, Comp, false>(k[0], v[0]);
+    warpBitonicMergeLE16KVP<K, V, 2, Dir, Comp, false>(k[0], v[0]);
+    warpBitonicMergeLE16KVP<K, V, 4, Dir, Comp, false>(k[0], v[0]);
+    warpBitonicMergeLE16KVP<K, V, 8, Dir, Comp, false>(k[0], v[0]);
+    warpBitonicMergeLE16KVP<K, V, 16, Dir, Comp, false>(k[0], v[0]);
+  }
+};
+
+/// Sort a list of kWarpSize * N elements in registers, where N is an
+/// arbitrary >= 1
+template <typename K, typename V, int N, bool Dir, typename Comp>
+inline __device__ void warpSortAnyRegistersKVP(K k[N],
+                                               KeyValuePair<K, V> v[N]) {
+  BitonicSortStepKVP<K, V, N, Dir, Comp>::sort(k, v);
+}
+
+// `Dir` true, produce largest values.
+// `Dir` false, produce smallest values.
+template <typename K, typename V, bool Dir, typename Comp, int NumWarpQ,
+          int NumThreadQ, int ThreadsPerBlock>
+struct KeyValueWarpSelect {
+  static constexpr int kNumWarpQRegisters = NumWarpQ / faiss::gpu::kWarpSize;
+
+  __device__ inline KeyValueWarpSelect(K initKVal,
+                                       faiss::gpu::KeyValuePair<K, V> initVVal,
+                                       int k)
+    : initK(initKVal),
+      initV(initVVal),
+      numVals(0),
+      warpKTop(initKVal),
+      warpKTopRDist(initKVal),
+      kLane((k - 1) % faiss::gpu::kWarpSize) {
+    static_assert(faiss::gpu::utils::isPowerOf2(ThreadsPerBlock),
+                  "threads must be a power-of-2");
+    static_assert(faiss::gpu::utils::isPowerOf2(NumWarpQ),
+                  "warp queue must be power-of-2");
+
+    // Fill the per-thread queue keys with the default value
+#pragma unroll
+    for (int i = 0; i < NumThreadQ; ++i) {
+      threadK[i] = initK;
+      threadV[i].key = initV.key;
+      threadV[i].value = initV.value;
+    }
+
+    // Fill the warp queue with the default value
+#pragma unroll
+    for (int i = 0; i < kNumWarpQRegisters; ++i) {
+      warpK[i] = initK;
+      warpV[i].key = initV.key;
+      warpV[i].value = initV.value;
+    }
+  }
+
+  __device__ inline void addThreadQ(K k, faiss::gpu::KeyValuePair<K, V>& v) {
+    if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) {
+      // Rotate right
+#pragma unroll
+      for (int i = NumThreadQ - 1; i > 0; --i) {
+        threadK[i] = threadK[i - 1];
+        threadV[i].key = threadV[i - 1].key;
+        threadV[i].value = threadV[i - 1].value;
+      }
+
+      threadK[0] = k;
+      threadV[0].key = v.key;
+      threadV[0].value = v.value;
+      ++numVals;
+    }
+  }
+  /// This function handles sorting and merging together the
+  /// per-thread queues with the warp-wide queue, creating a sorted
+  /// list across both
+
+  // TODO
+  __device__ inline void mergeWarpQ() {
+    // Sort all of the per-thread queues
+    faiss::gpu::warpSortAnyRegistersKVP<K, V, NumThreadQ, !Dir, Comp>(threadK,
+                                                                      threadV);
+
+    // The warp queue is already sorted, and now that we've sorted the
+    // per-thread queue, merge both sorted lists together, producing
+    // one sorted list
+    faiss::gpu::warpMergeAnyRegistersKVP<K, V, kNumWarpQRegisters, NumThreadQ,
+                                         !Dir, Comp, false>(warpK, warpV,
+                                                            threadK, threadV);
+  }
+
+  /// WARNING: all threads in a warp must participate in this.
+  /// Otherwise, you must call the constituent parts separately.
+  __device__ inline void add(K k, faiss::gpu::KeyValuePair<K, V>& v) {
+    addThreadQ(k, v);
+    checkThreadQ();
+  }
+
+  __device__ inline void reduce() {
+    // Have all warps dump and merge their queues; this will produce
+    // the final per-warp results
+    mergeWarpQ();
+  }
+
+  __device__ inline void checkThreadQ() {
+    bool needSort = (numVals == NumThreadQ);
+
+#if CUDA_VERSION >= 9000
+    needSort = __any_sync(0xffffffff, needSort);
+#else
+    needSort = __any(needSort);
+#endif
+
+    if (!needSort) {
+      // no lanes have triggered a sort
+      return;
+    }
+
+    mergeWarpQ();
+
+    // Any top-k elements have been merged into the warp queue; we're
+    // free to reset the thread queues
+    numVals = 0;
+
+#pragma unroll
+    for (int i = 0; i < NumThreadQ; ++i) {
+      threadK[i] = initK;
+      threadV[i].key = initV.key;
+      threadV[i].value = initV.value;
+    }
+
+    // We have to beat at least this element
+    warpKTopRDist = shfl(warpV[kNumWarpQRegisters - 1].key, kLane);
+    warpKTop = shfl(warpK[kNumWarpQRegisters - 1], kLane);
+  }
+
+  /// Dump final k selected values for this warp out
+  __device__ inline void writeOut(K* outK, V* outV, int k) {
+    int laneId = faiss::gpu::getLaneId();
+
+#pragma unroll
+    for (int i = 0; i < kNumWarpQRegisters; ++i) {
+      int idx = i * faiss::gpu::kWarpSize + laneId;
+
+      if (idx < k) {
+        outK[idx] = warpK[i];
+        outV[idx] = warpV[i].value;
+      }
+    }
+  }
+
+  // Default element key
+  const K initK;
+
+  // Default element value
+  const faiss::gpu::KeyValuePair<K, V> initV;
+
+  // Number of valid elements in our thread queue
+  int numVals;
+
+  // The k-th highest (Dir) or lowest (!Dir) element
+  K warpKTop;
+
+  // TopK's distance to closest landmark
+  K warpKTopRDist;
+
+  // Thread queue values
+  K threadK[NumThreadQ];
+  faiss::gpu::KeyValuePair<K, V> threadV[NumThreadQ];
+
+  // warpK[0] is highest (Dir) or lowest (!Dir)
+  K warpK[kNumWarpQRegisters];
+  faiss::gpu::KeyValuePair<K, V> warpV[kNumWarpQRegisters];
+
+  // This is what lane we should load an approximation (>=k) to the
+  // kth element from the last register in the warp queue (i.e.,
+  // warpK[kNumWarpQRegisters - 1]).
+  int kLane;
+};
+
+}  // namespace gpu
+}  // namespace faiss
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index 71c547c281..73866b35a8 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include "detail/knn_brute_force_faiss.cuh"
+#include "detail/selection_faiss.cuh"
 
 #include <raft/mr/device/buffer.hpp>
 
@@ -24,6 +25,32 @@ namespace raft {
 namespace spatial {
 namespace knn {
 
+using deviceAllocator = raft::mr::device::allocator;
+
+/**
+ * Performs a k-select across row partitioned index/distance
+ * matrices formatted like the following:
+ * row1: k0, k1, k2
+ * row2: k0, k1, k2
+ * row3: k0, k1, k2
+ * row1: k0, k1, k2
+ * row2: k0, k1, k2
+ * row3: k0, k1, k2
+ *
+ * etc...
+ *
+ * @tparam value_idx
+ * @tparam value_t
+ * @param inK
+ * @param inV
+ * @param outK
+ * @param outV
+ * @param n_samples
+ * @param n_parts
+ * @param k
+ * @param stream
+ * @param translations
+ */
 template <typename value_idx = int64_t, typename value_t = float>
 inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
                             value_idx *outV, size_t n_samples, int n_parts,
@@ -33,6 +60,34 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
                           translations);
 }
 
+/**
+ * Performs a k-select across column-partitioned index/distance
+ * matrices formatted like the following:
+ * row1: k0, k1, k2, k0, k1, k2
+ * row2: k0, k1, k2, k0, k1, k2
+ * row3: k0, k1, k2, k0, k1, k2
+ *
+ * etc...
+ *
+ * @tparam value_idx
+ * @tparam value_t
+ * @param inK
+ * @param inV
+ * @param n_rows
+ * @param n_cols
+ * @param outK
+ * @param outV
+ * @param select_min
+ * @param k
+ * @param stream
+ */
+template <typename value_idx = int, typename value_t = float>
+inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols,
+                     value_t *outK, value_idx *outV, bool select_min, int k,
+                     cudaStream_t stream) {
+  detail::select_k(inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+}
+
 /**
  * @brief Flat C++ API function to perform a brute force knn on
  * a series of input arrays and combine the results into a single
@@ -73,7 +128,6 @@ inline void brute_force_knn(
                                handle.get_num_internal_streams(), rowMajorIndex,
                                rowMajorQuery, translations, metric, metric_arg);
 }
-
 }  // namespace knn
 }  // namespace spatial
 }  // namespace raft
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 42066061f3..43e1c65695 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -84,11 +84,12 @@ add_executable(test_raft
     test/sparse/norm.cu
     test/sparse/reduce.cu
     test/sparse/row_op.cu
-    test/sparse/selection.cu
     test/sparse/sort.cu
     test/sparse/symmetrize.cu
     test/spatial/knn.cu
     test/spatial/haversine.cu
+    test/spatial/ball_cover.cu
+    test/spatial/selection.cu
     test/spectral_matrix.cu
     test/stats/mean.cu
     test/stats/mean_center.cu
diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu
new file mode 100644
index 0000000000..3c70ee8728
--- /dev/null
+++ b/cpp/test/spatial/ball_cover.cu
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/cudart_utils.h>
+#include <raft/linalg/distance_type.h>
+#include <raft/spatial/knn/ball_cover.hpp>
+#include <raft/spatial/knn/detail/haversine_distance.cuh>
+#include <raft/spatial/knn/detail/knn_brute_force_faiss.cuh>
+#include <rmm/device_uvector.hpp>
+#include "../test_utils.h"
+#include "spatial_data.h"
+
+#include <thrust/transform.h>
+#include <rmm/exec_policy.hpp>
+
+#include <gtest/gtest.h>
+#include <cstdint>
+#include <iostream>
+#include <vector>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+using namespace std;
+
+template <typename value_idx, typename value_t>
+__global__ void count_discrepancies_kernel(value_idx *actual_idx,
+                                           value_idx *expected_idx,
+                                           value_t *actual, value_t *expected,
+                                           uint32_t m, uint32_t n,
+                                           uint32_t *out, float thres = 1e-3) {
+  uint32_t row = blockDim.x * blockIdx.x + threadIdx.x;
+
+  int n_diffs = 0;
+  if (row < m) {
+    for (uint32_t i = 0; i < n; i++) {
+      value_t d = actual[row * n + i] - expected[row * n + i];
+      bool matches = fabsf(d) <= thres;
+      if (!matches) {
+        //          printf("row=%d, actual_idx=%ld, actual=%f, expected_id=%ld, expected=%f\n",
+        //                 row, actual_idx[row*n+i], actual[row*n+i], expected_idx[row*n+i], expected[row*n+i]);
+      }
+
+      n_diffs += !matches;
+      out[row] = n_diffs;
+    }
+  }
+}
+
+struct is_nonzero {
+  __host__ __device__ bool operator()(uint32_t &i) { return i > 0; }
+};
+
+template <typename value_idx, typename value_t>
+uint32_t count_discrepancies(value_idx *actual_idx, value_idx *expected_idx,
+                             value_t *actual, value_t *expected, uint32_t m,
+                             uint32_t n, uint32_t *out, cudaStream_t stream) {
+  uint32_t tpb = 256;
+  count_discrepancies_kernel<<<raft::ceildiv(m, tpb), tpb, 0, stream>>>(
+    actual_idx, expected_idx, actual, expected, m, n, out);
+
+  auto exec_policy = rmm::exec_policy(stream);
+
+  uint32_t result = thrust::count_if(exec_policy, out, out + m, is_nonzero());
+  return result;
+}
+
+struct ToRadians {
+  __device__ __host__ float operator()(float a) {
+    return a * (CUDART_PI_F / 180.0);
+  }
+};
+
+struct BallCoverInputs {
+  uint32_t k;
+  float weight;
+  raft::distance::DistanceType metric;
+};
+
+template <typename value_idx, typename value_t>
+class BallCoverKNNQueryTest : public ::testing::TestWithParam<BallCoverInputs> {
+ protected:
+  void basicTest() {
+    params = ::testing::TestWithParam<BallCoverInputs>::GetParam();
+    raft::handle_t handle;
+
+    uint32_t k = params.k;
+    float weight = params.weight;
+    auto metric = params.metric;
+
+    std::vector<value_t> h_train_inputs = spatial_data;
+
+    uint32_t n = h_train_inputs.size() / d;
+
+    rmm::device_uvector<value_idx> d_ref_I(n * k, handle.get_stream());
+    rmm::device_uvector<value_t> d_ref_D(n * k, handle.get_stream());
+
+    // Allocate input
+    rmm::device_uvector<value_t> d_train_inputs(n * d, handle.get_stream());
+    raft::update_device(d_train_inputs.data(), h_train_inputs.data(), n * d,
+                        handle.get_stream());
+
+    if (metric == raft::distance::DistanceType::Haversine) {
+      thrust::transform(handle.get_thrust_policy(), d_train_inputs.data(),
+                        d_train_inputs.data() + d_train_inputs.size(),
+                        d_train_inputs.data(), ToRadians());
+    }
+
+    cudaStream_t *int_streams = nullptr;
+    std::vector<int64_t> *translations = nullptr;
+
+    std::vector<float *> input_vec = {d_train_inputs.data()};
+    std::vector<uint32_t> sizes_vec = {n};
+
+    raft::spatial::knn::detail::brute_force_knn_impl<uint32_t, int64_t>(
+      input_vec, sizes_vec, d, d_train_inputs.data(), n, d_ref_I.data(),
+      d_ref_D.data(), k, handle.get_stream(), int_streams, 0, true, true,
+      translations, metric);
+
+    CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
+
+    // Allocate predicted arrays
+    rmm::device_uvector<value_idx> d_pred_I(n * k, handle.get_stream());
+    rmm::device_uvector<value_t> d_pred_D(n * k, handle.get_stream());
+
+    BallCoverIndex<value_idx, value_t> index(handle, d_train_inputs.data(), n,
+                                             d, metric);
+
+    raft::spatial::knn::rbc_build_index(handle, index);
+    raft::spatial::knn::rbc_knn_query(handle, index, k, d_train_inputs.data(),
+                                      n, d_pred_I.data(), d_pred_D.data(), true,
+                                      weight);
+
+    CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
+    // What we really want are for the distances to match exactly. The
+    // indices may or may not match exactly, depending upon the ordering which
+    // can be nondeterministic.
+
+    rmm::device_uvector<uint32_t> discrepancies(n, handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(), discrepancies.data(),
+                 discrepancies.data() + discrepancies.size(), 0);
+    //
+    int res = count_discrepancies(d_ref_I.data(), d_pred_I.data(),
+                                  d_ref_D.data(), d_pred_D.data(), n, k,
+                                  discrepancies.data(), handle.get_stream());
+
+    ASSERT_TRUE(res == 0);
+  }
+
+  void SetUp() override {}
+
+  void TearDown() override {}
+
+ protected:
+  uint32_t d = 2;
+  BallCoverInputs params;
+};
+
+template <typename value_idx, typename value_t>
+class BallCoverAllKNNTest : public ::testing::TestWithParam<BallCoverInputs> {
+ protected:
+  void basicTest() {
+    params = ::testing::TestWithParam<BallCoverInputs>::GetParam();
+    raft::handle_t handle;
+
+    uint32_t k = params.k;
+    float weight = params.weight;
+    auto metric = params.metric;
+
+    std::vector<value_t> h_train_inputs = spatial_data;
+
+    uint32_t n = h_train_inputs.size() / d;
+
+    rmm::device_uvector<value_idx> d_ref_I(n * k, handle.get_stream());
+    rmm::device_uvector<value_t> d_ref_D(n * k, handle.get_stream());
+
+    // Allocate input
+    rmm::device_uvector<value_t> d_train_inputs(n * d, handle.get_stream());
+    raft::update_device(d_train_inputs.data(), h_train_inputs.data(), n * d,
+                        handle.get_stream());
+
+    if (metric == raft::distance::DistanceType::Haversine) {
+      thrust::transform(handle.get_thrust_policy(), d_train_inputs.data(),
+                        d_train_inputs.data() + d_train_inputs.size(),
+                        d_train_inputs.data(), ToRadians());
+    }
+
+    cudaStream_t *int_streams = nullptr;
+    std::vector<int64_t> *translations = nullptr;
+
+    std::vector<float *> input_vec = {d_train_inputs.data()};
+    std::vector<uint32_t> sizes_vec = {n};
+
+    raft::spatial::knn::detail::brute_force_knn_impl<uint32_t, int64_t>(
+      input_vec, sizes_vec, d, d_train_inputs.data(), n, d_ref_I.data(),
+      d_ref_D.data(), k, handle.get_stream(), int_streams, 0, true, true,
+      translations, metric);
+
+    CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
+
+    // Allocate predicted arrays
+    rmm::device_uvector<value_idx> d_pred_I(n * k, handle.get_stream());
+    rmm::device_uvector<value_t> d_pred_D(n * k, handle.get_stream());
+
+    BallCoverIndex<value_idx, value_t> index(handle, d_train_inputs.data(), n,
+                                             d, metric);
+
+    raft::spatial::knn::rbc_all_knn_query(handle, index, k, d_pred_I.data(),
+                                          d_pred_D.data(), true, weight);
+
+    CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
+    // What we really want are for the distances to match exactly. The
+    // indices may or may not match exactly, depending upon the ordering which
+    // can be nondeterministic.
+
+    rmm::device_uvector<uint32_t> discrepancies(n, handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(), discrepancies.data(),
+                 discrepancies.data() + discrepancies.size(), 0);
+    //
+    uint32_t res = count_discrepancies(
+      d_ref_I.data(), d_pred_I.data(), d_ref_D.data(), d_pred_D.data(), n, k,
+      discrepancies.data(), handle.get_stream());
+    ASSERT_TRUE(res == 0);
+  }
+
+  void SetUp() override {}
+
+  void TearDown() override {}
+
+ protected:
+  uint32_t d = 2;
+  BallCoverInputs params;
+};
+
+typedef BallCoverAllKNNTest<int64_t, float> BallCoverAllKNNTestF;
+typedef BallCoverKNNQueryTest<int64_t, float> BallCoverKNNQueryTestF;
+
+const std::vector<BallCoverInputs> ballcover_inputs = {
+  {2, 1.0, raft::distance::DistanceType::Haversine},
+  {4, 1.0, raft::distance::DistanceType::Haversine},
+  {7, 1.0, raft::distance::DistanceType::Haversine},
+  {2, 1.0, raft::distance::DistanceType::L2SqrtUnexpanded},
+  {4, 1.0, raft::distance::DistanceType::L2SqrtUnexpanded},
+  {7, 1.0, raft::distance::DistanceType::L2SqrtUnexpanded},
+};
+
+INSTANTIATE_TEST_CASE_P(BallCoverAllKNNTest, BallCoverAllKNNTestF,
+                        ::testing::ValuesIn(ballcover_inputs));
+INSTANTIATE_TEST_CASE_P(BallCoverKNNQueryTest, BallCoverKNNQueryTestF,
+                        ::testing::ValuesIn(ballcover_inputs));
+
+TEST_P(BallCoverAllKNNTestF, Fit) { basicTest(); }
+TEST_P(BallCoverKNNQueryTestF, Fit) { basicTest(); }
+
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index e4d05920c0..7c70f6ffac 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -85,15 +85,6 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
     build_expected_output<<<raft::ceildiv(rows_ * k_, 32), 32, 0, stream>>>(
       expected_labels_, rows_, k_, search_labels_);
 
-    raft::print_device_vector("Output indices: ", indices_, rows_ * k_,
-                              std::cout);
-    raft::print_device_vector("Output distances: ", distances_, rows_ * k_,
-                              std::cout);
-    raft::print_device_vector("Output labels: ", actual_labels_, rows_ * k_,
-                              std::cout);
-    raft::print_device_vector("Expected labels: ", expected_labels_, rows_ * k_,
-                              std::cout);
-
     ASSERT_TRUE(devArrMatch(expected_labels_, actual_labels_, rows_ * k_,
                             raft::Compare<int>()));
   }
diff --git a/cpp/test/sparse/selection.cu b/cpp/test/spatial/selection.cu
similarity index 93%
rename from cpp/test/sparse/selection.cu
rename to cpp/test/spatial/selection.cu
index 256ecfdfb7..a7f1af6034 100644
--- a/cpp/test/sparse/selection.cu
+++ b/cpp/test/spatial/selection.cu
@@ -20,10 +20,10 @@
 #include "../test_utils.h"
 
 #include <raft/sparse/utils.h>
-#include <raft/sparse/selection/selection.cuh>
+#include <raft/spatial/knn/knn.hpp>
 
 namespace raft {
-namespace sparse {
+namespace spatial {
 namespace selection {
 
 using namespace raft;
@@ -89,9 +89,8 @@ class SparseSelectionTest
 
     make_data();
 
-    raft::sparse::selection::select_k(dists, inds, n_rows, n_cols, out_dists,
-                                      out_indices, params.select_min, k,
-                                      stream);
+    raft::spatial::knn::select_k(dists, inds, n_rows, n_cols, out_dists,
+                                 out_indices, params.select_min, k, stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
@@ -143,5 +142,5 @@ INSTANTIATE_TEST_CASE_P(SparseSelectionTest, SparseSelectionTestF,
                         ::testing::ValuesIn(inputs_i32_f));
 
 };  // end namespace selection
-};  // end namespace sparse
+};  // end namespace spatial
 };  // end namespace raft
diff --git a/cpp/test/spatial/spatial_data.h b/cpp/test/spatial/spatial_data.h
new file mode 100644
index 0000000000..87891164fc
--- /dev/null
+++ b/cpp/test/spatial/spatial_data.h
@@ -0,0 +1,27 @@
+#include <vector>
+
+namespace raft {
+namespace spatial {
+
+// Latitude and longitude coordinates of 51 US states / territories
+std::vector<float> spatial_data = {
+  63.588753, -154.493062, 32.318231, -86.902298,  35.20105,  -91.831833,
+  34.048928, -111.093731, 36.778261, -119.417932, 39.550051, -105.782067,
+  41.603221, -73.087749,  38.905985, -77.033418,  38.910832, -75.52767,
+  27.664827, -81.515754,  32.157435, -82.907123,  19.898682, -155.665857,
+  41.878003, -93.097702,  44.068202, -114.742041, 40.633125, -89.398528,
+  40.551217, -85.602364,  39.011902, -98.484246,  37.839333, -84.270018,
+  31.244823, -92.145024,  42.407211, -71.382437,  39.045755, -76.641271,
+  45.253783, -69.445469,  44.314844, -85.602364,  46.729553, -94.6859,
+  37.964253, -91.831833,  32.354668, -89.398528,  46.879682, -110.362566,
+  35.759573, -79.0193,    47.551493, -101.002012, 41.492537, -99.901813,
+  43.193852, -71.572395,  40.058324, -74.405661,  34.97273,  -105.032363,
+  38.80261,  -116.419389, 43.299428, -74.217933,  40.417287, -82.907123,
+  35.007752, -97.092877,  43.804133, -120.554201, 41.203322, -77.194525,
+  18.220833, -66.590149,  41.580095, -71.477429,  33.836081, -81.163725,
+  43.969515, -99.901813,  35.517491, -86.580447,  31.968599, -99.901813,
+  39.32098,  -111.093731, 37.431573, -78.656894,  44.558803, -72.577841,
+  47.751074, -120.740139, 43.78444,  -88.787868,  38.597626, -80.454903,
+  43.075968, -107.290284};
+};  // namespace spatial
+};  // namespace raft
\ No newline at end of file
diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h
index b8e8fe3fa0..0f135c0121 100644
--- a/cpp/test/test_utils.h
+++ b/cpp/test/test_utils.h
@@ -21,6 +21,13 @@
 #include <memory>
 #include <raft/cuda_utils.cuh>
 
+#include <fstream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
 namespace raft {
 
 template <typename T>
@@ -249,4 +256,39 @@ testing::AssertionResult match(const T expected, T actual, L eq_compare) {
     ms /= args.runs;                                    \
   } while (0)
 
+inline std::vector<float> read_csv(std::string filename,
+                                   bool skip_first_n_columns = 1) {
+  std::vector<float> result;
+  std::ifstream myFile(filename);
+  if (!myFile.is_open()) throw std::runtime_error("Could not open file");
+
+  std::string line, colname;
+  int val;
+
+  if (myFile.good()) {
+    std::getline(myFile, line);
+    std::stringstream ss(line);
+    while (std::getline(ss, colname, ',')) {
+    }
+  }
+
+  int n_lines = 0;
+  while (std::getline(myFile, line)) {
+    std::stringstream ss(line);
+    int colIdx = 0;
+    while (ss >> val) {
+      if (colIdx >= skip_first_n_columns) {
+        result.push_back(val);
+        if (ss.peek() == ',') ss.ignore();
+      }
+      colIdx++;
+    }
+    n_lines++;
+  }
+
+  printf("lines read: %d\n", n_lines);
+  myFile.close();
+  return result;
+}
+
 };  // end namespace raft

From 6d7f897bad1e7ab6860bbe5d503d077431a30e34 Mon Sep 17 00:00:00 2001
From: Victor Lafargue <viclafargue@nvidia.com>
Date: Fri, 24 Sep 2021 12:55:23 +0200
Subject: [PATCH 023/171] Fix `matrixVectorOp` to verify promoted pointer type
 is still aligned to vectorized load boundary (#325)

Fix for https://github.com/rapidsai/cuml/issues/3965
The function did not check for the memory alignment of the pointer provided.

Authors:
  - Victor Lafargue (https://github.com/viclafargue)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/325
---
 cpp/include/raft/linalg/matrix_vector_op.cuh | 44 +++++++++++++++-----
 cpp/include/raft/vectorized.cuh              | 10 ++++-
 2 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index 902816418f..e948c3e673 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -73,6 +73,12 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec,
 
 /**
  * @brief Operations for all the columns or rows with a given vector.
+ * Caution : Threads process multiple elements to speed up processing. These
+ * are loaded in a single read thanks to type promotion. Faster processing
+ * would thus only be enabled when adresses are optimally aligned for it.
+ * Note : the function will also check that the size of the window of accesses
+ * is a multiple of the number of elements processed by a thread in order to
+ * enable faster processing
  * @tparam Type the matrix/vector type
  * @tparam Lambda a device function which represents a binary operator
  * @tparam IdxType Integer type used to for addressing
@@ -93,17 +99,23 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D,
                     IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op,
                     cudaStream_t stream) {
   IdxType stride = rowMajor ? D : N;
-  size_t bytes = stride * sizeof(Type);
-  if (16 / sizeof(Type) && bytes % 16 == 0) {
+  size_t stride_bytes = stride * sizeof(Type);
+
+  auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) {
+    return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 &&
+           reinterpret_cast<uintptr_t>(matrix) % sizeof(Type);
+  };
+
+  if (test_aligned_access(16)) {
     matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (8 / sizeof(Type) && bytes % 8 == 0) {
+  } else if (test_aligned_access(8)) {
     matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (4 / sizeof(Type) && bytes % 4 == 0) {
+  } else if (test_aligned_access(4)) {
     matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (2 / sizeof(Type) && bytes % 2 == 0) {
+  } else if (test_aligned_access(2)) {
     matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
   } else if (1 / sizeof(Type)) {
@@ -168,6 +180,12 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1,
 
 /**
  * @brief Operations for all the columns or rows with the given vectors.
+ * Caution : Threads process multiple elements to speed up processing. These
+ * are loaded in a single read thanks to type promotion. Faster processing
+ * would thus only be enabled when adresses are optimally aligned for it.
+ * Note : the function will also check that the size of the window of accesses
+ * is a multiple of the number of elements processed by a thread in order to
+ * enable faster processing
  * @tparam Type the matrix/vector type
  * @tparam Lambda a device function which represents a binary operator
  * @tparam IdxType Integer type used to for addressing
@@ -189,17 +207,23 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1,
                     const Type *vec2, IdxType D, IdxType N, bool rowMajor,
                     bool bcastAlongRows, Lambda op, cudaStream_t stream) {
   IdxType stride = rowMajor ? D : N;
-  size_t bytes = stride * sizeof(Type);
-  if (16 / sizeof(Type) && bytes % 16 == 0) {
+  size_t stride_bytes = stride * sizeof(Type);
+
+  auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) {
+    return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 &&
+           reinterpret_cast<uintptr_t>(matrix) % sizeof(Type);
+  };
+
+  if (test_aligned_access(16)) {
     matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (8 / sizeof(Type) && bytes % 8 == 0) {
+  } else if (test_aligned_access(8)) {
     matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (4 / sizeof(Type) && bytes % 4 == 0) {
+  } else if (test_aligned_access(4)) {
     matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (2 / sizeof(Type) && bytes % 2 == 0) {
+  } else if (test_aligned_access(2)) {
     matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
   } else if (1 / sizeof(Type)) {
diff --git a/cpp/include/raft/vectorized.cuh b/cpp/include/raft/vectorized.cuh
index 1829fc0351..ceffbcca78 100644
--- a/cpp/include/raft/vectorized.cuh
+++ b/cpp/include/raft/vectorized.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -227,6 +227,14 @@ struct IOType<double, 2> {
      * reasons if one is unable to issue such vectorized operations, one can always
      * fallback to using POD types.
      *
+     * Concept of vectorized accesses : Threads process multiple elements
+     * to speed up processing. These are loaded in a single read thanks
+     * to type promotion. It is then reinterpreted as a vector elements
+     * to perform the kernel's work.
+     *
+     * Caution : vectorized accesses requires input adresses to be memory aligned
+     * according not to the input type but to the promoted type used for reading.
+     *
      * Example demonstrating the use of load operations, performing math on such
      * loaded data and finally storing it back.
      * @code{.cu}

From 890c48e08ddeac7ca355c4ba9e6fd72bd216716b Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 27 Sep 2021 13:37:05 -0500
Subject: [PATCH 024/171] Unpin `dask` & `distributed` in CI (#338)

Changes to be in-line with: https://github.com/rapidsai/cudf/pull/9307/

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/raft/pull/338
---
 ci/gpu/build.sh          | 4 ++--
 ci/local/old-gpubuild.sh | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 6f5b5c3c6c..0ba9901107 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -59,8 +59,8 @@ gpuci_mamba_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid
 # Install the master version of dask, distributed, and dask-ml
 gpuci_logger "Install the master version of dask and distributed"
 set -x
-pip install "git+https://github.com/dask/distributed.git@2021.09.1" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git@2021.09.1" --upgrade --no-deps
+pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
+pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
 set +x
 
 
diff --git a/ci/local/old-gpubuild.sh b/ci/local/old-gpubuild.sh
index 9ab4486977..efd6c0382a 100644
--- a/ci/local/old-gpubuild.sh
+++ b/ci/local/old-gpubuild.sh
@@ -81,8 +81,8 @@ fi
 
 # Install the master version of dask, distributed, and dask-ml
 set -x
-pip install "git+https://github.com/dask/distributed.git@2021.09.1" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git@2021.09.1" --upgrade --no-deps
+pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
+pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
 set +x
 
 
From f5725d6d70388aa3189d9b5b48e1d8c85b07580b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Tue, 28 Sep 2021 16:43:04 -0400
Subject: [PATCH 025/171] [HOTFIX] Reverting fused l2 knn for now (#340)

This isn't really a full revert but just reverting the call to the new fused l2 knn prim from the brute-force knn prim.
---
 .../knn/detail/knn_brute_force_faiss.cuh      | 105 ++++++------------
 cpp/test/spatial/ball_cover.cu                |  56 +++++++---
 2 files changed, 73 insertions(+), 88 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 43fc4a164f..3a3f0a6513 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -271,81 +271,44 @@ void brute_force_knn_impl(std::vector<float *> &input,
     cudaStream_t stream =
       raft::select_stream(userStream, internalStreams, n_int_streams, i);
 
-    if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true &&
-        (metric == raft::distance::DistanceType::L2Unexpanded ||
-         metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
-         metric == raft::distance::DistanceType::L2Expanded ||
-         metric == raft::distance::DistanceType::L2SqrtExpanded)) {
-      size_t worksize = 0;
-      void *workspace = nullptr;
-
-      switch (metric) {
-        case raft::distance::DistanceType::L2Expanded:
-        case raft::distance::DistanceType::L2Unexpanded:
-        case raft::distance::DistanceType::L2SqrtExpanded:
-        // Even for L2 Sqrt distance case we use non-sqrt version
-        // as FAISS bfKNN only support non-sqrt metric & some tests
-        // in RAFT/cuML (like Linkage) fails if we use L2 sqrt.
-        // Even for L2 Sqrt distance case we use non-sqrt version
-        // as FAISS bfKNN only support non-sqrt metric & some tests
-        // in RAFT/cuML (like Linkage) fails if we use L2 sqrt.
-        case raft::distance::DistanceType::L2SqrtUnexpanded:
-          l2_unexpanded_knn<raft::distance::DistanceType::L2Unexpanded, int64_t,
-                            float, false>(
-            D, out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k,
-            rowMajorIndex, rowMajorQuery, stream, workspace, worksize);
-          if (worksize) {
-            rmm::device_uvector<int> d_mutexes(worksize, stream);
-            workspace = d_mutexes.data();
-            l2_unexpanded_knn<raft::distance::DistanceType::L2Unexpanded,
-                              int64_t, float, false>(
-              D, out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k,
-              rowMajorIndex, rowMajorQuery, stream, workspace, worksize);
-          }
-          break;
-        default:
-          break;
-      }
-    } else {
-      switch (metric) {
-        case raft::distance::DistanceType::Haversine:
-
-          ASSERT(D == 2,
-                 "Haversine distance requires 2 dimensions "
-                 "(latitude / longitude).");
-
-          haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i],
-                        n, k, stream);
-          break;
-        default:
-          faiss::MetricType m = build_faiss_metric(metric);
-
-          faiss::gpu::StandardGpuResources gpu_res;
-
-          gpu_res.noTempMemory();
-          gpu_res.setDefaultStream(device, stream);
-
-          faiss::gpu::GpuDistanceParams args;
-          args.metric = m;
-          args.metricArg = metricArg;
-          args.k = k;
-          args.dims = D;
-          args.vectors = input[i];
-          args.vectorsRowMajor = rowMajorIndex;
-          args.numVectors = sizes[i];
-          args.queries = search_items;
-          args.queriesRowMajor = rowMajorQuery;
-          args.numQueries = n;
-          args.outDistances = out_d_ptr;
-          args.outIndices = out_i_ptr;
-
-          /**
+    switch (metric) {
+      case raft::distance::DistanceType::Haversine:
+
+        ASSERT(D == 2,
+               "Haversine distance requires 2 dimensions "
+               "(latitude / longitude).");
+
+        haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n,
+                      k, stream);
+        break;
+      default:
+        faiss::MetricType m = build_faiss_metric(metric);
+
+        faiss::gpu::StandardGpuResources gpu_res;
+
+        gpu_res.noTempMemory();
+        gpu_res.setDefaultStream(device, stream);
+
+        faiss::gpu::GpuDistanceParams args;
+        args.metric = m;
+        args.metricArg = metricArg;
+        args.k = k;
+        args.dims = D;
+        args.vectors = input[i];
+        args.vectorsRowMajor = rowMajorIndex;
+        args.numVectors = sizes[i];
+        args.queries = search_items;
+        args.queriesRowMajor = rowMajorQuery;
+        args.numQueries = n;
+        args.outDistances = out_d_ptr;
+        args.outIndices = out_i_ptr;
+
+        /**
            * @todo: Until FAISS supports pluggable allocation strategies,
            * we will not reap the benefits of the pool allocator for
            * avoiding device-wide synchronizations from cudaMalloc/cudaFree
            */
-          bfKnn(&gpu_res, args);
-      }
+        bfKnn(&gpu_res, args);
     }
 
     CUDA_CHECK(cudaPeekAtLastError());
diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu
index 3c70ee8728..c43ce78cbf 100644
--- a/cpp/test/spatial/ball_cover.cu
+++ b/cpp/test/spatial/ball_cover.cu
@@ -17,6 +17,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
 #include <raft/spatial/knn/ball_cover.hpp>
+#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
 #include <raft/spatial/knn/detail/haversine_distance.cuh>
 #include <raft/spatial/knn/detail/knn_brute_force_faiss.cuh>
 #include <rmm/device_uvector.hpp>
@@ -79,6 +80,40 @@ uint32_t count_discrepancies(value_idx *actual_idx, value_idx *expected_idx,
   return result;
 }
 
+template <typename value_t>
+void compute_bfknn(const raft::handle_t &handle, const value_t *X1,
+                   const value_t *X2, uint32_t n, uint32_t d, uint32_t k,
+                   const raft::distance::DistanceType metric, value_t *dists,
+                   int64_t *inds) {
+  std::vector<value_t *> input_vec = {const_cast<value_t *>(X1)};
+  std::vector<uint32_t> sizes_vec = {n};
+
+  if (metric == raft::distance::DistanceType::Haversine) {
+    cudaStream_t *int_streams = nullptr;
+    std::vector<int64_t> *translations = nullptr;
+
+    raft::spatial::knn::detail::brute_force_knn_impl<uint32_t, int64_t>(
+      input_vec, sizes_vec, d, const_cast<value_t *>(X2), n, inds, dists, k,
+      handle.get_stream(), int_streams, 0, true, true, translations, metric);
+  } else {
+    size_t worksize = 0;
+    void *workspace = nullptr;
+    raft::spatial::knn::detail::l2_unexpanded_knn<
+      raft::distance::DistanceType::L2SqrtUnexpanded, int64_t, value_t, false>(
+      (size_t)d, inds, dists, input_vec[0], X2, (size_t)sizes_vec[0], (size_t)n,
+      (int)k, true, true, handle.get_stream(), workspace, worksize);
+    if (worksize) {
+      rmm::device_uvector<int> d_mutexes(worksize, handle.get_stream());
+      workspace = d_mutexes.data();
+      raft::spatial::knn::detail::l2_unexpanded_knn<
+        raft::distance::DistanceType::L2SqrtUnexpanded, int64_t, value_t,
+        false>((size_t)d, inds, dists, input_vec[0], X2, (size_t)sizes_vec[0],
+               (size_t)n, (int)k, true, true, handle.get_stream(), workspace,
+               worksize);
+    }
+  }
+}
+
 struct ToRadians {
   __device__ __host__ float operator()(float a) {
     return a * (CUDART_PI_F / 180.0);
@@ -120,16 +155,8 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam<BallCoverInputs> {
                         d_train_inputs.data(), ToRadians());
     }
 
-    cudaStream_t *int_streams = nullptr;
-    std::vector<int64_t> *translations = nullptr;
-
-    std::vector<float *> input_vec = {d_train_inputs.data()};
-    std::vector<uint32_t> sizes_vec = {n};
-
-    raft::spatial::knn::detail::brute_force_knn_impl<uint32_t, int64_t>(
-      input_vec, sizes_vec, d, d_train_inputs.data(), n, d_ref_I.data(),
-      d_ref_D.data(), k, handle.get_stream(), int_streams, 0, true, true,
-      translations, metric);
+    compute_bfknn(handle, d_train_inputs.data(), d_train_inputs.data(), n, d, k,
+                  metric, d_ref_D.data(), d_ref_I.data());
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
 
@@ -199,16 +226,11 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam<BallCoverInputs> {
                         d_train_inputs.data(), ToRadians());
     }
 
-    cudaStream_t *int_streams = nullptr;
-    std::vector<int64_t> *translations = nullptr;
-
     std::vector<float *> input_vec = {d_train_inputs.data()};
     std::vector<uint32_t> sizes_vec = {n};
 
-    raft::spatial::knn::detail::brute_force_knn_impl<uint32_t, int64_t>(
-      input_vec, sizes_vec, d, d_train_inputs.data(), n, d_ref_I.data(),
-      d_ref_D.data(), k, handle.get_stream(), int_streams, 0, true, true,
-      translations, metric);
+    compute_bfknn(handle, d_train_inputs.data(), d_train_inputs.data(), n, d, k,
+                  metric, d_ref_D.data(), d_ref_I.data());
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
 

From cc70815ddfeb1ce3ca3d7dce7bfa7a32e6fc7074 Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Thu, 30 Sep 2021 11:09:50 -0700
Subject: [PATCH 026/171] updating cuco to latest commit (#342)

---
 cpp/cmake/thirdparty/get_cuco.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index 06b2d17e7b..2ee7a1c384 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -22,7 +22,7 @@ function(find_and_configure_cuco VERSION)
       INSTALL_EXPORT_SET  raft-exports
       CPM_ARGS
         GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
-        GIT_TAG        b1fea0cbe4c384160740af00f7c8760846539abb
+        GIT_TAG        a3c85ee2ea25ddfdd4596c6b9d546f7c7590743f
         OPTIONS        "BUILD_TESTS OFF"
                        "BUILD_BENCHMARKS OFF"
                        "BUILD_EXAMPLES OFF"

From 3c7d49b29e8fc72227ae15709d56f6bc9f8f79da Mon Sep 17 00:00:00 2001
From: Chuck Hastings <45364586+ChuckHastings@users.noreply.github.com>
Date: Thu, 30 Sep 2021 19:46:16 -0400
Subject: [PATCH 027/171] update cuco version to pick up bug fix (#346)

---
 cpp/cmake/thirdparty/get_cuco.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index 2ee7a1c384..33e28ff622 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -22,7 +22,7 @@ function(find_and_configure_cuco VERSION)
       INSTALL_EXPORT_SET  raft-exports
       CPM_ARGS
         GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
-        GIT_TAG        a3c85ee2ea25ddfdd4596c6b9d546f7c7590743f
+        GIT_TAG        729857a5698a0e8d8f812e0464f65f37854ae17b
         OPTIONS        "BUILD_TESTS OFF"
                        "BUILD_BENCHMARKS OFF"
                        "BUILD_EXAMPLES OFF"

From 75656cee48b544caf609555f838eac39e68e3438 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 11 Oct 2021 17:52:57 +0200
Subject: [PATCH 028/171] Update to UCX-Py 0.23 (#358)

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)
  - Jordan Jacobelli (https://github.com/Ethyling)

URL: https://github.com/rapidsai/raft/pull/358
---
 ci/gpu/build.sh          | 2 +-
 ci/local/old-gpubuild.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 0ba9901107..96d70577f5 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -51,7 +51,7 @@ gpuci_mamba_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid
       "rmm=${MINOR_VERSION}" \
       "dask-cudf=${MINOR_VERSION}" \
       "dask-cuda=${MINOR_VERSION}" \
-      "ucx-py=0.22.*" \
+      "ucx-py=0.23.*" \
       "rapids-build-env=${MINOR_VERSION}.*" \
       "rapids-notebook-env=${MINOR_VERSION}.*" \
       "rapids-doc-env=${MINOR_VERSION}.*"
diff --git a/ci/local/old-gpubuild.sh b/ci/local/old-gpubuild.sh
index efd6c0382a..3d0a5cf87b 100644
--- a/ci/local/old-gpubuild.sh
+++ b/ci/local/old-gpubuild.sh
@@ -56,7 +56,7 @@ gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid
       "distributed>=2.12.0" \
       "dask-cudf=${MINOR_VERSION}" \
       "dask-cuda=${MINOR_VERSION}" \
-      "ucx-py=0.22.*"
+      "ucx-py=0.23.*"
 
 if [ "$RUN_CUML_LIBCUML_TESTS" = "ON" ] || [ "$RUN_CUML_PRIMS_TESTS" = "ON" ] || [ "$RUN_CUML_PYTHON_TESTS" = "ON" ]; then
   gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \

From a45219262fcb99a8dd6fcd49e1e34be351e664a0 Mon Sep 17 00:00:00 2001
From: Victor Lafargue <viclafargue@nvidia.com>
Date: Thu, 14 Oct 2021 19:46:41 +0200
Subject: [PATCH 029/171] Update RAFT test directory (#359)

Closes #323

Authors:
  - Victor Lafargue (https://github.com/viclafargue)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/359
---
 cpp/test/distance/dist_adj.cu                 |  46 +++---
 cpp/test/distance/dist_canberra.cu            |   4 +-
 cpp/test/distance/dist_chebyshev.cu           |   4 +-
 cpp/test/distance/dist_correlation.cu         |   4 +-
 cpp/test/distance/dist_cos.cu                 |   4 +-
 cpp/test/distance/dist_euc_exp.cu             |   4 +-
 cpp/test/distance/dist_euc_unexp.cu           |   4 +-
 cpp/test/distance/dist_hamming.cu             |   4 +-
 cpp/test/distance/dist_hellinger.cu           |   4 +-
 cpp/test/distance/dist_jensen_shannon.cu      |   4 +-
 cpp/test/distance/dist_kl_divergence.cu       |   4 +-
 cpp/test/distance/dist_l1.cu                  |   4 +-
 cpp/test/distance/dist_minkowski.cu           |   4 +-
 cpp/test/distance/dist_russell_rao.cu         |   4 +-
 cpp/test/distance/distance_base.cuh           |  61 ++++----
 cpp/test/distance/fused_l2_nn.cu              | 111 ++++++++------
 cpp/test/label/label.cu                       |  39 ++---
 cpp/test/label/merge_labels.cu                |   3 +-
 cpp/test/linalg/add.cu                        |  40 ++---
 cpp/test/linalg/binary_op.cu                  |  55 ++++---
 cpp/test/linalg/cholesky_r1.cu                |   3 +-
 cpp/test/linalg/coalesced_reduction.cu        |  39 ++---
 cpp/test/linalg/divide.cu                     |  38 ++---
 cpp/test/linalg/eig.cu                        | 114 +++++++-------
 cpp/test/linalg/eig_sel.cu                    |  53 ++++---
 cpp/test/linalg/eltwise.cu                    |  66 ++++----
 cpp/test/linalg/map.cu                        |  64 ++++----
 cpp/test/linalg/map_then_reduce.cu            |  42 ++---
 cpp/test/linalg/matrix_vector_op.cu           |  54 +++----
 cpp/test/linalg/multiply.cu                   |  34 ++---
 cpp/test/linalg/norm.cu                       |  90 +++++------
 cpp/test/linalg/reduce.cu                     |  45 +++---
 cpp/test/linalg/strided_reduction.cu          |  37 ++---
 cpp/test/linalg/subtract.cu                   |  54 +++----
 cpp/test/linalg/svd.cu                        |  90 ++++++-----
 cpp/test/linalg/transpose.cu                  |  43 +++---
 cpp/test/linalg/unary_op.cu                   |  47 +++---
 cpp/test/matrix/math.cu                       | 132 ++++++++--------
 cpp/test/matrix/matrix.cu                     |  44 +++---
 cpp/test/random/rng.cu                        | 144 +++++++++---------
 cpp/test/random/rng_int.cu                    |  35 +++--
 cpp/test/random/sample_without_replacement.cu |  46 +++---
 cpp/test/sparse/add.cu                        |  99 ++++++------
 cpp/test/sparse/convert_coo.cu                |  37 +++--
 cpp/test/sparse/convert_csr.cu                |  73 +++++----
 cpp/test/sparse/csr_row_slice.cu              |  94 ++++++------
 cpp/test/sparse/csr_to_dense.cu               |  65 ++++----
 cpp/test/sparse/csr_transpose.cu              |  86 ++++++-----
 cpp/test/sparse/degree.cu                     |  55 ++++---
 cpp/test/sparse/dist_coo_spmv.cu              |  61 ++++----
 cpp/test/sparse/distance.cu                   |  62 ++++----
 cpp/test/sparse/knn.cu                        |  74 ++++-----
 cpp/test/sparse/knn_graph.cu                  |  35 ++---
 cpp/test/sparse/linkage.cu                    |  36 +++--
 cpp/test/sparse/norm.cu                       |  52 +++----
 cpp/test/sparse/row_op.cu                     |  40 ++---
 cpp/test/sparse/sort.cu                       |  33 ++--
 cpp/test/sparse/symmetrize.cu                 |  47 +++---
 cpp/test/spatial/haversine.cu                 |  61 ++++----
 cpp/test/spatial/knn.cu                       |  95 +++++++-----
 cpp/test/spatial/selection.cu                 |  71 ++++-----
 cpp/test/stats/mean.cu                        |  42 +++--
 cpp/test/stats/mean_center.cu                 |  58 +++----
 cpp/test/stats/stddev.cu                      |  57 ++++---
 cpp/test/stats/sum.cu                         |  36 ++---
 65 files changed, 1595 insertions(+), 1495 deletions(-)

diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index 8d5cd68f13..9d97755f03 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -19,6 +19,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/distance.cuh>
 #include <raft/random/rng.cuh>
+#include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
 
 namespace raft {
@@ -70,38 +71,42 @@ template <typename DataType>
 class DistanceAdjTest
   : public ::testing::TestWithParam<DistanceAdjInputs<DataType>> {
  public:
+  DistanceAdjTest()
+    : params(::testing::TestWithParam<DistanceAdjInputs<DataType>>::GetParam()),
+      stream(handle.get_stream()),
+      dist(params.m * params.n, stream),
+      dist_ref(params.m * params.n, stream) {}
+
   void SetUp() override {
-    params = ::testing::TestWithParam<DistanceAdjInputs<DataType>>::GetParam();
     raft::random::Rng r(params.seed);
     int m = params.m;
     int n = params.n;
     int k = params.k;
     bool isRowMajor = params.isRowMajor;
-    CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(x, m * k, stream);
-    raft::allocate(y, n * k, stream);
-    raft::allocate(dist_ref, m * n, stream);
-    raft::allocate(dist, m * n, stream);
-    r.uniform(x, m * k, DataType(-1.0), DataType(1.0), stream);
-    r.uniform(y, n * k, DataType(-1.0), DataType(1.0), stream);
+
+    rmm::device_uvector<DataType> x(m * k, stream);
+    rmm::device_uvector<DataType> y(n * k, stream);
+
+    r.uniform(x.data(), m * k, DataType(-1.0), DataType(1.0), stream);
+    r.uniform(y.data(), n * k, DataType(-1.0), DataType(1.0), stream);
 
     DataType threshold = params.eps;
 
-    naiveDistanceAdj(dist_ref, x, y, m, n, k, threshold, isRowMajor);
-    char *workspace = nullptr;
+    naiveDistanceAdj(dist_ref.data(), x.data(), y.data(), m, n, k, threshold,
+                     isRowMajor);
     size_t worksize =
       raft::distance::getWorkspaceSize<raft::distance::DistanceType::L2Expanded,
-                                       DataType, DataType, bool>(x, y, m, n, k);
-    if (worksize != 0) {
-      raft::allocate(workspace, worksize, stream);
-    }
+                                       DataType, DataType, bool>(
+        x.data(), y.data(), m, n, k);
+    rmm::device_uvector<char> workspace(worksize, stream);
 
     auto fin_op = [threshold] __device__(DataType d_val, int g_d_idx) {
       return d_val <= threshold;
     };
     raft::distance::distance<raft::distance::DistanceType::L2Expanded, DataType,
                              DataType, bool>(
-      x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor);
+      x.data(), y.data(), dist.data(), m, n, k, workspace.data(),
+      workspace.size(), fin_op, stream, isRowMajor);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
@@ -109,8 +114,9 @@ class DistanceAdjTest
 
  protected:
   DistanceAdjInputs<DataType> params;
-  DataType *x, *y;
-  bool *dist_ref, *dist;
+  rmm::device_uvector<bool> dist_ref;
+  rmm::device_uvector<bool> dist;
+  raft::handle_t handle;
   cudaStream_t stream;
 };
 
@@ -128,7 +134,8 @@ typedef DistanceAdjTest<float> DistanceAdjTestF;
 TEST_P(DistanceAdjTestF, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::Compare<bool>()));
+  ASSERT_TRUE(
+    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare<bool>()));
 }
 INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestF,
                         ::testing::ValuesIn(inputsf));
@@ -147,7 +154,8 @@ typedef DistanceAdjTest<double> DistanceAdjTestD;
 TEST_P(DistanceAdjTestD, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::Compare<bool>()));
+  ASSERT_TRUE(
+    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare<bool>()));
 }
 INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestD,
                         ::testing::ValuesIn(inputsd));
diff --git a/cpp/test/distance/dist_canberra.cu b/cpp/test/distance/dist_canberra.cu
index 10bc4d1899..bddfdff3b6 100644
--- a/cpp/test/distance/dist_canberra.cu
+++ b/cpp/test/distance/dist_canberra.cu
@@ -38,7 +38,7 @@ typedef DistanceCanberra<float> DistanceCanberraF;
 TEST_P(DistanceCanberraF, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
                                 raft::CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraF,
@@ -58,7 +58,7 @@ typedef DistanceCanberra<double> DistanceCanberraD;
 TEST_P(DistanceCanberraD, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
                                 raft::CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD,
diff --git a/cpp/test/distance/dist_chebyshev.cu b/cpp/test/distance/dist_chebyshev.cu
index 6a2b02863a..0dc6edfaad 100644
--- a/cpp/test/distance/dist_chebyshev.cu
+++ b/cpp/test/distance/dist_chebyshev.cu
@@ -38,7 +38,7 @@ typedef DistanceLinf<float> DistanceLinfF;
 TEST_P(DistanceLinfF, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
                                 raft::CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfF,
@@ -58,7 +58,7 @@ typedef DistanceLinf<double> DistanceLinfD;
 TEST_P(DistanceLinfD, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
                                 raft::CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD,
diff --git a/cpp/test/distance/dist_correlation.cu b/cpp/test/distance/dist_correlation.cu
index 5d84f18e52..f6dc015738 100644
--- a/cpp/test/distance/dist_correlation.cu
+++ b/cpp/test/distance/dist_correlation.cu
@@ -39,7 +39,7 @@ typedef DistanceCorrelation<float> DistanceCorrelationF;
 TEST_P(DistanceCorrelationF, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
                                 raft::CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationF,
@@ -59,7 +59,7 @@ typedef DistanceCorrelation<double> DistanceCorrelationD;
 TEST_P(DistanceCorrelationD, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
                                 raft::CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationD,
diff --git a/cpp/test/distance/dist_cos.cu b/cpp/test/distance/dist_cos.cu
index 291c4196f9..2487bcbd95 100644
--- a/cpp/test/distance/dist_cos.cu
+++ b/cpp/test/distance/dist_cos.cu
@@ -39,7 +39,7 @@ typedef DistanceExpCos<float> DistanceExpCosF;
 TEST_P(DistanceExpCosF, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n,
                           raft::CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosF,
@@ -59,7 +59,7 @@ typedef DistanceExpCos<double> DistanceExpCosD;
 TEST_P(DistanceExpCosD, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n,
                           raft::CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosD,
diff --git a/cpp/test/distance/dist_euc_exp.cu b/cpp/test/distance/dist_euc_exp.cu
index 46e7ded0ec..a6ef01aa45 100644
--- a/cpp/test/distance/dist_euc_exp.cu
+++ b/cpp/test/distance/dist_euc_exp.cu
@@ -38,7 +38,7 @@ typedef DistanceEucExpTest<float> DistanceEucExpTestF;
 TEST_P(DistanceEucExpTestF, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n,
                           raft::CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestF,
@@ -58,7 +58,7 @@ typedef DistanceEucExpTest<double> DistanceEucExpTestD;
 TEST_P(DistanceEucExpTestD, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n,
                           raft::CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestD,
diff --git a/cpp/test/distance/dist_euc_unexp.cu b/cpp/test/distance/dist_euc_unexp.cu
index 92f424647d..290abda352 100644
--- a/cpp/test/distance/dist_euc_unexp.cu
+++ b/cpp/test/distance/dist_euc_unexp.cu
@@ -39,7 +39,7 @@ typedef DistanceEucUnexpTest<float> DistanceEucUnexpTestF;
 TEST_P(DistanceEucUnexpTestF, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n,
                           raft::CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestF,
@@ -59,7 +59,7 @@ typedef DistanceEucUnexpTest<double> DistanceEucUnexpTestD;
 TEST_P(DistanceEucUnexpTestD, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n,
                           raft::CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestD,
diff --git a/cpp/test/distance/dist_hamming.cu b/cpp/test/distance/dist_hamming.cu
index 47febd825b..0123c8bada 100644
--- a/cpp/test/distance/dist_hamming.cu
+++ b/cpp/test/distance/dist_hamming.cu
@@ -39,7 +39,7 @@ typedef DistanceHamming<float> DistanceHammingF;
 TEST_P(DistanceHammingF, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
                                 raft::CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingF,
@@ -59,7 +59,7 @@ typedef DistanceHamming<double> DistanceHammingD;
 TEST_P(DistanceHammingD, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
                                 raft::CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingD,
diff --git a/cpp/test/distance/dist_hellinger.cu b/cpp/test/distance/dist_hellinger.cu
index 39dc7aaeff..39d197f786 100644
--- a/cpp/test/distance/dist_hellinger.cu
+++ b/cpp/test/distance/dist_hellinger.cu
@@ -39,7 +39,7 @@ typedef DistanceHellingerExp<float> DistanceHellingerExpF;
 TEST_P(DistanceHellingerExpF, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
                                 raft::CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpF,
@@ -59,7 +59,7 @@ typedef DistanceHellingerExp<double> DistanceHellingerExpD;
 TEST_P(DistanceHellingerExpD, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
                                 raft::CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD,
diff --git a/cpp/test/distance/dist_jensen_shannon.cu b/cpp/test/distance/dist_jensen_shannon.cu
index bc0b56f506..9070ce92c1 100644
--- a/cpp/test/distance/dist_jensen_shannon.cu
+++ b/cpp/test/distance/dist_jensen_shannon.cu
@@ -39,7 +39,7 @@ typedef DistanceJensenShannon<float> DistanceJensenShannonF;
 TEST_P(DistanceJensenShannonF, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
                                 raft::CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonF,
@@ -59,7 +59,7 @@ typedef DistanceJensenShannon<double> DistanceJensenShannonD;
 TEST_P(DistanceJensenShannonD, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
                                 raft::CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonD,
diff --git a/cpp/test/distance/dist_kl_divergence.cu b/cpp/test/distance/dist_kl_divergence.cu
index 884ac4b948..7c32596527 100644
--- a/cpp/test/distance/dist_kl_divergence.cu
+++ b/cpp/test/distance/dist_kl_divergence.cu
@@ -39,7 +39,7 @@ typedef DistanceKLDivergence<float> DistanceKLDivergenceF;
 TEST_P(DistanceKLDivergenceF, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
                                 raft::CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceF,
@@ -59,7 +59,7 @@ typedef DistanceKLDivergence<double> DistanceKLDivergenceD;
 TEST_P(DistanceKLDivergenceD, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
                                 raft::CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceD,
diff --git a/cpp/test/distance/dist_l1.cu b/cpp/test/distance/dist_l1.cu
index bd32837e45..ff7705d195 100644
--- a/cpp/test/distance/dist_l1.cu
+++ b/cpp/test/distance/dist_l1.cu
@@ -38,7 +38,7 @@ typedef DistanceUnexpL1<float> DistanceUnexpL1F;
 TEST_P(DistanceUnexpL1F, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
                                 raft::CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1F,
@@ -58,7 +58,7 @@ typedef DistanceUnexpL1<double> DistanceUnexpL1D;
 TEST_P(DistanceUnexpL1D, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
                                 raft::CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1D,
diff --git a/cpp/test/distance/dist_minkowski.cu b/cpp/test/distance/dist_minkowski.cu
index 42b8e294ac..7d87bbc2c7 100644
--- a/cpp/test/distance/dist_minkowski.cu
+++ b/cpp/test/distance/dist_minkowski.cu
@@ -39,7 +39,7 @@ typedef DistanceLpUnexp<float> DistanceLpUnexpF;
 TEST_P(DistanceLpUnexpF, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
                                 raft::CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpF,
@@ -59,7 +59,7 @@ typedef DistanceLpUnexp<double> DistanceLpUnexpD;
 TEST_P(DistanceLpUnexpD, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
                                 raft::CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpD,
diff --git a/cpp/test/distance/dist_russell_rao.cu b/cpp/test/distance/dist_russell_rao.cu
index 74ccfb0c2e..ae735951a8 100644
--- a/cpp/test/distance/dist_russell_rao.cu
+++ b/cpp/test/distance/dist_russell_rao.cu
@@ -39,7 +39,7 @@ typedef DistanceRussellRao<float> DistanceRussellRaoF;
 TEST_P(DistanceRussellRaoF, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
                                 raft::CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoF,
@@ -59,7 +59,7 @@ typedef DistanceRussellRao<double> DistanceRussellRaoD;
 TEST_P(DistanceRussellRaoD, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
                                 raft::CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoD,
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index 4798d102f3..a10710e622 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -384,62 +384,59 @@ void distanceLauncher(DataType *x, DataType *y, DataType *dist, DataType *dist2,
 template <raft::distance::DistanceType distanceType, typename DataType>
 class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
  public:
+  DistanceTest()
+    : params(::testing::TestWithParam<DistanceInputs<DataType>>::GetParam()),
+      stream(handle.get_stream()),
+      x(params.m * params.k, stream),
+      y(params.n * params.k, stream),
+      dist_ref(params.m * params.n, stream),
+      dist(params.m * params.n, stream),
+      dist2(params.m * params.n, stream) {}
+
   void SetUp() override {
-    params = ::testing::TestWithParam<DistanceInputs<DataType>>::GetParam();
     raft::random::Rng r(params.seed);
     int m = params.m;
     int n = params.n;
     int k = params.k;
     DataType metric_arg = params.metric_arg;
     bool isRowMajor = params.isRowMajor;
-    CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(x, m * k, stream);
-    raft::allocate(y, n * k, stream);
-    raft::allocate(dist_ref, m * n, stream);
-    raft::allocate(dist, m * n, stream);
-    raft::allocate(dist2, m * n, stream);
     if (distanceType == raft::distance::DistanceType::HellingerExpanded ||
         distanceType == raft::distance::DistanceType::JensenShannon ||
         distanceType == raft::distance::DistanceType::KLDivergence) {
       // Hellinger works only on positive numbers
-      r.uniform(x, m * k, DataType(0.0), DataType(1.0), stream);
-      r.uniform(y, n * k, DataType(0.0), DataType(1.0), stream);
+      r.uniform(x.data(), m * k, DataType(0.0), DataType(1.0), stream);
+      r.uniform(y.data(), n * k, DataType(0.0), DataType(1.0), stream);
     } else if (distanceType ==
                raft::distance::DistanceType::RusselRaoExpanded) {
-      r.uniform(x, m * k, DataType(0.0), DataType(1.0), stream);
-      r.uniform(y, n * k, DataType(0.0), DataType(1.0), stream);
+      r.uniform(x.data(), m * k, DataType(0.0), DataType(1.0), stream);
+      r.uniform(y.data(), n * k, DataType(0.0), DataType(1.0), stream);
       // Russel rao works on boolean values.
-      r.bernoulli(x, m * k, 0.5f, stream);
-      r.bernoulli(y, n * k, 0.5f, stream);
+      r.bernoulli(x.data(), m * k, 0.5f, stream);
+      r.bernoulli(y.data(), n * k, 0.5f, stream);
     } else {
-      r.uniform(x, m * k, DataType(-1.0), DataType(1.0), stream);
-      r.uniform(y, n * k, DataType(-1.0), DataType(1.0), stream);
+      r.uniform(x.data(), m * k, DataType(-1.0), DataType(1.0), stream);
+      r.uniform(y.data(), n * k, DataType(-1.0), DataType(1.0), stream);
     }
-    naiveDistance(dist_ref, x, y, m, n, k, distanceType, isRowMajor,
-                  metric_arg);
-    char *workspace = nullptr;
+    naiveDistance(dist_ref.data(), x.data(), y.data(), m, n, k, distanceType,
+                  isRowMajor, metric_arg);
     size_t worksize =
       raft::distance::getWorkspaceSize<distanceType, DataType, DataType,
-                                       DataType>(x, y, m, n, k);
-    if (worksize != 0) {
-      raft::allocate(workspace, worksize, stream);
-    }
+                                       DataType>(x.data(), y.data(), m, n, k);
+    rmm::device_uvector<char> workspace(worksize, stream);
 
     DataType threshold = -10000.f;
-    distanceLauncher<distanceType, DataType>(x, y, dist, dist2, m, n, k, params,
-                                             threshold, workspace, worksize,
-                                             stream, isRowMajor, metric_arg);
-  }
-
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    distanceLauncher<distanceType, DataType>(
+      x.data(), y.data(), dist.data(), dist2.data(), m, n, k, params, threshold,
+      workspace.data(), workspace.size(), stream, isRowMajor, metric_arg);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
  protected:
-  DistanceInputs<DataType> params;
-  DataType *x, *y, *dist_ref, *dist, *dist2;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  DistanceInputs<DataType> params;
+  rmm::device_uvector<DataType> x, y, dist_ref, dist, dist2;
 };
 
 }  // end namespace distance
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index cfea4ee2d9..33f7dbf828 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -100,44 +100,51 @@ struct Inputs {
 template <typename DataT, bool Sqrt>
 class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
  public:
+  FusedL2NNTest()
+    : params(::testing::TestWithParam<Inputs<DataT>>::GetParam()),
+      stream(handle.get_stream()),
+      x(params.m * params.k, stream),
+      y(params.n * params.k, stream),
+      xn(params.m, stream),
+      yn(params.n, stream),
+      min(params.m, stream),
+      min_ref(params.m, stream),
+      workspace(params.m * sizeof(int), stream) {}
+
+ protected:
   void SetUp() override {
-    params = ::testing::TestWithParam<Inputs<DataT>>::GetParam();
     raft::random::Rng r(params.seed);
     int m = params.m;
     int n = params.n;
     int k = params.k;
-    CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(x, m * k, stream);
-    raft::allocate(y, n * k, stream);
-    raft::allocate(xn, m, stream);
-    raft::allocate(yn, n, stream);
-    raft::allocate(workspace, sizeof(int) * m, stream);
-    raft::allocate(min, m, stream);
-    raft::allocate(min_ref, m, stream);
-    r.uniform(x, m * k, DataT(-1.0), DataT(1.0), stream);
-    r.uniform(y, n * k, DataT(-1.0), DataT(1.0), stream);
+    r.uniform(x.data(), m * k, DataT(-1.0), DataT(1.0), stream);
+    r.uniform(y.data(), n * k, DataT(-1.0), DataT(1.0), stream);
     generateGoldenResult();
-    raft::linalg::rowNorm(xn, x, k, m, raft::linalg::L2Norm, true, stream);
-    raft::linalg::rowNorm(yn, y, k, n, raft::linalg::L2Norm, true, stream);
-  }
-
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    raft::linalg::rowNorm(xn.data(), x.data(), k, m, raft::linalg::L2Norm, true,
+                          stream);
+    raft::linalg::rowNorm(yn.data(), y.data(), k, n, raft::linalg::L2Norm, true,
+                          stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
  protected:
   Inputs<DataT> params;
-  DataT *x, *y, *xn, *yn;
-  char *workspace;
-  cub::KeyValuePair<int, DataT> *min, *min_ref;
+  rmm::device_uvector<DataT> x;
+  rmm::device_uvector<DataT> y;
+  rmm::device_uvector<DataT> xn;
+  rmm::device_uvector<DataT> yn;
+  rmm::device_uvector<cub::KeyValuePair<int, DataT>> min;
+  rmm::device_uvector<cub::KeyValuePair<int, DataT>> min_ref;
+  rmm::device_uvector<char> workspace;
+  raft::handle_t handle;
   cudaStream_t stream;
 
   virtual void generateGoldenResult() {
     int m = params.m;
     int n = params.n;
     int k = params.k;
-    naive<DataT, Sqrt>(min_ref, x, y, m, n, k, (int *)workspace, stream);
+    naive<DataT, Sqrt>(min_ref.data(), x.data(), y.data(), m, n, k,
+                       (int *)workspace.data(), stream);
   }
 
   void runTest(cub::KeyValuePair<int, DataT> *out) {
@@ -146,7 +153,8 @@ class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
     int k = params.k;
     MinAndDistanceReduceOp<int, DataT> redOp;
     fusedL2NN<DataT, cub::KeyValuePair<int, DataT>, int>(
-      out, x, y, xn, yn, m, n, k, (void *)workspace, redOp,
+      out, x.data(), y.data(), xn.data(), yn.data(), m, n, k,
+      (void *)workspace.data(), redOp,
       raft::distance::KVPMinReduce<int, DataT>(), Sqrt, true, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
@@ -219,16 +227,16 @@ const std::vector<Inputs<float>> inputsf = {
 };
 typedef FusedL2NNTest<float, false> FusedL2NNTestF_Sq;
 TEST_P(FusedL2NNTestF_Sq, Result) {
-  runTest(min);
-  ASSERT_TRUE(devArrMatch(min_ref, min, params.m,
+  runTest(min.data());
+  ASSERT_TRUE(devArrMatch(min_ref.data(), min.data(), params.m,
                           CompareApproxAbsKVP<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sq,
                         ::testing::ValuesIn(inputsf));
 typedef FusedL2NNTest<float, true> FusedL2NNTestF_Sqrt;
 TEST_P(FusedL2NNTestF_Sqrt, Result) {
-  runTest(min);
-  ASSERT_TRUE(devArrMatch(min_ref, min, params.m,
+  runTest(min.data());
+  ASSERT_TRUE(devArrMatch(min_ref.data(), min.data(), params.m,
                           CompareApproxAbsKVP<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sqrt,
@@ -254,16 +262,16 @@ const std::vector<Inputs<double>> inputsd = {
 };
 typedef FusedL2NNTest<double, false> FusedL2NNTestD_Sq;
 TEST_P(FusedL2NNTestD_Sq, Result) {
-  runTest(min);
-  ASSERT_TRUE(devArrMatch(min_ref, min, params.m,
+  runTest(min.data());
+  ASSERT_TRUE(devArrMatch(min_ref.data(), min.data(), params.m,
                           CompareApproxAbsKVP<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sq,
                         ::testing::ValuesIn(inputsd));
 typedef FusedL2NNTest<double, true> FusedL2NNTestD_Sqrt;
 TEST_P(FusedL2NNTestD_Sqrt, Result) {
-  runTest(min);
-  ASSERT_TRUE(devArrMatch(min_ref, min, params.m,
+  runTest(min.data());
+  ASSERT_TRUE(devArrMatch(min_ref.data(), min.data(), params.m,
                           CompareApproxAbsKVP<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sqrt,
@@ -272,40 +280,47 @@ INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sqrt,
 /// This is to test output determinism of the prim
 template <typename DataT, bool Sqrt>
 class FusedL2NNDetTest : public FusedL2NNTest<DataT, Sqrt> {
+ public:
+  FusedL2NNDetTest() : stream(handle.get_stream()), min1(0, stream) {}
+
   void SetUp() override {
     FusedL2NNTest<DataT, Sqrt>::SetUp();
     int m = this->params.m;
-    CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(min1, m, stream);
+    min1.resize(m, stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override { FusedL2NNTest<DataT, Sqrt>::TearDown(); }
 
  protected:
-  cub::KeyValuePair<int, DataT> *min1;
+  raft::handle_t handle;
+  cudaStream_t stream;
+
+  rmm::device_uvector<cub::KeyValuePair<int, DataT>> min1;
 
   static const int NumRepeats = 100;
-  cudaStream_t stream;
 
   void generateGoldenResult() override {}
 };
 
 typedef FusedL2NNDetTest<float, false> FusedL2NNDetTestF_Sq;
 TEST_P(FusedL2NNDetTestF_Sq, Result) {
-  runTest(min);  // assumed to be golden
+  runTest(min.data());  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
-    runTest(min1);
-    ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP<float>()));
+    runTest(min1.data());
+    ASSERT_TRUE(
+      devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<float>()));
   }
 }
 INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sq,
                         ::testing::ValuesIn(inputsf));
 typedef FusedL2NNDetTest<float, true> FusedL2NNDetTestF_Sqrt;
 TEST_P(FusedL2NNDetTestF_Sqrt, Result) {
-  runTest(min);  // assumed to be golden
+  runTest(min.data());  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
-    runTest(min1);
-    ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP<float>()));
+    runTest(min1.data());
+    ASSERT_TRUE(
+      devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<float>()));
   }
 }
 INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sqrt,
@@ -313,20 +328,22 @@ INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sqrt,
 
 typedef FusedL2NNDetTest<double, false> FusedL2NNDetTestD_Sq;
 TEST_P(FusedL2NNDetTestD_Sq, Result) {
-  runTest(min);  // assumed to be golden
+  runTest(min.data());  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
-    runTest(min1);
-    ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP<double>()));
+    runTest(min1.data());
+    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m,
+                            CompareExactKVP<double>()));
   }
 }
 INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sq,
                         ::testing::ValuesIn(inputsd));
 typedef FusedL2NNDetTest<double, true> FusedL2NNDetTestD_Sqrt;
 TEST_P(FusedL2NNDetTestD_Sqrt, Result) {
-  runTest(min);  // assumed to be golden
+  runTest(min.data());  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
-    runTest(min1);
-    ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP<double>()));
+    runTest(min1.data());
+    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m,
+                            CompareExactKVP<double>()));
   }
 }
 INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sqrt,
diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu
index b28c754a5a..f79d8f10c8 100644
--- a/cpp/test/label/label.cu
+++ b/cpp/test/label/label.cu
@@ -41,11 +41,9 @@ TEST_F(MakeMonotonicTest, Result) {
 
   int m = 12;
 
-  float *data, *actual, *expected;
-
-  raft::allocate(data, m, stream, true);
-  raft::allocate(actual, m, stream, true);
-  raft::allocate(expected, m, stream, true);
+  rmm::device_uvector<float> data(m, stream);
+  rmm::device_uvector<float> actual(m, stream);
+  rmm::device_uvector<float> expected(m, stream);
 
   float *data_h =
     new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0};
@@ -53,17 +51,15 @@ TEST_F(MakeMonotonicTest, Result) {
   float *expected_h =
     new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 6.0, 7.0};
 
-  raft::update_device(data, data_h, m, stream);
-  raft::update_device(expected, expected_h, m, stream);
+  raft::update_device(data.data(), data_h, m, stream);
+  raft::update_device(expected.data(), expected_h, m, stream);
 
-  make_monotonic(actual, data, m, stream);
+  make_monotonic(actual.data(), data.data(), m, stream);
 
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
-  ASSERT_TRUE(devArrMatch(actual, expected, m, raft::Compare<bool>(), stream));
-
-  raft::deallocate_all(stream);
-  CUDA_CHECK(cudaStreamDestroy(stream));
+  ASSERT_TRUE(devArrMatch(actual.data(), expected.data(), m,
+                          raft::Compare<bool>(), stream));
 
   delete data_h;
   delete expected_h;
@@ -74,14 +70,13 @@ TEST(labelTest, Classlabels) {
   CUDA_CHECK(cudaStreamCreate(&stream));
 
   int n_rows = 6;
-  float *y_d;
-  raft::allocate(y_d, n_rows, stream);
+  rmm::device_uvector<float> y_d(n_rows, stream);
 
   float y_h[] = {2, -1, 1, 2, 1, 1};
-  raft::update_device(y_d, y_h, n_rows, stream);
+  raft::update_device(y_d.data(), y_h, n_rows, stream);
 
   rmm::device_uvector<float> y_unique_d(0, stream);
-  int n_classes = getUniquelabels(y_unique_d, y_d, n_rows, stream);
+  int n_classes = getUniquelabels(y_unique_d, y_d.data(), n_rows, stream);
 
   ASSERT_EQ(n_classes, 3);
 
@@ -89,18 +84,14 @@ TEST(labelTest, Classlabels) {
   EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d.data(), n_classes,
                               raft::Compare<float>(), stream));
 
-  float *y_relabeled_d;
-  raft::allocate(y_relabeled_d, n_rows, stream);
+  rmm::device_uvector<float> y_relabeled_d(n_rows, stream);
 
-  getOvrlabels(y_d, n_rows, y_unique_d.data(), n_classes, y_relabeled_d, 2,
-               stream);
+  getOvrlabels(y_d.data(), n_rows, y_unique_d.data(), n_classes,
+               y_relabeled_d.data(), 2, stream);
 
   float y_relabeled_exp[] = {1, -1, -1, 1, -1, -1};
-  EXPECT_TRUE(devArrMatchHost(y_relabeled_exp, y_relabeled_d, n_rows,
+  EXPECT_TRUE(devArrMatchHost(y_relabeled_exp, y_relabeled_d.data(), n_rows,
                               raft::Compare<float>(), stream));
-
-  raft::deallocate_all(stream);
-  CUDA_CHECK(cudaStreamDestroy(stream));
 }
 };  // namespace label
 };  // namespace raft
diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu
index 28d8d59884..76e0a4295e 100644
--- a/cpp/test/label/merge_labels.cu
+++ b/cpp/test/label/merge_labels.cu
@@ -72,9 +72,10 @@ class MergeLabelsTest
   }
 
  protected:
-  MergeLabelsInputs<Index_> params;
   raft::handle_t handle;
   cudaStream_t stream;
+
+  MergeLabelsInputs<Index_> params;
   rmm::device_uvector<Index_> labels_a, labels_b, expected, R;
   rmm::device_scalar<bool> mask, m;
 };
diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu
index 301f069a33..eea9df046b 100644
--- a/cpp/test/linalg/add.cu
+++ b/cpp/test/linalg/add.cu
@@ -26,37 +26,41 @@ namespace linalg {
 
 template <typename InT, typename OutT = InT>
 class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
+ public:
+  AddTest()
+    : params(::testing::TestWithParam<AddInputs<InT, OutT>>::GetParam()),
+      stream(handle.get_stream()),
+      in1(params.len, stream),
+      in2(params.len, stream),
+      out_ref(params.len, stream),
+      out(params.len, stream) {}
+
  protected:
   void SetUp() override {
     params = ::testing::TestWithParam<AddInputs<InT, OutT>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
-    CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(in1, len, stream);
-    raft::allocate(in2, len, stream);
-    raft::allocate(out_ref, len, stream);
-    raft::allocate(out, len, stream);
-    r.uniform(in1, len, InT(-1.0), InT(1.0), stream);
-    r.uniform(in2, len, InT(-1.0), InT(1.0), stream);
-    naiveAddElem<InT, OutT>(out_ref, in1, in2, len);
-    add<InT, OutT>(out, in1, in2, len, stream);
-  }
-
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    r.uniform(in1.data(), len, InT(-1.0), InT(1.0), stream);
+    r.uniform(in2.data(), len, InT(-1.0), InT(1.0), stream);
+    naiveAddElem<InT, OutT>(out_ref.data(), in1.data(), in2.data(), len);
+    add<InT, OutT>(out.data(), in1.data(), in2.data(), len, stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void compare() {
-    ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len,
+    ASSERT_TRUE(raft::devArrMatch(out_ref.data(), out.data(), params.len,
                                   raft::CompareApprox<OutT>(params.tolerance)));
   }
 
  protected:
-  AddInputs<InT, OutT> params;
-  InT *in1, *in2;
-  OutT *out_ref, *out;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  AddInputs<InT, OutT> params;
+  rmm::device_uvector<InT> in1;
+  rmm::device_uvector<InT> in2;
+  rmm::device_uvector<OutT> out_ref;
+  rmm::device_uvector<OutT> out;
 };
 
 const std::vector<AddInputs<float>> inputsf = {
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu
index 475d8e58ff..b60f04cd34 100644
--- a/cpp/test/linalg/binary_op.cu
+++ b/cpp/test/linalg/binary_op.cu
@@ -39,44 +39,43 @@ void binaryOpLaunch(OutType *out, const InType *in1, const InType *in2,
 template <typename InType, typename IdxType, typename OutType = InType>
 class BinaryOpTest
   : public ::testing::TestWithParam<BinaryOpInputs<InType, IdxType, OutType>> {
+ public:
+  BinaryOpTest()
+    : params(::testing::TestWithParam<
+             BinaryOpInputs<InType, IdxType, OutType>>::GetParam()),
+      stream(handle.get_stream()),
+      in1(params.len, stream),
+      in2(params.len, stream),
+      out_ref(params.len, stream),
+      out(params.len, stream) {}
+
  protected:
   void SetUp() override {
-    params = ::testing::TestWithParam<
-      BinaryOpInputs<InType, IdxType, OutType>>::GetParam();
     raft::random::Rng r(params.seed);
-
-    cudaStream_t stream;
-    CUDA_CHECK(cudaStreamCreate(&stream));
     IdxType len = params.len;
-    raft::allocate(in1, len, stream);
-    raft::allocate(in2, len, stream);
-    raft::allocate(out_ref, len, stream);
-    raft::allocate(out, len, stream);
-    r.uniform(in1, len, InType(-1.0), InType(1.0), stream);
-    r.uniform(in2, len, InType(-1.0), InType(1.0), stream);
-    naiveAdd(out_ref, in1, in2, len);
-    binaryOpLaunch(out, in1, in2, len, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
-  void TearDown() override {
-    CUDA_CHECK(cudaFree(in1));
-    CUDA_CHECK(cudaFree(in2));
-    CUDA_CHECK(cudaFree(out_ref));
-    CUDA_CHECK(cudaFree(out));
+    r.uniform(in1.data(), len, InType(-1.0), InType(1.0), stream);
+    r.uniform(in2.data(), len, InType(-1.0), InType(1.0), stream);
+    naiveAdd(out_ref.data(), in1.data(), in2.data(), len);
+    binaryOpLaunch(out.data(), in1.data(), in2.data(), len, stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
  protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
   BinaryOpInputs<InType, IdxType, OutType> params;
-  InType *in1, *in2;
-  OutType *out_ref, *out;
+  rmm::device_uvector<InType> in1;
+  rmm::device_uvector<InType> in2;
+  rmm::device_uvector<OutType> out_ref;
+  rmm::device_uvector<OutType> out;
 };
 
 const std::vector<BinaryOpInputs<float, int>> inputsf_i32 = {
   {0.000001f, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<float, int> BinaryOpTestF_i32;
 TEST_P(BinaryOpTestF_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
                           CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32,
@@ -86,7 +85,7 @@ const std::vector<BinaryOpInputs<float, size_t>> inputsf_i64 = {
   {0.000001f, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<float, size_t> BinaryOpTestF_i64;
 TEST_P(BinaryOpTestF_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
                           CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i64,
@@ -96,7 +95,7 @@ const std::vector<BinaryOpInputs<float, int, double>> inputsf_i32_d = {
   {0.000001f, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<float, int, double> BinaryOpTestF_i32_D;
 TEST_P(BinaryOpTestF_i32_D, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
                           CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32_D,
@@ -106,7 +105,7 @@ const std::vector<BinaryOpInputs<double, int>> inputsd_i32 = {
   {0.00000001, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<double, int> BinaryOpTestD_i32;
 TEST_P(BinaryOpTestD_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
                           CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i32,
@@ -116,7 +115,7 @@ const std::vector<BinaryOpInputs<double, size_t>> inputsd_i64 = {
   {0.00000001, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<double, size_t> BinaryOpTestD_i64;
 TEST_P(BinaryOpTestD_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
                           CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i64,
diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu
index 00db1715dc..262a1ad26c 100644
--- a/cpp/test/linalg/cholesky_r1.cu
+++ b/cpp/test/linalg/cholesky_r1.cu
@@ -105,9 +105,10 @@ class CholeskyR1Test : public ::testing::Test {
   }
 
   raft::handle_t handle;
-  cusolverDnHandle_t solver_handle;
   cudaStream_t stream;
 
+  cusolverDnHandle_t solver_handle;
+
   int n_rows = 4;
   int Lwork;
   math_t G_host[16] =  // clang-format off
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index 45dbd9dcc4..f17a0f0f5d 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -51,36 +51,39 @@ void coalescedReductionLaunch(T *dots, const T *data, int cols, int rows,
 template <typename T>
 class coalescedReductionTest
   : public ::testing::TestWithParam<coalescedReductionInputs<T>> {
+ public:
+  coalescedReductionTest()
+    : params(::testing::TestWithParam<coalescedReductionInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      data(params.rows * params.cols, stream),
+      dots_exp(params.rows * params.cols, stream),
+      dots_act(params.rows * params.cols, stream) {}
+
  protected:
   void SetUp() override {
-    params = ::testing::TestWithParam<coalescedReductionInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
-    CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(data, len, stream);
-    raft::allocate(dots_exp, rows, stream);
-    raft::allocate(dots_act, rows, stream);
-    r.uniform(data, len, T(-1.0), T(1.0), stream);
-    naiveCoalescedReduction(dots_exp, data, cols, rows, stream);
+    r.uniform(data.data(), len, T(-1.0), T(1.0), stream);
+    naiveCoalescedReduction(dots_exp.data(), data.data(), cols, rows, stream);
 
     // Perform reduction with default inplace = false first
-    coalescedReductionLaunch(dots_act, data, cols, rows, stream);
+    coalescedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream);
     // Add to result with inplace = true next
-    coalescedReductionLaunch(dots_act, data, cols, rows, stream, true);
+    coalescedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream,
+                             true);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
  protected:
-  coalescedReductionInputs<T> params;
-  T *data, *dots_exp, *dots_act;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  coalescedReductionInputs<T> params;
+  rmm::device_uvector<T> data;
+  rmm::device_uvector<T> dots_exp;
+  rmm::device_uvector<T> dots_act;
 };
 
 const std::vector<coalescedReductionInputs<float>> inputsf = {
@@ -97,13 +100,13 @@ const std::vector<coalescedReductionInputs<double>> inputsd = {
 
 typedef coalescedReductionTest<float> coalescedReductionTestF;
 TEST_P(coalescedReductionTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows,
+  ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.rows,
                                 raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef coalescedReductionTest<double> coalescedReductionTestD;
 TEST_P(coalescedReductionTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows,
+  ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.rows,
                                 raft::CompareApprox<double>(params.tolerance)));
 }
 
diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu
index 563f96c835..430c35f41b 100644
--- a/cpp/test/linalg/divide.cu
+++ b/cpp/test/linalg/divide.cu
@@ -45,39 +45,41 @@ void naiveDivide(Type *out, const Type *in, Type scalar, int len,
 template <typename T>
 class DivideTest
   : public ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>> {
+ public:
+  DivideTest()
+    : params(
+        ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      in(params.len, stream),
+      out_ref(params.len, stream),
+      out(params.len, stream) {}
+
  protected:
   void SetUp() override {
-    params =
-      ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
     CUDA_CHECK(cudaStreamCreate(&stream));
-
-    raft::allocate(in, len, stream);
-    raft::allocate(out_ref, len, stream);
-    raft::allocate(out, len, stream);
-    r.uniform(in, len, T(-1.0), T(1.0), stream);
-    naiveDivide(out_ref, in, params.scalar, len, stream);
-    divideScalar(out, in, params.scalar, len, stream);
+    r.uniform(in.data(), len, T(-1.0), T(1.0), stream);
+    naiveDivide(out_ref.data(), in.data(), params.scalar, len, stream);
+    divideScalar(out.data(), in.data(), params.scalar, len, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
  protected:
-  UnaryOpInputs<T> params;
-  T *in, *out_ref, *out;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  UnaryOpInputs<T> params;
+  rmm::device_uvector<T> in;
+  rmm::device_uvector<T> out_ref;
+  rmm::device_uvector<T> out;
 };
 
 const std::vector<UnaryOpInputs<float>> inputsf = {
   {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
 typedef DivideTest<float> DivideTestF;
 TEST_P(DivideTestF, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
                           raft::CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestF,
@@ -87,7 +89,7 @@ typedef DivideTest<double> DivideTestD;
 const std::vector<UnaryOpInputs<double>> inputsd = {
   {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
 TEST_P(DivideTestD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
                           raft::CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestD,
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
index 6e26757cf3..87d6c4faa6 100644
--- a/cpp/test/linalg/eig.cu
+++ b/cpp/test/linalg/eig.cu
@@ -41,73 +41,75 @@ template <typename T>
 
 template <typename T>
 class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
+ public:
+  EigTest()
+    : params(::testing::TestWithParam<EigInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      cov_matrix(params.len, stream),
+      eig_vectors(params.len, stream),
+      eig_vectors_jacobi(params.len, stream),
+      eig_vectors_ref(params.len, stream),
+      eig_vals(params.n_col, stream),
+      eig_vals_jacobi(params.n_col, stream),
+      eig_vals_ref(params.n_col, stream),
+      cov_matrix_large(params.n * params.n, stream),
+      eig_vectors_large(params.n * params.n, stream),
+      eig_vectors_jacobi_large(params.n * params.n, stream),
+      eig_vals_large(params.n, stream),
+      eig_vals_jacobi_large(params.n, stream) {}
+
  protected:
   void SetUp() override {
-    raft::handle_t handle;
-    stream = handle.get_stream();
-
-    params = ::testing::TestWithParam<EigInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
 
-    raft::allocate(cov_matrix, len, stream);
     T cov_matrix_h[] = {1.0,  0.9, 0.81, 0.729, 0.9,   1.0,  0.9, 0.81,
                         0.81, 0.9, 1.0,  0.9,   0.729, 0.81, 0.9, 1.0};
     ASSERT(len == 16, "This test only works with 4x4 matrices!");
-    raft::update_device(cov_matrix, cov_matrix_h, len, stream);
-
-    raft::allocate(eig_vectors, len, stream);
-    raft::allocate(eig_vals, params.n_col, stream);
-    raft::allocate(eig_vectors_jacobi, len, stream);
-    raft::allocate(eig_vals_jacobi, params.n_col, stream);
+    raft::update_device(cov_matrix.data(), cov_matrix_h, len, stream);
 
     T eig_vectors_ref_h[] = {0.2790, -0.6498, 0.6498, -0.2789, -0.5123, 0.4874,
                              0.4874, -0.5123, 0.6498, 0.2789,  -0.2789, -0.6498,
                              0.4874, 0.5123,  0.5123, 0.4874};
     T eig_vals_ref_h[] = {0.0614, 0.1024, 0.3096, 3.5266};
 
-    raft::allocate(eig_vectors_ref, len, stream);
-    raft::allocate(eig_vals_ref, params.n_col, stream);
+    raft::update_device(eig_vectors_ref.data(), eig_vectors_ref_h, len, stream);
+    raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, params.n_col,
+                        stream);
 
-    raft::update_device(eig_vectors_ref, eig_vectors_ref_h, len, stream);
-    raft::update_device(eig_vals_ref, eig_vals_ref_h, params.n_col, stream);
-
-    eigDC(handle, cov_matrix, params.n_row, params.n_col, eig_vectors, eig_vals,
-          stream);
+    eigDC(handle, cov_matrix.data(), params.n_row, params.n_col,
+          eig_vectors.data(), eig_vals.data(), stream);
 
     T tol = 1.e-7;
     int sweeps = 15;
-    eigJacobi(handle, cov_matrix, params.n_row, params.n_col,
-              eig_vectors_jacobi, eig_vals_jacobi, stream, tol, sweeps);
+    eigJacobi(handle, cov_matrix.data(), params.n_row, params.n_col,
+              eig_vectors_jacobi.data(), eig_vals_jacobi.data(), stream, tol,
+              sweeps);
 
     // test code for comparing two methods
     len = params.n * params.n;
-    raft::allocate(cov_matrix_large, len, stream);
-    raft::allocate(eig_vectors_large, len, stream);
-    raft::allocate(eig_vectors_jacobi_large, len, stream);
-    raft::allocate(eig_vals_large, params.n, stream);
-    raft::allocate(eig_vals_jacobi_large, params.n, stream);
-
-    r.uniform(cov_matrix_large, len, T(-1.0), T(1.0), stream);
-
-    eigDC(handle, cov_matrix_large, params.n, params.n, eig_vectors_large,
-          eig_vals_large, stream);
-    eigJacobi(handle, cov_matrix_large, params.n, params.n,
-              eig_vectors_jacobi_large, eig_vals_jacobi_large, stream, tol,
-              sweeps);
-  }
 
-  void TearDown() override { raft::deallocate_all(stream); }
+    r.uniform(cov_matrix_large.data(), len, T(-1.0), T(1.0), stream);
+
+    eigDC(handle, cov_matrix_large.data(), params.n, params.n,
+          eig_vectors_large.data(), eig_vals_large.data(), stream);
+    eigJacobi(handle, cov_matrix_large.data(), params.n, params.n,
+              eig_vectors_jacobi_large.data(), eig_vals_jacobi_large.data(),
+              stream, tol, sweeps);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+  }
 
  protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
   EigInputs<T> params;
-  T *cov_matrix, *eig_vectors, *eig_vectors_jacobi, *eig_vectors_ref, *eig_vals,
-    *eig_vals_jacobi, *eig_vals_ref;
 
-  T *cov_matrix_large, *eig_vectors_large, *eig_vectors_jacobi_large,
-    *eig_vals_large, *eig_vals_jacobi_large;
+  rmm::device_uvector<T> cov_matrix, eig_vectors, eig_vectors_jacobi,
+    eig_vectors_ref, eig_vals, eig_vals_jacobi, eig_vals_ref;
 
-  cudaStream_t stream;
+  rmm::device_uvector<T> cov_matrix_large, eig_vectors_large,
+    eig_vectors_jacobi_large, eig_vals_large, eig_vals_jacobi_large;
 };
 
 const std::vector<EigInputs<float>> inputsf2 = {
@@ -119,71 +121,71 @@ const std::vector<EigInputs<double>> inputsd2 = {
 typedef EigTest<float> EigTestValF;
 TEST_P(EigTestValF, Result) {
   ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col,
+    raft::devArrMatch(eig_vals_ref.data(), eig_vals.data(), params.n_col,
                       raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestValD;
 TEST_P(EigTestValD, Result) {
   ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col,
+    raft::devArrMatch(eig_vals_ref.data(), eig_vals.data(), params.n_col,
                       raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigTest<float> EigTestVecF;
 TEST_P(EigTestVecF, Result) {
   ASSERT_TRUE(
-    raft::devArrMatch(eig_vectors_ref, eig_vectors, params.len,
+    raft::devArrMatch(eig_vectors_ref.data(), eig_vectors.data(), params.len,
                       raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestVecD;
 TEST_P(EigTestVecD, Result) {
   ASSERT_TRUE(
-    raft::devArrMatch(eig_vectors_ref, eig_vectors, params.len,
+    raft::devArrMatch(eig_vectors_ref.data(), eig_vectors.data(), params.len,
                       raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigTest<float> EigTestValJacobiF;
 TEST_P(EigTestValJacobiF, Result) {
   ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref, eig_vals_jacobi, params.n_col,
+    raft::devArrMatch(eig_vals_ref.data(), eig_vals_jacobi.data(), params.n_col,
                       raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestValJacobiD;
 TEST_P(EigTestValJacobiD, Result) {
   ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref, eig_vals_jacobi, params.n_col,
+    raft::devArrMatch(eig_vals_ref.data(), eig_vals_jacobi.data(), params.n_col,
                       raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigTest<float> EigTestVecJacobiF;
 TEST_P(EigTestVecJacobiF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vectors_ref, eig_vectors_jacobi, params.len,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    eig_vectors_ref.data(), eig_vectors_jacobi.data(), params.len,
+    raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestVecJacobiD;
 TEST_P(EigTestVecJacobiD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vectors_ref, eig_vectors_jacobi, params.len,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    eig_vectors_ref.data(), eig_vectors_jacobi.data(), params.len,
+    raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigTest<float> EigTestVecCompareF;
 TEST_P(EigTestVecCompareF, Result) {
   ASSERT_TRUE(raft::devArrMatch(
-    eig_vectors_large, eig_vectors_jacobi_large, (params.n * params.n),
-    raft::CompareApproxAbs<float>(params.tolerance)));
+    eig_vectors_large.data(), eig_vectors_jacobi_large.data(),
+    (params.n * params.n), raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestVecCompareD;
 TEST_P(EigTestVecCompareD, Result) {
   ASSERT_TRUE(raft::devArrMatch(
-    eig_vectors_large, eig_vectors_jacobi_large, (params.n * params.n),
-    raft::CompareApproxAbs<double>(params.tolerance)));
+    eig_vectors_large.data(), eig_vectors_jacobi_large.data(),
+    (params.n * params.n), raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValF, ::testing::ValuesIn(inputsf2));
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index bdd0a08ff6..3c75654733 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -43,45 +43,48 @@ template <typename T>
 
 template <typename T>
 class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
+ public:
+  EigSelTest()
+    : params(::testing::TestWithParam<EigSelInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      cov_matrix(params.len, stream),
+      eig_vectors(12, stream),
+      eig_vectors_ref(12, stream),
+      eig_vals(params.n_col, stream),
+      eig_vals_ref(params.n_col, stream) {}
+
  protected:
   void SetUp() override {
-    raft::handle_t handle;
-    stream = handle.get_stream();
-
-    params = ::testing::TestWithParam<EigSelInputs<T>>::GetParam();
     int len = params.len;
 
-    raft::allocate(cov_matrix, len, stream);
     T cov_matrix_h[] = {1.0,  0.9, 0.81, 0.729, 0.9,   1.0,  0.9, 0.81,
                         0.81, 0.9, 1.0,  0.9,   0.729, 0.81, 0.9, 1.0};
     ASSERT(len == 16, "This test only works with 4x4 matrices!");
-    raft::update_device(cov_matrix, cov_matrix_h, len, stream);
-
-    raft::allocate(eig_vectors, 12, stream);
-    raft::allocate(eig_vals, params.n_col, stream);
+    raft::update_device(cov_matrix.data(), cov_matrix_h, len, stream);
 
     T eig_vectors_ref_h[] = {-0.5123, 0.4874,  0.4874, -0.5123, 0.6498, 0.2789,
                              -0.2789, -0.6498, 0.4874, 0.5123,  0.5123, 0.4874};
     T eig_vals_ref_h[] = {0.1024, 0.3096, 3.5266, 3.5266};
 
-    raft::allocate(eig_vectors_ref, 12, stream);
-    raft::allocate(eig_vals_ref, params.n_col, stream);
+    raft::update_device(eig_vectors_ref.data(), eig_vectors_ref_h, 12, stream);
+    raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, 4, stream);
 
-    raft::update_device(eig_vectors_ref, eig_vectors_ref_h, 12, stream);
-    raft::update_device(eig_vals_ref, eig_vals_ref_h, 4, stream);
-
-    eigSelDC(handle, cov_matrix, params.n_row, params.n_col, 3, eig_vectors,
-             eig_vals, EigVecMemUsage::OVERWRITE_INPUT, stream);
+    eigSelDC(handle, cov_matrix.data(), params.n_row, params.n_col, 3,
+             eig_vectors.data(), eig_vals.data(),
+             EigVecMemUsage::OVERWRITE_INPUT, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override { raft::deallocate_all(stream); }
-
  protected:
-  EigSelInputs<T> params;
-  T *cov_matrix, *eig_vectors, *eig_vectors_ref, *eig_vals, *eig_vals_ref;
-
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  EigSelInputs<T> params;
+  rmm::device_uvector<T> cov_matrix;
+  rmm::device_uvector<T> eig_vectors;
+  rmm::device_uvector<T> eig_vectors_ref;
+  rmm::device_uvector<T> eig_vals;
+  rmm::device_uvector<T> eig_vals_ref;
 };
 
 const std::vector<EigSelInputs<float>> inputsf2 = {
@@ -93,28 +96,28 @@ const std::vector<EigSelInputs<double>> inputsd2 = {
 typedef EigSelTest<float> EigSelTestValF;
 TEST_P(EigSelTestValF, Result) {
   ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col,
+    raft::devArrMatch(eig_vals_ref.data(), eig_vals.data(), params.n_col,
                       raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigSelTest<double> EigSelTestValD;
 TEST_P(EigSelTestValD, Result) {
   ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col,
+    raft::devArrMatch(eig_vals_ref.data(), eig_vals.data(), params.n_col,
                       raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigSelTest<float> EigSelTestVecF;
 TEST_P(EigSelTestVecF, Result) {
   ASSERT_TRUE(
-    raft::devArrMatch(eig_vectors_ref, eig_vectors, 12,
+    raft::devArrMatch(eig_vectors_ref.data(), eig_vectors.data(), 12,
                       raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigSelTest<double> EigSelTestVecD;
 TEST_P(EigSelTestVecD, Result) {
   ASSERT_TRUE(
-    raft::devArrMatch(eig_vectors_ref, eig_vectors, 12,
+    raft::devArrMatch(eig_vectors_ref.data(), eig_vectors.data(), 12,
                       raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu
index e955f7a354..65bd7e4427 100644
--- a/cpp/test/linalg/eltwise.cu
+++ b/cpp/test/linalg/eltwise.cu
@@ -60,33 +60,31 @@ template <typename T>
 template <typename T>
 class ScalarMultiplyTest
   : public ::testing::TestWithParam<ScalarMultiplyInputs<T>> {
+ public:
+  ScalarMultiplyTest()
+    : params(::testing::TestWithParam<ScalarMultiplyInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      in(len, stream),
+      out_ref(len, stream),
+      out(len, stream) {}
+
  protected:
   void SetUp() override {
-    params = ::testing::TestWithParam<ScalarMultiplyInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
     T scalar = params.scalar;
-
-    cudaStream_t stream;
-    CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(in, len, stream);
-    raft::allocate(out_ref, len, stream);
-    raft::allocate(out, len, stream);
     r.uniform(in, len, T(-1.0), T(1.0), stream);
     naiveScale(out_ref, in, scalar, len, stream);
     scalarMultiply(out, in, scalar, len, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
-  void TearDown() override {
-    CUDA_CHECK(cudaFree(in));
-    CUDA_CHECK(cudaFree(out_ref));
-    CUDA_CHECK(cudaFree(out));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
  protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
   ScalarMultiplyInputs<T> params;
-  T *in, *out_ref, *out;
+  rmm::device_uvector<T> in, out_ref, out;
 };
 
 const std::vector<ScalarMultiplyInputs<float>> inputsf1 = {
@@ -97,13 +95,13 @@ const std::vector<ScalarMultiplyInputs<double>> inputsd1 = {
 
 typedef ScalarMultiplyTest<float> ScalarMultiplyTestF;
 TEST_P(ScalarMultiplyTestF, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
                           CompareApprox<float>(params.tolerance)));
 }
 
 typedef ScalarMultiplyTest<double> ScalarMultiplyTestD;
 TEST_P(ScalarMultiplyTestD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
                           CompareApprox<double>(params.tolerance)));
 }
 
@@ -148,35 +146,33 @@ template <typename T>
 
 template <typename T>
 class EltwiseAddTest : public ::testing::TestWithParam<EltwiseAddInputs<T>> {
+ public:
+  EltwiseAddTest()
+    : params(::testing::TestWithParam<EltwiseAddInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      in1(params.len, stream),
+      in2(params.len, stream),
+      out_ref(params.len, stream),
+      out(params.len, stream) {}
+
  protected:
   void SetUp() override {
     params = ::testing::TestWithParam<EltwiseAddInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
-
-    cudaStream_t stream;
-    CUDA_CHECK(cudaStreamCreate(&stream));
     int len = params.len;
-    raft::allocate(in1, len, stream);
-    raft::allocate(in2, len, stream);
-    raft::allocate(out_ref, len, stream);
-    raft::allocate(out, len, stream);
     r.uniform(in1, len, T(-1.0), T(1.0), stream);
     r.uniform(in2, len, T(-1.0), T(1.0), stream);
     naiveAdd(out_ref, in1, in2, len, stream);
     eltwiseAdd(out, in1, in2, len, stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
-  void TearDown() override {
-    CUDA_CHECK(cudaFree(in1));
-    CUDA_CHECK(cudaFree(in2));
-    CUDA_CHECK(cudaFree(out_ref));
-    CUDA_CHECK(cudaFree(out));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
  protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
   EltwiseAddInputs<T> params;
-  T *in1, *in2, *out_ref, *out;
+  rmm::device_uvector<T> in1, in2, out_ref, out;
 };
 
 const std::vector<EltwiseAddInputs<float>> inputsf2 = {
@@ -187,13 +183,13 @@ const std::vector<EltwiseAddInputs<double>> inputsd2 = {
 
 typedef EltwiseAddTest<float> EltwiseAddTestF;
 TEST_P(EltwiseAddTestF, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
                           CompareApprox<float>(params.tolerance)));
 }
 
 typedef EltwiseAddTest<double> EltwiseAddTestD;
 TEST_P(EltwiseAddTestD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
                           CompareApprox<double>(params.tolerance)));
 }
 
diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu
index 5b13fb5362..271ae13b2e 100644
--- a/cpp/test/linalg/map.cu
+++ b/cpp/test/linalg/map.cu
@@ -46,55 +46,57 @@ template <typename InType, typename IdxType, typename OutType = InType>
 void create_ref(OutType *out_ref, const InType *in1, const InType *in2,
                 const InType *in3, InType scalar, IdxType len,
                 cudaStream_t stream) {
-  InType *tmp;
-  raft::allocate(tmp, len, stream);
-  eltwiseAdd(tmp, in1, in2, len, stream);
-  eltwiseAdd(out_ref, tmp, in3, len, stream);
+  rmm::device_uvector<InType> tmp(len, stream);
+  eltwiseAdd(tmp.data(), in1, in2, len, stream);
+  eltwiseAdd(out_ref, tmp.data(), in3, len, stream);
   scalarAdd(out_ref, out_ref, (OutType)scalar, len, stream);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
 class MapTest
   : public ::testing::TestWithParam<MapInputs<InType, IdxType, OutType>> {
+ public:
+  MapTest()
+    : params(::testing::TestWithParam<
+             MapInputs<InType, IdxType, OutType>>::GetParam()),
+      stream(handle.get_stream()),
+      in1(params.len, stream),
+      in2(params.len, stream),
+      in3(params.len, stream),
+      out_ref(params.len, stream),
+      out(params.len, stream) {}
+
  protected:
   void SetUp() override {
-    params =
-      ::testing::TestWithParam<MapInputs<InType, IdxType, OutType>>::GetParam();
     raft::random::Rng r(params.seed);
 
-    CUDA_CHECK(cudaStreamCreate(&stream));
     IdxType len = params.len;
-    raft::allocate(in1, len, stream);
-    raft::allocate(in2, len, stream);
-    raft::allocate(in3, len, stream);
-    raft::allocate(out_ref, len, stream);
-    raft::allocate(out, len, stream);
-    r.uniform(in1, len, InType(-1.0), InType(1.0), stream);
-    r.uniform(in2, len, InType(-1.0), InType(1.0), stream);
-    r.uniform(in3, len, InType(-1.0), InType(1.0), stream);
-
-    create_ref(out_ref, in1, in2, in3, params.scalar, len, stream);
-    mapLaunch(out, in1, in2, in3, params.scalar, len, stream);
+    r.uniform(in1.data(), len, InType(-1.0), InType(1.0), stream);
+    r.uniform(in2.data(), len, InType(-1.0), InType(1.0), stream);
+    r.uniform(in3.data(), len, InType(-1.0), InType(1.0), stream);
+
+    create_ref(out_ref.data(), in1.data(), in2.data(), in3.data(),
+               params.scalar, len, stream);
+    mapLaunch(out.data(), in1.data(), in2.data(), in3.data(), params.scalar,
+              len, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
  protected:
-  MapInputs<InType, IdxType, OutType> params;
-  InType *in1, *in2, *in3;
-  OutType *out_ref, *out;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  MapInputs<InType, IdxType, OutType> params;
+  rmm::device_uvector<InType> in1, in2, in3;
+  rmm::device_uvector<OutType> out_ref, out;
 };
 
 const std::vector<MapInputs<float, int>> inputsf_i32 = {
   {0.000001f, 1024 * 1024, 1234ULL, 3.2}};
 typedef MapTest<float, int> MapTestF_i32;
 TEST_P(MapTestF_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
                           CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32,
@@ -104,7 +106,7 @@ const std::vector<MapInputs<float, size_t>> inputsf_i64 = {
   {0.000001f, 1024 * 1024, 1234ULL, 9.4}};
 typedef MapTest<float, size_t> MapTestF_i64;
 TEST_P(MapTestF_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
                           CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i64,
@@ -114,7 +116,7 @@ const std::vector<MapInputs<float, int, double>> inputsf_i32_d = {
   {0.000001f, 1024 * 1024, 1234ULL, 5.9}};
 typedef MapTest<float, int, double> MapTestF_i32_D;
 TEST_P(MapTestF_i32_D, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
                           CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32_D,
@@ -124,7 +126,7 @@ const std::vector<MapInputs<double, int>> inputsd_i32 = {
   {0.00000001, 1024 * 1024, 1234ULL, 7.5}};
 typedef MapTest<double, int> MapTestD_i32;
 TEST_P(MapTestD_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
                           CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i32,
@@ -134,7 +136,7 @@ const std::vector<MapInputs<double, size_t>> inputsd_i64 = {
   {0.00000001, 1024 * 1024, 1234ULL, 5.2}};
 typedef MapTest<double, size_t> MapTestD_i64;
 TEST_P(MapTestD_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
                           CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i64,
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index 4a44e59504..e77809def7 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -70,38 +70,39 @@ void mapReduceLaunch(OutType *out_ref, OutType *out, const InType *in,
 
 template <typename InType, typename OutType>
 class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<InType>> {
+ public:
+  MapReduceTest()
+    : params(::testing::TestWithParam<MapReduceInputs<InType>>::GetParam()),
+      stream(handle.get_stream()),
+      in(params.len, stream),
+      out_ref(params.len, stream),
+      out(params.len, stream)
+
+  {}
+
  protected:
   void SetUp() override {
-    params = ::testing::TestWithParam<MapReduceInputs<InType>>::GetParam();
     raft::random::Rng r(params.seed);
     auto len = params.len;
-
-    CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(in, len, stream);
-    raft::allocate(out_ref, len, stream);
-    raft::allocate(out, len, stream);
-    r.uniform(in, len, InType(-1.0), InType(1.0), stream);
-    mapReduceLaunch(out_ref, out, in, len, stream);
+    r.uniform(in.data(), len, InType(-1.0), InType(1.0), stream);
+    mapReduceLaunch(out_ref.data(), out.data(), in.data(), len, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
  protected:
-  MapReduceInputs<InType> params;
-  InType *in;
-  OutType *out_ref, *out;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  MapReduceInputs<InType> params;
+  rmm::device_uvector<InType> in;
+  rmm::device_uvector<OutType> out_ref, out;
 };
 
 const std::vector<MapReduceInputs<float>> inputsf = {
   {0.001f, 1024 * 1024, 1234ULL}};
 typedef MapReduceTest<float, float> MapReduceTestFF;
 TEST_P(MapReduceTestFF, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
                           CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF,
@@ -109,7 +110,7 @@ INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF,
 
 typedef MapReduceTest<float, double> MapReduceTestFD;
 TEST_P(MapReduceTestFD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
                           CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFD,
@@ -119,7 +120,7 @@ const std::vector<MapReduceInputs<double>> inputsd = {
   {0.000001, 1024 * 1024, 1234ULL}};
 typedef MapReduceTest<double, double> MapReduceTestDD;
 TEST_P(MapReduceTestDD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
                           CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestDD,
@@ -168,9 +169,10 @@ class MapGenericReduceTest : public ::testing::Test {
   }
 
  protected:
-  int n = 1237;
   raft::handle_t handle;
   cudaStream_t stream;
+
+  int n = 1237;
   rmm::device_uvector<InType> input;
   rmm::device_scalar<OutType> output;
 };
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index e017ee0918..28296ad7bd 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -59,44 +59,44 @@ void matrixVectorOpLaunch(T *out, const T *in, const T *vec1, const T *vec2,
 template <typename T, typename IdxType>
 class MatVecOpTest
   : public ::testing::TestWithParam<MatVecOpInputs<T, IdxType>> {
+ public:
+  MatVecOpTest()
+    : params(::testing::TestWithParam<MatVecOpInputs<T, IdxType>>::GetParam()),
+      stream(handle.get_stream()),
+      in(params.rows * params.cols, stream),
+      out_ref(params.rows * params.cols, stream),
+      out(params.rows * params.cols, stream),
+      vec1(params.bcastAlongRows ? params.cols : params.rows, stream),
+      vec2(params.bcastAlongRows ? params.cols : params.rows, stream) {}
+
  protected:
   void SetUp() override {
-    params = ::testing::TestWithParam<MatVecOpInputs<T, IdxType>>::GetParam();
     raft::random::Rng r(params.seed);
     IdxType N = params.rows, D = params.cols;
     IdxType len = N * D;
-
-    CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(in, len, stream);
-    raft::allocate(out_ref, len, stream);
-    raft::allocate(out, len, stream);
     IdxType vecLen = params.bcastAlongRows ? D : N;
-    raft::allocate(vec1, vecLen, stream);
-    raft::allocate(vec2, vecLen, stream);
-    r.uniform(in, len, (T)-1.0, (T)1.0, stream);
-    r.uniform(vec1, vecLen, (T)-1.0, (T)1.0, stream);
-    r.uniform(vec2, vecLen, (T)-1.0, (T)1.0, stream);
+    r.uniform(in.data(), len, (T)-1.0, (T)1.0, stream);
+    r.uniform(vec1.data(), vecLen, (T)-1.0, (T)1.0, stream);
+    r.uniform(vec2.data(), vecLen, (T)-1.0, (T)1.0, stream);
     if (params.useTwoVectors) {
-      naiveMatVec(out_ref, in, vec1, vec2, D, N, params.rowMajor,
-                  params.bcastAlongRows, (T)1.0);
+      naiveMatVec(out_ref.data(), in.data(), vec1.data(), vec2.data(), D, N,
+                  params.rowMajor, params.bcastAlongRows, (T)1.0);
     } else {
-      naiveMatVec(out_ref, in, vec1, D, N, params.rowMajor,
+      naiveMatVec(out_ref.data(), in.data(), vec1.data(), D, N, params.rowMajor,
                   params.bcastAlongRows, (T)1.0);
     }
-    matrixVectorOpLaunch(out, in, vec1, vec2, D, N, params.rowMajor,
-                         params.bcastAlongRows, params.useTwoVectors, stream);
+    matrixVectorOpLaunch(out.data(), in.data(), vec1.data(), vec2.data(), D, N,
+                         params.rowMajor, params.bcastAlongRows,
+                         params.useTwoVectors, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
  protected:
-  MatVecOpInputs<T, IdxType> params;
-  T *in, *out, *out_ref, *vec1, *vec2;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  MatVecOpInputs<T, IdxType> params;
+  rmm::device_uvector<T> in, out, out_ref, vec1, vec2;
 };
 
 const std::vector<MatVecOpInputs<float, int>> inputsf_i32 = {
@@ -119,7 +119,7 @@ const std::vector<MatVecOpInputs<float, int>> inputsf_i32 = {
   {0.00001f, 1024, 64, false, false, true, 1234ULL}};
 typedef MatVecOpTest<float, int> MatVecOpTestF_i32;
 TEST_P(MatVecOpTestF_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.rows * params.cols,
                           CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32,
@@ -130,7 +130,7 @@ const std::vector<MatVecOpInputs<float, size_t>> inputsf_i64 = {
   {0.00001f, 2500, 250, false, false, true, 1234ULL}};
 typedef MatVecOpTest<float, size_t> MatVecOpTestF_i64;
 TEST_P(MatVecOpTestF_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.rows * params.cols,
                           CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64,
@@ -156,7 +156,7 @@ const std::vector<MatVecOpInputs<double, int>> inputsd_i32 = {
   {0.0000001, 1024, 64, false, false, true, 1234ULL}};
 typedef MatVecOpTest<double, int> MatVecOpTestD_i32;
 TEST_P(MatVecOpTestD_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.rows * params.cols,
                           CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32,
@@ -167,7 +167,7 @@ const std::vector<MatVecOpInputs<double, size_t>> inputsd_i64 = {
   {0.0000001, 2500, 250, false, false, true, 1234ULL}};
 typedef MatVecOpTest<double, size_t> MatVecOpTestD_i64;
 TEST_P(MatVecOpTestD_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.rows * params.cols,
                           CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64,
diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu
index d7bda7c27d..d0772e538d 100644
--- a/cpp/test/linalg/multiply.cu
+++ b/cpp/test/linalg/multiply.cu
@@ -26,38 +26,38 @@ namespace linalg {
 
 template <typename T>
 class MultiplyTest : public ::testing::TestWithParam<UnaryOpInputs<T>> {
+ public:
+  MultiplyTest()
+    : params(::testing::TestWithParam<UnaryOpInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      in(params.len, stream),
+      out_ref(params.len, stream),
+      out(params.len, stream) {}
+
  protected:
   void SetUp() override {
     params = ::testing::TestWithParam<UnaryOpInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
-    CUDA_CHECK(cudaStreamCreate(&stream));
-
-    raft::allocate(in, len, stream);
-    raft::allocate(out_ref, len, stream);
-    raft::allocate(out, len, stream);
-    r.uniform(in, len, T(-1.0), T(1.0), stream);
-    naiveScale(out_ref, in, params.scalar, len, stream);
-    multiplyScalar(out, in, params.scalar, len, stream);
+    r.uniform(in.data(), len, T(-1.0), T(1.0), stream);
+    naiveScale(out_ref.data(), in.data(), params.scalar, len, stream);
+    multiplyScalar(out.data(), in.data(), params.scalar, len, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
  protected:
-  UnaryOpInputs<T> params;
-  T *in, *out_ref, *out;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  UnaryOpInputs<T> params;
+  rmm::device_uvector<T> in, out_ref, out;
 };
 
 const std::vector<UnaryOpInputs<float>> inputsf = {
   {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
 typedef MultiplyTest<float> MultiplyTestF;
 TEST_P(MultiplyTestF, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
                           raft::CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestF,
@@ -67,7 +67,7 @@ typedef MultiplyTest<double> MultiplyTestD;
 const std::vector<UnaryOpInputs<double>> inputsd = {
   {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
 TEST_P(MultiplyTestD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
                           raft::CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestD,
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
index 5563064982..94b703d15e 100644
--- a/cpp/test/linalg/norm.cu
+++ b/cpp/test/linalg/norm.cu
@@ -71,40 +71,36 @@ void naiveRowNorm(Type *dots, const Type *data, int D, int N, NormType type,
 template <typename T>
 class RowNormTest : public ::testing::TestWithParam<NormInputs<T>> {
  public:
+  RowNormTest()
+    : params(::testing::TestWithParam<NormInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      data(params.rows * params.cols, stream),
+      dots_exp(params.rows, stream),
+      dots_act(params.rows, stream) {}
+
   void SetUp() override {
-    CUDA_CHECK(cudaStreamCreate(&stream));
-    params = ::testing::TestWithParam<NormInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols, len = rows * cols;
-    cudaStream_t stream;
-    CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(data, len, stream);
-    raft::allocate(dots_exp, rows, stream);
-    raft::allocate(dots_act, rows, stream);
-    r.uniform(data, len, T(-1.0), T(1.0), stream);
-    naiveRowNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt,
-                 stream);
+    r.uniform(data.data(), len, T(-1.0), T(1.0), stream);
+    naiveRowNorm(dots_exp.data(), data.data(), cols, rows, params.type,
+                 params.do_sqrt, stream);
     if (params.do_sqrt) {
       auto fin_op = [] __device__(T in) { return raft::mySqrt(in); };
-      rowNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream,
-              fin_op);
+      rowNorm(dots_act.data(), data.data(), cols, rows, params.type,
+              params.rowMajor, stream, fin_op);
     } else {
-      rowNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream);
+      rowNorm(dots_act.data(), data.data(), cols, rows, params.type,
+              params.rowMajor, stream);
     }
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
-  void TearDown() override {
-    CUDA_CHECK(cudaFree(data));
-    CUDA_CHECK(cudaFree(dots_exp));
-    CUDA_CHECK(cudaFree(dots_act));
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
  protected:
-  NormInputs<T> params;
-  T *data, *dots_exp, *dots_act;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  NormInputs<T> params;
+  rmm::device_uvector<T> data, dots_exp, dots_act;
 };
 
 ///// Column-wise norm test definitisons
@@ -136,39 +132,37 @@ void naiveColNorm(Type *dots, const Type *data, int D, int N, NormType type,
 template <typename T>
 class ColNormTest : public ::testing::TestWithParam<NormInputs<T>> {
  public:
+  ColNormTest()
+    : params(::testing::TestWithParam<NormInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      data(params.rows * params.cols, stream),
+      dots_exp(params.cols, stream),
+      dots_act(params.cols, stream) {}
+
   void SetUp() override {
-    CUDA_CHECK(cudaStreamCreate(&stream));
-    params = ::testing::TestWithParam<NormInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols, len = rows * cols;
-    cudaStream_t stream;
-    CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(data, len, stream);
-    r.uniform(data, len, T(-1.0), T(1.0), stream);
-    raft::allocate(dots_exp, cols, stream);
-    raft::allocate(dots_act, cols, stream);
-
-    naiveColNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt,
-                 stream);
+    r.uniform(data.data(), len, T(-1.0), T(1.0), stream);
+
+    naiveColNorm(dots_exp.data(), data.data(), cols, rows, params.type,
+                 params.do_sqrt, stream);
     if (params.do_sqrt) {
       auto fin_op = [] __device__(T in) { return raft::mySqrt(in); };
-      colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream,
-              fin_op);
+      colNorm(dots_act.data(), data.data(), cols, rows, params.type,
+              params.rowMajor, stream, fin_op);
     } else {
-      colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream);
+      colNorm(dots_act.data(), data.data(), cols, rows, params.type,
+              params.rowMajor, stream);
     }
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
  protected:
-  NormInputs<T> params;
-  T *data, *dots_exp, *dots_act;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  NormInputs<T> params;
+  rmm::device_uvector<T> data, dots_exp, dots_act;
 };
 
 ///// Row- and column-wise tests
@@ -212,13 +206,13 @@ const std::vector<NormInputs<double>> inputsd = {
 
 typedef RowNormTest<float> RowNormTestF;
 TEST_P(RowNormTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows,
+  ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.rows,
                                 raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef RowNormTest<double> RowNormTestD;
 TEST_P(RowNormTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows,
+  ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.rows,
                                 raft::CompareApprox<double>(params.tolerance)));
 }
 
@@ -268,13 +262,13 @@ const std::vector<NormInputs<double>> inputscd = {
 
 typedef ColNormTest<float> ColNormTestF;
 TEST_P(ColNormTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.cols,
+  ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.cols,
                                 raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef ColNormTest<double> ColNormTestD;
 TEST_P(ColNormTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.cols,
+  ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.cols,
                                 raft::CompareApprox<double>(params.tolerance)));
 }
 
diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
index 7ceeaf7f8e..cf7585dc23 100644
--- a/cpp/test/linalg/reduce.cu
+++ b/cpp/test/linalg/reduce.cu
@@ -54,44 +54,45 @@ void reduceLaunch(OutType *dots, const InType *data, int cols, int rows,
 template <typename InType, typename OutType>
 class ReduceTest
   : public ::testing::TestWithParam<ReduceInputs<InType, OutType>> {
+ public:
+  ReduceTest()
+    : params(
+        ::testing::TestWithParam<ReduceInputs<InType, OutType>>::GetParam()),
+      stream(handle.get_stream()),
+      data(params.rows * params.cols, stream),
+      dots_exp(params.alongRows ? params.rows : params.cols, stream),
+      dots_act(params.alongRows ? params.rows : params.cols, stream) {}
+
  protected:
   void SetUp() override {
-    CUDA_CHECK(cudaStreamCreate(&stream));
-    params =
-      ::testing::TestWithParam<ReduceInputs<InType, OutType>>::GetParam();
     raft::random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
     outlen = params.alongRows ? rows : cols;
-    raft::allocate(data, len, stream);
-    raft::allocate(dots_exp, outlen, stream);
-    raft::allocate(dots_act, outlen, stream);
-    r.uniform(data, len, InType(-1.0), InType(1.0), stream);
-    naiveReduction(dots_exp, data, cols, rows, params.rowMajor,
+    r.uniform(data.data(), len, InType(-1.0), InType(1.0), stream);
+    naiveReduction(dots_exp.data(), data.data(), cols, rows, params.rowMajor,
                    params.alongRows, stream);
 
     // Perform reduction with default inplace = false first
-    reduceLaunch(dots_act, data, cols, rows, params.rowMajor, params.alongRows,
-                 false, stream);
+    reduceLaunch(dots_act.data(), data.data(), cols, rows, params.rowMajor,
+                 params.alongRows, false, stream);
     // Add to result with inplace = true next, which shouldn't affect
     // in the case of coalescedReduction!
     if (!(params.rowMajor ^ params.alongRows)) {
-      reduceLaunch(dots_act, data, cols, rows, params.rowMajor,
+      reduceLaunch(dots_act.data(), data.data(), cols, rows, params.rowMajor,
                    params.alongRows, true, stream);
     }
-  }
-
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
  protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
   ReduceInputs<InType, OutType> params;
-  InType *data;
-  OutType *dots_exp, *dots_act;
+  rmm::device_uvector<InType> data;
+  rmm::device_uvector<OutType> dots_exp, dots_act;
   int outlen;
-  cudaStream_t stream;
 };
 
 const std::vector<ReduceInputs<float, float>> inputsff = {
@@ -150,19 +151,19 @@ const std::vector<ReduceInputs<float, double>> inputsfd = {
 
 typedef ReduceTest<float, float> ReduceTestFF;
 TEST_P(ReduceTestFF, Result) {
-  ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen,
+  ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), outlen,
                           raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef ReduceTest<double, double> ReduceTestDD;
 TEST_P(ReduceTestDD, Result) {
-  ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen,
+  ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), outlen,
                           raft::CompareApprox<double>(params.tolerance)));
 }
 
 typedef ReduceTest<float, double> ReduceTestFD;
 TEST_P(ReduceTestFD, Result) {
-  ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen,
+  ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), outlen,
                           raft::CompareApprox<double>(params.tolerance)));
 }
 
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu
index 55d8cc0e92..56632a59cc 100644
--- a/cpp/test/linalg/strided_reduction.cu
+++ b/cpp/test/linalg/strided_reduction.cu
@@ -41,33 +41,34 @@ void stridedReductionLaunch(T *dots, const T *data, int cols, int rows,
 template <typename T>
 class stridedReductionTest
   : public ::testing::TestWithParam<stridedReductionInputs<T>> {
+ public:
+  stridedReductionTest()
+    : params(::testing::TestWithParam<stridedReductionInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      data(params.rows * params.cols, stream),
+      dots_exp(params.cols, stream),  // expected dot products (from test)
+      dots_act(params.cols, stream)   // actual dot products (from prim)
+  {}
+
  protected:
   void SetUp() override {
-    CUDA_CHECK(cudaStreamCreate(&stream));
-    params = ::testing::TestWithParam<stridedReductionInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
-
-    raft::allocate(data, len, stream);
-    raft::allocate(dots_exp, cols, stream);  //expected dot products (from test)
-    raft::allocate(dots_act, cols, stream);  //actual dot products (from prim)
-    r.uniform(data, len, T(-1.0), T(1.0),
+    r.uniform(data.data(), len, T(-1.0), T(1.0),
               stream);  //initialize matrix to random
 
-    unaryAndGemv(dots_exp, data, cols, rows, stream);
-    stridedReductionLaunch(dots_act, data, cols, rows, stream);
-  }
-
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    unaryAndGemv(dots_exp.data(), data.data(), cols, rows, stream);
+    stridedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
  protected:
-  stridedReductionInputs<T> params;
-  T *data, *dots_exp, *dots_act;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  stridedReductionInputs<T> params;
+  rmm::device_uvector<T> data, dots_exp, dots_act;
 };
 
 const std::vector<stridedReductionInputs<float>> inputsf = {
@@ -84,13 +85,13 @@ const std::vector<stridedReductionInputs<double>> inputsd = {
 
 typedef stridedReductionTest<float> stridedReductionTestF;
 TEST_P(stridedReductionTestF, Result) {
-  ASSERT_TRUE(devArrMatch(dots_exp, dots_act, params.cols,
+  ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), params.cols,
                           raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef stridedReductionTest<double> stridedReductionTestD;
 TEST_P(stridedReductionTestD, Result) {
-  ASSERT_TRUE(devArrMatch(dots_exp, dots_act, params.cols,
+  ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), params.cols,
                           raft::CompareApprox<double>(params.tolerance)));
 }
 
diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu
index 27dea8503f..df3686ee32 100644
--- a/cpp/test/linalg/subtract.cu
+++ b/cpp/test/linalg/subtract.cu
@@ -74,38 +74,38 @@ template <typename T>
 
 template <typename T>
 class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
+ public:
+  SubtractTest()
+    : params(::testing::TestWithParam<SubtractInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      in1(params.len, stream),
+      in2(params.len, stream),
+      out_ref(params.len, stream),
+      out(params.len, stream) {}
+
  protected:
   void SetUp() override {
-    params = ::testing::TestWithParam<SubtractInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
-    CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(in1, len, stream);
-    raft::allocate(in2, len, stream);
-    raft::allocate(out_ref, len, stream);
-    raft::allocate(out, len, stream);
-    r.uniform(in1, len, T(-1.0), T(1.0), stream);
-    r.uniform(in2, len, T(-1.0), T(1.0), stream);
-
-    naiveSubtractElem(out_ref, in1, in2, len, stream);
-    naiveSubtractScalar(out_ref, out_ref, T(1), len, stream);
-
-    subtract(out, in1, in2, len, stream);
-    subtractScalar(out, out, T(1), len, stream);
-    subtract(in1, in1, in2, len, stream);
-    subtractScalar(in1, in1, T(1), len, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-  }
+    r.uniform(in1.data(), len, T(-1.0), T(1.0), stream);
+    r.uniform(in2.data(), len, T(-1.0), T(1.0), stream);
+
+    naiveSubtractElem(out_ref.data(), in1.data(), in2.data(), len, stream);
+    naiveSubtractScalar(out_ref.data(), out_ref.data(), T(1), len, stream);
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    subtract(out.data(), in1.data(), in2.data(), len, stream);
+    subtractScalar(out.data(), out.data(), T(1), len, stream);
+    subtract(in1.data(), in1.data(), in2.data(), len, stream);
+    subtractScalar(in1.data(), in1.data(), T(1), len, stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
  protected:
-  SubtractInputs<T> params;
-  T *in1, *in2, *out_ref, *out;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  SubtractInputs<T> params;
+  rmm::device_uvector<T> in1, in2, out_ref, out;
 };
 
 const std::vector<SubtractInputs<float>> inputsf2 = {
@@ -116,19 +116,19 @@ const std::vector<SubtractInputs<double>> inputsd2 = {
 
 typedef SubtractTest<float> SubtractTestF;
 TEST_P(SubtractTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(raft::devArrMatch(out_ref.data(), out.data(), params.len,
                                 raft::CompareApprox<float>(params.tolerance)));
 
-  ASSERT_TRUE(raft::devArrMatch(out_ref, in1, params.len,
+  ASSERT_TRUE(raft::devArrMatch(out_ref.data(), in1.data(), params.len,
                                 raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef SubtractTest<double> SubtractTestD;
 TEST_P(SubtractTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len,
+  ASSERT_TRUE(raft::devArrMatch(out_ref.data(), out.data(), params.len,
                                 raft::CompareApprox<double>(params.tolerance)));
 
-  ASSERT_TRUE(raft::devArrMatch(out_ref, in1, params.len,
+  ASSERT_TRUE(raft::devArrMatch(out_ref.data(), in1.data(), params.len,
                                 raft::CompareApprox<double>(params.tolerance)));
 }
 
diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu
index 72a27790de..cbd6df9c8f 100644
--- a/cpp/test/linalg/svd.cu
+++ b/cpp/test/linalg/svd.cu
@@ -41,32 +41,31 @@ template <typename T>
 
 template <typename T>
 class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
+ public:
+  SvdTest()
+    : params(::testing::TestWithParam<SvdInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      data(params.len, stream),
+      left_eig_vectors_qr(params.n_row * params.n_col, stream),
+      right_eig_vectors_trans_qr(params.n_col * params.n_col, stream),
+      sing_vals_qr(params.n_col, stream),
+      left_eig_vectors_ref(params.n_row * params.n_col, stream),
+      right_eig_vectors_ref(params.n_col * params.n_col, stream),
+      sing_vals_ref(params.len, stream) {}
+
  protected:
   void SetUp() override {
-    raft::handle_t handle;
-
-    params = ::testing::TestWithParam<SvdInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
-    stream = handle.get_stream();
-    raft::allocate(data, len, stream);
 
     ASSERT(params.n_row == 3, "This test only supports nrows=3!");
     ASSERT(params.len == 6, "This test only supports len=6!");
     T data_h[] = {1.0, 4.0, 2.0, 2.0, 5.0, 1.0};
-    raft::update_device(data, data_h, len, stream);
+    raft::update_device(data.data(), data_h, len, stream);
 
     int left_evl = params.n_row * params.n_col;
     int right_evl = params.n_col * params.n_col;
 
-    raft::allocate(left_eig_vectors_qr, left_evl, stream);
-    raft::allocate(right_eig_vectors_trans_qr, right_evl, stream);
-    raft::allocate(sing_vals_qr, params.n_col, stream);
-
-    // allocate(left_eig_vectors_jacobi, left_evl);
-    // allocate(right_eig_vectors_trans_jacobi, right_evl);
-    // allocate(sing_vals_jacobi, params.n_col);
-
     T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695,
                                   0.488195,  0.110706,  -0.865685};
 
@@ -74,29 +73,26 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
 
     T sing_vals_ref_h[] = {7.065283, 1.040081};
 
-    raft::allocate(left_eig_vectors_ref, left_evl, stream);
-    raft::allocate(right_eig_vectors_ref, right_evl, stream);
-    raft::allocate(sing_vals_ref, params.n_col, stream);
-
-    raft::update_device(left_eig_vectors_ref, left_eig_vectors_ref_h, left_evl,
-                        stream);
-    raft::update_device(right_eig_vectors_ref, right_eig_vectors_ref_h,
+    raft::update_device(left_eig_vectors_ref.data(), left_eig_vectors_ref_h,
+                        left_evl, stream);
+    raft::update_device(right_eig_vectors_ref.data(), right_eig_vectors_ref_h,
                         right_evl, stream);
-    raft::update_device(sing_vals_ref, sing_vals_ref_h, params.n_col, stream);
+    raft::update_device(sing_vals_ref.data(), sing_vals_ref_h, params.n_col,
+                        stream);
 
-    svdQR(handle, data, params.n_row, params.n_col, sing_vals_qr,
-          left_eig_vectors_qr, right_eig_vectors_trans_qr, true, true, true,
-          stream);
+    svdQR(handle, data.data(), params.n_row, params.n_col, sing_vals_qr.data(),
+          left_eig_vectors_qr.data(), right_eig_vectors_trans_qr.data(), true,
+          true, true, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override { raft::deallocate_all(stream); }
-
  protected:
-  SvdInputs<T> params;
-  T *data, *left_eig_vectors_qr, *right_eig_vectors_trans_qr, *sing_vals_qr,
-    *left_eig_vectors_ref, *right_eig_vectors_ref, *sing_vals_ref;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  SvdInputs<T> params;
+  rmm::device_uvector<T> data, left_eig_vectors_qr, right_eig_vectors_trans_qr,
+    sing_vals_qr, left_eig_vectors_ref, right_eig_vectors_ref, sing_vals_ref;
 };
 
 const std::vector<SvdInputs<float>> inputsf2 = {
@@ -108,45 +104,47 @@ const std::vector<SvdInputs<double>> inputsd2 = {
 typedef SvdTest<float> SvdTestValF;
 TEST_P(SvdTestValF, Result) {
   ASSERT_TRUE(
-    raft::devArrMatch(sing_vals_ref, sing_vals_qr, params.n_col,
+    raft::devArrMatch(sing_vals_ref.data(), sing_vals_qr.data(), params.n_col,
                       raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef SvdTest<double> SvdTestValD;
 TEST_P(SvdTestValD, Result) {
   ASSERT_TRUE(
-    raft::devArrMatch(sing_vals_ref, sing_vals_qr, params.n_col,
+    raft::devArrMatch(sing_vals_ref.data(), sing_vals_qr.data(), params.n_col,
                       raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef SvdTest<float> SvdTestLeftVecF;
 TEST_P(SvdTestLeftVecF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(
-    left_eig_vectors_ref, left_eig_vectors_qr, params.n_row * params.n_col,
-    raft::CompareApproxAbs<float>(params.tolerance)));
+  ASSERT_TRUE(
+    raft::devArrMatch(left_eig_vectors_ref.data(), left_eig_vectors_qr.data(),
+                      params.n_row * params.n_col,
+                      raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef SvdTest<double> SvdTestLeftVecD;
 TEST_P(SvdTestLeftVecD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(
-    left_eig_vectors_ref, left_eig_vectors_qr, params.n_row * params.n_col,
-    raft::CompareApproxAbs<double>(params.tolerance)));
+  ASSERT_TRUE(
+    raft::devArrMatch(left_eig_vectors_ref.data(), left_eig_vectors_qr.data(),
+                      params.n_row * params.n_col,
+                      raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef SvdTest<float> SvdTestRightVecF;
 TEST_P(SvdTestRightVecF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(right_eig_vectors_ref, right_eig_vectors_trans_qr,
-                      params.n_col * params.n_col,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    right_eig_vectors_ref.data(), right_eig_vectors_trans_qr.data(),
+    params.n_col * params.n_col,
+    raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef SvdTest<double> SvdTestRightVecD;
 TEST_P(SvdTestRightVecD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(right_eig_vectors_ref, right_eig_vectors_trans_qr,
-                      params.n_col * params.n_col,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    right_eig_vectors_ref.data(), right_eig_vectors_trans_qr.data(),
+    params.n_col * params.n_col,
+    raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValF, ::testing::ValuesIn(inputsf2));
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index c574f54a05..b3f7f5b126 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -40,36 +40,35 @@ template <typename T>
 
 template <typename T>
 class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
+ public:
+  TransposeTest()
+    : params(::testing::TestWithParam<TranposeInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      data(params.len, stream),
+      data_trans_ref(params.len, stream),
+      data_trans(params.len, stream) {}
+
  protected:
   void SetUp() override {
-    params = ::testing::TestWithParam<TranposeInputs<T>>::GetParam();
-
-    stream = handle.get_stream();
-
     int len = params.len;
-
-    raft::allocate(data, len, stream);
     ASSERT(params.len == 9, "This test works only with len=9!");
     T data_h[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
-    raft::update_device(data, data_h, len, stream);
-
-    raft::allocate(data_trans_ref, len, stream);
+    raft::update_device(data.data(), data_h, len, stream);
     T data_ref_h[] = {1.0, 4.0, 7.0, 2.0, 5.0, 8.0, 3.0, 6.0, 9.0};
-    raft::update_device(data_trans_ref, data_ref_h, len, stream);
+    raft::update_device(data_trans_ref.data(), data_ref_h, len, stream);
 
-    raft::allocate(data_trans, len, stream);
-
-    transpose(handle, data, data_trans, params.n_row, params.n_col, stream);
-    transpose(data, params.n_row, stream);
+    transpose(handle, data.data(), data_trans.data(), params.n_row,
+              params.n_col, stream);
+    transpose(data.data(), params.n_row, stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override { raft::deallocate_all(stream); }
-
  protected:
-  TranposeInputs<T> params;
-  T *data, *data_trans, *data_trans_ref;
   raft::handle_t handle;
   cudaStream_t stream;
+
+  TranposeInputs<T> params;
+  rmm::device_uvector<T> data, data_trans, data_trans_ref;
 };
 
 const std::vector<TranposeInputs<float>> inputsf2 = {
@@ -81,22 +80,22 @@ const std::vector<TranposeInputs<double>> inputsd2 = {
 typedef TransposeTest<float> TransposeTestValF;
 TEST_P(TransposeTestValF, Result) {
   ASSERT_TRUE(
-    raft::devArrMatch(data_trans_ref, data_trans, params.len,
+    raft::devArrMatch(data_trans_ref.data(), data_trans.data(), params.len,
                       raft::CompareApproxAbs<float>(params.tolerance)));
 
   ASSERT_TRUE(
-    raft::devArrMatch(data_trans_ref, data, params.len,
+    raft::devArrMatch(data_trans_ref.data(), data.data(), params.len,
                       raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef TransposeTest<double> TransposeTestValD;
 TEST_P(TransposeTestValD, Result) {
   ASSERT_TRUE(
-    raft::devArrMatch(data_trans_ref, data_trans, params.len,
+    raft::devArrMatch(data_trans_ref.data(), data_trans.data(), params.len,
                       raft::CompareApproxAbs<double>(params.tolerance)));
 
   ASSERT_TRUE(
-    raft::devArrMatch(data_trans_ref, data, params.len,
+    raft::devArrMatch(data_trans_ref.data(), data.data(), params.len,
                       raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu
index 042e8b9cbf..c3d10d70e7 100644
--- a/cpp/test/linalg/unary_op.cu
+++ b/cpp/test/linalg/unary_op.cu
@@ -46,38 +46,40 @@ void unaryOpLaunch(OutType *out, const InType *in, InType scalar, IdxType len,
 template <typename InType, typename IdxType, typename OutType = InType>
 class UnaryOpTest
   : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxType, OutType>> {
+ public:
+  UnaryOpTest()
+    : params(::testing::TestWithParam<
+             UnaryOpInputs<InType, IdxType, OutType>>::GetParam()),
+      stream(handle.get_stream()),
+      in(params.len, stream),
+      out_ref(params.len, stream),
+      out(params.len, stream) {}
+
  protected:
   void SetUp() override {
-    params = ::testing::TestWithParam<
-      UnaryOpInputs<InType, IdxType, OutType>>::GetParam();
     raft::random::Rng r(params.seed);
-    CUDA_CHECK(cudaStreamCreate(&stream));
     auto len = params.len;
-    raft::allocate(in, len, stream);
-    raft::allocate(out_ref, len, stream);
-    raft::allocate(out, len, stream);
-    r.uniform(in, len, InType(-1.0), InType(1.0), stream);
-  }
-
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    r.uniform(in.data(), len, InType(-1.0), InType(1.0), stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   virtual void DoTest() {
     auto len = params.len;
     auto scalar = params.scalar;
-    naiveScale(out_ref, in, scalar, len, stream);
-    unaryOpLaunch(out, in, scalar, len, stream);
+    naiveScale(out_ref.data(), in.data(), scalar, len, stream);
+    unaryOpLaunch(out.data(), in.data(), scalar, len, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
-    ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+    ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
                             CompareApprox<OutType>(params.tolerance)));
   }
 
-  UnaryOpInputs<InType, IdxType, OutType> params;
-  InType *in;
-  OutType *out_ref, *out;
+ protected:
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  UnaryOpInputs<InType, IdxType, OutType> params;
+  rmm::device_uvector<InType> in;
+  rmm::device_uvector<OutType> out_ref, out;
 };
 
 template <typename OutType, typename IdxType>
@@ -86,10 +88,13 @@ class WriteOnlyUnaryOpTest : public UnaryOpTest<OutType, IdxType, OutType> {
   void DoTest() override {
     auto len = this->params.len;
     auto scalar = this->params.scalar;
-    naiveScale(this->out_ref, (OutType *)nullptr, scalar, len, this->stream);
-    unaryOpLaunch(this->out, (OutType *)nullptr, scalar, len, this->stream);
+    naiveScale(this->out_ref.data(), (OutType *)nullptr, scalar, len,
+               this->stream);
+    unaryOpLaunch(this->out.data(), (OutType *)nullptr, scalar, len,
+                  this->stream);
     CUDA_CHECK(cudaStreamSynchronize(this->stream));
-    ASSERT_TRUE(devArrMatch(this->out_ref, this->out, this->params.len,
+    ASSERT_TRUE(devArrMatch(this->out_ref.data(), this->out.data(),
+                            this->params.len,
                             CompareApprox<OutType>(this->params.tolerance)));
   }
 };
diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu
index 63381dec07..84aa310076 100644
--- a/cpp/test/matrix/math.cu
+++ b/cpp/test/matrix/math.cu
@@ -109,81 +109,83 @@ template <typename T>
 
 template <typename T>
 class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
+ public:
+  MathTest()
+    : params(::testing::TestWithParam<MathInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      in_power(params.len, stream),
+      out_power_ref(params.len, stream),
+      in_sqrt(params.len, stream),
+      out_sqrt_ref(params.len, stream),
+      in_sign_flip(params.len, stream),
+      out_sign_flip_ref(params.len, stream),
+      in_ratio(4, stream),
+      out_ratio_ref(4, stream),
+      in_recip(4, stream),
+      in_recip_ref(4, stream),
+      out_recip(4, stream),
+      in_smallzero(4, stream),
+      out_smallzero(4, stream),
+      out_smallzero_ref(4, stream) {}
+
  protected:
   void SetUp() override {
-    params = ::testing::TestWithParam<MathInputs<T>>::GetParam();
     random::Rng r(params.seed);
     int len = params.len;
-
-    raft::handle_t handle;
-    stream = handle.get_stream();
-    CUDA_CHECK(cudaStreamCreate(&stream));
-
-    raft::allocate(in_power, len, stream);
-    raft::allocate(out_power_ref, len, stream);
-    raft::allocate(in_sqrt, len, stream);
-    raft::allocate(out_sqrt_ref, len, stream);
-    raft::allocate(in_sign_flip, len, stream);
-    raft::allocate(out_sign_flip_ref, len, stream);
-
-    raft::allocate(in_ratio, 4, stream);
     T in_ratio_h[4] = {1.0, 2.0, 2.0, 3.0};
-    update_device(in_ratio, in_ratio_h, 4, stream);
+    update_device(in_ratio.data(), in_ratio_h, 4, stream);
 
-    raft::allocate(out_ratio_ref, 4, stream);
     T out_ratio_ref_h[4] = {0.125, 0.25, 0.25, 0.375};
-    update_device(out_ratio_ref, out_ratio_ref_h, 4, stream);
+    update_device(out_ratio_ref.data(), out_ratio_ref_h, 4, stream);
 
-    r.uniform(in_power, len, T(-1.0), T(1.0), stream);
-    r.uniform(in_sqrt, len, T(0.0), T(1.0), stream);
+    r.uniform(in_power.data(), len, T(-1.0), T(1.0), stream);
+    r.uniform(in_sqrt.data(), len, T(0.0), T(1.0), stream);
     // r.uniform(in_ratio, len, T(0.0), T(1.0));
-    r.uniform(in_sign_flip, len, T(-100.0), T(100.0), stream);
+    r.uniform(in_sign_flip.data(), len, T(-100.0), T(100.0), stream);
 
-    naivePower(in_power, out_power_ref, len, stream);
-    power(in_power, len, stream);
+    naivePower(in_power.data(), out_power_ref.data(), len, stream);
+    power(in_power.data(), len, stream);
 
-    naiveSqrt(in_sqrt, out_sqrt_ref, len);
-    seqRoot(in_sqrt, len, stream);
+    naiveSqrt(in_sqrt.data(), out_sqrt_ref.data(), len);
+    seqRoot(in_sqrt.data(), len, stream);
 
-    ratio(handle, in_ratio, in_ratio, 4, stream);
+    ratio(handle, in_ratio.data(), in_ratio.data(), 4, stream);
 
-    naiveSignFlip(in_sign_flip, out_sign_flip_ref, params.n_row, params.n_col);
-    signFlip(in_sign_flip, params.n_row, params.n_col, stream);
+    naiveSignFlip(in_sign_flip.data(), out_sign_flip_ref.data(), params.n_row,
+                  params.n_col);
+    signFlip(in_sign_flip.data(), params.n_row, params.n_col, stream);
 
-    raft::allocate(in_recip, 4, stream);
-    raft::allocate(in_recip_ref, 4, stream);
-    raft::allocate(out_recip, 4, stream);
     // default threshold is 1e-15
     std::vector<T> in_recip_h = {0.1, 0.01, -0.01, 0.1e-16};
     std::vector<T> in_recip_ref_h = {10.0, 100.0, -100.0, 0.0};
-    update_device(in_recip, in_recip_h.data(), 4, stream);
-    update_device(in_recip_ref, in_recip_ref_h.data(), 4, stream);
+    update_device(in_recip.data(), in_recip_h.data(), 4, stream);
+    update_device(in_recip_ref.data(), in_recip_ref_h.data(), 4, stream);
     T recip_scalar = T(1.0);
 
     // this `reciprocal()` has to go first bc next one modifies its input
-    reciprocal(in_recip, out_recip, recip_scalar, 4, stream);
+    reciprocal(in_recip.data(), out_recip.data(), recip_scalar, 4, stream);
 
-    reciprocal(in_recip, recip_scalar, 4, stream, true);
+    reciprocal(in_recip.data(), recip_scalar, 4, stream, true);
 
     std::vector<T> in_small_val_zero_h = {0.1, 1e-16, -1e-16, -0.1};
     std::vector<T> in_small_val_zero_ref_h = {0.1, 0.0, 0.0, -0.1};
-    raft::allocate(in_smallzero, 4, stream);
-    raft::allocate(out_smallzero, 4, stream);
-    raft::allocate(out_smallzero_ref, 4, stream);
-    update_device(in_smallzero, in_small_val_zero_h.data(), 4, stream);
-    update_device(out_smallzero_ref, in_small_val_zero_ref_h.data(), 4, stream);
-    setSmallValuesZero(out_smallzero, in_smallzero, 4, stream);
-    setSmallValuesZero(in_smallzero, 4, stream);
-  }
 
-  void TearDown() override { raft::deallocate_all(stream); }
+    update_device(in_smallzero.data(), in_small_val_zero_h.data(), 4, stream);
+    update_device(out_smallzero_ref.data(), in_small_val_zero_ref_h.data(), 4,
+                  stream);
+    setSmallValuesZero(out_smallzero.data(), in_smallzero.data(), 4, stream);
+    setSmallValuesZero(in_smallzero.data(), 4, stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+  }
 
  protected:
-  MathInputs<T> params;
-  T *in_power, *out_power_ref, *in_sqrt, *out_sqrt_ref, *in_ratio,
-    *out_ratio_ref, *in_sign_flip, *out_sign_flip_ref, *in_recip, *in_recip_ref,
-    *out_recip, *in_smallzero, *out_smallzero, *out_smallzero_ref;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  MathInputs<T> params;
+  rmm::device_uvector<T> in_power, out_power_ref, in_sqrt, out_sqrt_ref,
+    in_ratio, out_ratio_ref, in_sign_flip, out_sign_flip_ref, in_recip,
+    in_recip_ref, out_recip, in_smallzero, out_smallzero, out_smallzero_ref;
 };
 
 const std::vector<MathInputs<float>> inputsf = {
@@ -194,87 +196,87 @@ const std::vector<MathInputs<double>> inputsd = {
 
 typedef MathTest<float> MathPowerTestF;
 TEST_P(MathPowerTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_power, out_power_ref, params.len,
+  ASSERT_TRUE(devArrMatch(in_power.data(), out_power_ref.data(), params.len,
                           CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathPowerTestD;
 TEST_P(MathPowerTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_power, out_power_ref, params.len,
+  ASSERT_TRUE(devArrMatch(in_power.data(), out_power_ref.data(), params.len,
                           CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathSqrtTestF;
 TEST_P(MathSqrtTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_sqrt, out_sqrt_ref, params.len,
+  ASSERT_TRUE(devArrMatch(in_sqrt.data(), out_sqrt_ref.data(), params.len,
                           CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathSqrtTestD;
 TEST_P(MathSqrtTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_sqrt, out_sqrt_ref, params.len,
+  ASSERT_TRUE(devArrMatch(in_sqrt.data(), out_sqrt_ref.data(), params.len,
                           CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathRatioTestF;
 TEST_P(MathRatioTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4,
+  ASSERT_TRUE(devArrMatch(in_ratio.data(), out_ratio_ref.data(), 4,
                           CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathRatioTestD;
 TEST_P(MathRatioTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4,
+  ASSERT_TRUE(devArrMatch(in_ratio.data(), out_ratio_ref.data(), 4,
                           CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathSignFlipTestF;
 TEST_P(MathSignFlipTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_sign_flip, out_sign_flip_ref, params.len,
-                          CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(in_sign_flip.data(), out_sign_flip_ref.data(),
+                          params.len, CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathSignFlipTestD;
 TEST_P(MathSignFlipTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_sign_flip, out_sign_flip_ref, params.len,
-                          CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(in_sign_flip.data(), out_sign_flip_ref.data(),
+                          params.len, CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathReciprocalTestF;
 TEST_P(MathReciprocalTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4,
+  ASSERT_TRUE(devArrMatch(in_recip.data(), in_recip_ref.data(), 4,
                           CompareApprox<float>(params.tolerance)));
 
   // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`.
-  ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3,
+  ASSERT_TRUE(devArrMatch(out_recip.data(), in_recip_ref.data(), 3,
                           CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathReciprocalTestD;
 TEST_P(MathReciprocalTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4,
+  ASSERT_TRUE(devArrMatch(in_recip.data(), in_recip_ref.data(), 4,
                           CompareApprox<double>(params.tolerance)));
 
   // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`.
-  ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3,
+  ASSERT_TRUE(devArrMatch(out_recip.data(), in_recip_ref.data(), 3,
                           CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathSetSmallZeroTestF;
 TEST_P(MathSetSmallZeroTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_smallzero, out_smallzero_ref, 4,
+  ASSERT_TRUE(devArrMatch(in_smallzero.data(), out_smallzero_ref.data(), 4,
                           CompareApprox<float>(params.tolerance)));
 
-  ASSERT_TRUE(devArrMatch(out_smallzero, out_smallzero_ref, 4,
+  ASSERT_TRUE(devArrMatch(out_smallzero.data(), out_smallzero_ref.data(), 4,
                           CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathSetSmallZeroTestD;
 TEST_P(MathSetSmallZeroTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_smallzero, out_smallzero_ref, 4,
+  ASSERT_TRUE(devArrMatch(in_smallzero.data(), out_smallzero_ref.data(), 4,
                           CompareApprox<double>(params.tolerance)));
 
-  ASSERT_TRUE(devArrMatch(out_smallzero, out_smallzero_ref, 4,
+  ASSERT_TRUE(devArrMatch(out_smallzero.data(), out_smallzero_ref.data(), 4,
                           CompareApprox<double>(params.tolerance)));
 }
 
diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu
index cc88df0a73..9dde1cca9a 100644
--- a/cpp/test/matrix/matrix.cu
+++ b/cpp/test/matrix/matrix.cu
@@ -39,36 +39,35 @@ template <typename T>
 
 template <typename T>
 class MatrixTest : public ::testing::TestWithParam<MatrixInputs<T>> {
+ public:
+  MatrixTest()
+    : params(::testing::TestWithParam<MatrixInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      in1(params.n_row * params.n_col, stream),
+      in2(params.n_row * params.n_col, stream),
+      in1_revr(params.n_row * params.n_col, stream) {}
+
  protected:
   void SetUp() override {
-    params = ::testing::TestWithParam<MatrixInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.n_row * params.n_col;
-    CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(in1, len, stream);
-    raft::allocate(in2, len, stream);
-    raft::allocate(in1_revr, len, stream);
-    r.uniform(in1, len, T(-1.0), T(1.0), stream);
+    r.uniform(in1.data(), len, T(-1.0), T(1.0), stream);
 
-    copy(in1, in2, params.n_row, params.n_col, stream);
+    copy(in1.data(), in2.data(), params.n_row, params.n_col, stream);
     // copy(in1, in1_revr, params.n_row, params.n_col);
     // colReverse(in1_revr, params.n_row, params.n_col);
 
-    T *outTrunc;
-    raft::allocate(outTrunc, 6, stream);
-    truncZeroOrigin(in1, params.n_row, outTrunc, 3, 2, stream);
+    rmm::device_uvector<T> outTrunc(6, stream);
+    truncZeroOrigin(in1.data(), params.n_row, outTrunc.data(), 3, 2, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
  protected:
-  MatrixInputs<T> params;
-  T *in1, *in2, *in1_revr;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  MatrixInputs<T> params;
+  rmm::device_uvector<T> in1, in2, in1_revr;
 };
 
 const std::vector<MatrixInputs<float>> inputsf2 = {{0.000001f, 4, 4, 1234ULL}};
@@ -78,13 +77,15 @@ const std::vector<MatrixInputs<double>> inputsd2 = {
 
 typedef MatrixTest<float> MatrixTestF;
 TEST_P(MatrixTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(in1, in2, params.n_row * params.n_col,
+  ASSERT_TRUE(raft::devArrMatch(in1.data(), in2.data(),
+                                params.n_row * params.n_col,
                                 raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef MatrixTest<double> MatrixTestD;
 TEST_P(MatrixTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(in1, in2, params.n_row * params.n_col,
+  ASSERT_TRUE(raft::devArrMatch(in1.data(), in2.data(),
+                                params.n_row * params.n_col,
                                 raft::CompareApprox<double>(params.tolerance)));
 }
 
@@ -131,6 +132,9 @@ class MatrixCopyRowsTest : public ::testing::Test {
   }
 
  protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
   int n_rows = 10;
   int n_cols = 3;
   int n_selected = 5;
@@ -140,8 +144,6 @@ class MatrixCopyRowsTest : public ::testing::Test {
                                     17, 19, 20, 23, 24, 27, 29};
   math_t output_exp_rowmajor[15] = {0,  1,  2,  9,  10, 11, 12, 13,
                                     14, 21, 22, 23, 27, 28, 29};
-  raft::handle_t handle;
-  cudaStream_t stream;
   rmm::device_uvector<math_t> input;
   rmm::device_uvector<math_t> output;
   rmm::device_uvector<idx_array_t> indices;
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
index c2ec7a340f..f0e0f6cb93 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/test/random/rng.cu
@@ -79,58 +79,59 @@ template <typename T>
 
 template <typename T>
 class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
+ public:
+  RngTest()
+    : params(::testing::TestWithParam<RngInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      data(0, stream),
+      stats(2, stream) {
+    data.resize(params.len, stream);
+    CUDA_CHECK(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(T), stream));
+  }
+
  protected:
   void SetUp() override {
     // Tests are configured with their expected test-values sigma. For example,
     // 4 x sigma indicates the test shouldn't fail 99.9% of the time.
     num_sigma = 10;
-    params = ::testing::TestWithParam<RngInputs<T>>::GetParam();
-    CUDA_CHECK(cudaStreamCreate(&stream));
     Rng r(params.seed, params.gtype);
-    raft::allocate(data, params.len, stream);
-    raft::allocate(stats, 2, stream, true);
     switch (params.type) {
       case RNG_Normal:
-        r.normal(data, params.len, params.start, params.end, stream);
+        r.normal(data.data(), params.len, params.start, params.end, stream);
         break;
       case RNG_LogNormal:
-        r.lognormal(data, params.len, params.start, params.end, stream);
+        r.lognormal(data.data(), params.len, params.start, params.end, stream);
         break;
       case RNG_Uniform:
-        r.uniform(data, params.len, params.start, params.end, stream);
+        r.uniform(data.data(), params.len, params.start, params.end, stream);
         break;
       case RNG_Gumbel:
-        r.gumbel(data, params.len, params.start, params.end, stream);
+        r.gumbel(data.data(), params.len, params.start, params.end, stream);
         break;
       case RNG_Logistic:
-        r.logistic(data, params.len, params.start, params.end, stream);
+        r.logistic(data.data(), params.len, params.start, params.end, stream);
         break;
       case RNG_Exp:
-        r.exponential(data, params.len, params.start, stream);
+        r.exponential(data.data(), params.len, params.start, stream);
         break;
       case RNG_Rayleigh:
-        r.rayleigh(data, params.len, params.start, stream);
+        r.rayleigh(data.data(), params.len, params.start, stream);
         break;
       case RNG_Laplace:
-        r.laplace(data, params.len, params.start, params.end, stream);
+        r.laplace(data.data(), params.len, params.start, params.end, stream);
         break;
     };
     static const int threads = 128;
     meanKernel<T, threads>
-      <<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(stats, data,
-                                                                   params.len);
-    update_host<T>(h_stats, stats, 2, stream);
+      <<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(
+        stats.data(), data.data(), params.len);
+    update_host<T>(h_stats, stats.data(), 2, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
     h_stats[0] /= params.len;
     h_stats[1] = (h_stats[1] / params.len) - (h_stats[0] * h_stats[0]);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
   void getExpectedMeanVar(T meanvar[2]) {
     switch (params.type) {
       case RNG_Normal:
@@ -177,11 +178,13 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
   }
 
  protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
   RngInputs<T> params;
-  T *data, *stats;
+  rmm::device_uvector<T> data, stats;
   T h_stats[2];  // mean, var
   int num_sigma;
-  cudaStream_t stream;
 };
 
 // The measured mean and standard deviation for each tested distribution are,
@@ -375,30 +378,29 @@ TEST(Rng, MeanError) {
   int seed = time_struct.millitm;
   int num_samples = 1024;
   int num_experiments = 1024;
-  float* data;
-  float* mean_result;
-  float* std_result;
   int len = num_samples * num_experiments;
 
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
 
-  raft::allocate(data, len, stream);
-  raft::allocate(mean_result, num_experiments, stream);
-  raft::allocate(std_result, num_experiments, stream);
+  rmm::device_uvector<float> data(len, stream);
+  rmm::device_uvector<float> mean_result(num_experiments, stream);
+  rmm::device_uvector<float> std_result(num_experiments, stream);
 
   for (auto rtype : {GenPhilox, GenKiss99 /*, raft::random::GenTaps */}) {
     Rng r(seed, rtype);
-    r.normal(data, len, 3.3f, 0.23f, stream);
+    r.normal(data.data(), len, 3.3f, 0.23f, stream);
     // r.uniform(data, len, -1.0, 2.0);
-    raft::stats::mean(mean_result, data, num_samples, num_experiments, false,
-                      false, stream);
-    raft::stats::stddev(std_result, data, mean_result, num_samples,
-                        num_experiments, false, false, stream);
+    raft::stats::mean(mean_result.data(), data.data(), num_samples,
+                      num_experiments, false, false, stream);
+    raft::stats::stddev(std_result.data(), data.data(), mean_result.data(),
+                        num_samples, num_experiments, false, false, stream);
     std::vector<float> h_mean_result(num_experiments);
     std::vector<float> h_std_result(num_experiments);
-    update_host(h_mean_result.data(), mean_result, num_experiments, stream);
-    update_host(h_std_result.data(), std_result, num_experiments, stream);
+    update_host(h_mean_result.data(), mean_result.data(), num_experiments,
+                stream);
+    update_host(h_std_result.data(), std_result.data(), num_experiments,
+                stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
     auto d_mean = quick_mean(h_mean_result);
 
@@ -416,7 +418,6 @@ TEST(Rng, MeanError) {
     ASSERT_TRUE(
       (diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5));
   }
-  raft::deallocate_all(stream);
   CUDA_CHECK(cudaStreamDestroy(stream));
 
   // std::cout << "mean_res:" << h_mean_result << "\n";
@@ -424,29 +425,29 @@ TEST(Rng, MeanError) {
 
 template <typename T, int len, int scale>
 class ScaledBernoulliTest : public ::testing::Test {
+ public:
+  ScaledBernoulliTest() : stream(handle.get_stream()), data(len, stream) {}
+
  protected:
   void SetUp() override {
     CUDA_CHECK(cudaStreamCreate(&stream));
-
     Rng r(42);
-
-    raft::allocate(data, len * sizeof(T), stream);
-    r.scaled_bernoulli(data, len, T(0.5), T(scale), stream);
+    r.scaled_bernoulli(data.data(), len, T(0.5), T(scale), stream);
   }
 
-  void TearDown() override { CUDA_CHECK(cudaFree(data)); }
-
   void rangeCheck() {
     T* h_data = new T[len];
-    update_host(h_data, data, len, stream);
+    update_host(h_data, data.data(), len, stream);
     ASSERT_TRUE(std::none_of(h_data, h_data + len, [](const T& a) {
       return a < -scale || a > scale;
     }));
     delete[] h_data;
   }
 
-  T* data;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  rmm::device_uvector<T> data;
 };
 
 typedef ScaledBernoulliTest<float, 500, 35> ScaledBernoulliTest1;
@@ -457,27 +458,29 @@ TEST_F(ScaledBernoulliTest2, RangeCheck) { rangeCheck(); }
 
 template <typename T, int len>
 class BernoulliTest : public ::testing::Test {
+ public:
+  BernoulliTest() : stream(handle.get_stream()), data(len, stream) {}
+
  protected:
   void SetUp() override {
-    CUDA_CHECK(cudaStreamCreate(&stream));
     Rng r(42);
-    raft::allocate(data, len * sizeof(bool), stream);
-    r.bernoulli(data, len, T(0.5), stream);
+    r.bernoulli(data.data(), len, T(0.5), stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override { CUDA_CHECK(cudaFree(data)); }
-
   void trueFalseCheck() {
     // both true and false values must be present
     bool* h_data = new bool[len];
-    update_host(h_data, data, len, stream);
+    update_host(h_data, data.data(), len, stream);
     ASSERT_TRUE(std::any_of(h_data, h_data + len, [](bool a) { return a; }));
     ASSERT_TRUE(std::any_of(h_data, h_data + len, [](bool a) { return !a; }));
     delete[] h_data;
   }
 
-  bool* data;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  rmm::device_uvector<bool> data;
 };
 
 typedef BernoulliTest<float, 1000> BernoulliTest1;
@@ -505,49 +508,50 @@ template <typename T>
 template <typename T>
 class RngNormalTableTest
   : public ::testing::TestWithParam<RngNormalTableInputs<T>> {
+ public:
+  RngNormalTableTest()
+    : params(::testing::TestWithParam<RngNormalTableInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      data(params.rows * params.cols, stream),
+      stats(2, stream),
+      mu_vec(params.cols, stream) {
+    CUDA_CHECK(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(T), stream));
+  }
+
  protected:
   void SetUp() override {
     // Tests are configured with their expected test-values sigma. For example,
     // 4 x sigma indicates the test shouldn't fail 99.9% of the time.
     num_sigma = 10;
-    params = ::testing::TestWithParam<RngNormalTableInputs<T>>::GetParam();
     int len = params.rows * params.cols;
-
-    CUDA_CHECK(cudaStreamCreate(&stream));
     Rng r(params.seed, params.gtype);
-    raft::allocate(data, len, stream);
-    raft::allocate(stats, 2, stream, true);
-    raft::allocate(mu_vec, params.cols, stream);
-    r.fill(mu_vec, params.cols, params.mu, stream);
+    r.fill(mu_vec.data(), params.cols, params.mu, stream);
     T* sigma_vec = nullptr;
-    r.normalTable(data, params.rows, params.cols, mu_vec, sigma_vec,
-                  params.sigma, stream);
+    r.normalTable(data.data(), params.rows, params.cols, mu_vec.data(),
+                  sigma_vec, params.sigma, stream);
     static const int threads = 128;
-    meanKernel<T, threads>
-      <<<raft::ceildiv(len, threads), threads, 0, stream>>>(stats, data, len);
-    update_host<T>(h_stats, stats, 2, stream);
+    meanKernel<T, threads><<<raft::ceildiv(len, threads), threads, 0, stream>>>(
+      stats.data(), data.data(), len);
+    update_host<T>(h_stats, stats.data(), 2, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
     h_stats[0] /= len;
     h_stats[1] = (h_stats[1] / len) - (h_stats[0] * h_stats[0]);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
   void getExpectedMeanVar(T meanvar[2]) {
     meanvar[0] = params.mu;
     meanvar[1] = params.sigma * params.sigma;
   }
 
  protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
   RngNormalTableInputs<T> params;
-  T *data, *stats, *mu_vec;
+  rmm::device_uvector<T> data, stats, mu_vec;
   T h_stats[2];  // mean, var
   int num_sigma;
-  cudaStream_t stream;
 };
 
 typedef RngNormalTableTest<float> RngNormalTableTestF;
diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu
index a98619e5b4..e51700fbb7 100644
--- a/cpp/test/random/rng_int.cu
+++ b/cpp/test/random/rng_int.cu
@@ -65,35 +65,36 @@ template <typename T>
 
 template <typename T>
 class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
+ public:
+  RngTest()
+    : params(::testing::TestWithParam<RngInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      data(0, stream),
+      stats(2, stream) {
+    data.resize(params.len, stream);
+    CUDA_CHECK(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(float), stream));
+  }
+
  protected:
   void SetUp() override {
-    params = ::testing::TestWithParam<RngInputs<T>>::GetParam();
     Rng r(params.seed, params.gtype);
 
-    CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(data, params.len, stream);
-    raft::allocate(stats, 2, stream, true);
     switch (params.type) {
       case RNG_Uniform:
-        r.uniformInt(data, params.len, params.start, params.end, stream);
+        r.uniformInt(data.data(), params.len, params.start, params.end, stream);
         break;
     };
     static const int threads = 128;
     meanKernel<T, threads>
-      <<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(stats, data,
-                                                                   params.len);
-    update_host<float>(h_stats, stats, 2, stream);
+      <<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(
+        stats.data(), data.data(), params.len);
+    update_host<float>(h_stats, stats.data(), 2, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
     h_stats[0] /= params.len;
     h_stats[1] = (h_stats[1] / params.len) - (h_stats[0] * h_stats[0]);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
   void getExpectedMeanVar(float meanvar[2]) {
     switch (params.type) {
       case RNG_Uniform:
@@ -105,11 +106,13 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
   }
 
  protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
   RngInputs<T> params;
-  T *data;
-  float *stats;
+  rmm::device_uvector<T> data;
+  rmm::device_uvector<float> stats;
   float h_stats[2];  // mean, var
-  cudaStream_t stream;
 };
 
 typedef RngTest<uint32_t> RngTestU32;
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu
index cf60f46afe..ecb4164616 100644
--- a/cpp/test/random/sample_without_replacement.cu
+++ b/cpp/test/random/sample_without_replacement.cu
@@ -44,40 +44,40 @@ template <typename T>
 
 template <typename T>
 class SWoRTest : public ::testing::TestWithParam<SWoRInputs<T>> {
+ public:
+  SWoRTest()
+    : params(::testing::TestWithParam<SWoRInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      in(params.len, stream),
+      wts(params.len, stream),
+      out(params.sampledLen, stream),
+      outIdx(params.sampledLen, stream) {}
+
  protected:
   void SetUp() override {
-    params = ::testing::TestWithParam<SWoRInputs<T>>::GetParam();
-    CUDA_CHECK(cudaStreamCreate(&stream));
-
     Rng r(params.seed, params.gtype);
-    raft::allocate(in, params.len, stream);
-    raft::allocate(wts, params.len, stream);
-    raft::allocate(out, params.sampledLen, stream);
-    raft::allocate(outIdx, params.sampledLen, stream);
     h_outIdx.resize(params.sampledLen);
-    r.uniform(in, params.len, T(-1.0), T(1.0), stream);
-    r.uniform(wts, params.len, T(1.0), T(2.0), stream);
+    r.uniform(in.data(), params.len, T(-1.0), T(1.0), stream);
+    r.uniform(wts.data(), params.len, T(1.0), T(2.0), stream);
     if (params.largeWeightIndex >= 0) {
-      update_device(wts + params.largeWeightIndex, &params.largeWeight, 1,
-                    stream);
+      update_device(wts.data() + params.largeWeightIndex, &params.largeWeight,
+                    1, stream);
     }
-    r.sampleWithoutReplacement(handle, out, outIdx, in, wts, params.sampledLen,
-                               params.len, stream);
-    update_host(&(h_outIdx[0]), outIdx, params.sampledLen, stream);
-  }
-
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    r.sampleWithoutReplacement(handle, out.data(), outIdx.data(), in.data(),
+                               wts.data(), params.sampledLen, params.len,
+                               stream);
+    update_host(&(h_outIdx[0]), outIdx.data(), params.sampledLen, stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
  protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
   SWoRInputs<T> params;
-  T *in, *out, *wts;
-  int* outIdx;
+  rmm::device_uvector<T> in, out, wts;
+  rmm::device_uvector<int> outIdx;
   std::vector<int> h_outIdx;
-  cudaStream_t stream;
-  raft::handle_t handle;
 };
 
 typedef SWoRTest<float> SWoRTestF;
diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu
index 8429a46941..b9d4d18e98 100644
--- a/cpp/test/sparse/add.cu
+++ b/cpp/test/sparse/add.cu
@@ -46,81 +46,86 @@ struct CSRAddInputs {
 template <typename Type_f, typename Index_>
 class CSRAddTest
   : public ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>> {
+ public:
+  CSRAddTest()
+    : params(
+        ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>>::GetParam()),
+      stream(handle.get_stream()),
+      ind_a(params.matrix_a.row_ind.size(), stream),
+      ind_ptr_a(params.matrix_a.row_ind_ptr.size(), stream),
+      values_a(params.matrix_a.row_ind_ptr.size(), stream),
+      ind_b(params.matrix_a.row_ind.size(), stream),
+      ind_ptr_b(params.matrix_b.row_ind_ptr.size(), stream),
+      values_b(params.matrix_b.row_ind_ptr.size(), stream),
+      ind_verify(params.matrix_a.row_ind.size(), stream),
+      ind_ptr_verify(params.matrix_verify.row_ind_ptr.size(), stream),
+      values_verify(params.matrix_verify.row_ind_ptr.size(), stream),
+      ind_result(params.matrix_a.row_ind.size(), stream),
+      ind_ptr_result(params.matrix_verify.row_ind_ptr.size(), stream),
+      values_result(params.matrix_verify.row_ind_ptr.size(), stream) {}
+
  protected:
   void SetUp() override {
-    params = ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>>::GetParam();
     n_rows = params.matrix_a.row_ind.size();
     nnz_a = params.matrix_a.row_ind_ptr.size();
     nnz_b = params.matrix_b.row_ind_ptr.size();
     nnz_result = params.matrix_verify.row_ind_ptr.size();
-
-    cudaStreamCreate(&stream);
-
-    raft::allocate(ind_a, n_rows, stream);
-    raft::allocate(ind_ptr_a, nnz_a, stream);
-    raft::allocate(values_a, nnz_a, stream);
-
-    raft::allocate(ind_b, n_rows, stream);
-    raft::allocate(ind_ptr_b, nnz_b, stream);
-    raft::allocate(values_b, nnz_b, stream);
-
-    raft::allocate(ind_verify, n_rows, stream);
-    raft::allocate(ind_ptr_verify, nnz_result, stream);
-    raft::allocate(values_verify, nnz_result, stream);
-
-    raft::allocate(ind_result, n_rows, stream);
-    raft::allocate(ind_ptr_result, nnz_result, stream);
-    raft::allocate(values_result, nnz_result, stream);
   }
 
   void Run() {
-    raft::update_device(ind_a, params.matrix_a.row_ind.data(), n_rows, stream);
-    raft::update_device(ind_ptr_a, params.matrix_a.row_ind_ptr.data(), nnz_a,
+    raft::update_device(ind_a.data(), params.matrix_a.row_ind.data(), n_rows,
+                        stream);
+    raft::update_device(ind_ptr_a.data(), params.matrix_a.row_ind_ptr.data(),
+                        nnz_a, stream);
+    raft::update_device(values_a.data(), params.matrix_a.values.data(), nnz_a,
                         stream);
-    raft::update_device(values_a, params.matrix_a.values.data(), nnz_a, stream);
 
-    raft::update_device(ind_b, params.matrix_b.row_ind.data(), n_rows, stream);
-    raft::update_device(ind_ptr_b, params.matrix_b.row_ind_ptr.data(), nnz_b,
+    raft::update_device(ind_b.data(), params.matrix_b.row_ind.data(), n_rows,
+                        stream);
+    raft::update_device(ind_ptr_b.data(), params.matrix_b.row_ind_ptr.data(),
+                        nnz_b, stream);
+    raft::update_device(values_b.data(), params.matrix_b.values.data(), nnz_b,
                         stream);
-    raft::update_device(values_b, params.matrix_b.values.data(), nnz_b, stream);
 
-    raft::update_device(ind_verify, params.matrix_verify.row_ind.data(), n_rows,
+    raft::update_device(ind_verify.data(), params.matrix_verify.row_ind.data(),
+                        n_rows, stream);
+    raft::update_device(ind_ptr_verify.data(),
+                        params.matrix_verify.row_ind_ptr.data(), nnz_result,
                         stream);
-    raft::update_device(ind_ptr_verify, params.matrix_verify.row_ind_ptr.data(),
-                        nnz_result, stream);
-    raft::update_device(values_verify, params.matrix_verify.values.data(),
-                        nnz_result, stream);
+    raft::update_device(values_verify.data(),
+                        params.matrix_verify.values.data(), nnz_result, stream);
 
     Index_ nnz = linalg::csr_add_calc_inds<Type_f, 32>(
-      ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b,
-      n_rows, ind_result, stream);
+      ind_a.data(), ind_ptr_a.data(), values_a.data(), nnz_a, ind_b.data(),
+      ind_ptr_b.data(), values_b.data(), nnz_b, n_rows, ind_result.data(),
+      stream);
 
     ASSERT_TRUE(nnz == nnz_result);
-    ASSERT_TRUE(raft::devArrMatch<Index_>(ind_verify, ind_result, n_rows,
-                                          raft::Compare<Index_>()));
+    ASSERT_TRUE(raft::devArrMatch<Index_>(ind_verify.data(), ind_result.data(),
+                                          n_rows, raft::Compare<Index_>()));
 
     linalg::csr_add_finalize<Type_f, 32>(
-      ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b,
-      n_rows, ind_result, ind_ptr_result, values_result, stream);
+      ind_a.data(), ind_ptr_a.data(), values_a.data(), nnz_a, ind_b.data(),
+      ind_ptr_b.data(), values_b.data(), nnz_b, n_rows, ind_result.data(),
+      ind_ptr_result.data(), values_result.data(), stream);
 
-    ASSERT_TRUE(raft::devArrMatch<Index_>(ind_ptr_verify, ind_ptr_result, nnz,
+    ASSERT_TRUE(raft::devArrMatch<Index_>(ind_ptr_verify.data(),
+                                          ind_ptr_result.data(), nnz,
                                           raft::Compare<Index_>()));
-    ASSERT_TRUE(raft::devArrMatch<Type_f>(values_verify, values_result, nnz,
+    ASSERT_TRUE(raft::devArrMatch<Type_f>(values_verify.data(),
+                                          values_result.data(), nnz,
                                           raft::Compare<Type_f>()));
   }
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
  protected:
-  CSRAddInputs<Type_f, Index_> params;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  CSRAddInputs<Type_f, Index_> params;
   Index_ n_rows, nnz_a, nnz_b, nnz_result;
-  Index_ *ind_a, *ind_b, *ind_verify, *ind_result, *ind_ptr_a, *ind_ptr_b,
-    *ind_ptr_verify, *ind_ptr_result;
-  Type_f *values_a, *values_b, *values_verify, *values_result;
+  rmm::device_uvector<Index_> ind_a, ind_b, ind_verify, ind_result, ind_ptr_a,
+    ind_ptr_b, ind_ptr_verify, ind_ptr_result;
+  rmm::device_uvector<Type_f> values_a, values_b, values_verify, values_result;
 };
 
 using CSRAddTestF = CSRAddTest<float, int>;
diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu
index 4f9c00c7ab..8bdd5b88c7 100644
--- a/cpp/test/sparse/convert_coo.cu
+++ b/cpp/test/sparse/convert_coo.cu
@@ -38,38 +38,37 @@ struct CSRtoCOOInputs {
 
 template <typename Index_>
 class CSRtoCOOTest : public ::testing::TestWithParam<CSRtoCOOInputs<Index_>> {
- protected:
-  void SetUp() override {
-    params = ::testing::TestWithParam<CSRtoCOOInputs<Index_>>::GetParam();
+ public:
+  CSRtoCOOTest()
+    : params(::testing::TestWithParam<CSRtoCOOInputs<Index_>>::GetParam()),
+      stream(handle.get_stream()),
+      ex_scan(params.ex_scan.size(), stream),
+      verify(params.verify.size(), stream),
+      result(params.verify.size(), stream) {}
 
-    cudaStreamCreate(&stream);
-    raft::allocate(ex_scan, params.ex_scan.size(), stream);
-    raft::allocate(verify, params.verify.size(), stream);
-    raft::allocate(result, params.verify.size(), stream, true);
-  }
+ protected:
+  void SetUp() override {}
 
   void Run() {
     Index_ n_rows = params.ex_scan.size();
     Index_ nnz = params.verify.size();
 
-    raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream);
-    raft::update_device(verify, params.verify.data(), nnz, stream);
+    raft::update_device(ex_scan.data(), params.ex_scan.data(), n_rows, stream);
+    raft::update_device(verify.data(), params.verify.data(), nnz, stream);
 
-    convert::csr_to_coo<Index_, 32>(ex_scan, n_rows, result, nnz, stream);
+    convert::csr_to_coo<Index_, 32>(ex_scan.data(), n_rows, result.data(), nnz,
+                                    stream);
 
-    ASSERT_TRUE(raft::devArrMatch<Index_>(verify, result, nnz,
+    ASSERT_TRUE(raft::devArrMatch<Index_>(verify.data(), result.data(), nnz,
                                           raft::Compare<float>(), stream));
   }
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
  protected:
-  CSRtoCOOInputs<Index_> params;
+  raft::handle_t handle;
   cudaStream_t stream;
-  Index_ *ex_scan, *verify, *result;
+
+  CSRtoCOOInputs<Index_> params;
+  rmm::device_uvector<Index_> ex_scan, verify, result;
 };
 
 using CSRtoCOOTestI = CSRtoCOOTest<int>;
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index 465aad4e7f..2f1ed99332 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -63,30 +63,28 @@ TEST_P(SortedCOOToCSR, Result) {
 
   int nnz = 8;
 
-  int *in, *out, *exp;
-
   int *in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
   int *exp_h = new int[4]{0, 2, 4, 6};
 
-  raft::allocate(in, nnz, stream, true);
-  raft::allocate(exp, 4, stream, true);
-  raft::allocate(out, 4, stream, true);
+  rmm::device_uvector<int> in(nnz, stream);
+  rmm::device_uvector<int> exp(4, stream);
+  rmm::device_uvector<int> out(4, stream);
+  CUDA_CHECK(cudaMemsetAsync(in.data(), 0, in.size() * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(exp.data(), 0, exp.size() * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(out.data(), 0, out.size() * sizeof(int), stream));
 
-  raft::update_device(in, in_h, nnz, stream);
-  raft::update_device(exp, exp_h, 4, stream);
+  raft::update_device(in.data(), in_h, nnz, stream);
+  raft::update_device(exp.data(), exp_h, 4, stream);
 
-  convert::sorted_coo_to_csr<int>(in, nnz, out, 4, stream);
+  convert::sorted_coo_to_csr<int>(in.data(), nnz, out.data(), 4, stream);
 
-  ASSERT_TRUE(raft::devArrMatch<int>(out, exp, 4, raft::Compare<int>()));
+  ASSERT_TRUE(
+    raft::devArrMatch<int>(out.data(), exp.data(), 4, raft::Compare<int>()));
 
   cudaStreamDestroy(stream);
 
   delete[] in_h;
   delete[] exp_h;
-
-  CUDA_CHECK(cudaFree(in));
-  CUDA_CHECK(cudaFree(exp));
-  CUDA_CHECK(cudaFree(out));
 }
 
 INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR,
@@ -106,42 +104,41 @@ struct CSRAdjGraphInputs {
 template <typename Index_>
 class CSRAdjGraphTest
   : public ::testing::TestWithParam<CSRAdjGraphInputs<Index_>> {
+ public:
+  CSRAdjGraphTest()
+    : params(::testing::TestWithParam<CSRAdjGraphInputs<Index_>>::GetParam()),
+      stream(handle.get_stream()),
+      row_ind(params.n_rows, stream),
+      adj(params.n_rows * params.n_cols, stream),
+      result(params.verify.size(), stream),
+      verify(params.verify.size(), stream) {}
+
  protected:
-  void SetUp() override {
-    params = ::testing::TestWithParam<CSRAdjGraphInputs<Index_>>::GetParam();
-    cudaStreamCreate(&stream);
-    nnz = params.verify.size();
-
-    raft::allocate(row_ind, params.n_rows, stream);
-    raft::allocate(adj, params.n_rows * params.n_cols, stream);
-    raft::allocate(result, nnz, stream, true);
-    raft::allocate(verify, nnz, stream);
-  }
+  void SetUp() override { nnz = params.verify.size(); }
 
   void Run() {
-    raft::update_device(row_ind, params.row_ind.data(), params.n_rows, stream);
-    raft::update_device(adj, reinterpret_cast<bool *>(params.adj.data()),
+    raft::update_device(row_ind.data(), params.row_ind.data(), params.n_rows,
+                        stream);
+    raft::update_device(adj.data(), reinterpret_cast<bool *>(params.adj.data()),
                         params.n_rows * params.n_cols, stream);
-    raft::update_device(verify, params.verify.data(), nnz, stream);
+    raft::update_device(verify.data(), params.verify.data(), nnz, stream);
 
-    convert::csr_adj_graph_batched<Index_, 32>(
-      row_ind, params.n_cols, nnz, params.n_rows, adj, result, stream);
+    convert::csr_adj_graph_batched<Index_, 32>(row_ind.data(), params.n_cols,
+                                               nnz, params.n_rows, adj.data(),
+                                               result.data(), stream);
 
-    ASSERT_TRUE(
-      raft::devArrMatch<Index_>(verify, result, nnz, raft::Compare<Index_>()));
-  }
-
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    ASSERT_TRUE(raft::devArrMatch<Index_>(verify.data(), result.data(), nnz,
+                                          raft::Compare<Index_>()));
   }
 
  protected:
-  CSRAdjGraphInputs<Index_> params;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  CSRAdjGraphInputs<Index_> params;
   Index_ nnz;
-  Index_ *row_ind, *result, *verify;
-  bool *adj;
+  rmm::device_uvector<Index_> row_ind, result, verify;
+  rmm::device_uvector<bool> adj;
 };
 
 using CSRAdjGraphTestI = CSRAdjGraphTest<int>;
diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu
index 00e6899cb2..33893649bd 100644
--- a/cpp/test/sparse/csr_row_slice.cu
+++ b/cpp/test/sparse/csr_row_slice.cu
@@ -16,12 +16,14 @@
 
 #include <cusparse_v2.h>
 #include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
 
 #include <gtest/gtest.h>
 #include <raft/sparse/cusparse_wrappers.h>
-
 #include <raft/sparse/op/slice.h>
 
+#include <rmm/device_uvector.hpp>
+
 #include "../test_utils.h"
 
 namespace raft {
@@ -53,91 +55,97 @@ template <typename value_idx, typename value_t>
 template <typename value_idx, typename value_t>
 class CSRRowSliceTest
   : public ::testing::TestWithParam<CSRRowSliceInputs<value_idx, value_t>> {
+ public:
+  CSRRowSliceTest()
+    : params(::testing::TestWithParam<
+             CSRRowSliceInputs<value_idx, value_t>>::GetParam()),
+      stream(handle.get_stream()),
+      indptr(0, stream),
+      indices(0, stream),
+      data(0, stream),
+      out_indptr_ref(0, stream),
+      out_indices_ref(0, stream),
+      out_data_ref(0, stream),
+      out_indptr(0, stream),
+      out_indices(0, stream),
+      out_data(0, stream) {
+    indptr.resize(params.indptr_h.size(), stream);
+    indices.resize(params.indices_h.size(), stream);
+    data.resize(params.data_h.size(), stream);
+    out_indptr_ref.resize(params.out_indptr_ref_h.size(), stream);
+    out_indices_ref.resize(params.out_indices_ref_h.size(), stream);
+    out_data_ref.resize(params.out_data_ref_h.size(), stream);
+    out_indptr.resize(params.out_indptr_ref_h.size(), stream);
+    out_indices.resize(params.out_indices_ref_h.size(), stream);
+    out_data.resize(params.out_data_ref_h.size(), stream);
+  }
+
  protected:
   void make_data() {
     std::vector<value_idx> indptr_h = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
     std::vector<value_t> data_h = params.data_h;
 
-    raft::allocate(indptr, indptr_h.size(), stream);
-    raft::allocate(indices, indices_h.size(), stream);
-    raft::allocate(data, data_h.size(), stream);
-
-    update_device(indptr, indptr_h.data(), indptr_h.size(), stream);
-    update_device(indices, indices_h.data(), indices_h.size(), stream);
-    update_device(data, data_h.data(), data_h.size(), stream);
+    update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream);
+    update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(data.data(), data_h.data(), data_h.size(), stream);
 
     std::vector<value_idx> out_indptr_ref_h = params.out_indptr_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
     std::vector<value_t> out_data_ref_h = params.out_data_ref_h;
 
-    raft::allocate(out_indptr_ref, out_indptr_ref_h.size(), stream);
-    raft::allocate(out_indices_ref, out_indices_ref_h.size(), stream);
-    raft::allocate(out_data_ref, out_data_ref_h.size(), stream);
-
-    update_device(out_indptr_ref, out_indptr_ref_h.data(),
+    update_device(out_indptr_ref.data(), out_indptr_ref_h.data(),
                   out_indptr_ref_h.size(), stream);
-    update_device(out_indices_ref, out_indices_ref_h.data(),
+    update_device(out_indices_ref.data(), out_indices_ref_h.data(),
                   out_indices_ref_h.size(), stream);
-    update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(),
-                  stream);
-
-    raft::allocate(out_indptr, out_indptr_ref_h.size(), stream);
-    raft::allocate(out_indices, out_indices_ref_h.size(), stream);
-    raft::allocate(out_data, out_data_ref_h.size(), stream);
+    update_device(out_data_ref.data(), out_data_ref_h.data(),
+                  out_data_ref_h.size(), stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void SetUp() override {
-    params = ::testing::TestWithParam<
-      CSRRowSliceInputs<value_idx, value_t>>::GetParam();
-    CUDA_CHECK(cudaStreamCreate(&stream));
-
     make_data();
 
     int csr_start_offset;
     int csr_stop_offset;
 
     raft::sparse::op::csr_row_slice_indptr(
-      params.start_row, params.stop_row, indptr, out_indptr, &csr_start_offset,
-      &csr_stop_offset, stream);
+      params.start_row, params.stop_row, indptr.data(), out_indptr.data(),
+      &csr_start_offset, &csr_stop_offset, stream);
 
-    raft::sparse::op::csr_row_slice_populate(csr_start_offset, csr_stop_offset,
-                                             indices, data, out_indices,
-                                             out_data, stream);
+    raft::sparse::op::csr_row_slice_populate(
+      csr_start_offset, csr_stop_offset, indices.data(), data.data(),
+      out_indices.data(), out_data.data(), stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
   void compare() {
-    ASSERT_TRUE(devArrMatch(out_indptr, out_indptr_ref,
+    ASSERT_TRUE(devArrMatch(out_indptr.data(), out_indptr_ref.data(),
                             params.out_indptr_ref_h.size(),
                             Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(out_indices, out_indices_ref,
+    ASSERT_TRUE(devArrMatch(out_indices.data(), out_indices_ref.data(),
                             params.out_indices_ref_h.size(),
                             Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(out_data, out_data_ref,
+    ASSERT_TRUE(devArrMatch(out_data.data(), out_data_ref.data(),
                             params.out_data_ref_h.size(), Compare<value_t>()));
   }
 
  protected:
+  raft::handle_t handle;
   cudaStream_t stream;
 
   // input data
-  value_idx *indptr, *indices;
-  value_t *data;
+  rmm::device_uvector<value_idx> indptr, indices;
+  rmm::device_uvector<value_t> data;
 
   // output data
-  value_idx *out_indptr, *out_indices;
-  value_t *out_data;
+  rmm::device_uvector<value_idx> out_indptr, out_indices;
+  rmm::device_uvector<value_t> out_data;
 
   // expected output data
-  value_idx *out_indptr_ref, *out_indices_ref;
-  value_t *out_data_ref;
+  rmm::device_uvector<value_idx> out_indptr_ref, out_indices_ref;
+  rmm::device_uvector<value_t> out_data_ref;
 
   CSRRowSliceInputs<value_idx, value_t> params;
 };
diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu
index 7f6b7dad07..1a206c8499 100644
--- a/cpp/test/sparse/csr_to_dense.cu
+++ b/cpp/test/sparse/csr_to_dense.cu
@@ -16,10 +16,14 @@
 
 #include <cusparse_v2.h>
 #include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
 
 #include <gtest/gtest.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/sparse/convert/dense.cuh>
+
+#include <rmm/device_uvector.hpp>
+
 #include "../test_utils.h"
 
 namespace raft {
@@ -49,67 +53,72 @@ template <typename value_idx, typename value_t>
 template <typename value_idx, typename value_t>
 class CSRToDenseTest
   : public ::testing::TestWithParam<CSRToDenseInputs<value_idx, value_t>> {
+ public:
+  CSRToDenseTest()
+    : params(::testing::TestWithParam<
+             CSRToDenseInputs<value_idx, value_t>>::GetParam()),
+      stream(raft_handle.get_stream()),
+      indptr(0, stream),
+      indices(0, stream),
+      data(0, stream),
+      out_ref(0, stream),
+      out(0, stream) {
+    indptr.resize(params.indptr_h.size(), stream);
+    indices.resize(params.indices_h.size(), stream);
+    data.resize(params.data_h.size(), stream);
+    out_ref.resize(params.out_ref_h.size(), stream);
+    out.resize(params.out_ref_h.size(), stream);
+  }
+
  protected:
   void make_data() {
     std::vector<value_idx> indptr_h = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
     std::vector<value_t> data_h = params.data_h;
 
-    raft::allocate(indptr, indptr_h.size(), stream);
-    raft::allocate(indices, indices_h.size(), stream);
-    raft::allocate(data, data_h.size(), stream);
-
-    update_device(indptr, indptr_h.data(), indptr_h.size(), stream);
-    update_device(indices, indices_h.data(), indices_h.size(), stream);
-    update_device(data, data_h.data(), data_h.size(), stream);
+    update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream);
+    update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(data.data(), data_h.data(), data_h.size(), stream);
 
     std::vector<value_t> out_ref_h = params.out_ref_h;
 
-    raft::allocate(out_ref, out_ref_h.size(), stream);
-
-    update_device(out_ref, out_ref_h.data(), out_ref_h.size(), stream);
-
-    raft::allocate(out, out_ref_h.size(), stream);
+    update_device(out_ref.data(), out_ref_h.data(), out_ref_h.size(), stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void SetUp() override {
-    params = ::testing::TestWithParam<
-      CSRToDenseInputs<value_idx, value_t>>::GetParam();
-    CUDA_CHECK(cudaStreamCreate(&stream));
     CUSPARSE_CHECK(cusparseCreate(&handle));
 
     make_data();
 
-    convert::csr_to_dense(handle, params.nrows, params.ncols, indptr, indices,
-                          data, params.nrows, out, stream, true);
+    convert::csr_to_dense(handle, params.nrows, params.ncols, indptr.data(),
+                          indices.data(), data.data(), params.nrows, out.data(),
+                          stream, true);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUSPARSE_CHECK(cusparseDestroy(handle));
   }
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
   void compare() {
-    ASSERT_TRUE(
-      devArrMatch(out, out_ref, params.out_ref_h.size(), Compare<value_t>()));
+    ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.out_ref_h.size(),
+                            Compare<value_t>()));
   }
 
  protected:
+  raft::handle_t raft_handle;
   cudaStream_t stream;
+
   cusparseHandle_t handle;
 
   // input data
-  value_idx *indptr, *indices;
-  value_t *data;
+  rmm::device_uvector<value_idx> indptr, indices;
+  rmm::device_uvector<value_t> data;
 
   // output data
-  value_t *out;
+  rmm::device_uvector<value_t> out;
 
   // expected output data
-  value_t *out_ref;
+  rmm::device_uvector<value_t> out_ref;
 
   CSRToDenseInputs<value_idx, value_t> params;
 };
diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu
index e50a9d94a9..8983f10d2b 100644
--- a/cpp/test/sparse/csr_transpose.cu
+++ b/cpp/test/sparse/csr_transpose.cu
@@ -20,8 +20,8 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
-
 #include <raft/sparse/linalg/transpose.h>
+#include <raft/handle.hpp>
 
 #include "../test_utils.h"
 
@@ -55,87 +55,95 @@ template <typename value_idx, typename value_t>
 template <typename value_idx, typename value_t>
 class CSRTransposeTest
   : public ::testing::TestWithParam<CSRTransposeInputs<value_idx, value_t>> {
+ public:
+  CSRTransposeTest()
+    : params(::testing::TestWithParam<
+             CSRTransposeInputs<value_idx, value_t>>::GetParam()),
+      stream(raft_handle.get_stream()),
+      indptr(0, stream),
+      indices(0, stream),
+      data(0, stream),
+      out_indptr_ref(0, stream),
+      out_indices_ref(0, stream),
+      out_data_ref(0, stream),
+      out_indptr(0, stream),
+      out_indices(0, stream),
+      out_data(0, stream) {
+    indptr.resize(params.indptr_h.size(), stream);
+    indices.resize(params.indices_h.size(), stream);
+    data.resize(params.data_h.size(), stream);
+    out_indptr_ref.resize(params.out_indptr_ref_h.size(), stream);
+    out_indices_ref.resize(params.out_indices_ref_h.size(), stream);
+    out_data_ref.resize(params.out_data_ref_h.size(), stream);
+    out_indptr.resize(params.out_indptr_ref_h.size(), stream);
+    out_indices.resize(params.out_indices_ref_h.size(), stream);
+    out_data.resize(params.out_data_ref_h.size(), stream);
+  }
+
  protected:
   void make_data() {
     std::vector<value_idx> indptr_h = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
     std::vector<value_t> data_h = params.data_h;
 
-    raft::allocate(indptr, indptr_h.size(), stream);
-    raft::allocate(indices, indices_h.size(), stream);
-    raft::allocate(data, data_h.size(), stream);
-
-    update_device(indptr, indptr_h.data(), indptr_h.size(), stream);
-    update_device(indices, indices_h.data(), indices_h.size(), stream);
-    update_device(data, data_h.data(), data_h.size(), stream);
+    update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream);
+    update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(data.data(), data_h.data(), data_h.size(), stream);
 
     std::vector<value_idx> out_indptr_ref_h = params.out_indptr_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
     std::vector<value_t> out_data_ref_h = params.out_data_ref_h;
 
-    raft::allocate(out_indptr_ref, out_indptr_ref_h.size(), stream);
-    raft::allocate(out_indices_ref, out_indices_ref_h.size(), stream);
-    raft::allocate(out_data_ref, out_data_ref_h.size(), stream);
-
-    update_device(out_indptr_ref, out_indptr_ref_h.data(),
+    update_device(out_indptr_ref.data(), out_indptr_ref_h.data(),
                   out_indptr_ref_h.size(), stream);
-    update_device(out_indices_ref, out_indices_ref_h.data(),
+    update_device(out_indices_ref.data(), out_indices_ref_h.data(),
                   out_indices_ref_h.size(), stream);
-    update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(),
-                  stream);
-
-    raft::allocate(out_indptr, out_indptr_ref_h.size(), stream);
-    raft::allocate(out_indices, out_indices_ref_h.size(), stream);
-    raft::allocate(out_data, out_data_ref_h.size(), stream);
+    update_device(out_data_ref.data(), out_data_ref_h.data(),
+                  out_data_ref_h.size(), stream);
   }
 
   void SetUp() override {
-    params = ::testing::TestWithParam<
-      CSRTransposeInputs<value_idx, value_t>>::GetParam();
-    CUDA_CHECK(cudaStreamCreate(&stream));
     CUSPARSE_CHECK(cusparseCreate(&handle));
 
     make_data();
 
     raft::sparse::linalg::csr_transpose(
-      handle, indptr, indices, data, out_indptr, out_indices, out_data,
-      params.nrows, params.ncols, params.nnz, stream);
+      handle, indptr.data(), indices.data(), data.data(), out_indptr.data(),
+      out_indices.data(), out_data.data(), params.nrows, params.ncols,
+      params.nnz, stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUSPARSE_CHECK(cusparseDestroy(handle));
   }
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
   void compare() {
-    ASSERT_TRUE(devArrMatch(out_indptr, out_indptr_ref,
+    ASSERT_TRUE(devArrMatch(out_indptr.data(), out_indptr_ref.data(),
                             params.out_indptr_ref_h.size(),
                             Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(out_indices, out_indices_ref,
+    ASSERT_TRUE(devArrMatch(out_indices.data(), out_indices_ref.data(),
                             params.out_indices_ref_h.size(),
                             Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(out_data, out_data_ref,
+    ASSERT_TRUE(devArrMatch(out_data.data(), out_data_ref.data(),
                             params.out_data_ref_h.size(), Compare<value_t>()));
   }
 
  protected:
+  raft::handle_t raft_handle;
   cudaStream_t stream;
+
   cusparseHandle_t handle;
 
   // input data
-  value_idx *indptr, *indices;
-  value_t *data;
+  rmm::device_uvector<value_idx> indptr, indices;
+  rmm::device_uvector<value_t> data;
 
   // output data
-  value_idx *out_indptr, *out_indices;
-  value_t *out_data;
+  rmm::device_uvector<value_idx> out_indptr, out_indices;
+  rmm::device_uvector<value_t> out_data;
 
   // expected output data
-  value_idx *out_indptr_ref, *out_indices_ref;
-  value_t *out_data_ref;
+  rmm::device_uvector<value_idx> out_indptr_ref, out_indices_ref;
+  rmm::device_uvector<value_t> out_data_ref;
 
   CSRTransposeInputs<value_idx, value_t> params;
 };
diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu
index f8a469af45..2201702b03 100644
--- a/cpp/test/sparse/degree.cu
+++ b/cpp/test/sparse/degree.cu
@@ -50,24 +50,27 @@ typedef SparseDegreeTests<float> COODegree;
 TEST_P(COODegree, Result) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
-  int *in_rows, *verify, *results;
 
   int in_rows_h[5] = {0, 0, 1, 2, 2};
   int verify_h[5] = {2, 1, 2, 0, 0};
 
-  raft::allocate(in_rows, 5, stream);
-  raft::allocate(verify, 5, stream, true);
-  raft::allocate(results, 5, stream, true);
+  rmm::device_uvector<int> in_rows(5, stream);
+  rmm::device_uvector<int> verify(5, stream);
+  rmm::device_uvector<int> results(5, stream);
+  CUDA_CHECK(
+    cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream));
+  CUDA_CHECK(
+    cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream));
 
-  raft::update_device(in_rows, *&in_rows_h, 5, stream);
-  raft::update_device(verify, *&verify_h, 5, stream);
+  raft::update_device(in_rows.data(), *&in_rows_h, 5, stream);
+  raft::update_device(verify.data(), *&verify_h, 5, stream);
 
-  linalg::coo_degree<32>(in_rows, 5, results, stream);
+  linalg::coo_degree<32>(in_rows.data(), 5, results.data(), stream);
   cudaDeviceSynchronize();
 
-  ASSERT_TRUE(raft::devArrMatch<int>(verify, results, 5, raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(verify.data(), results.data(), 5,
+                                     raft::Compare<int>()));
 
-  raft::deallocate_all(stream);
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
@@ -76,28 +79,32 @@ TEST_P(COODegreeNonzero, Result) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
 
-  int *in_rows, *verify, *results;
-  float *in_vals;
-
   int in_rows_h[5] = {0, 0, 1, 2, 2};
   float in_vals_h[5] = {0.0, 5.0, 0.0, 1.0, 1.0};
   int verify_h[5] = {1, 0, 2, 0, 0};
 
-  raft::allocate(in_rows, 5, stream);
-  raft::allocate(verify, 5, stream, true);
-  raft::allocate(results, 5, stream, true);
-  raft::allocate(in_vals, 5, stream, true);
-
-  raft::update_device(in_rows, *&in_rows_h, 5, stream);
-  raft::update_device(verify, *&verify_h, 5, stream);
-  raft::update_device(in_vals, *&in_vals_h, 5, stream);
-
-  linalg::coo_degree_nz<32, float>(in_rows, in_vals, 5, results, stream);
+  rmm::device_uvector<int> in_rows(5, stream);
+  rmm::device_uvector<int> verify(5, stream);
+  rmm::device_uvector<int> results(5, stream);
+  rmm::device_uvector<float> in_vals(5, stream);
+  CUDA_CHECK(
+    cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream));
+  CUDA_CHECK(
+    cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream));
+  CUDA_CHECK(
+    cudaMemsetAsync(in_vals.data(), 0, in_vals.size() * sizeof(float), stream));
+
+  raft::update_device(in_rows.data(), *&in_rows_h, 5, stream);
+  raft::update_device(verify.data(), *&verify_h, 5, stream);
+  raft::update_device(in_vals.data(), *&in_vals_h, 5, stream);
+
+  linalg::coo_degree_nz<32, float>(in_rows.data(), in_vals.data(), 5,
+                                   results.data(), stream);
   cudaDeviceSynchronize();
 
-  ASSERT_TRUE(raft::devArrMatch<int>(verify, results, 5, raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(verify.data(), results.data(), 5,
+                                     raft::Compare<int>()));
 
-  raft::deallocate_all(stream);
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index 563dcf6f15..cd3e69bdd2 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -77,7 +77,13 @@ class SparseDistanceCOOSPMVTest
   : public ::testing::TestWithParam<
       SparseDistanceCOOSPMVInputs<value_idx, value_t, strategy_t>> {
  public:
-  SparseDistanceCOOSPMVTest() : dist_config(handle) {}
+  SparseDistanceCOOSPMVTest()
+    : dist_config(handle),
+      indptr(0, handle.get_stream()),
+      indices(0, handle.get_stream()),
+      data(0, handle.get_stream()),
+      out_dists(0, handle.get_stream()),
+      out_dists_ref(0, handle.get_stream()) {}
 
   template <typename U,
             std::enable_if_t<std::is_same_v<U, hash_strategy_t>> * = nullptr>
@@ -104,7 +110,7 @@ class SparseDistanceCOOSPMVTest
 
     strategy_t selected_strategy = make_strategy<strategy_t>();
     balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
-      out_dists, dist_config, coo_rows.data(), reduce_func, accum_func,
+      out_dists.data(), dist_config, coo_rows.data(), reduce_func, accum_func,
       write_func, selected_strategy);
 
     if (rev) {
@@ -113,7 +119,7 @@ class SparseDistanceCOOSPMVTest
         dist_config.a_nnz, dist_config.handle.get_stream());
 
       balanced_coo_pairwise_generalized_spmv_rev<value_idx, value_t>(
-        out_dists, dist_config, coo_rows.data(), reduce_func, accum_func,
+        out_dists.data(), dist_config, coo_rows.data(), reduce_func, accum_func,
         write_func, selected_strategy);
     }
   }
@@ -144,7 +150,8 @@ class SparseDistanceCOOSPMVTest
                      AtomicAdd());
         float p = 1.0f / params.input_configuration.metric_arg;
         raft::linalg::unaryOp<value_t>(
-          out_dists, out_dists, dist_config.a_nrows * dist_config.b_nrows,
+          out_dists.data(), out_dists.data(),
+          dist_config.a_nrows * dist_config.b_nrows,
           [=] __device__(value_t input) { return powf(input, p); },
           dist_config.handle.get_stream());
 
@@ -160,24 +167,22 @@ class SparseDistanceCOOSPMVTest
     std::vector<value_idx> indices_h = params.input_configuration.indices_h;
     std::vector<value_t> data_h = params.input_configuration.data_h;
 
-    raft::allocate(indptr, indptr_h.size(), handle.get_stream());
-    raft::allocate(indices, indices_h.size(), handle.get_stream());
-    raft::allocate(data, data_h.size(), handle.get_stream());
+    auto stream = handle.get_stream();
+    indptr.resize(indptr_h.size(), stream);
+    indices.resize(indices_h.size(), stream);
+    data.resize(data_h.size(), stream);
 
-    update_device(indptr, indptr_h.data(), indptr_h.size(),
-                  handle.get_stream());
-    update_device(indices, indices_h.data(), indices_h.size(),
-                  handle.get_stream());
-    update_device(data, data_h.data(), data_h.size(), handle.get_stream());
+    update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream);
+    update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(data.data(), data_h.data(), data_h.size(), stream);
 
     std::vector<value_t> out_dists_ref_h =
       params.input_configuration.out_dists_ref_h;
 
-    raft::allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1),
-                   handle.get_stream());
+    out_dists_ref.resize((indptr_h.size() - 1) * (indptr_h.size() - 1), stream);
 
-    update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
-                  handle.get_stream());
+    update_device(out_dists_ref.data(), out_dists_ref_h.data(),
+                  out_dists_ref_h.size(), stream);
   }
 
   void SetUp() override {
@@ -189,29 +194,27 @@ class SparseDistanceCOOSPMVTest
     dist_config.b_nrows = params.input_configuration.indptr_h.size() - 1;
     dist_config.b_ncols = params.input_configuration.n_cols;
     dist_config.b_nnz = params.input_configuration.indices_h.size();
-    dist_config.b_indptr = indptr;
-    dist_config.b_indices = indices;
-    dist_config.b_data = data;
+    dist_config.b_indptr = indptr.data();
+    dist_config.b_indices = indices.data();
+    dist_config.b_data = data.data();
     dist_config.a_nrows = params.input_configuration.indptr_h.size() - 1;
     dist_config.a_ncols = params.input_configuration.n_cols;
     dist_config.a_nnz = params.input_configuration.indices_h.size();
-    dist_config.a_indptr = indptr;
-    dist_config.a_indices = indices;
-    dist_config.a_data = data;
+    dist_config.a_indptr = indptr.data();
+    dist_config.a_indices = indices.data();
+    dist_config.a_data = data.data();
 
     int out_size = dist_config.a_nrows * dist_config.b_nrows;
 
-    raft::allocate(out_dists, out_size, handle.get_stream());
+    out_dists.resize(out_size, handle.get_stream());
 
     run_spmv();
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
   }
 
-  void TearDown() override { raft::deallocate_all(handle.get_stream()); }
-
   void compare() {
-    ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists,
+    ASSERT_TRUE(devArrMatch(out_dists_ref.data(), out_dists.data(),
                             params.input_configuration.out_dists_ref_h.size(),
                             CompareApprox<value_t>(1e-3)));
   }
@@ -220,11 +223,11 @@ class SparseDistanceCOOSPMVTest
   raft::handle_t handle;
 
   // input data
-  value_idx *indptr, *indices;
-  value_t *data;
+  rmm::device_uvector<value_idx> indptr, indices;
+  rmm::device_uvector<value_t> data;
 
   // output data
-  value_t *out_dists, *out_dists_ref;
+  rmm::device_uvector<value_t> out_dists, out_dists_ref;
 
   raft::sparse::distance::distances_config_t<value_idx, value_t> dist_config;
 
diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu
index 4b531992f0..efe49c6470 100644
--- a/cpp/test/sparse/distance.cu
+++ b/cpp/test/sparse/distance.cu
@@ -58,40 +58,44 @@ template <typename value_idx, typename value_t>
 class SparseDistanceTest
   : public ::testing::TestWithParam<SparseDistanceInputs<value_idx, value_t>> {
  public:
-  SparseDistanceTest() : dist_config(handle) {}
+  SparseDistanceTest()
+    : params(::testing::TestWithParam<
+             SparseDistanceInputs<value_idx, value_t>>::GetParam()),
+      dist_config(handle),
+      indptr(0, handle.get_stream()),
+      indices(0, handle.get_stream()),
+      data(0, handle.get_stream()),
+      out_dists(0, handle.get_stream()),
+      out_dists_ref(0, handle.get_stream()) {}
 
   void SetUp() override {
-    params = ::testing::TestWithParam<
-      SparseDistanceInputs<value_idx, value_t>>::GetParam();
-
     make_data();
 
     dist_config.b_nrows = params.indptr_h.size() - 1;
     dist_config.b_ncols = params.n_cols;
     dist_config.b_nnz = params.indices_h.size();
-    dist_config.b_indptr = indptr;
-    dist_config.b_indices = indices;
-    dist_config.b_data = data;
+    dist_config.b_indptr = indptr.data();
+    dist_config.b_indices = indices.data();
+    dist_config.b_data = data.data();
     dist_config.a_nrows = params.indptr_h.size() - 1;
     dist_config.a_ncols = params.n_cols;
     dist_config.a_nnz = params.indices_h.size();
-    dist_config.a_indptr = indptr;
-    dist_config.a_indices = indices;
-    dist_config.a_data = data;
+    dist_config.a_indptr = indptr.data();
+    dist_config.a_indices = indices.data();
+    dist_config.a_data = data.data();
 
     int out_size = dist_config.a_nrows * dist_config.b_nrows;
 
-    raft::allocate(out_dists, out_size, handle.get_stream());
+    out_dists.resize(out_size, handle.get_stream());
 
-    pairwiseDistance(out_dists, dist_config, params.metric, params.metric_arg);
+    pairwiseDistance(out_dists.data(), dist_config, params.metric,
+                     params.metric_arg);
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
   }
 
-  void TearDown() override { raft::deallocate_all(handle.get_stream()); }
-
   void compare() {
-    ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists,
+    ASSERT_TRUE(devArrMatch(out_dists_ref.data(), out_dists.data(),
                             params.out_dists_ref_h.size(),
                             CompareApprox<value_t>(1e-3)));
   }
@@ -102,33 +106,31 @@ class SparseDistanceTest
     std::vector<value_idx> indices_h = params.indices_h;
     std::vector<value_t> data_h = params.data_h;
 
-    raft::allocate(indptr, indptr_h.size(), handle.get_stream());
-    raft::allocate(indices, indices_h.size(), handle.get_stream());
-    raft::allocate(data, data_h.size(), handle.get_stream());
+    auto stream = handle.get_stream();
+    indptr.resize(indptr_h.size(), stream);
+    indices.resize(indices_h.size(), stream);
+    data.resize(data_h.size(), stream);
 
-    update_device(indptr, indptr_h.data(), indptr_h.size(),
-                  handle.get_stream());
-    update_device(indices, indices_h.data(), indices_h.size(),
-                  handle.get_stream());
-    update_device(data, data_h.data(), data_h.size(), handle.get_stream());
+    update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream);
+    update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(data.data(), data_h.data(), data_h.size(), stream);
 
     std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
 
-    raft::allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1),
-                   handle.get_stream());
+    out_dists_ref.resize((indptr_h.size() - 1) * (indptr_h.size() - 1), stream);
 
-    update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
-                  dist_config.handle.get_stream());
+    update_device(out_dists_ref.data(), out_dists_ref_h.data(),
+                  out_dists_ref_h.size(), dist_config.handle.get_stream());
   }
 
   raft::handle_t handle;
 
   // input data
-  value_idx *indptr, *indices;
-  value_t *data;
+  rmm::device_uvector<value_idx> indptr, indices;
+  rmm::device_uvector<value_t> data;
 
   // output data
-  value_t *out_dists, *out_dists_ref;
+  rmm::device_uvector<value_t> out_dists, out_dists_ref;
 
   SparseDistanceInputs<value_idx, value_t> params;
   raft::sparse::distance::distances_config_t<value_idx, value_t> dist_config;
diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu
index 22f97559b1..86b3b3d382 100644
--- a/cpp/test/sparse/knn.cu
+++ b/cpp/test/sparse/knn.cu
@@ -62,10 +62,19 @@ template <typename value_idx, typename value_t>
 class SparseKNNTest
   : public ::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>> {
  public:
-  void SetUp() override {
-    params =
-      ::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>>::GetParam();
+  SparseKNNTest()
+    : params(::testing::TestWithParam<
+             SparseKNNInputs<value_idx, value_t>>::GetParam()),
+      indptr(0, handle.get_stream()),
+      indices(0, handle.get_stream()),
+      data(0, handle.get_stream()),
+      out_indices(0, handle.get_stream()),
+      out_dists(0, handle.get_stream()),
+      out_indices_ref(0, handle.get_stream()),
+      out_dists_ref(0, handle.get_stream()) {}
 
+ protected:
+  void SetUp() override {
     n_rows = params.indptr_h.size() - 1;
     nnz = params.indices_h.size();
     k = params.k;
@@ -73,20 +82,19 @@ class SparseKNNTest
     make_data();
 
     raft::sparse::selection::brute_force_knn<value_idx, value_t>(
-      indptr, indices, data, nnz, n_rows, params.n_cols, indptr, indices, data,
-      nnz, n_rows, params.n_cols, out_indices, out_dists, k, handle,
-      params.batch_size_index, params.batch_size_query, params.metric);
+      indptr.data(), indices.data(), data.data(), nnz, n_rows, params.n_cols,
+      indptr.data(), indices.data(), data.data(), nnz, n_rows, params.n_cols,
+      out_indices.data(), out_dists.data(), k, handle, params.batch_size_index,
+      params.batch_size_query, params.metric);
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
   }
 
-  void TearDown() override { raft::deallocate_all(handle.get_stream()); }
-
   void compare() {
-    ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, n_rows * k,
+    ASSERT_TRUE(devArrMatch(out_dists_ref.data(), out_dists.data(), n_rows * k,
                             CompareApprox<value_t>(1e-4)));
-    ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k,
-                            Compare<value_idx>()));
+    ASSERT_TRUE(devArrMatch(out_indices_ref.data(), out_indices.data(),
+                            n_rows * k, Compare<value_idx>()));
   }
 
  protected:
@@ -95,30 +103,28 @@ class SparseKNNTest
     std::vector<value_idx> indices_h = params.indices_h;
     std::vector<value_t> data_h = params.data_h;
 
-    raft::allocate(indptr, indptr_h.size(), handle.get_stream());
-    raft::allocate(indices, indices_h.size(), handle.get_stream());
-    raft::allocate(data, data_h.size(), handle.get_stream());
+    auto stream = handle.get_stream();
+    indptr.resize(indptr_h.size(), stream);
+    indices.resize(indices_h.size(), stream);
+    data.resize(data_h.size(), stream);
 
-    update_device(indptr, indptr_h.data(), indptr_h.size(),
-                  handle.get_stream());
-    update_device(indices, indices_h.data(), indices_h.size(),
-                  handle.get_stream());
-    update_device(data, data_h.data(), data_h.size(), handle.get_stream());
+    update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream);
+    update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(data.data(), data_h.data(), data_h.size(), stream);
 
     std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
 
-    raft::allocate(out_indices_ref, out_indices_ref_h.size(),
-                   handle.get_stream());
-    raft::allocate(out_dists_ref, out_dists_ref_h.size(), handle.get_stream());
+    out_indices_ref.resize(out_indices_ref_h.size(), stream);
+    out_dists_ref.resize(out_dists_ref_h.size(), stream);
 
-    update_device(out_indices_ref, out_indices_ref_h.data(),
-                  out_indices_ref_h.size(), handle.get_stream());
-    update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
-                  handle.get_stream());
+    update_device(out_indices_ref.data(), out_indices_ref_h.data(),
+                  out_indices_ref_h.size(), stream);
+    update_device(out_dists_ref.data(), out_dists_ref_h.data(),
+                  out_dists_ref_h.size(), stream);
 
-    raft::allocate(out_dists, n_rows * k, handle.get_stream());
-    raft::allocate(out_indices, n_rows * k, handle.get_stream());
+    out_dists.resize(n_rows * k, stream);
+    out_indices.resize(n_rows * k, stream);
   }
 
   raft::handle_t handle;
@@ -126,15 +132,15 @@ class SparseKNNTest
   int n_rows, nnz, k;
 
   // input data
-  value_idx *indptr, *indices;
-  value_t *data;
+  rmm::device_uvector<value_idx> indptr, indices;
+  rmm::device_uvector<value_t> data;
 
   // output data
-  value_idx *out_indices;
-  value_t *out_dists;
+  rmm::device_uvector<value_idx> out_indices;
+  rmm::device_uvector<value_t> out_dists;
 
-  value_idx *out_indices_ref;
-  value_t *out_dists_ref;
+  rmm::device_uvector<value_idx> out_indices_ref;
+  rmm::device_uvector<value_t> out_dists_ref;
 
   SparseKNNInputs<value_idx, value_t> params;
 };
diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu
index e259eafa70..712f95018c 100644
--- a/cpp/test/sparse/knn_graph.cu
+++ b/cpp/test/sparse/knn_graph.cu
@@ -59,23 +59,24 @@ template <typename value_idx, typename value_t>
 template <typename value_idx, typename value_t>
 class KNNGraphTest
   : public ::testing::TestWithParam<KNNGraphInputs<value_idx, value_t>> {
-  void SetUp() override {
-    params =
-      ::testing::TestWithParam<KNNGraphInputs<value_idx, value_t>>::GetParam();
-
-    raft::handle_t handle;
-
-    stream = handle.get_stream();
+ public:
+  KNNGraphTest()
+    : params(::testing::TestWithParam<
+             KNNGraphInputs<value_idx, value_t>>::GetParam()),
+      stream(handle.get_stream()),
+      X(0, stream) {
+    X.resize(params.X.size(), stream);
+  }
 
+ protected:
+  void SetUp() override {
     out = new raft::sparse::COO<value_t, value_idx>(stream);
 
-    raft::allocate(X, params.X.size(), stream);
-
-    update_device(X, params.X.data(), params.X.size(), stream);
+    update_device(X.data(), params.X.data(), params.X.size(), stream);
 
     raft::sparse::selection::knn_graph(
-      handle, X, params.m, params.n, raft::distance::DistanceType::L2Unexpanded,
-      *out);
+      handle, X.data(), params.m, params.n,
+      raft::distance::DistanceType::L2Unexpanded, *out);
 
     rmm::device_scalar<value_idx> sum(stream);
     sum.set_value_to_zero_async(stream);
@@ -90,20 +91,16 @@ class KNNGraphTest
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-
-    delete out;
-  }
+  void TearDown() override { delete out; }
 
  protected:
+  raft::handle_t handle;
   cudaStream_t stream;
 
   // input data
   raft::sparse::COO<value_t, value_idx> *out;
 
-  value_t *X;
+  rmm::device_uvector<value_t> X;
 
   value_idx sum_h;
 
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu
index 3bd144ef54..6d4af7f016 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/sparse/linkage.cu
@@ -153,23 +153,23 @@ template <typename T, typename IdxT>
 
 template <typename T, typename IdxT>
 class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
+ public:
+  LinkageTest()
+    : params(::testing::TestWithParam<LinkageInputs<T, IdxT>>::GetParam()),
+      stream(handle.get_stream()),
+      labels(params.n_row, stream),
+      labels_ref(params.n_row, stream) {}
+
  protected:
   void basicTest() {
-    CUDA_CHECK(cudaStreamCreate(&stream));
-
-    params = ::testing::TestWithParam<LinkageInputs<T, IdxT>>::GetParam();
-
     rmm::device_uvector<T> data(params.n_row * params.n_col, stream);
 
-    // Allocate result labels and expected labels on device
-    raft::allocate(labels, params.n_row, stream);
-    raft::allocate(labels_ref, params.n_row, stream);
-
     raft::copy(data.data(), params.data.data(), data.size(), stream);
-    raft::copy(labels_ref, params.expected_labels.data(), params.n_row, stream);
+    raft::copy(labels_ref.data(), params.expected_labels.data(), params.n_row,
+               stream);
 
     raft::hierarchy::linkage_output<IdxT, T> out_arrs;
-    out_arrs.labels = labels;
+    out_arrs.labels = labels.data();
 
     rmm::device_uvector<IdxT> out_children(params.n_row * 2, stream);
 
@@ -182,23 +182,21 @@ class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
       raft::distance::DistanceType::L2SqrtExpanded, &out_arrs, params.c,
       params.n_clusters);
 
-    CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
 
-    score = compute_rand_index(labels, labels_ref, params.n_row, stream);
+    score = compute_rand_index(labels.data(), labels_ref.data(), params.n_row,
+                               stream);
   }
 
   void SetUp() override { basicTest(); }
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
  protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
   LinkageInputs<T, IdxT> params;
-  IdxT *labels, *labels_ref;
+  rmm::device_uvector<IdxT> labels, labels_ref;
   double score;
-  cudaStream_t stream;
 };
 
 const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
index d69dd15c57..91b9bc6c54 100644
--- a/cpp/test/sparse/norm.cu
+++ b/cpp/test/sparse/norm.cu
@@ -41,51 +41,49 @@ struct CSRRowNormalizeInputs {
 template <typename Type_f, typename Index_>
 class CSRRowNormalizeTest
   : public ::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>> {
+ public:
+  CSRRowNormalizeTest()
+    : params(::testing::TestWithParam<
+             CSRRowNormalizeInputs<Type_f, Index_>>::GetParam()),
+      stream(handle.get_stream()),
+      in_vals(params.in_vals.size(), stream),
+      verify(params.verify.size(), stream),
+      ex_scan(params.ex_scan.size(), stream),
+      result(params.verify.size(), stream) {}
+
  protected:
-  void SetUp() override {
-    params = ::testing::TestWithParam<
-      CSRRowNormalizeInputs<Type_f, Index_>>::GetParam();
-    cudaStreamCreate(&stream);
-
-    raft::allocate(in_vals, params.in_vals.size(), stream);
-    raft::allocate(verify, params.verify.size(), stream);
-    raft::allocate(ex_scan, params.ex_scan.size(), stream);
-    raft::allocate(result, params.verify.size(), stream, true);
-  }
+  void SetUp() override {}
 
   void Run() {
     Index_ n_rows = params.ex_scan.size();
     Index_ nnz = params.in_vals.size();
 
-    raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream);
-    raft::update_device(in_vals, params.in_vals.data(), nnz, stream);
-    raft::update_device(verify, params.verify.data(), nnz, stream);
+    raft::update_device(ex_scan.data(), params.ex_scan.data(), n_rows, stream);
+    raft::update_device(in_vals.data(), params.in_vals.data(), nnz, stream);
+    raft::update_device(verify.data(), params.verify.data(), nnz, stream);
 
     switch (params.method) {
       case MAX:
-        linalg::csr_row_normalize_max<32, Type_f>(ex_scan, in_vals, nnz, n_rows,
-                                                  result, stream);
+        linalg::csr_row_normalize_max<32, Type_f>(
+          ex_scan.data(), in_vals.data(), nnz, n_rows, result.data(), stream);
         break;
       case L1:
-        linalg::csr_row_normalize_l1<32, Type_f>(ex_scan, in_vals, nnz, n_rows,
-                                                 result, stream);
+        linalg::csr_row_normalize_l1<32, Type_f>(
+          ex_scan.data(), in_vals.data(), nnz, n_rows, result.data(), stream);
         break;
     }
 
-    ASSERT_TRUE(
-      raft::devArrMatch<Type_f>(verify, result, nnz, raft::Compare<Type_f>()));
-  }
-
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    ASSERT_TRUE(raft::devArrMatch<Type_f>(verify.data(), result.data(), nnz,
+                                          raft::Compare<Type_f>()));
   }
 
  protected:
-  CSRRowNormalizeInputs<Type_f, Index_> params;
+  raft::handle_t handle;
   cudaStream_t stream;
-  Index_ *ex_scan;
-  Type_f *in_vals, *result, *verify;
+
+  CSRRowNormalizeInputs<Type_f, Index_> params;
+  rmm::device_uvector<Index_> ex_scan;
+  rmm::device_uvector<Type_f> in_vals, result, verify;
 };
 
 using CSRRowNormalizeTestF = CSRRowNormalizeTest<float, int>;
diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu
index 805a3d85da..8011d73a6e 100644
--- a/cpp/test/sparse/row_op.cu
+++ b/cpp/test/sparse/row_op.cu
@@ -51,40 +51,40 @@ void csr_row_op_wrapper(const Index_ *row_ind, Index_ n_rows, Index_ nnz,
 template <typename Type_f, typename Index_>
 class CSRRowOpTest
   : public ::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>> {
+ public:
+  CSRRowOpTest()
+    : params(
+        ::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>>::GetParam()),
+      stream(handle.get_stream()),
+      verify(params.verify.size(), stream),
+      ex_scan(params.ex_scan.size(), stream),
+      result(params.verify.size(), stream) {}
+
  protected:
   void SetUp() override {
-    params =
-      ::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>>::GetParam();
-    cudaStreamCreate(&stream);
     n_rows = params.ex_scan.size();
     nnz = params.verify.size();
-
-    raft::allocate(verify, nnz, stream);
-    raft::allocate(ex_scan, n_rows, stream);
-    raft::allocate(result, nnz, stream, true);
   }
 
   void Run() {
-    raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream);
-    raft::update_device(verify, params.verify.data(), nnz, stream);
+    raft::update_device(ex_scan.data(), params.ex_scan.data(), n_rows, stream);
+    raft::update_device(verify.data(), params.verify.data(), nnz, stream);
 
-    csr_row_op_wrapper<Type_f, Index_>(ex_scan, n_rows, nnz, result, stream);
-
-    ASSERT_TRUE(
-      raft::devArrMatch<Type_f>(verify, result, nnz, raft::Compare<Type_f>()));
-  }
+    csr_row_op_wrapper<Type_f, Index_>(ex_scan.data(), n_rows, nnz,
+                                       result.data(), stream);
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    ASSERT_TRUE(raft::devArrMatch<Type_f>(verify.data(), result.data(), nnz,
+                                          raft::Compare<Type_f>()));
   }
 
  protected:
-  CSRRowOpInputs<Type_f, Index_> params;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  CSRRowOpInputs<Type_f, Index_> params;
   Index_ n_rows, nnz;
-  Index_ *ex_scan;
-  Type_f *result, *verify;
+  rmm::device_uvector<Index_> ex_scan;
+  rmm::device_uvector<Type_f> result, verify;
 };
 
 using CSRRowOpTestF = CSRRowOpTest<float, int>;
diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu
index e73a8a547b..92833630dd 100644
--- a/cpp/test/sparse/sort.cu
+++ b/cpp/test/sparse/sort.cu
@@ -47,16 +47,17 @@ const std::vector<SparseSortInput<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
 typedef SparseSortTest<float> COOSort;
 TEST_P(COOSort, Result) {
-  int *in_rows, *in_cols, *verify;
-  float *in_vals;
-
   params = ::testing::TestWithParam<SparseSortInput<float>>::GetParam();
   raft::random::Rng r(params.seed);
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
 
-  raft::allocate(in_vals, params.nnz, stream);
-  r.uniform(in_vals, params.nnz, float(-1.0), float(1.0), stream);
+  rmm::device_uvector<int> in_rows(params.nnz, stream);
+  rmm::device_uvector<int> in_cols(params.nnz, stream);
+  rmm::device_uvector<int> verify(params.nnz, stream);
+  rmm::device_uvector<float> in_vals(params.nnz, stream);
+
+  r.uniform(in_vals.data(), params.nnz, float(-1.0), float(1.0), stream);
 
   int *in_rows_h = (int *)malloc(params.nnz * sizeof(int));
   int *in_cols_h = (int *)malloc(params.nnz * sizeof(int));
@@ -68,29 +69,21 @@ TEST_P(COOSort, Result) {
     in_cols_h[i] = i;
   }
 
-  raft::allocate(in_rows, params.nnz, stream);
-  raft::allocate(in_cols, params.nnz, stream);
-  raft::allocate(verify, params.nnz, stream);
-
-  raft::update_device(in_rows, in_rows_h, params.nnz, stream);
+  raft::update_device(in_rows.data(), in_rows_h, params.nnz, stream);
 
-  raft::update_device(in_cols, in_cols_h, params.nnz, stream);
-  raft::update_device(verify, verify_h, params.nnz, stream);
+  raft::update_device(in_cols.data(), in_cols_h, params.nnz, stream);
+  raft::update_device(verify.data(), verify_h, params.nnz, stream);
 
-  op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals,
-               stream);
+  op::coo_sort(params.m, params.n, params.nnz, in_rows.data(), in_cols.data(),
+               in_vals.data(), stream);
 
-  ASSERT_TRUE(
-    raft::devArrMatch<int>(verify, in_rows, params.nnz, raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(verify.data(), in_rows.data(), params.nnz,
+                                     raft::Compare<int>()));
 
   delete[] in_rows_h;
   delete[] in_cols_h;
   delete[] verify_h;
 
-  CUDA_CHECK(cudaFree(in_rows));
-  CUDA_CHECK(cudaFree(in_cols));
-  CUDA_CHECK(cudaFree(in_vals));
-  CUDA_CHECK(cudaFree(verify));
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu
index 35233dc473..d50211f971 100644
--- a/cpp/test/sparse/symmetrize.cu
+++ b/cpp/test/sparse/symmetrize.cu
@@ -59,29 +59,31 @@ template <typename value_idx, typename value_t>
 template <typename value_idx, typename value_t>
 class SparseSymmetrizeTest : public ::testing::TestWithParam<
                                SparseSymmetrizeInputs<value_idx, value_t>> {
+ public:
+  SparseSymmetrizeTest()
+    : params(::testing::TestWithParam<
+             SparseSymmetrizeInputs<value_idx, value_t>>::GetParam()),
+      stream(handle.get_stream()),
+      indptr(0, stream),
+      indices(0, stream),
+      data(0, stream) {}
+
  protected:
   void make_data() {
     std::vector<value_idx> indptr_h = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
     std::vector<value_t> data_h = params.data_h;
 
-    raft::allocate(indptr, indptr_h.size(), stream);
-    raft::allocate(indices, indices_h.size(), stream);
-    raft::allocate(data, data_h.size(), stream);
+    indptr.resize(indptr_h.size(), stream);
+    indices.resize(indices_h.size(), stream);
+    data.resize(data_h.size(), stream);
 
-    update_device(indptr, indptr_h.data(), indptr_h.size(), stream);
-    update_device(indices, indices_h.data(), indices_h.size(), stream);
-    update_device(data, data_h.data(), data_h.size(), stream);
+    update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream);
+    update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(data.data(), data_h.data(), data_h.size(), stream);
   }
 
   void SetUp() override {
-    params = ::testing::TestWithParam<
-      SparseSymmetrizeInputs<value_idx, value_t>>::GetParam();
-
-    raft::handle_t handle;
-
-    stream = handle.get_stream();
-
     make_data();
 
     value_idx m = params.indptr_h.size() - 1;
@@ -90,12 +92,13 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam<
 
     rmm::device_uvector<value_idx> coo_rows(nnz, stream);
 
-    raft::sparse::convert::csr_to_coo(indptr, m, coo_rows.data(), nnz, stream);
+    raft::sparse::convert::csr_to_coo(indptr.data(), m, coo_rows.data(), nnz,
+                                      stream);
 
     raft::sparse::COO<value_t, value_idx> out(stream);
 
-    raft::sparse::linalg::symmetrize(handle, coo_rows.data(), indices, data, m,
-                                     n, coo_rows.size(), out);
+    raft::sparse::linalg::symmetrize(handle, coo_rows.data(), indices.data(),
+                                     data.data(), m, n, coo_rows.size(), out);
 
     rmm::device_scalar<value_idx> sum(stream);
     sum.set_value_to_zero_async(stream);
@@ -107,19 +110,13 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam<
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-    CUDA_CHECK(cudaFree(indptr));
-    CUDA_CHECK(cudaFree(indices));
-    CUDA_CHECK(cudaFree(data));
-  }
-
  protected:
+  raft::handle_t handle;
   cudaStream_t stream;
 
   // input data
-  value_idx *indptr, *indices;
-  value_t *data;
+  rmm::device_uvector<value_idx> indptr, indices;
+  rmm::device_uvector<value_t> data;
 
   value_idx sum_h;
 
diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu
index 122d7f2d6a..e51bb227a1 100644
--- a/cpp/test/spatial/haversine.cu
+++ b/cpp/test/spatial/haversine.cu
@@ -18,6 +18,7 @@
 #include <raft/linalg/distance_type.h>
 #include <iostream>
 #include <raft/spatial/knn/detail/haversine_distance.cuh>
+#include <rmm/device_uvector.hpp>
 #include <vector>
 #include "../test_utils.h"
 
@@ -27,20 +28,27 @@ namespace knn {
 
 template <typename value_idx, typename value_t>
 class HaversineKNNTest : public ::testing::Test {
+ public:
+  HaversineKNNTest()
+    : stream(handle.get_stream()),
+      d_train_inputs(n * d, stream),
+      d_ref_I(n * n, stream),
+      d_ref_D(n * n, stream),
+      d_pred_I(n * n, stream),
+      d_pred_D(n * n, stream) {}
+
  protected:
   void basicTest() {
-    CUDA_CHECK(cudaStreamCreate(&stream));
-
     // Allocate input
-    raft::allocate(d_train_inputs, n * d, stream);
+    d_train_inputs.resize(n * d, stream);
 
     // Allocate reference arrays
-    raft::allocate<value_idx>(d_ref_I, n * n, stream);
-    raft::allocate(d_ref_D, n * n, stream);
+    d_ref_I.resize(n * n, stream);
+    d_ref_D.resize(n * n, stream);
 
     // Allocate predicted arrays
-    raft::allocate<value_idx>(d_pred_I, n * n, stream);
-    raft::allocate(d_pred_D, n * n, stream);
+    d_pred_I.resize(n * n, stream);
+    d_pred_D.resize(n * n, stream);
 
     // make testdata on host
     std::vector<value_t> h_train_inputs = {
@@ -49,7 +57,8 @@ class HaversineKNNTest : public ::testing::Test {
       0.53154002, -1.47049808, 0.72891737, -1.54095137};
 
     h_train_inputs.resize(n);
-    raft::update_device(d_train_inputs, h_train_inputs.data(), n * d, stream);
+    raft::update_device(d_train_inputs.data(), h_train_inputs.data(), n * d,
+                        stream);
 
     std::vector<value_t> h_res_D = {
       0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595,
@@ -59,54 +68,52 @@ class HaversineKNNTest : public ::testing::Test {
       0., 0.16461092, 0.20535265, 0.23048252, 0.2426416,  0.5170737,
       0., 0.152463,   0.18767063, 0.20535265, 0.2345792,  0.44288665};
     h_res_D.resize(n * n);
-    raft::update_device(d_ref_D, h_res_D.data(), n * n, stream);
+    raft::update_device(d_ref_D.data(), h_res_D.data(), n * n, stream);
 
     std::vector<value_idx> h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0,
                                       2, 0, 5, 4, 3, 1, 3, 4, 5, 2, 0, 1,
                                       4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1};
     h_res_I.resize(n * n);
-    raft::update_device<value_idx>(d_ref_I, h_res_I.data(), n * n, stream);
+    raft::update_device<value_idx>(d_ref_I.data(), h_res_I.data(), n * n,
+                                   stream);
 
-    std::vector<value_t *> input_vec = {d_train_inputs};
+    std::vector<value_t *> input_vec = {d_train_inputs.data()};
     std::vector<value_idx> sizes_vec = {n};
 
     raft::spatial::knn::detail::haversine_knn(
-      d_pred_I, d_pred_D, d_train_inputs, d_train_inputs, n, n, k, stream);
+      d_pred_I.data(), d_pred_D.data(), d_train_inputs.data(),
+      d_train_inputs.data(), n, n, k, stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void SetUp() override { basicTest(); }
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
  protected:
-  value_t *d_train_inputs;
+  raft::handle_t handle;
+  cudaStream_t stream;
+
+  rmm::device_uvector<value_t> d_train_inputs;
 
   int n = 6;
   int d = 2;
 
   int k = 6;
 
-  value_idx *d_pred_I;
-  value_t *d_pred_D;
+  rmm::device_uvector<value_idx> d_pred_I;
+  rmm::device_uvector<value_t> d_pred_D;
 
-  value_idx *d_ref_I;
-  value_t *d_ref_D;
-
-  cudaStream_t stream;
+  rmm::device_uvector<value_idx> d_ref_I;
+  rmm::device_uvector<value_t> d_ref_D;
 };
 
 typedef HaversineKNNTest<int, float> HaversineKNNTestF;
 
 TEST_F(HaversineKNNTestF, Fit) {
-  ASSERT_TRUE(raft::devArrMatch(d_ref_D, d_pred_D, n * n,
+  ASSERT_TRUE(raft::devArrMatch(d_ref_D.data(), d_pred_D.data(), n * n,
                                 raft::CompareApprox<float>(1e-3)));
-  ASSERT_TRUE(
-    raft::devArrMatch(d_ref_I, d_pred_I, n * n, raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch(d_ref_I.data(), d_pred_I.data(), n * n,
+                                raft::Compare<int>()));
 }
 
 }  // namespace knn
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index 7c70f6ffac..35a82b1e53 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -58,44 +58,71 @@ __global__ void build_expected_output(int *output, int n_rows, int k,
 
 template <typename T>
 class KNNTest : public ::testing::TestWithParam<KNNInputs> {
+ public:
+  KNNTest()
+    : params_(::testing::TestWithParam<KNNInputs>::GetParam()),
+      stream(handle.get_stream()),
+      actual_labels_(0, stream),
+      expected_labels_(0, stream),
+      input_(0, stream),
+      search_data_(0, stream),
+      indices_(0, stream),
+      distances_(0, stream),
+      search_labels_(0, stream) {}
+
  protected:
   void testBruteForce() {
-    raft::print_device_vector("Input array: ", input_, rows_ * cols_,
+    raft::print_device_vector("Input array: ", input_.data(), rows_ * cols_,
                               std::cout);
     std::cout << "K: " << k_ << "\n";
-    raft::print_device_vector("Labels array: ", search_labels_, rows_,
+    raft::print_device_vector("Labels array: ", search_labels_.data(), rows_,
                               std::cout);
 
-    auto stream = handle_.get_stream();
-
-    raft::allocate(actual_labels_, rows_ * k_, stream, true);
-    raft::allocate(expected_labels_, rows_ * k_, stream, true);
-
     std::vector<float *> input_vec;
     std::vector<int> sizes_vec;
-    input_vec.push_back(input_);
+    input_vec.push_back(input_.data());
     sizes_vec.push_back(rows_);
 
-    brute_force_knn(handle_, input_vec, sizes_vec, cols_, search_data_, rows_,
-                    indices_, distances_, k_, true, true);
+    brute_force_knn(handle, input_vec, sizes_vec, cols_, search_data_.data(),
+                    rows_, indices_.data(), distances_.data(), k_, true, true);
 
     build_actual_output<<<raft::ceildiv(rows_ * k_, 32), 32, 0, stream>>>(
-      actual_labels_, rows_, k_, search_labels_, indices_);
+      actual_labels_.data(), rows_, k_, search_labels_.data(), indices_.data());
 
     build_expected_output<<<raft::ceildiv(rows_ * k_, 32), 32, 0, stream>>>(
-      expected_labels_, rows_, k_, search_labels_);
+      expected_labels_.data(), rows_, k_, search_labels_.data());
 
-    ASSERT_TRUE(devArrMatch(expected_labels_, actual_labels_, rows_ * k_,
-                            raft::Compare<int>()));
+    ASSERT_TRUE(devArrMatch(expected_labels_.data(), actual_labels_.data(),
+                            rows_ * k_, raft::Compare<int>()));
   }
 
   void SetUp() override {
-    params_ = ::testing::TestWithParam<KNNInputs>::GetParam();
     rows_ = params_.input.size();
     cols_ = params_.input[0].size();
     k_ = params_.k;
 
-    cudaStream_t stream = handle_.get_stream();
+    actual_labels_.resize(rows_ * k_, stream);
+    expected_labels_.resize(rows_ * k_, stream);
+    input_.resize(rows_ * cols_, stream);
+    search_data_.resize(rows_ * cols_, stream);
+    indices_.resize(rows_ * k_, stream);
+    distances_.resize(rows_ * k_, stream);
+    search_labels_.resize(rows_, stream);
+
+    CUDA_CHECK(cudaMemsetAsync(actual_labels_.data(), 0,
+                               actual_labels_.size() * sizeof(int), stream));
+    CUDA_CHECK(cudaMemsetAsync(expected_labels_.data(), 0,
+                               expected_labels_.size() * sizeof(int), stream));
+    CUDA_CHECK(
+      cudaMemsetAsync(input_.data(), 0, input_.size() * sizeof(float), stream));
+    CUDA_CHECK(cudaMemsetAsync(search_data_.data(), 0,
+                               search_data_.size() * sizeof(float), stream));
+    CUDA_CHECK(cudaMemsetAsync(indices_.data(), 0,
+                               indices_.size() * sizeof(int64_t), stream));
+    CUDA_CHECK(cudaMemsetAsync(distances_.data(), 0,
+                               distances_.size() * sizeof(float), stream));
+    CUDA_CHECK(cudaMemsetAsync(search_labels_.data(), 0,
+                               search_labels_.size() * sizeof(int), stream));
 
     std::vector<float> row_major_input;
     for (std::size_t i = 0; i < params_.input.size(); ++i) {
@@ -111,36 +138,28 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
       params_.labels.data(), params_.labels.size() * sizeof(int), stream);
     int *labels_ptr = static_cast<int *>(labels_d.data());
 
-    raft::allocate(input_, rows_ * cols_, stream, true);
-    raft::allocate(search_data_, rows_ * cols_, stream, true);
-    raft::allocate(indices_, rows_ * k_, stream, true);
-    raft::allocate(distances_, rows_ * k_, stream, true);
-    raft::allocate(search_labels_, rows_, stream, true);
-
-    raft::copy(input_, input_ptr, rows_ * cols_, stream);
-    raft::copy(search_data_, input_ptr, rows_ * cols_, stream);
-    raft::copy(search_labels_, labels_ptr, rows_, stream);
-  }
-
-  void TearDown() override {
-    cudaStream_t stream = handle_.get_stream();
-    raft::deallocate_all(stream);
+    raft::copy(input_.data(), input_ptr, rows_ * cols_, stream);
+    raft::copy(search_data_.data(), input_ptr, rows_ * cols_, stream);
+    raft::copy(search_labels_.data(), labels_ptr, rows_, stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
  private:
-  raft::handle_t handle_;
+  raft::handle_t handle;
+  cudaStream_t stream;
+
   KNNInputs params_;
   int rows_;
   int cols_;
-  float *input_;
-  float *search_data_;
-  int64_t *indices_;
-  float *distances_;
+  rmm::device_uvector<float> input_;
+  rmm::device_uvector<float> search_data_;
+  rmm::device_uvector<int64_t> indices_;
+  rmm::device_uvector<float> distances_;
   int k_;
 
-  int *search_labels_;
-  int *actual_labels_;
-  int *expected_labels_;
+  rmm::device_uvector<int> search_labels_;
+  rmm::device_uvector<int> actual_labels_;
+  rmm::device_uvector<int> expected_labels_;
 };
 
 const std::vector<KNNInputs> inputs = {
diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu
index a7f1af6034..7742b9bd30 100644
--- a/cpp/test/spatial/selection.cu
+++ b/cpp/test/spatial/selection.cu
@@ -53,75 +53,78 @@ template <typename value_idx, typename value_t>
 template <typename value_idx, typename value_t>
 class SparseSelectionTest
   : public ::testing::TestWithParam<SparseSelectionInputs<value_idx, value_t>> {
+ public:
+  SparseSelectionTest()
+    : params(::testing::TestWithParam<
+             SparseSelectionInputs<value_idx, value_t>>::GetParam()),
+      stream(handle.get_stream()),
+      dists(0, stream),
+      inds(0, stream),
+      out_indices_ref(0, stream),
+      out_dists_ref(0, stream),
+      out_dists(0, stream),
+      out_indices(0, stream) {}
+
  protected:
   void make_data() {
     std::vector<value_t> dists_h = params.dists_h;
 
-    raft::allocate(dists, n_rows * n_cols, stream);
-    update_device(dists, dists_h.data(), dists_h.size(), stream);
+    dists.resize(n_rows * n_cols, stream);
+    inds.resize(n_rows * n_cols, stream);
+    out_dists.resize(n_rows * k, stream);
+    out_indices.resize(n_rows * k, stream);
 
-    raft::allocate(inds, n_rows * n_cols, stream);
-    iota_fill(inds, n_rows, n_cols, stream);
+    update_device(dists.data(), dists_h.data(), dists_h.size(), stream);
+    iota_fill(inds.data(), n_rows, n_cols, stream);
 
     std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
+    out_indices_ref.resize(out_indices_ref_h.size(), stream);
+    out_dists_ref.resize(out_dists_ref_h.size(), stream);
 
-    raft::allocate(out_indices_ref, out_indices_ref_h.size(), stream);
-    raft::allocate(out_dists_ref, out_dists_ref_h.size(), stream);
-
-    update_device(out_indices_ref, out_indices_ref_h.data(),
+    update_device(out_indices_ref.data(), out_indices_ref_h.data(),
                   out_indices_ref_h.size(), stream);
-    update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
-                  stream);
-
-    raft::allocate(out_dists, n_rows * k, stream);
-    raft::allocate(out_indices, n_rows * k, stream);
+    update_device(out_dists_ref.data(), out_dists_ref_h.data(),
+                  out_dists_ref_h.size(), stream);
   }
 
   void SetUp() override {
-    params = ::testing::TestWithParam<
-      SparseSelectionInputs<value_idx, value_t>>::GetParam();
-    CUDA_CHECK(cudaStreamCreate(&stream));
-
     n_rows = params.n_rows;
     n_cols = params.n_cols;
     k = params.k;
 
     make_data();
 
-    raft::spatial::knn::select_k(dists, inds, n_rows, n_cols, out_dists,
-                                 out_indices, params.select_min, k, stream);
+    raft::spatial::knn::select_k(dists.data(), inds.data(), n_rows, n_cols,
+                                 out_dists.data(), out_indices.data(),
+                                 params.select_min, k, stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
   void compare() {
-    ASSERT_TRUE(
-      devArrMatch(out_dists_ref, out_dists, n_rows * k, Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k,
-                            Compare<value_idx>()));
+    ASSERT_TRUE(devArrMatch(out_dists_ref.data(), out_dists.data(), n_rows * k,
+                            Compare<value_t>()));
+    ASSERT_TRUE(devArrMatch(out_indices_ref.data(), out_indices.data(),
+                            n_rows * k, Compare<value_idx>()));
   }
 
  protected:
+  raft::handle_t handle;
   cudaStream_t stream;
 
   int n_rows, n_cols, k;
 
   // input data
-  value_t *dists;
-  value_idx *inds;
+  rmm::device_uvector<value_t> dists;
+  rmm::device_uvector<value_idx> inds;
 
   // output data
-  value_idx *out_indices;
-  value_t *out_dists;
+  rmm::device_uvector<value_idx> out_indices;
+  rmm::device_uvector<value_t> out_dists;
 
-  value_idx *out_indices_ref;
-  value_t *out_dists_ref;
+  rmm::device_uvector<value_idx> out_indices_ref;
+  rmm::device_uvector<value_t> out_dists_ref;
 
   SparseSelectionInputs<value_idx, value_t> params;
 };
diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu
index a3c88a92be..9884202cc0 100644
--- a/cpp/test/stats/mean.cu
+++ b/cpp/test/stats/mean.cu
@@ -41,38 +41,36 @@ template <typename T>
 
 template <typename T>
 class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
+ public:
+  MeanTest()
+    : params(::testing::TestWithParam<MeanInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      rows(params.rows),
+      cols(params.cols),
+      data(rows * cols, stream),
+      mean_act(rows * cols, stream) {}
+
  protected:
   void SetUp() override {
-    params = ::testing::TestWithParam<MeanInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
-
-    int rows = params.rows, cols = params.cols;
     int len = rows * cols;
-
-    CUDA_CHECK(cudaStreamCreate(&stream));
-
-    raft::allocate(data, len, stream);
-    raft::allocate(mean_act, cols, stream);
-    r.normal(data, len, params.mean, (T)1.0, stream);
-
-    meanSGtest(data, stream);
+    r.normal(data.data(), len, params.mean, (T)1.0, stream);
+    meanSGtest(data.data(), stream);
   }
 
   void meanSGtest(T *data, cudaStream_t stream) {
     int rows = params.rows, cols = params.cols;
-
-    mean(mean_act, data, cols, rows, params.sample, params.rowMajor, stream);
-  }
-
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    mean(mean_act.data(), data, cols, rows, params.sample, params.rowMajor,
+         stream);
   }
 
  protected:
-  MeanInputs<T> params;
-  T *data, *mean_act;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  MeanInputs<T> params;
+  int rows, cols;
+  rmm::device_uvector<T> data, mean_act;
 };
 
 // Note: For 1024 samples, 256 experiments, a mean of 1.0 with stddev=1.0, the
@@ -116,13 +114,13 @@ const std::vector<MeanInputs<double>> inputsd = {
 
 typedef MeanTest<float> MeanTestF;
 TEST_P(MeanTestF, Result) {
-  ASSERT_TRUE(devArrMatch(params.mean, mean_act, params.cols,
+  ASSERT_TRUE(devArrMatch(params.mean, mean_act.data(), params.cols,
                           CompareApprox<float>(params.tolerance)));
 }
 
 typedef MeanTest<double> MeanTestD;
 TEST_P(MeanTestD, Result) {
-  ASSERT_TRUE(devArrMatch(params.mean, mean_act, params.cols,
+  ASSERT_TRUE(devArrMatch(params.mean, mean_act.data(), params.cols,
                           CompareApprox<double>(params.tolerance)));
 }
 
diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu
index b827230b5d..9845663df9 100644
--- a/cpp/test/stats/mean_center.cu
+++ b/cpp/test/stats/mean_center.cu
@@ -42,40 +42,40 @@ template <typename T, typename IdxType>
 template <typename T, typename IdxType>
 class MeanCenterTest
   : public ::testing::TestWithParam<MeanCenterInputs<T, IdxType>> {
+ public:
+  MeanCenterTest()
+    : params(
+        ::testing::TestWithParam<MeanCenterInputs<T, IdxType>>::GetParam()),
+      stream(handle.get_stream()),
+      rows(params.rows),
+      cols(params.cols),
+      out(rows * cols, stream),
+      out_ref(rows * cols, stream),
+      data(rows * cols, stream),
+      meanVec(params.bcastAlongRows ? cols : rows, stream) {}
+
  protected:
   void SetUp() override {
-    params = ::testing::TestWithParam<MeanCenterInputs<T, IdxType>>::GetParam();
     raft::random::Rng r(params.seed);
-
-    CUDA_CHECK(cudaStreamCreate(&stream));
-
-    auto rows = params.rows, cols = params.cols;
     auto len = rows * cols;
-    IdxType vecLen = params.bcastAlongRows ? cols : rows;
-
-    raft::allocate(out, len, stream);
-    raft::allocate(out_ref, len, stream);
-    raft::allocate(data, len, stream);
-    raft::allocate(meanVec, vecLen, stream);
-    r.normal(data, len, params.mean, (T)1.0, stream);
-    raft::stats::mean(meanVec, data, cols, rows, params.sample, params.rowMajor,
-                      stream);
-    meanCenter(out, data, meanVec, cols, rows, params.rowMajor,
-               params.bcastAlongRows, stream);
-    raft::linalg::naiveMatVec(out_ref, data, meanVec, cols, rows,
-                              params.rowMajor, params.bcastAlongRows, (T)-1.0);
+    r.normal(data.data(), len, params.mean, (T)1.0, stream);
+    raft::stats::mean(meanVec.data(), data.data(), cols, rows, params.sample,
+                      params.rowMajor, stream);
+    meanCenter(out.data(), data.data(), meanVec.data(), cols, rows,
+               params.rowMajor, params.bcastAlongRows, stream);
+    raft::linalg::naiveMatVec(out_ref.data(), data.data(), meanVec.data(), cols,
+                              rows, params.rowMajor, params.bcastAlongRows,
+                              (T)-1.0);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
  protected:
-  MeanCenterInputs<T, IdxType> params;
-  T *data, *meanVec, *out, *out_ref;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  MeanCenterInputs<T, IdxType> params;
+  int rows, cols;
+  rmm::device_uvector<T> data, meanVec, out, out_ref;
 };
 
 const std::vector<MeanCenterInputs<float, int>> inputsf_i32 = {
@@ -105,7 +105,7 @@ const std::vector<MeanCenterInputs<float, int>> inputsf_i32 = {
   {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}};
 typedef MeanCenterTest<float, int> MeanCenterTestF_i32;
 TEST_P(MeanCenterTestF_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out, out_ref, params.cols,
+  ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.cols,
                           raft::CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32,
@@ -138,7 +138,7 @@ const std::vector<MeanCenterInputs<float, size_t>> inputsf_i64 = {
   {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}};
 typedef MeanCenterTest<float, size_t> MeanCenterTestF_i64;
 TEST_P(MeanCenterTestF_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out, out_ref, params.cols,
+  ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.cols,
                           raft::CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64,
@@ -171,7 +171,7 @@ const std::vector<MeanCenterInputs<double, int>> inputsd_i32 = {
   {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}};
 typedef MeanCenterTest<double, int> MeanCenterTestD_i32;
 TEST_P(MeanCenterTestD_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out, out_ref, params.cols,
+  ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.cols,
                           raft::CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32,
@@ -204,7 +204,7 @@ const std::vector<MeanCenterInputs<double, size_t>> inputsd_i64 = {
   {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}};
 typedef MeanCenterTest<double, size_t> MeanCenterTestD_i64;
 TEST_P(MeanCenterTestD_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out, out_ref, params.cols,
+  ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.cols,
                           raft::CompareApprox<double>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64,
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index fd374249d2..8c42b70c07 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -40,46 +40,53 @@ template <typename T>
 
 template <typename T>
 class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
+ public:
+  StdDevTest()
+    : params(::testing::TestWithParam<StdDevInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      rows(params.rows),
+      cols(params.cols),
+      data(rows * cols, stream),
+      mean_act(cols, stream),
+      stddev_act(cols, stream),
+      vars_act(cols, stream) {}
+
  protected:
   void SetUp() override {
-    params = ::testing::TestWithParam<StdDevInputs<T>>::GetParam();
     random::Rng r(params.seed);
-    int rows = params.rows, cols = params.cols;
     int len = rows * cols;
 
-    CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(data, len, stream);
-    raft::allocate(mean_act, cols, stream);
-    raft::allocate(stddev_act, cols, stream);
-    raft::allocate(vars_act, cols, stream);
-    r.normal(data, len, params.mean, params.stddev, stream);
-    stdVarSGtest(data, stream);
+    data.resize(len, stream);
+    mean_act.resize(cols, stream);
+    stddev_act.resize(cols, stream);
+    vars_act.resize(cols, stream);
+    r.normal(data.data(), len, params.mean, params.stddev, stream);
+    stdVarSGtest(data.data(), stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
   void stdVarSGtest(T *data, cudaStream_t stream) {
     int rows = params.rows, cols = params.cols;
 
-    mean(mean_act, data, cols, rows, params.sample, params.rowMajor, stream);
+    mean(mean_act.data(), data, cols, rows, params.sample, params.rowMajor,
+         stream);
 
-    stddev(stddev_act, data, mean_act, cols, rows, params.sample,
+    stddev(stddev_act.data(), data, mean_act.data(), cols, rows, params.sample,
            params.rowMajor, stream);
 
-    vars(vars_act, data, mean_act, cols, rows, params.sample, params.rowMajor,
-         stream);
-
-    raft::matrix::seqRoot(vars_act, T(1), cols, stream);
-  }
+    vars(vars_act.data(), data, mean_act.data(), cols, rows, params.sample,
+         params.rowMajor, stream);
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
+    raft::matrix::seqRoot(vars_act.data(), T(1), cols, stream);
   }
 
  protected:
-  StdDevInputs<T> params;
-  T *data, *mean_act, *stddev_act, *vars_act;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  StdDevInputs<T> params;
+  int rows, cols;
+  rmm::device_uvector<T> data, mean_act, stddev_act, vars_act;
 };
 
 const std::vector<StdDevInputs<float>> inputsf = {
@@ -120,19 +127,19 @@ const std::vector<StdDevInputs<double>> inputsd = {
 
 typedef StdDevTest<float> StdDevTestF;
 TEST_P(StdDevTestF, Result) {
-  ASSERT_TRUE(devArrMatch(params.stddev, stddev_act, params.cols,
+  ASSERT_TRUE(devArrMatch(params.stddev, stddev_act.data(), params.cols,
                           CompareApprox<float>(params.tolerance)));
 
-  ASSERT_TRUE(devArrMatch(stddev_act, vars_act, params.cols,
+  ASSERT_TRUE(devArrMatch(stddev_act.data(), vars_act.data(), params.cols,
                           CompareApprox<float>(params.tolerance)));
 }
 
 typedef StdDevTest<double> StdDevTestD;
 TEST_P(StdDevTestD, Result) {
-  ASSERT_TRUE(devArrMatch(params.stddev, stddev_act, params.cols,
+  ASSERT_TRUE(devArrMatch(params.stddev, stddev_act.data(), params.cols,
                           CompareApprox<double>(params.tolerance)));
 
-  ASSERT_TRUE(devArrMatch(stddev_act, vars_act, params.cols,
+  ASSERT_TRUE(devArrMatch(stddev_act.data(), vars_act.data(), params.cols,
                           CompareApprox<double>(params.tolerance)));
 }
 
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index 58ebec7859..f5b341cb0e 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -38,35 +38,36 @@ template <typename T>
 
 template <typename T>
 class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
+ public:
+  SumTest()
+    : params(::testing::TestWithParam<SumInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      rows(params.rows),
+      cols(params.cols),
+      data(rows * cols, stream),
+      sum_act(cols, stream) {}
+
  protected:
   void SetUp() override {
-    params = ::testing::TestWithParam<SumInputs<T>>::GetParam();
-    int rows = params.rows, cols = params.cols;
     int len = rows * cols;
-    CUDA_CHECK(cudaStreamCreate(&stream));
-    raft::allocate(data, len, stream);
 
     T data_h[len];
     for (int i = 0; i < len; i++) {
       data_h[i] = T(1);
     }
 
-    raft::update_device(data, data_h, len, stream);
-
-    raft::allocate(sum_act, cols, stream);
-    sum(sum_act, data, cols, rows, false, stream);
+    raft::update_device(data.data(), data_h, len, stream);
+    sum(sum_act.data(), data.data(), cols, rows, false, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
-    raft::deallocate_all(stream);
-    CUDA_CHECK(cudaStreamDestroy(stream));
-  }
-
  protected:
-  SumInputs<T> params;
-  T *data, *sum_act;
+  raft::handle_t handle;
   cudaStream_t stream;
+
+  SumInputs<T> params;
+  int rows, cols;
+  rmm::device_uvector<T> data, sum_act;
 };
 
 const std::vector<SumInputs<float>> inputsf = {{0.05f, 1024, 32, 1234ULL},
@@ -77,13 +78,14 @@ const std::vector<SumInputs<double>> inputsd = {{0.05, 1024, 32, 1234ULL},
 
 typedef SumTest<float> SumTestF;
 TEST_P(SumTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(float(params.rows), sum_act, params.cols,
+  ASSERT_TRUE(raft::devArrMatch(float(params.rows), sum_act.data(), params.cols,
                                 raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef SumTest<double> SumTestD;
 TEST_P(SumTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(double(params.rows), sum_act, params.cols,
+  ASSERT_TRUE(raft::devArrMatch(double(params.rows), sum_act.data(),
+                                params.cols,
                                 raft::CompareApprox<double>(params.tolerance)));
 }
 

From ccba252f347ada53e6648b419ea41c8d9fb88350 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Fri, 22 Oct 2021 10:04:19 -0400
Subject: [PATCH 030/171] Hiding implementation details for distance primitives
 (dense + sparse) (#344)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/344
---
 .../raft/distance/{ => detail}/canberra.cuh   |   5 +-
 .../raft/distance/{ => detail}/chebyshev.cuh  |   5 +-
 .../distance/{ => detail}/correlation.cuh     |   5 +-
 .../raft/distance/{ => detail}/cosine.cuh     |   4 +-
 .../raft/distance/{ => detail}/distance.cuh   | 361 ++++++++----------
 .../raft/distance/{ => detail}/euclidean.cuh  |   4 +-
 .../distance/{ => detail}/fused_l2_nn.cuh     |  75 +---
 .../raft/distance/{ => detail}/hamming.cuh    |   5 +-
 .../raft/distance/{ => detail}/hellinger.cuh  |   4 +-
 .../distance/{ => detail}/jensen_shannon.cuh  |   4 +-
 .../distance/{ => detail}/kl_divergence.cuh   |   4 +-
 cpp/include/raft/distance/{ => detail}/l1.cuh |   4 +-
 .../raft/distance/{ => detail}/minkowski.cuh  |   5 +-
 .../{ => detail}/pairwise_distance_base.cuh   |  37 +-
 .../distance/{ => detail}/russell_rao.cuh     |   4 +-
 cpp/include/raft/distance/distance.hpp        | 291 ++++++++++++++
 cpp/include/raft/distance/fused_l2_nn.hpp     | 107 ++++++
 .../distance/{ => detail}/bin_distance.cuh    |   5 +-
 .../sparse/distance/{ => detail}/coo_spmv.cuh |   8 +-
 .../distance/detail/coo_spmv_kernel.cuh       |   3 +-
 .../coo_spmv_strategies/base_strategy.cuh     |   6 +-
 .../coo_mask_row_iterators.cuh                |   4 +-
 .../dense_smem_strategy.cuh                   |   2 +
 .../coo_spmv_strategies/hash_strategy.cuh     |   2 +
 .../distance/{ => detail}/ip_distance.cuh     |  23 +-
 .../distance/{ => detail}/l2_distance.cuh     |   4 +-
 .../distance/{ => detail}/lp_distance.cuh     |   4 +-
 .../distance/{ => detail}/operators.cuh       |   2 +
 .../sparse/distance/{ => detail}/utils.cuh    |   2 +
 .../distance/{distance.cuh => distance.hpp}   |  51 +--
 .../sparse/selection/connect_components.cuh   |   2 +-
 cpp/include/raft/sparse/selection/knn.cuh     |  11 +-
 .../knn/detail/ann_quantized_faiss.cuh        |   2 +-
 .../raft/spatial/knn/detail/ball_cover.cuh    |   1 -
 .../raft/spatial/knn/detail/fused_l2_knn.cuh  |  14 +-
 cpp/test/distance/dist_adj.cu                 |   2 +-
 cpp/test/distance/distance_base.cuh           |   2 +-
 cpp/test/distance/fused_l2_nn.cu              |   5 +-
 cpp/test/sparse/dist_coo_spmv.cu              |  30 +-
 cpp/test/sparse/distance.cu                   |   2 +-
 40 files changed, 740 insertions(+), 371 deletions(-)
 rename cpp/include/raft/distance/{ => detail}/canberra.cuh (98%)
 rename cpp/include/raft/distance/{ => detail}/chebyshev.cuh (98%)
 rename cpp/include/raft/distance/{ => detail}/correlation.cuh (99%)
 rename cpp/include/raft/distance/{ => detail}/cosine.cuh (98%)
 rename cpp/include/raft/distance/{ => detail}/distance.cuh (63%)
 rename cpp/include/raft/distance/{ => detail}/euclidean.cuh (99%)
 rename cpp/include/raft/distance/{ => detail}/fused_l2_nn.cuh (75%)
 rename cpp/include/raft/distance/{ => detail}/hamming.cuh (98%)
 rename cpp/include/raft/distance/{ => detail}/hellinger.cuh (98%)
 rename cpp/include/raft/distance/{ => detail}/jensen_shannon.cuh (98%)
 rename cpp/include/raft/distance/{ => detail}/kl_divergence.cuh (99%)
 rename cpp/include/raft/distance/{ => detail}/l1.cuh (98%)
 rename cpp/include/raft/distance/{ => detail}/minkowski.cuh (98%)
 rename cpp/include/raft/distance/{ => detail}/pairwise_distance_base.cuh (92%)
 rename cpp/include/raft/distance/{ => detail}/russell_rao.cuh (98%)
 create mode 100644 cpp/include/raft/distance/distance.hpp
 create mode 100644 cpp/include/raft/distance/fused_l2_nn.hpp
 rename cpp/include/raft/sparse/distance/{ => detail}/bin_distance.cuh (98%)
 rename cpp/include/raft/sparse/distance/{ => detail}/coo_spmv.cuh (98%)
 rename cpp/include/raft/sparse/distance/{ => detail}/coo_spmv_strategies/base_strategy.cuh (97%)
 rename cpp/include/raft/sparse/distance/{ => detail}/coo_spmv_strategies/coo_mask_row_iterators.cuh (99%)
 rename cpp/include/raft/sparse/distance/{ => detail}/coo_spmv_strategies/dense_smem_strategy.cuh (98%)
 rename cpp/include/raft/sparse/distance/{ => detail}/coo_spmv_strategies/hash_strategy.cuh (99%)
 rename cpp/include/raft/sparse/distance/{ => detail}/ip_distance.cuh (78%)
 rename cpp/include/raft/sparse/distance/{ => detail}/l2_distance.cuh (99%)
 rename cpp/include/raft/sparse/distance/{ => detail}/lp_distance.cuh (98%)
 rename cpp/include/raft/sparse/distance/{ => detail}/operators.cuh (98%)
 rename cpp/include/raft/sparse/distance/{ => detail}/utils.cuh (96%)
 rename cpp/include/raft/sparse/distance/{distance.cuh => distance.hpp} (70%)

diff --git a/cpp/include/raft/distance/canberra.cuh b/cpp/include/raft/distance/detail/canberra.cuh
similarity index 98%
rename from cpp/include/raft/distance/canberra.cuh
rename to cpp/include/raft/distance/detail/canberra.cuh
index b87c295eb0..c4c384c45f 100644
--- a/cpp/include/raft/distance/canberra.cuh
+++ b/cpp/include/raft/distance/detail/canberra.cuh
@@ -15,10 +15,11 @@
  */
 
 #pragma once
-#include <raft/distance/pairwise_distance_base.cuh>
+#include <raft/distance/detail/pairwise_distance_base.cuh>
 
 namespace raft {
 namespace distance {
+namespace detail {
 
 /**
  * @brief the canberra distance matrix calculation implementer
@@ -157,5 +158,7 @@ void canberraImpl(int m, int n, int k, const InType *pA, const InType *pB,
       n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
   }
 }
+
+}  // namespace detail
 }  // namespace distance
 }  // namespace raft
diff --git a/cpp/include/raft/distance/chebyshev.cuh b/cpp/include/raft/distance/detail/chebyshev.cuh
similarity index 98%
rename from cpp/include/raft/distance/chebyshev.cuh
rename to cpp/include/raft/distance/detail/chebyshev.cuh
index 8d53408cf8..77fba28310 100644
--- a/cpp/include/raft/distance/chebyshev.cuh
+++ b/cpp/include/raft/distance/detail/chebyshev.cuh
@@ -15,11 +15,11 @@
  */
 
 #pragma once
-#include <raft/distance/pairwise_distance_base.cuh>
+#include <raft/distance/detail/pairwise_distance_base.cuh>
 
 namespace raft {
 namespace distance {
-
+namespace detail {
 /**
  * @brief the Chebyshev distance matrix calculation implementer
  *  It computes the following equation: cij = max(cij, op(ai-bj))
@@ -154,5 +154,6 @@ void chebyshevImpl(int m, int n, int k, const InType *pA, const InType *pB,
       n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
   }
 }
+}  // namespace detail
 }  // namespace distance
 }  // namespace raft
diff --git a/cpp/include/raft/distance/correlation.cuh b/cpp/include/raft/distance/detail/correlation.cuh
similarity index 99%
rename from cpp/include/raft/distance/correlation.cuh
rename to cpp/include/raft/distance/detail/correlation.cuh
index ed3b7a5464..cee986997a 100644
--- a/cpp/include/raft/distance/correlation.cuh
+++ b/cpp/include/raft/distance/detail/correlation.cuh
@@ -16,11 +16,12 @@
 
 #pragma once
 #include <raft/cuda_utils.cuh>
-#include <raft/distance/pairwise_distance_base.cuh>
+#include <raft/distance/detail/pairwise_distance_base.cuh>
 #include <raft/linalg/reduce.cuh>
 
 namespace raft {
 namespace distance {
+namespace detail {
 
 /**
  * @brief the Correlation distance matrix:
@@ -243,5 +244,7 @@ void correlationImpl(int m, int n, int k, const InType *pA, const InType *pB,
                        fin_op, stream);
   }
 }
+
+}  // namespace detail
 }  // namespace distance
 }  // namespace raft
diff --git a/cpp/include/raft/distance/cosine.cuh b/cpp/include/raft/distance/detail/cosine.cuh
similarity index 98%
rename from cpp/include/raft/distance/cosine.cuh
rename to cpp/include/raft/distance/detail/cosine.cuh
index ed9bd28b7f..900e045edc 100644
--- a/cpp/include/raft/distance/cosine.cuh
+++ b/cpp/include/raft/distance/detail/cosine.cuh
@@ -16,11 +16,12 @@
 
 #pragma once
 
-#include <raft/distance/pairwise_distance_base.cuh>
+#include <raft/distance/detail/pairwise_distance_base.cuh>
 #include <raft/linalg/norm.cuh>
 
 namespace raft {
 namespace distance {
+namespace detail {
 
 /**
  * @brief the cosine distance matrix calculation implementer
@@ -201,5 +202,6 @@ void cosineAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA,
   }
 }
 
+};  // end namespace detail
 };  // end namespace distance
 };  // end namespace raft
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
similarity index 63%
rename from cpp/include/raft/distance/distance.cuh
rename to cpp/include/raft/distance/detail/distance.cuh
index 65b4f3b830..199dc73fb6 100644
--- a/cpp/include/raft/distance/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -19,22 +19,70 @@
 #include <cuda_runtime_api.h>
 #include <raft/linalg/distance_type.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/distance/canberra.cuh>
-#include <raft/distance/chebyshev.cuh>
-#include <raft/distance/correlation.cuh>
-#include <raft/distance/cosine.cuh>
-#include <raft/distance/euclidean.cuh>
-#include <raft/distance/hamming.cuh>
-#include <raft/distance/hellinger.cuh>
-#include <raft/distance/jensen_shannon.cuh>
-#include <raft/distance/kl_divergence.cuh>
-#include <raft/distance/l1.cuh>
-#include <raft/distance/minkowski.cuh>
-#include <raft/distance/russell_rao.cuh>
+#include <raft/distance/detail/canberra.cuh>
+#include <raft/distance/detail/chebyshev.cuh>
+#include <raft/distance/detail/correlation.cuh>
+#include <raft/distance/detail/cosine.cuh>
+#include <raft/distance/detail/euclidean.cuh>
+#include <raft/distance/detail/hamming.cuh>
+#include <raft/distance/detail/hellinger.cuh>
+#include <raft/distance/detail/jensen_shannon.cuh>
+#include <raft/distance/detail/kl_divergence.cuh>
+#include <raft/distance/detail/l1.cuh>
+#include <raft/distance/detail/minkowski.cuh>
+#include <raft/distance/detail/russell_rao.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace distance {
+namespace detail {
+
+/** enum to tell how to compute distance */
+enum DistanceType : unsigned short {
+
+  /** evaluate as dist_ij = sum(x_ik^2) + sum(y_ij)^2 - 2*sum(x_ik * y_jk) */
+  L2Expanded = 0,
+  /** same as above, but inside the epilogue, perform square root operation */
+  L2SqrtExpanded = 1,
+  /** cosine distance */
+  CosineExpanded = 2,
+  /** L1 distance */
+  L1 = 3,
+  /** evaluate as dist_ij += (x_ik - y-jk)^2 */
+  L2Unexpanded = 4,
+  /** same as above, but inside the epilogue, perform square root operation */
+  L2SqrtUnexpanded = 5,
+  /** basic inner product **/
+  InnerProduct = 6,
+  /** Chebyshev (Linf) distance **/
+  Linf = 7,
+  /** Canberra distance **/
+  Canberra = 8,
+  /** Generalized Minkowski distance **/
+  LpUnexpanded = 9,
+  /** Correlation distance **/
+  CorrelationExpanded = 10,
+  /** Jaccard distance **/
+  JaccardExpanded = 11,
+  /** Hellinger distance **/
+  HellingerExpanded = 12,
+  /** Haversine distance **/
+  Haversine = 13,
+  /** Bray-Curtis distance **/
+  BrayCurtis = 14,
+  /** Jensen-Shannon distance**/
+  JensenShannon = 15,
+  /** Hamming distance **/
+  HammingUnexpanded = 16,
+  /** KLDivergence **/
+  KLDivergence = 17,
+  /** RusselRao **/
+  RusselRaoExpanded = 18,
+  /** Dice-Sorensen distance **/
+  DiceExpanded = 19,
+  /** Precomputed (special value) **/
+  Precomputed = 100
+};
 
 namespace {
 template <raft::distance::DistanceType distanceType, typename InType,
@@ -53,10 +101,10 @@ struct DistanceImpl<raft::distance::DistanceType::L2Expanded, InType, AccType,
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
            Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
            cudaStream_t stream, bool isRowMajor, InType) {
-    raft::distance::euclideanAlgo1<InType, AccType, OutType, FinalLambda,
-                                   Index_>(m, n, k, x, y, dist, false,
-                                           (AccType *)workspace, worksize,
-                                           fin_op, stream, isRowMajor);
+    raft::distance::detail::euclideanAlgo1<InType, AccType, OutType,
+                                           FinalLambda, Index_>(
+      m, n, k, x, y, dist, false, (AccType *)workspace, worksize, fin_op,
+      stream, isRowMajor);
   }
 };
 
@@ -67,10 +115,10 @@ struct DistanceImpl<raft::distance::DistanceType::L2SqrtExpanded, InType,
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
            Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
            cudaStream_t stream, bool isRowMajor, InType) {
-    raft::distance::euclideanAlgo1<InType, AccType, OutType, FinalLambda,
-                                   Index_>(m, n, k, x, y, dist, true,
-                                           (AccType *)workspace, worksize,
-                                           fin_op, stream, isRowMajor);
+    raft::distance::detail::euclideanAlgo1<InType, AccType, OutType,
+                                           FinalLambda, Index_>(
+      m, n, k, x, y, dist, true, (AccType *)workspace, worksize, fin_op, stream,
+      isRowMajor);
   }
 };
 
@@ -81,9 +129,10 @@ struct DistanceImpl<raft::distance::DistanceType::CosineExpanded, InType,
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
            Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
            cudaStream_t stream, bool isRowMajor, InType) {
-    raft::distance::cosineAlgo1<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, (AccType *)workspace, worksize, fin_op, stream,
-      isRowMajor);
+    raft::distance::detail::cosineAlgo1<InType, AccType, OutType, FinalLambda,
+                                        Index_>(m, n, k, x, y, dist,
+                                                (AccType *)workspace, worksize,
+                                                fin_op, stream, isRowMajor);
   }
 };
 
@@ -94,9 +143,9 @@ struct DistanceImpl<raft::distance::DistanceType::L2Unexpanded, InType, AccType,
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
            Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
            bool isRowMajor, InType) {
-    raft::distance::euclideanAlgo2<InType, AccType, OutType, FinalLambda,
-                                   Index_>(m, n, k, x, y, dist, false, fin_op,
-                                           stream, isRowMajor);
+    raft::distance::detail::euclideanAlgo2<InType, AccType, OutType,
+                                           FinalLambda, Index_>(
+      m, n, k, x, y, dist, false, fin_op, stream, isRowMajor);
   }
 };
 
@@ -107,9 +156,9 @@ struct DistanceImpl<raft::distance::DistanceType::L2SqrtUnexpanded, InType,
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
            Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
            bool isRowMajor, InType) {
-    raft::distance::euclideanAlgo2<InType, AccType, OutType, FinalLambda,
-                                   Index_>(m, n, k, x, y, dist, true, fin_op,
-                                           stream, isRowMajor);
+    raft::distance::detail::euclideanAlgo2<InType, AccType, OutType,
+                                           FinalLambda, Index_>(
+      m, n, k, x, y, dist, true, fin_op, stream, isRowMajor);
   }
 };
 
@@ -120,8 +169,9 @@ struct DistanceImpl<raft::distance::DistanceType::L1, InType, AccType, OutType,
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
            Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
            bool isRowMajor, InType) {
-    raft::distance::l1Impl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
+    raft::distance::detail::l1Impl<InType, AccType, OutType, FinalLambda,
+                                   Index_>(m, n, k, x, y, dist, fin_op, stream,
+                                           isRowMajor);
   }
 };
 
@@ -132,9 +182,9 @@ struct DistanceImpl<raft::distance::DistanceType::Linf, InType, AccType,
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
            Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
            bool isRowMajor, InType) {
-    raft::distance::chebyshevImpl<InType, AccType, OutType, FinalLambda,
-                                  Index_>(m, n, k, x, y, dist, fin_op, stream,
-                                          isRowMajor);
+    raft::distance::detail::chebyshevImpl<InType, AccType, OutType, FinalLambda,
+                                          Index_>(m, n, k, x, y, dist, fin_op,
+                                                  stream, isRowMajor);
   }
 };
 
@@ -145,9 +195,9 @@ struct DistanceImpl<raft::distance::DistanceType::HellingerExpanded, InType,
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
            Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
            bool isRowMajor, InType) {
-    raft::distance::hellingerImpl<InType, AccType, OutType, FinalLambda,
-                                  Index_>(m, n, k, x, y, dist, fin_op, stream,
-                                          isRowMajor);
+    raft::distance::detail::hellingerImpl<InType, AccType, OutType, FinalLambda,
+                                          Index_>(m, n, k, x, y, dist, fin_op,
+                                                  stream, isRowMajor);
   }
 };
 
@@ -158,9 +208,9 @@ struct DistanceImpl<raft::distance::DistanceType::LpUnexpanded, InType, AccType,
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
            Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
            bool isRowMajor, InType metric_arg) {
-    raft::distance::minkowskiImpl<InType, AccType, OutType, FinalLambda,
-                                  Index_>(m, n, k, x, y, dist, fin_op, stream,
-                                          isRowMajor, metric_arg);
+    raft::distance::detail::minkowskiImpl<InType, AccType, OutType, FinalLambda,
+                                          Index_>(
+      m, n, k, x, y, dist, fin_op, stream, isRowMajor, metric_arg);
   }
 };
 
@@ -171,8 +221,9 @@ struct DistanceImpl<raft::distance::DistanceType::Canberra, InType, AccType,
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
            Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
            bool isRowMajor, InType) {
-    raft::distance::canberraImpl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
+    raft::distance::detail::canberraImpl<InType, AccType, OutType, FinalLambda,
+                                         Index_>(m, n, k, x, y, dist, fin_op,
+                                                 stream, isRowMajor);
   }
 };
 
@@ -183,9 +234,9 @@ struct DistanceImpl<raft::distance::DistanceType::HammingUnexpanded, InType,
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
            Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
            bool isRowMajor, InType) {
-    raft::distance::hammingUnexpandedImpl<InType, AccType, OutType, FinalLambda,
-                                          Index_>(m, n, k, x, y, dist, fin_op,
-                                                  stream, isRowMajor);
+    raft::distance::detail::hammingUnexpandedImpl<InType, AccType, OutType,
+                                                  FinalLambda, Index_>(
+      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
@@ -196,9 +247,9 @@ struct DistanceImpl<raft::distance::DistanceType::JensenShannon, InType,
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
            Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
            bool isRowMajor, InType) {
-    raft::distance::jensenShannonImpl<InType, AccType, OutType, FinalLambda,
-                                      Index_>(m, n, k, x, y, dist, fin_op,
-                                              stream, isRowMajor);
+    raft::distance::detail::jensenShannonImpl<InType, AccType, OutType,
+                                              FinalLambda, Index_>(
+      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
@@ -209,9 +260,9 @@ struct DistanceImpl<raft::distance::DistanceType::RusselRaoExpanded, InType,
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
            Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
            bool isRowMajor, InType) {
-    raft::distance::russellRaoImpl<InType, AccType, OutType, FinalLambda,
-                                   Index_>(m, n, k, x, y, dist, fin_op, stream,
-                                           isRowMajor);
+    raft::distance::detail::russellRaoImpl<InType, AccType, OutType,
+                                           FinalLambda, Index_>(
+      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
@@ -222,9 +273,9 @@ struct DistanceImpl<raft::distance::DistanceType::KLDivergence, InType, AccType,
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
            Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
            bool isRowMajor, InType) {
-    raft::distance::klDivergenceImpl<InType, AccType, OutType, FinalLambda,
-                                     Index_>(m, n, k, x, y, dist, fin_op,
-                                             stream, isRowMajor);
+    raft::distance::detail::klDivergenceImpl<InType, AccType, OutType,
+                                             FinalLambda, Index_>(
+      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
@@ -235,50 +286,15 @@ struct DistanceImpl<raft::distance::DistanceType::CorrelationExpanded, InType,
   void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
            Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
            cudaStream_t stream, bool isRowMajor, InType) {
-    raft::distance::correlationImpl<InType, AccType, OutType, FinalLambda,
-                                    Index_>(m, n, k, x, y, dist,
-                                            (AccType *)workspace, worksize,
-                                            fin_op, stream, isRowMajor);
+    raft::distance::detail::correlationImpl<InType, AccType, OutType,
+                                            FinalLambda, Index_>(
+      m, n, k, x, y, dist, (AccType *)workspace, worksize, fin_op, stream,
+      isRowMajor);
   }
 };
 
 }  // anonymous namespace
 
-/**
- * @brief Return the exact workspace size to compute the distance
- * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam Index_ Index type
- * @param x first set of points
- * @param y second set of points
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- *
- * @note If the specifed distanceType doesn't need the workspace at all, it
- * returns 0.
- */
-template <raft::distance::DistanceType distanceType, typename InType,
-          typename AccType, typename OutType, typename Index_ = int>
-size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n,
-                        Index_ k) {
-  size_t worksize = 0;
-  constexpr bool is_allocated =
-    (distanceType <= raft::distance::DistanceType::CosineExpanded) ||
-    (distanceType == raft::distance::DistanceType::CorrelationExpanded);
-  constexpr int numOfBuffers =
-    (distanceType == raft::distance::DistanceType::CorrelationExpanded) ? 2 : 1;
-
-  if (is_allocated) {
-    worksize += numOfBuffers * m * sizeof(AccType);
-    if (x != y) worksize += numOfBuffers * n * sizeof(AccType);
-  }
-
-  return worksize;
-}
-
 /**
  * @brief Evaluate pairwise distances with the user epilogue lamba allowed
  * @tparam DistanceType which distance to evaluate
@@ -319,7 +335,43 @@ void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
 }
 
 /**
- * @brief Evaluate pairwise distances for the simple use case
+         * @brief Evaluate pairwise distances for the simple use case
+         * @tparam DistanceType which distance to evaluate
+         * @tparam InType input argument type
+         * @tparam AccType accumulation type
+         * @tparam OutType output type
+         * @tparam Index_ Index type
+         * @param x first set of points
+         * @param y second set of points
+         * @param dist output distance matrix
+         * @param m number of points in x
+         * @param n number of points in y
+         * @param k dimensionality
+         * @param workspace temporary workspace needed for computations
+         * @param worksize number of bytes of the workspace
+         * @param stream cuda stream
+         * @param isRowMajor whether the matrices are row-major or col-major
+         *
+         * @note if workspace is passed as nullptr, this will return in
+         *  worksize, the number of bytes of workspace required
+         */
+template <raft::distance::DistanceType distanceType, typename InType,
+          typename AccType, typename OutType, typename Index_ = int>
+void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
+              Index_ n, Index_ k, void *workspace, size_t worksize,
+              cudaStream_t stream, bool isRowMajor = true,
+              InType metric_arg = 2.0f) {
+  auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) {
+    return d_val;
+  };
+  distance<distanceType, InType, AccType, OutType, decltype(default_fin_op),
+           Index_>(x, y, dist, m, n, k, workspace, worksize, default_fin_op,
+                   stream, isRowMajor, metric_arg);
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+/**
+ * @brief Return the exact workspace size to compute the distance
  * @tparam DistanceType which distance to evaluate
  * @tparam InType input argument type
  * @tparam AccType accumulation type
@@ -327,31 +379,30 @@ void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
  * @tparam Index_ Index type
  * @param x first set of points
  * @param y second set of points
- * @param dist output distance matrix
  * @param m number of points in x
  * @param n number of points in y
  * @param k dimensionality
- * @param workspace temporary workspace needed for computations
- * @param worksize number of bytes of the workspace
- * @param stream cuda stream
- * @param isRowMajor whether the matrices are row-major or col-major
  *
- * @note if workspace is passed as nullptr, this will return in
- *  worksize, the number of bytes of workspace required
+ * @note If the specifed distanceType doesn't need the workspace at all, it
+ * returns 0.
  */
 template <raft::distance::DistanceType distanceType, typename InType,
           typename AccType, typename OutType, typename Index_ = int>
-void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
-              Index_ n, Index_ k, void *workspace, size_t worksize,
-              cudaStream_t stream, bool isRowMajor = true,
-              InType metric_arg = 2.0f) {
-  auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) {
-    return d_val;
-  };
-  distance<distanceType, InType, AccType, OutType, decltype(default_fin_op),
-           Index_>(x, y, dist, m, n, k, workspace, worksize, default_fin_op,
-                   stream, isRowMajor, metric_arg);
-  CUDA_CHECK(cudaPeekAtLastError());
+size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n,
+                        Index_ k) {
+  size_t worksize = 0;
+  constexpr bool is_allocated =
+    (distanceType <= raft::distance::DistanceType::CosineExpanded) ||
+    (distanceType == raft::distance::DistanceType::CorrelationExpanded);
+  constexpr int numOfBuffers =
+    (distanceType == raft::distance::DistanceType::CorrelationExpanded) ? 2 : 1;
+
+  if (is_allocated) {
+    worksize += numOfBuffers * m * sizeof(AccType);
+    if (x != y) worksize += numOfBuffers * n * sizeof(AccType);
+  }
+
+  return worksize;
 }
 
 /**
@@ -386,91 +437,7 @@ void pairwise_distance_impl(const Type *x, const Type *y, Type *dist, Index_ m,
                                                workspace.data(), worksize,
                                                stream, isRowMajor, metric_arg);
 }
-
-template <typename Type, typename Index_ = int>
-void pairwise_distance(const Type *x, const Type *y, Type *dist, Index_ m,
-                       Index_ n, Index_ k, rmm::device_uvector<char> &workspace,
-                       raft::distance::DistanceType metric, cudaStream_t stream,
-                       bool isRowMajor = true, Type metric_arg = 2.0f) {
-  switch (metric) {
-    case raft::distance::DistanceType::L2Expanded:
-      pairwise_distance_impl<Type, Index_,
-                             raft::distance::DistanceType::L2Expanded>(
-        x, y, dist, m, n, k, workspace, stream, isRowMajor);
-      break;
-    case raft::distance::DistanceType::L2SqrtExpanded:
-      pairwise_distance_impl<Type, Index_,
-                             raft::distance::DistanceType::L2SqrtExpanded>(
-        x, y, dist, m, n, k, workspace, stream, isRowMajor);
-      break;
-    case raft::distance::DistanceType::CosineExpanded:
-      pairwise_distance_impl<Type, Index_,
-                             raft::distance::DistanceType::CosineExpanded>(
-        x, y, dist, m, n, k, workspace, stream, isRowMajor);
-      break;
-    case raft::distance::DistanceType::L1:
-      pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L1>(
-        x, y, dist, m, n, k, workspace, stream, isRowMajor);
-      break;
-    case raft::distance::DistanceType::L2Unexpanded:
-      pairwise_distance_impl<Type, Index_,
-                             raft::distance::DistanceType::L2Unexpanded>(
-        x, y, dist, m, n, k, workspace, stream, isRowMajor);
-      break;
-    case raft::distance::DistanceType::L2SqrtUnexpanded:
-      pairwise_distance_impl<Type, Index_,
-                             raft::distance::DistanceType::L2SqrtUnexpanded>(
-        x, y, dist, m, n, k, workspace, stream, isRowMajor);
-      break;
-    case raft::distance::DistanceType::Linf:
-      pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::Linf>(
-        x, y, dist, m, n, k, workspace, stream, isRowMajor);
-      break;
-    case raft::distance::DistanceType::HellingerExpanded:
-      pairwise_distance_impl<Type, Index_,
-                             raft::distance::DistanceType::HellingerExpanded>(
-        x, y, dist, m, n, k, workspace, stream, isRowMajor);
-      break;
-    case raft::distance::DistanceType::LpUnexpanded:
-      pairwise_distance_impl<Type, Index_,
-                             raft::distance::DistanceType::LpUnexpanded>(
-        x, y, dist, m, n, k, workspace, stream, isRowMajor, metric_arg);
-      break;
-    case raft::distance::DistanceType::Canberra:
-      pairwise_distance_impl<Type, Index_,
-                             raft::distance::DistanceType::Canberra>(
-        x, y, dist, m, n, k, workspace, stream, isRowMajor);
-      break;
-    case raft::distance::DistanceType::HammingUnexpanded:
-      pairwise_distance_impl<Type, Index_,
-                             raft::distance::DistanceType::HammingUnexpanded>(
-        x, y, dist, m, n, k, workspace, stream, isRowMajor);
-      break;
-    case raft::distance::DistanceType::JensenShannon:
-      pairwise_distance_impl<Type, Index_,
-                             raft::distance::DistanceType::JensenShannon>(
-        x, y, dist, m, n, k, workspace, stream, isRowMajor);
-      break;
-    case raft::distance::DistanceType::RusselRaoExpanded:
-      pairwise_distance_impl<Type, Index_,
-                             raft::distance::DistanceType::RusselRaoExpanded>(
-        x, y, dist, m, n, k, workspace, stream, isRowMajor);
-      break;
-    case raft::distance::DistanceType::KLDivergence:
-      pairwise_distance_impl<Type, Index_,
-                             raft::distance::DistanceType::KLDivergence>(
-        x, y, dist, m, n, k, workspace, stream, isRowMajor);
-      break;
-    case raft::distance::DistanceType::CorrelationExpanded:
-      pairwise_distance_impl<Type, Index_,
-                             raft::distance::DistanceType::CorrelationExpanded>(
-        x, y, dist, m, n, k, workspace, stream, isRowMajor);
-      break;
-    default:
-      THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
-  };
-}
 /** @} */
-
+};  // namespace detail
 };  // namespace distance
 };  // namespace raft
diff --git a/cpp/include/raft/distance/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh
similarity index 99%
rename from cpp/include/raft/distance/euclidean.cuh
rename to cpp/include/raft/distance/detail/euclidean.cuh
index 484da0e5bf..8b8882c244 100644
--- a/cpp/include/raft/distance/euclidean.cuh
+++ b/cpp/include/raft/distance/detail/euclidean.cuh
@@ -15,11 +15,12 @@
  */
 
 #pragma once
-#include <raft/distance/pairwise_distance_base.cuh>
+#include <raft/distance/detail/pairwise_distance_base.cuh>
 #include <raft/linalg/norm.cuh>
 
 namespace raft {
 namespace distance {
+namespace detail {
 
 /**
  * @brief the expanded euclidean distance matrix calculation implementer
@@ -352,5 +353,6 @@ void euclideanAlgo2(Index_ m, Index_ n, Index_ k, const InType *pA,
   }
 }
 
+};  // end namespace detail
 };  // end namespace distance
 };  // end namespace raft
diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
similarity index 75%
rename from cpp/include/raft/distance/fused_l2_nn.cuh
rename to cpp/include/raft/distance/detail/fused_l2_nn.cuh
index b96a536e38..ca8f729a68 100644
--- a/cpp/include/raft/distance/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
@@ -20,11 +20,12 @@
 #include <cub/cub.cuh>
 #include <limits>
 #include <raft/cuda_utils.cuh>
-#include <raft/distance/pairwise_distance_base.cuh>
+#include <raft/distance/detail/pairwise_distance_base.cuh>
 #include <raft/linalg/contractions.cuh>
 
 namespace raft {
 namespace distance {
+namespace detail {
 
 #if (ENABLE_MEMCPY_ASYNC == 1)
 #include <cuda_pipeline.h>
@@ -32,7 +33,7 @@ using namespace nvcuda::experimental;
 #endif
 
 template <typename LabelT, typename DataT>
-struct KVPMinReduce {
+struct KVPMinReduceImpl {
   typedef cub::KeyValuePair<LabelT, DataT> KVP;
 
   DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) {
@@ -42,7 +43,7 @@ struct KVPMinReduce {
 };  // KVPMinReduce
 
 template <typename LabelT, typename DataT>
-struct MinAndDistanceReduceOp {
+struct MinAndDistanceReduceOpImpl {
   typedef typename cub::KeyValuePair<LabelT, DataT> KVP;
   DI void operator()(LabelT rid, KVP* out, const KVP& other) {
     if (other.value < out->value) {
@@ -58,7 +59,7 @@ struct MinAndDistanceReduceOp {
 };
 
 template <typename LabelT, typename DataT>
-struct MinReduceOp {
+struct MinReduceOpImpl {
   typedef typename cub::KeyValuePair<LabelT, DataT> KVP;
   DI void operator()(LabelT rid, DataT* out, const KVP& other) {
     if (other.value < *out) {
@@ -77,6 +78,14 @@ __global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) {
   }
 }
 
+template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
+void initialize(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp,
+                cudaStream_t stream) {
+  auto blks = raft::ceildiv(m, 256);
+  initKernel<DataT, OutT, IdxT>
+    <<<blks, 256, 0, stream>>>(min, m, maxVal, redOp);
+}
+
 // TODO: specialize this function for MinAndDistanceReduceOp<int, float>
 // with atomicCAS of 64 bit which will eliminate mutex and shfls
 template <typename P, typename OutT, typename IdxT, typename KVPair,
@@ -264,62 +273,6 @@ void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn,
   CUDA_CHECK(cudaGetLastError());
 }
 
-/**
- * @brief Fused L2 distance and 1-nearest-neighbor computation in a single call.
- *
- * The benefits of such a call are 2-fold: 1) eliminate the need for an
- * intermediate buffer to store the output of gemm 2) reduce the memory read
- * traffic on this intermediate buffer, otherwise needed during the reduction
- * phase for 1-NN.
- *
- * @tparam DataT     data type
- * @tparam OutT      output type to either store 1-NN indices and their minimum
- *                   distances or store only the min distances. Accordingly, one
- *                   has to pass an appropriate `ReduceOpT`
- * @tparam IdxT      indexing arithmetic type
- * @tparam ReduceOpT A struct to perform the final needed reduction operation
- *                   and also to initialize the output array elements with the
- *                   appropriate initial value needed for reduction.
- *
- * @param[out] min           will contain the reduced output (Length = `m`)
- *                           (on device)
- * @param[in]  x             first matrix. Row major. Dim = `m x k`.
- *                           (on device).
- * @param[in]  y             second matrix. Row major. Dim = `n x k`.
- *                           (on device).
- * @param[in]  xn            L2 squared norm of `x`. Length = `m`. (on device).
- * @param[in]  yn            L2 squared norm of `y`. Length = `n`. (on device)
- * @param[in]  m             gemm m
- * @param[in]  n             gemm n
- * @param[in]  k             gemm k
- * @param[in]  workspace     temp workspace. Size = sizeof(int)*m. (on device)
- * @param[in]  redOp         reduction operator in the epilogue
- * @param[in]  sqrt          Whether the output `minDist` should contain L2-sqrt
- * @param[in]  initOutBuffer whether to initialize the output buffer before the
- *                           main kernel launch
- * @param[in]  stream        cuda stream
- */
-template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT,
-          typename KVPReduceOpT>
-void fusedL2NN(OutT* min, const DataT* x, const DataT* y, const DataT* xn,
-               const DataT* yn, IdxT m, IdxT n, IdxT k, void* workspace,
-               ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt,
-               bool initOutBuffer, cudaStream_t stream) {
-  size_t bytes = sizeof(DataT) * k;
-  if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) {
-    fusedL2NNImpl<DataT, OutT, IdxT, 16 / sizeof(DataT), ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt,
-      initOutBuffer, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) {
-    fusedL2NNImpl<DataT, OutT, IdxT, 8 / sizeof(DataT), ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt,
-      initOutBuffer, stream);
-  } else {
-    fusedL2NNImpl<DataT, OutT, IdxT, 1, ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt,
-      initOutBuffer, stream);
-  }
-}
-
+}  // namespace detail
 }  // namespace distance
 }  // namespace raft
diff --git a/cpp/include/raft/distance/hamming.cuh b/cpp/include/raft/distance/detail/hamming.cuh
similarity index 98%
rename from cpp/include/raft/distance/hamming.cuh
rename to cpp/include/raft/distance/detail/hamming.cuh
index 08f1020b85..0169ba33a2 100644
--- a/cpp/include/raft/distance/hamming.cuh
+++ b/cpp/include/raft/distance/detail/hamming.cuh
@@ -15,10 +15,11 @@
  */
 
 #pragma once
-#include <raft/distance/pairwise_distance_base.cuh>
+#include <raft/distance/detail/pairwise_distance_base.cuh>
 
 namespace raft {
 namespace distance {
+namespace detail {
 
 /**
  * @brief the Hamming distance matrix using the unexpanded form:
@@ -171,5 +172,7 @@ void hammingUnexpandedImpl(int m, int n, int k, const InType *pA,
                                           pDcast, fin_op, stream);
   }
 }
+
+}  // namespace detail
 }  // namespace distance
 }  // namespace raft
diff --git a/cpp/include/raft/distance/hellinger.cuh b/cpp/include/raft/distance/detail/hellinger.cuh
similarity index 98%
rename from cpp/include/raft/distance/hellinger.cuh
rename to cpp/include/raft/distance/detail/hellinger.cuh
index f7ad3ed1ba..933d850dbf 100644
--- a/cpp/include/raft/distance/hellinger.cuh
+++ b/cpp/include/raft/distance/detail/hellinger.cuh
@@ -15,11 +15,12 @@
  */
 
 #pragma once
-#include <raft/distance/pairwise_distance_base.cuh>
+#include <raft/distance/detail/pairwise_distance_base.cuh>
 #include <raft/linalg/unary_op.cuh>
 
 namespace raft {
 namespace distance {
+namespace detail {
 
 /**
  * @brief the Hellinger distance matrix using the expanded form:
@@ -200,5 +201,6 @@ void hellingerImpl(int m, int n, int k, const InType *pA, const InType *pB,
       n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
   }
 }
+}  // namespace detail
 }  // namespace distance
 }  // namespace raft
diff --git a/cpp/include/raft/distance/jensen_shannon.cuh b/cpp/include/raft/distance/detail/jensen_shannon.cuh
similarity index 98%
rename from cpp/include/raft/distance/jensen_shannon.cuh
rename to cpp/include/raft/distance/detail/jensen_shannon.cuh
index 2a94205853..1e39f39682 100644
--- a/cpp/include/raft/distance/jensen_shannon.cuh
+++ b/cpp/include/raft/distance/detail/jensen_shannon.cuh
@@ -15,10 +15,11 @@
  */
 
 #pragma once
-#include <raft/distance/pairwise_distance_base.cuh>
+#include <raft/distance/detail/pairwise_distance_base.cuh>
 
 namespace raft {
 namespace distance {
+namespace detail {
 
 /**
  * @brief the Jensen Shannon distance matrix:
@@ -177,5 +178,6 @@ void jensenShannonImpl(int m, int n, int k, const InType *pA, const InType *pB,
                          stream);
   }
 }
+}  // namespace detail
 }  // namespace distance
 }  // namespace raft
diff --git a/cpp/include/raft/distance/kl_divergence.cuh b/cpp/include/raft/distance/detail/kl_divergence.cuh
similarity index 99%
rename from cpp/include/raft/distance/kl_divergence.cuh
rename to cpp/include/raft/distance/detail/kl_divergence.cuh
index 3197b73d10..5a18ba1670 100644
--- a/cpp/include/raft/distance/kl_divergence.cuh
+++ b/cpp/include/raft/distance/detail/kl_divergence.cuh
@@ -15,10 +15,11 @@
  */
 
 #pragma once
-#include <raft/distance/pairwise_distance_base.cuh>
+#include <raft/distance/detail/pairwise_distance_base.cuh>
 
 namespace raft {
 namespace distance {
+namespace detail {
 
 /**
  * @brief the KL Divergence distance matrix:
@@ -238,5 +239,6 @@ void klDivergenceImpl(int m, int n, int k, const InType *pA, const InType *pB,
                  false>(n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
   }
 }
+}  // namespace detail
 }  // namespace distance
 }  // namespace raft
diff --git a/cpp/include/raft/distance/l1.cuh b/cpp/include/raft/distance/detail/l1.cuh
similarity index 98%
rename from cpp/include/raft/distance/l1.cuh
rename to cpp/include/raft/distance/detail/l1.cuh
index 6ab084f041..33e9bae206 100644
--- a/cpp/include/raft/distance/l1.cuh
+++ b/cpp/include/raft/distance/detail/l1.cuh
@@ -15,10 +15,11 @@
  */
 
 #pragma once
-#include <raft/distance/pairwise_distance_base.cuh>
+#include <raft/distance/detail/pairwise_distance_base.cuh>
 
 namespace raft {
 namespace distance {
+namespace detail {
 
 /**
  * @brief the L1 distance matrix calculation implementer
@@ -151,5 +152,6 @@ void l1Impl(int m, int n, int k, const InType *pA, const InType *pB,
       n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
   }
 }
+}  // namespace detail
 }  // namespace distance
 }  // namespace raft
diff --git a/cpp/include/raft/distance/minkowski.cuh b/cpp/include/raft/distance/detail/minkowski.cuh
similarity index 98%
rename from cpp/include/raft/distance/minkowski.cuh
rename to cpp/include/raft/distance/detail/minkowski.cuh
index 803f5fc78a..8bd3deb08f 100644
--- a/cpp/include/raft/distance/minkowski.cuh
+++ b/cpp/include/raft/distance/detail/minkowski.cuh
@@ -15,10 +15,11 @@
  */
 
 #pragma once
-#include <raft/distance/pairwise_distance_base.cuh>
+#include <raft/distance/detail/pairwise_distance_base.cuh>
 
 namespace raft {
 namespace distance {
+namespace detail {
 
 /**
  * @brief the unexpanded Minkowski distance matrix calculation 
@@ -167,6 +168,6 @@ void minkowskiImpl(Index_ m, Index_ n, Index_ k, const InType *pA,
       n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream, metric_arg);
   }
 }
-
+};  // end namespace detail
 };  // end namespace distance
 };  // end namespace raft
diff --git a/cpp/include/raft/distance/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
similarity index 92%
rename from cpp/include/raft/distance/pairwise_distance_base.cuh
rename to cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index e3ff9a7081..a98bda1541 100644
--- a/cpp/include/raft/distance/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -24,6 +24,7 @@
 
 namespace raft {
 namespace distance {
+namespace detail {
 
 /**
  * @brief Device class for L1, L2 and cosine distance metrics.
@@ -69,11 +70,11 @@ template <bool useNorms, typename DataT, typename AccT, typename OutT,
 struct PairwiseDistances : public BaseClass {
  private:
   typedef Policy P;
-  const DataT* xn;
-  const DataT* yn;
-  const DataT* const yBase;
-  OutT* dOutput;
-  char* smem;
+  const DataT *xn;
+  const DataT *yn;
+  const DataT *const yBase;
+  OutT *dOutput;
+  char *smem;
   CoreLambda core_op;
   EpilogueLambda epilog_op;
   FinalLambda fin_op;
@@ -83,10 +84,10 @@ struct PairwiseDistances : public BaseClass {
 
  public:
   // Constructor
-  DI PairwiseDistances(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n,
+  DI PairwiseDistances(const DataT *_x, const DataT *_y, IdxT _m, IdxT _n,
                        IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd,
-                       const DataT* _xn, const DataT* _yn, OutT* _dOutput,
-                       char* _smem, CoreLambda _core_op,
+                       const DataT *_xn, const DataT *_yn, OutT *_dOutput,
+                       char *_smem, CoreLambda _core_op,
                        EpilogueLambda _epilog_op, FinalLambda _fin_op,
                        rowEpilogueLambda _rowEpilog_op)
     : BaseClass(_x, _y, _m, _n, _k, _lda, _ldb, _ldd, _smem),
@@ -203,8 +204,8 @@ struct PairwiseDistances : public BaseClass {
 
   DI void epilog(IdxT gridStrideX, IdxT gridStrideY) {
     if (useNorms) {
-      DataT* sxNorm = (DataT*)(&smem[P::SmemSize]);
-      DataT* syNorm = (&sxNorm[P::Mblk]);
+      DataT *sxNorm = (DataT *)(&smem[P::SmemSize]);
+      DataT *syNorm = (&sxNorm[P::Mblk]);
 
       // Load x & y norms required by this threadblock in shmem buffer
       if (gridStrideX == blockIdx.x * P::Nblk) {
@@ -294,14 +295,13 @@ template <bool useNorms, typename DataT, typename AccT, typename OutT,
           typename IdxT, typename Policy, typename CoreLambda,
           typename EpilogueLambda, typename FinalLambda, bool isRowMajor = true,
           bool writeOut = true>
-__global__ __launch_bounds__(
-  Policy::Nthreads,
-  2) void pairwiseDistanceMatKernel(const DataT* x, const DataT* y,
-                                    const DataT* _xn, const DataT* _yn, IdxT m,
-                                    IdxT n, IdxT k, IdxT lda, IdxT ldb,
-                                    IdxT ldd, OutT* dOutput, CoreLambda core_op,
-                                    EpilogueLambda epilog_op,
-                                    FinalLambda fin_op) {
+__global__ __launch_bounds__(Policy::Nthreads, 2)
+
+  void pairwiseDistanceMatKernel(const DataT *x, const DataT *y,
+                                 const DataT *_xn, const DataT *_yn, IdxT m,
+                                 IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                                 OutT *dOutput, CoreLambda core_op,
+                                 EpilogueLambda epilog_op, FinalLambda fin_op) {
   extern __shared__ char smem[];
   auto rowEpilog = [] __device__(IdxT starty) { return; };
 
@@ -337,5 +337,6 @@ dim3 launchConfigGenerator(IdxT m, IdxT n, std::size_t sMemSize, T func) {
   return grid;
 }
 
+};  // namespace detail
 };  // namespace distance
 };  // namespace raft
diff --git a/cpp/include/raft/distance/russell_rao.cuh b/cpp/include/raft/distance/detail/russell_rao.cuh
similarity index 98%
rename from cpp/include/raft/distance/russell_rao.cuh
rename to cpp/include/raft/distance/detail/russell_rao.cuh
index 417fb73b94..8e4c4824c3 100644
--- a/cpp/include/raft/distance/russell_rao.cuh
+++ b/cpp/include/raft/distance/detail/russell_rao.cuh
@@ -15,10 +15,11 @@
  */
 
 #pragma once
-#include <raft/distance/pairwise_distance_base.cuh>
+#include <raft/distance/detail/pairwise_distance_base.cuh>
 
 namespace raft {
 namespace distance {
+namespace detail {
 
 /**
  * @brief the Russell Rao distance matrix:
@@ -167,5 +168,6 @@ void russellRaoImpl(int m, int n, int k, const InType *pA, const InType *pB,
       n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
   }
 }
+}  // namespace detail
 }  // namespace distance
 }  // namespace raft
diff --git a/cpp/include/raft/distance/distance.hpp b/cpp/include/raft/distance/distance.hpp
new file mode 100644
index 0000000000..84e8af261a
--- /dev/null
+++ b/cpp/include/raft/distance/distance.hpp
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/distance_type.h>
+#include <raft/distance/detail/distance.cuh>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace distance {
+
+/**
+* @brief Evaluate pairwise distances with the user epilogue lamba allowed
+* @tparam DistanceType which distance to evaluate
+* @tparam InType input argument type
+* @tparam AccType accumulation type
+* @tparam OutType output type
+* @tparam FinalLambda user-defined epilogue lamba
+* @tparam Index_ Index type
+* @param x first set of points
+* @param y second set of points
+* @param dist output distance matrix
+* @param m number of points in x
+* @param n number of points in y
+* @param k dimensionality
+* @param workspace temporary workspace needed for computations
+* @param worksize number of bytes of the workspace
+* @param fin_op the final gemm epilogue lambda
+* @param stream cuda stream
+* @param isRowMajor whether the matrices are row-major or col-major
+*
+* @note fin_op: This is a device lambda which is supposed to operate upon the
+* input which is AccType and returns the output in OutType. It's signature is
+* as follows:  <pre>OutType fin_op(AccType in, int g_idx);</pre>. If one needs
+* any other parameters, feel free to pass them via closure.
+*/
+template <raft::distance::DistanceType distanceType, typename InType,
+          typename AccType, typename OutType, typename FinalLambda,
+          typename Index_ = int>
+void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
+              Index_ n, Index_ k, void *workspace, size_t worksize,
+              FinalLambda fin_op, cudaStream_t stream, bool isRowMajor = true,
+              InType metric_arg = 2.0f) {
+  detail::distance<distanceType, InType, AccType, OutType, FinalLambda, Index_>(
+    x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor,
+    metric_arg);
+}
+
+/**
+* @brief Evaluate pairwise distances for the simple use case
+* @tparam DistanceType which distance to evaluate
+* @tparam InType input argument type
+* @tparam AccType accumulation type
+* @tparam OutType output type
+* @tparam Index_ Index type
+* @param x first set of points
+* @param y second set of points
+* @param dist output distance matrix
+* @param m number of points in x
+* @param n number of points in y
+* @param k dimensionality
+* @param workspace temporary workspace needed for computations
+* @param worksize number of bytes of the workspace
+* @param stream cuda stream
+* @param isRowMajor whether the matrices are row-major or col-major
+*
+* @note if workspace is passed as nullptr, this will return in
+*  worksize, the number of bytes of workspace required
+*/
+template <raft::distance::DistanceType distanceType, typename InType,
+          typename AccType, typename OutType, typename Index_ = int>
+void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
+              Index_ n, Index_ k, void *workspace, size_t worksize,
+              cudaStream_t stream, bool isRowMajor = true,
+              InType metric_arg = 2.0f) {
+  detail::distance<distanceType, InType, AccType, OutType, Index_>(
+    x, y, dist, m, n, k, workspace, worksize, stream, isRowMajor, metric_arg);
+}
+
+/**
+* @brief Return the exact workspace size to compute the distance
+* @tparam DistanceType which distance to evaluate
+* @tparam InType input argument type
+* @tparam AccType accumulation type
+* @tparam OutType output type
+* @tparam Index_ Index type
+* @param x first set of points
+* @param y second set of points
+* @param m number of points in x
+* @param n number of points in y
+* @param k dimensionality
+*
+* @note If the specified distanceType doesn't need the workspace at all, it
+* returns 0.
+*/
+template <raft::distance::DistanceType distanceType, typename InType,
+          typename AccType, typename OutType, typename Index_ = int>
+size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n,
+                        Index_ k) {
+  return detail::getWorkspaceSize<distanceType, InType, AccType, OutType,
+                                  Index_>(x, y, m, n, k);
+}
+
+/**
+* @brief Evaluate pairwise distances for the simple use case
+* @tparam DistanceType which distance to evaluate
+* @tparam InType input argument type
+* @tparam AccType accumulation type
+* @tparam OutType output type
+* @tparam Index_ Index type
+* @param x first set of points
+* @param y second set of points
+* @param dist output distance matrix
+* @param m number of points in x
+* @param n number of points in y
+* @param k dimensionality
+* @param workspace temporary workspace needed for computations
+* @param worksize number of bytes of the workspace
+* @param stream cuda stream
+* @param isRowMajor whether the matrices are row-major or col-major
+*
+* @note if workspace is passed as nullptr, this will return in
+*  worksize, the number of bytes of workspace required
+*/
+template <raft::distance::DistanceType distanceType, typename InType,
+          typename AccType, typename OutType, typename Index_ = int>
+void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
+              Index_ n, Index_ k, cudaStream_t stream, bool isRowMajor = true,
+              InType metric_arg = 2.0f) {
+  rmm::device_uvector<char> workspace(0, stream);
+  auto worksize =
+    getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(x, y, m, n,
+                                                                     k);
+  workspace.resize(worksize, stream);
+  detail::distance<distanceType, InType, AccType, OutType, Index_>(
+    x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor,
+    metric_arg);
+}
+
+/**
+ * @defgroup pairwise_distance pairwise distance prims
+ * @{
+ * @brief Convenience wrapper around 'distance' prim to convert runtime metric
+ * into compile time for the purpose of dispatch
+ * @tparam Type input/accumulation/output data-type
+ * @tparam Index_ indexing type
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param workspace temporary workspace buffer which can get resized as per the
+ * needed workspace size
+ * @param metric distance metric
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ */
+template <typename Type, typename Index_ = int>
+void pairwise_distance(const raft::handle_t &handle, const Type *x,
+                       const Type *y, Type *dist, Index_ m, Index_ n, Index_ k,
+                       rmm::device_uvector<char> &workspace,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor = true, Type metric_arg = 2.0f) {
+  switch (metric) {
+    case raft::distance::DistanceType::L2Expanded:
+      detail::pairwise_distance_impl<Type, Index_,
+                                     raft::distance::DistanceType::L2Expanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::L2SqrtExpanded:
+      detail::pairwise_distance_impl<
+        Type, Index_, raft::distance::DistanceType::L2SqrtExpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::CosineExpanded:
+      detail::pairwise_distance_impl<
+        Type, Index_, raft::distance::DistanceType::CosineExpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::L1:
+      detail::pairwise_distance_impl<Type, Index_,
+                                     raft::distance::DistanceType::L1>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::L2Unexpanded:
+      detail::pairwise_distance_impl<
+        Type, Index_, raft::distance::DistanceType::L2Unexpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::L2SqrtUnexpanded:
+      detail::pairwise_distance_impl<
+        Type, Index_, raft::distance::DistanceType::L2SqrtUnexpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::Linf:
+      detail::pairwise_distance_impl<Type, Index_,
+                                     raft::distance::DistanceType::Linf>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::HellingerExpanded:
+      detail::pairwise_distance_impl<
+        Type, Index_, raft::distance::DistanceType::HellingerExpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::LpUnexpanded:
+      detail::pairwise_distance_impl<
+        Type, Index_, raft::distance::DistanceType::LpUnexpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor,
+        metric_arg);
+      break;
+    case raft::distance::DistanceType::Canberra:
+      detail::pairwise_distance_impl<Type, Index_,
+                                     raft::distance::DistanceType::Canberra>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::HammingUnexpanded:
+      detail::pairwise_distance_impl<
+        Type, Index_, raft::distance::DistanceType::HammingUnexpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::JensenShannon:
+      detail::pairwise_distance_impl<
+        Type, Index_, raft::distance::DistanceType::JensenShannon>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::RusselRaoExpanded:
+      detail::pairwise_distance_impl<
+        Type, Index_, raft::distance::DistanceType::RusselRaoExpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::KLDivergence:
+      detail::pairwise_distance_impl<
+        Type, Index_, raft::distance::DistanceType::KLDivergence>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::CorrelationExpanded:
+      detail::pairwise_distance_impl<
+        Type, Index_, raft::distance::DistanceType::CorrelationExpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    default:
+      THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
+  };
+}
+/** @} */
+
+/**
+     * @defgroup pairwise_distance pairwise distance prims
+     * @{
+     * @brief Convenience wrapper around 'distance' prim to convert runtime metric
+     * into compile time for the purpose of dispatch
+     * @tparam Type input/accumulation/output data-type
+     * @tparam Index_ indexing type
+     * @param x first set of points
+     * @param y second set of points
+     * @param dist output distance matrix
+     * @param m number of points in x
+     * @param n number of points in y
+     * @param k dimensionality
+     * @param metric distance metric
+     * @param stream cuda stream
+     * @param isRowMajor whether the matrices are row-major or col-major
+     */
+template <typename Type, typename Index_ = int>
+void pairwise_distance(const raft::handle_t &handle, const Type *x,
+                       const Type *y, Type *dist, Index_ m, Index_ n, Index_ k,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor = true, Type metric_arg = 2.0f) {
+  rmm::device_uvector<char> workspace(0, handle.get_stream());
+  pairwise_distance<Type, Index_>(handle, x, y, dist, m, n, k, workspace,
+                                  metric, isRowMajor, metric_arg);
+}
+
+};  // namespace distance
+};  // namespace raft
diff --git a/cpp/include/raft/distance/fused_l2_nn.hpp b/cpp/include/raft/distance/fused_l2_nn.hpp
new file mode 100644
index 0000000000..df9974f602
--- /dev/null
+++ b/cpp/include/raft/distance/fused_l2_nn.hpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <cub/cub.cuh>
+#include <limits>
+#include <raft/cuda_utils.cuh>
+#include <raft/distance/detail/fused_l2_nn.cuh>
+#include <raft/handle.hpp>
+
+namespace raft {
+namespace distance {
+
+template <typename LabelT, typename DataT>
+using KVPMinReduce = detail::KVPMinReduceImpl<LabelT, DataT>;
+
+template <typename LabelT, typename DataT>
+using MinAndDistanceReduceOp =
+  detail::MinAndDistanceReduceOpImpl<LabelT, DataT>;
+
+template <typename LabelT, typename DataT>
+using MinReduceOp = detail::MinReduceOpImpl<LabelT, DataT>;
+
+/**
+ * Initialize array using init value from reduction op
+ */
+template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
+void initialize(const raft::handle_t& handle, OutT* min, IdxT m, DataT maxVal,
+                ReduceOpT redOp) {
+  detail::initialize<DataT, OutT, IdxT, ReduceOpT>(min, m, maxVal, redOp,
+                                                   handle.get_stream());
+}
+
+/**
+ * @brief Fused L2 distance and 1-nearest-neighbor computation in a single call.
+ *
+ * The benefits of such a call are 2-fold: 1) eliminate the need for an
+ * intermediate buffer to store the output of gemm 2) reduce the memory read
+ * traffic on this intermediate buffer, otherwise needed during the reduction
+ * phase for 1-NN.
+ *
+ * @tparam DataT     data type
+ * @tparam OutT      output type to either store 1-NN indices and their minimum
+ *                   distances or store only the min distances. Accordingly, one
+ *                   has to pass an appropriate `ReduceOpT`
+ * @tparam IdxT      indexing arithmetic type
+ * @tparam ReduceOpT A struct to perform the final needed reduction operation
+ *                   and also to initialize the output array elements with the
+ *                   appropriate initial value needed for reduction.
+ *
+ * @param[out] min           will contain the reduced output (Length = `m`)
+ *                           (on device)
+ * @param[in]  x             first matrix. Row major. Dim = `m x k`.
+ *                           (on device).
+ * @param[in]  y             second matrix. Row major. Dim = `n x k`.
+ *                           (on device).
+ * @param[in]  xn            L2 squared norm of `x`. Length = `m`. (on device).
+ * @param[in]  yn            L2 squared norm of `y`. Length = `n`. (on device)
+ * @param[in]  m             gemm m
+ * @param[in]  n             gemm n
+ * @param[in]  k             gemm k
+ * @param[in]  workspace     temp workspace. Size = sizeof(int)*m. (on device)
+ * @param[in]  redOp         reduction operator in the epilogue
+ * @param[in]  sqrt          Whether the output `minDist` should contain L2-sqrt
+ * @param[in]  initOutBuffer whether to initialize the output buffer before the
+ *                           main kernel launch
+ * @param[in]  stream        cuda stream
+ */
+template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT,
+          typename KVPReduceOpT>
+void fusedL2NN(OutT* min, const DataT* x, const DataT* y, const DataT* xn,
+               const DataT* yn, IdxT m, IdxT n, IdxT k, void* workspace,
+               ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt,
+               bool initOutBuffer, cudaStream_t stream) {
+  size_t bytes = sizeof(DataT) * k;
+  if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) {
+    detail::fusedL2NNImpl<DataT, OutT, IdxT, 16 / sizeof(DataT), ReduceOpT>(
+      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt,
+      initOutBuffer, stream);
+  } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) {
+    detail::fusedL2NNImpl<DataT, OutT, IdxT, 8 / sizeof(DataT), ReduceOpT>(
+      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt,
+      initOutBuffer, stream);
+  } else {
+    detail::fusedL2NNImpl<DataT, OutT, IdxT, 1, ReduceOpT>(
+      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt,
+      initOutBuffer, stream);
+  }
+}
+
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/sparse/distance/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
similarity index 98%
rename from cpp/include/raft/sparse/distance/bin_distance.cuh
rename to cpp/include/raft/sparse/distance/detail/bin_distance.cuh
index 6885c250c0..e6dd1331ae 100644
--- a/cpp/include/raft/sparse/distance/bin_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
@@ -24,7 +24,7 @@
 #include <raft/sparse/distance/common.h>
 #include <raft/sparse/utils.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/sparse/distance/ip_distance.cuh>
+#include <raft/sparse/distance/detail/ip_distance.cuh>
 #include <rmm/device_uvector.hpp>
 
 #include <nvfunctional>
@@ -32,7 +32,7 @@
 namespace raft {
 namespace sparse {
 namespace distance {
-
+namespace detail {
 // @TODO: Move this into sparse prims (coo_norm)
 template <typename value_idx, typename value_t>
 __global__ void compute_binary_row_norm_kernel(
@@ -193,6 +193,7 @@ class dice_expanded_distances_t : public distances_t<value_t> {
   ip_distances_t<value_idx, value_t> ip_dists;
 };
 
+}  // END namespace detail
 };  // END namespace distance
 };  // END namespace sparse
 };  // END namespace raft
diff --git a/cpp/include/raft/sparse/distance/coo_spmv.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
similarity index 98%
rename from cpp/include/raft/sparse/distance/coo_spmv.cuh
rename to cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
index 24be171900..83844b8c54 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
@@ -24,9 +24,9 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/mr/device/buffer.hpp>
 
-#include "../csr.cuh"
-#include "../utils.h"
-#include "common.h"
+#include "../../csr.cuh"
+#include "../../utils.h"
+#include "../common.h"
 
 #include <limits.h>
 
@@ -37,6 +37,7 @@
 namespace raft {
 namespace sparse {
 namespace distance {
+namespace detail {
 
 template <typename value_idx, typename value_t, int threads_per_block = 1024,
           typename product_f, typename accum_f, typename write_f,
@@ -178,6 +179,7 @@ inline void balanced_coo_pairwise_generalized_spmv_rev(
   }
 };
 
+}  // namespace detail
 }  // namespace distance
 }  // namespace sparse
 };  // namespace raft
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh
index 51f9a05394..866ff43224 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh
@@ -25,7 +25,7 @@
 namespace raft {
 namespace sparse {
 namespace distance {
-
+namespace detail {
 /**
   * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with
   * sparse-matrix-sparse-vector multiplication layout (SPMV).
@@ -203,6 +203,7 @@ __global__ void balanced_coo_generalized_spmv_kernel(
   }
 }
 
+}  // namespace detail
 }  // namespace distance
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh
similarity index 97%
rename from cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh
rename to cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh
index 3b57225350..4ad3368c4a 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include "../common.h"
-#include "../detail/coo_spmv_kernel.cuh"
+#include "../../common.h"
+#include "../coo_spmv_kernel.cuh"
 #include "../utils.cuh"
 #include "coo_mask_row_iterators.cuh"
 
@@ -26,6 +26,7 @@
 namespace raft {
 namespace sparse {
 namespace distance {
+namespace detail {
 
 template <typename value_idx, typename value_t, int tpb>
 class coo_spmv_strategy {
@@ -84,6 +85,7 @@ class coo_spmv_strategy {
   const distances_config_t<value_idx, value_t> &config;
 };
 
+}  // namespace detail
 }  // namespace distance
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh
similarity index 99%
rename from cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh
rename to cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh
index 74eb37bc2b..0ab7b65ac2 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "../common.h"
+#include "../../common.h"
 #include "../utils.cuh"
 
 #include <rmm/device_uvector.hpp>
@@ -24,6 +24,7 @@
 namespace raft {
 namespace sparse {
 namespace distance {
+namespace detail {
 
 template <typename value_idx>
 class mask_row_it {
@@ -186,6 +187,7 @@ class chunked_mask_row_it : public mask_row_it<value_idx> {
   }
 };
 
+}  // namespace detail
 }  // namespace distance
 }  // namespace sparse
 }  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh
similarity index 98%
rename from cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh
rename to cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh
index c463654a3b..79a5f154d0 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh
@@ -21,6 +21,7 @@
 namespace raft {
 namespace sparse {
 namespace distance {
+namespace detail {
 
 template <typename value_idx, typename value_t, int tpb>
 class dense_smem_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
@@ -92,6 +93,7 @@ class dense_smem_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
   }
 };
 
+}  // namespace detail
 }  // namespace distance
 }  // namespace sparse
 }  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh
similarity index 99%
rename from cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh
rename to cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh
index a95c6ff85b..5ba2d5c102 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh
@@ -33,6 +33,7 @@ CUCO_DECLARE_BITWISE_COMPARABLE(double);
 namespace raft {
 namespace sparse {
 namespace distance {
+namespace detail {
 
 template <typename value_idx, typename value_t, int tpb>
 class hash_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
@@ -217,6 +218,7 @@ class hash_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
   int map_size;
 };
 
+}  // namespace detail
 }  // namespace distance
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/include/raft/sparse/distance/ip_distance.cuh b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
similarity index 78%
rename from cpp/include/raft/sparse/distance/ip_distance.cuh
rename to cpp/include/raft/sparse/distance/detail/ip_distance.cuh
index b1e2756671..2cd7b670d8 100644
--- a/cpp/include/raft/sparse/distance/ip_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
@@ -27,8 +27,8 @@
 #include <raft/sparse/utils.h>
 #include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/convert/dense.cuh>
-#include <raft/sparse/distance/coo_spmv.cuh>
-#include <raft/sparse/distance/operators.cuh>
+#include <raft/sparse/distance/detail/coo_spmv.cuh>
+#include <raft/sparse/distance/detail/operators.cuh>
 #include <rmm/device_uvector.hpp>
 
 #include <nvfunctional>
@@ -36,14 +36,15 @@
 namespace raft {
 namespace sparse {
 namespace distance {
+namespace detail {
 
 template <typename value_idx, typename value_t>
 class ip_distances_t : public distances_t<value_t> {
  public:
   /**
-   * Computes simple sparse inner product distances as sum(x_y * y_k)
-   * @param[in] config specifies inputs, outputs, and sizes
-   */
+         * Computes simple sparse inner product distances as sum(x_y * y_k)
+         * @param[in] config specifies inputs, outputs, and sizes
+         */
   ip_distances_t(const distances_config_t<value_idx, value_t> &config)
     : config_(&config), coo_rows_b(config.b_nnz, config.handle.get_stream()) {
     raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows,
@@ -52,13 +53,13 @@ class ip_distances_t : public distances_t<value_t> {
   }
 
   /**
-   * Performs pairwise distance computation and computes output distances
-   * @param out_distances dense output matrix (size a_nrows * b_nrows)
-   */
+         * Performs pairwise distance computation and computes output distances
+         * @param out_distances dense output matrix (size a_nrows * b_nrows)
+         */
   void compute(value_t *out_distances) {
     /**
-	   * Compute pairwise distances and return dense matrix in row-major format
-	   */
+               * Compute pairwise distances and return dense matrix in row-major format
+               */
     balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
       out_distances, *config_, coo_rows_b.data(), Product(), Sum(),
       AtomicAdd());
@@ -72,6 +73,8 @@ class ip_distances_t : public distances_t<value_t> {
   const distances_config_t<value_idx, value_t> *config_;
   rmm::device_uvector<value_idx> coo_rows_b;
 };
+
+};  // END namespace detail
 };  // END namespace distance
 };  // END namespace sparse
 };  // END namespace raft
diff --git a/cpp/include/raft/sparse/distance/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
similarity index 99%
rename from cpp/include/raft/sparse/distance/l2_distance.cuh
rename to cpp/include/raft/sparse/distance/detail/l2_distance.cuh
index 6ccfd4adcb..e7ac78b80a 100644
--- a/cpp/include/raft/sparse/distance/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
@@ -26,7 +26,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/sparse/csr.cuh>
-#include <raft/sparse/distance/ip_distance.cuh>
+#include <raft/sparse/distance/detail/ip_distance.cuh>
 #include <rmm/device_uvector.hpp>
 
 #include <nvfunctional>
@@ -34,6 +34,7 @@
 namespace raft {
 namespace sparse {
 namespace distance {
+namespace detail {
 
 // @TODO: Move this into sparse prims (coo_norm)
 template <typename value_idx, typename value_t>
@@ -417,6 +418,7 @@ class russelrao_expanded_distances_t : public distances_t<value_t> {
   ip_distances_t<value_idx, value_t> ip_dists;
 };
 
+};  // END namespace detail
 };  // END namespace distance
 };  // END namespace sparse
 };  // END namespace raft
diff --git a/cpp/include/raft/sparse/distance/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
similarity index 98%
rename from cpp/include/raft/sparse/distance/lp_distance.cuh
rename to cpp/include/raft/sparse/distance/detail/lp_distance.cuh
index 885d55ee50..c11369375b 100644
--- a/cpp/include/raft/sparse/distance/lp_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
@@ -30,13 +30,14 @@
 
 #include <raft/sparse/distance/common.h>
 #include <raft/sparse/convert/coo.cuh>
-#include <raft/sparse/distance/operators.cuh>
+#include <raft/sparse/distance/detail/operators.cuh>
 
 #include <nvfunctional>
 
 namespace raft {
 namespace sparse {
 namespace distance {
+namespace detail {
 
 template <typename value_idx = int, typename value_t = float,
           typename product_f, typename accum_f, typename write_f>
@@ -272,6 +273,7 @@ class kl_divergence_unexpanded_distances_t : public distances_t<value_t> {
   const distances_config_t<value_idx, value_t> *config_;
 };
 
+};  // END namespace detail
 };  // END namespace distance
 };  // END namespace sparse
 };  // END namespace raft
diff --git a/cpp/include/raft/sparse/distance/operators.cuh b/cpp/include/raft/sparse/distance/detail/operators.cuh
similarity index 98%
rename from cpp/include/raft/sparse/distance/operators.cuh
rename to cpp/include/raft/sparse/distance/detail/operators.cuh
index 89acda8b1a..9f206095bf 100644
--- a/cpp/include/raft/sparse/distance/operators.cuh
+++ b/cpp/include/raft/sparse/distance/detail/operators.cuh
@@ -21,6 +21,7 @@
 namespace raft {
 namespace sparse {
 namespace distance {
+namespace detail {
 
 struct Sum {
   template <typename value_t>
@@ -90,6 +91,7 @@ struct AbsDiff {
     return fabs(a - b);
   }
 };
+}  // namespace detail
 }  // namespace distance
 }  // namespace sparse
 };  // namespace raft
diff --git a/cpp/include/raft/sparse/distance/utils.cuh b/cpp/include/raft/sparse/distance/detail/utils.cuh
similarity index 96%
rename from cpp/include/raft/sparse/distance/utils.cuh
rename to cpp/include/raft/sparse/distance/detail/utils.cuh
index 3bee1bc87d..abfb7d24ea 100644
--- a/cpp/include/raft/sparse/distance/utils.cuh
+++ b/cpp/include/raft/sparse/distance/detail/utils.cuh
@@ -24,6 +24,7 @@
 namespace raft {
 namespace sparse {
 namespace distance {
+namespace detail {
 
 /**
  * Computes the maximum number of columns that can be stored
@@ -39,6 +40,7 @@ inline int max_cols_per_block() {
          sizeof(value_t);
 }
 
+}  // namespace detail
 }  // namespace distance
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/include/raft/sparse/distance/distance.cuh b/cpp/include/raft/sparse/distance/distance.hpp
similarity index 70%
rename from cpp/include/raft/sparse/distance/distance.cuh
rename to cpp/include/raft/sparse/distance/distance.hpp
index 03df396b2e..24b10420f3 100644
--- a/cpp/include/raft/sparse/distance/distance.cuh
+++ b/cpp/include/raft/sparse/distance/distance.hpp
@@ -31,10 +31,10 @@
 #include <raft/sparse/convert/dense.cuh>
 #include <raft/sparse/csr.cuh>
 
-#include <raft/sparse/distance/bin_distance.cuh>
-#include <raft/sparse/distance/ip_distance.cuh>
-#include <raft/sparse/distance/l2_distance.cuh>
-#include <raft/sparse/distance/lp_distance.cuh>
+#include <raft/sparse/distance/detail/bin_distance.cuh>
+#include <raft/sparse/distance/detail/ip_distance.cuh>
+#include <raft/sparse/distance/detail/l2_distance.cuh>
+#include <raft/sparse/distance/detail/lp_distance.cuh>
 
 #include <cusparse_v2.h>
 
@@ -78,70 +78,77 @@ void pairwiseDistance(value_t *out,
                       raft::distance::DistanceType metric, float metric_arg) {
   switch (metric) {
     case raft::distance::DistanceType::L2Expanded:
-      l2_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      detail::l2_expanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
       break;
     case raft::distance::DistanceType::L2SqrtExpanded:
-      l2_sqrt_expanded_distances_t<value_idx, value_t>(input_config)
+      detail::l2_sqrt_expanded_distances_t<value_idx, value_t>(input_config)
         .compute(out);
       break;
     case raft::distance::DistanceType::InnerProduct:
-      ip_distances_t<value_idx, value_t>(input_config).compute(out);
+      detail::ip_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::L2Unexpanded:
-      l2_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      detail::l2_unexpanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
       break;
     case raft::distance::DistanceType::L2SqrtUnexpanded:
-      l2_sqrt_unexpanded_distances_t<value_idx, value_t>(input_config)
+      detail::l2_sqrt_unexpanded_distances_t<value_idx, value_t>(input_config)
         .compute(out);
       break;
     case raft::distance::DistanceType::L1:
-      l1_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      detail::l1_unexpanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
       break;
     case raft::distance::DistanceType::LpUnexpanded:
-      lp_unexpanded_distances_t<value_idx, value_t>(input_config, metric_arg)
+      detail::lp_unexpanded_distances_t<value_idx, value_t>(input_config,
+                                                            metric_arg)
         .compute(out);
       break;
     case raft::distance::DistanceType::Linf:
-      linf_unexpanded_distances_t<value_idx, value_t>(input_config)
+      detail::linf_unexpanded_distances_t<value_idx, value_t>(input_config)
         .compute(out);
       break;
     case raft::distance::DistanceType::Canberra:
-      canberra_unexpanded_distances_t<value_idx, value_t>(input_config)
+      detail::canberra_unexpanded_distances_t<value_idx, value_t>(input_config)
         .compute(out);
       break;
     case raft::distance::DistanceType::JaccardExpanded:
-      jaccard_expanded_distances_t<value_idx, value_t>(input_config)
+      detail::jaccard_expanded_distances_t<value_idx, value_t>(input_config)
         .compute(out);
       break;
     case raft::distance::DistanceType::CosineExpanded:
-      cosine_expanded_distances_t<value_idx, value_t>(input_config)
+      detail::cosine_expanded_distances_t<value_idx, value_t>(input_config)
         .compute(out);
       break;
     case raft::distance::DistanceType::HellingerExpanded:
-      hellinger_expanded_distances_t<value_idx, value_t>(input_config)
+      detail::hellinger_expanded_distances_t<value_idx, value_t>(input_config)
         .compute(out);
       break;
     case raft::distance::DistanceType::DiceExpanded:
-      dice_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      detail::dice_expanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
       break;
     case raft::distance::DistanceType::CorrelationExpanded:
-      correlation_expanded_distances_t<value_idx, value_t>(input_config)
+      detail::correlation_expanded_distances_t<value_idx, value_t>(input_config)
         .compute(out);
       break;
     case raft::distance::DistanceType::RusselRaoExpanded:
-      russelrao_expanded_distances_t<value_idx, value_t>(input_config)
+      detail::russelrao_expanded_distances_t<value_idx, value_t>(input_config)
         .compute(out);
       break;
     case raft::distance::DistanceType::HammingUnexpanded:
-      hamming_unexpanded_distances_t<value_idx, value_t>(input_config)
+      detail::hamming_unexpanded_distances_t<value_idx, value_t>(input_config)
         .compute(out);
       break;
     case raft::distance::DistanceType::JensenShannon:
-      jensen_shannon_unexpanded_distances_t<value_idx, value_t>(input_config)
+      detail::jensen_shannon_unexpanded_distances_t<value_idx, value_t>(
+        input_config)
         .compute(out);
       break;
     case raft::distance::DistanceType::KLDivergence:
-      kl_divergence_unexpanded_distances_t<value_idx, value_t>(input_config)
+      detail::kl_divergence_unexpanded_distances_t<value_idx, value_t>(
+        input_config)
         .compute(out);
       break;
 
diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh
index 46369ca964..5313b81192 100644
--- a/cpp/include/raft/sparse/selection/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/connect_components.cuh
@@ -16,7 +16,7 @@
 
 #include <cub/cub.cuh>
 
-#include <raft/distance/fused_l2_nn.cuh>
+#include <raft/distance/fused_l2_nn.hpp>
 #include <raft/label/classlabels.cuh>
 #include <raft/linalg/norm.cuh>
 #include <raft/mr/device/buffer.hpp>
diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh
index 49573a679d..fc1a7c0d8d 100644
--- a/cpp/include/raft/sparse/selection/knn.cuh
+++ b/cpp/include/raft/sparse/selection/knn.cuh
@@ -20,7 +20,6 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
-#include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/matrix.cuh>
@@ -30,16 +29,9 @@
 #include <raft/sparse/utils.h>
 #include <raft/sparse/coo.cuh>
 #include <raft/sparse/csr.cuh>
-#include <raft/sparse/distance/distance.cuh>
+#include <raft/sparse/distance/distance.hpp>
 #include <raft/spatial/knn/knn.hpp>
 
-#include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
-#include <raft/sparse/cusparse_wrappers.h>
-#include <raft/cuda_utils.cuh>
-
-#include <cusparse_v2.h>
-
 namespace raft {
 namespace sparse {
 namespace selection {
@@ -412,7 +404,6 @@ class sparse_knn_t {
    * @param[out] output_indices dense matrix for output indices (size n_query_rows * k)
    * @param[out] output_dists dense matrix for output distances (size n_query_rows * k)
    * @param[in] k the number of neighbors to query
-   * @param[in] cusparseHandle the initialized cusparseHandle instance to use
    * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to
    * @param[in] batch_size_index maximum number of rows to use from index matrix per batch
    * @param[in] batch_size_query maximum number of rows to use from query matrix per batch
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index 0e91b5225d..980001f166 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -27,7 +27,7 @@
 #include "processing.hpp"
 
 #include <label/classlabels.cuh>
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuIndexFlat.h>
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
index 46a97400e2..909e28708e 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
@@ -34,7 +34,6 @@
 #include <raft/matrix/matrix.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/sparse/convert/csr.cuh>
-#include <raft/sparse/distance/operators.cuh>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index 9d00d9b9f4..7f8523a587 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -17,7 +17,9 @@
 #include <cub/cub.cuh>
 #include <faiss/gpu/utils/Select.cuh>
 #include <limits>
-#include <raft/distance/pairwise_distance_base.cuh>
+
+// TODO: Need to hide the PairwiseDistance class impl and expose to public API
+#include <raft/distance/detail/pairwise_distance_base.cuh>
 #include "processing.hpp"
 
 namespace raft {
@@ -459,10 +461,10 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
       }
     };
 
-  raft::distance::PairwiseDistances<useNorms, DataT, AccT, OutT, IdxT, Policy,
-                                    CoreLambda, decltype(epilog_lambda),
-                                    FinalLambda, decltype(rowEpilog_lambda),
-                                    isRowMajor, false>
+  raft::distance::detail::PairwiseDistances<
+    useNorms, DataT, AccT, OutT, IdxT, Policy, CoreLambda,
+    decltype(epilog_lambda), FinalLambda, decltype(rowEpilog_lambda),
+    isRowMajor, false>
     obj(x, y, m, n, k, lda, ldb, ldd, _xn, _yn, nullptr, smem, core_op,
         epilog_lambda, fin_op, rowEpilog_lambda);
   obj.run();
@@ -510,7 +512,7 @@ void fusedL2kNNImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
              "fusedL2kNN: num of nearest neighbors must be <= 64");
     }
 
-    dim3 grid = raft::distance::launchConfigGenerator<KPolicy>(
+    dim3 grid = raft::distance::detail::launchConfigGenerator<KPolicy>(
       m, n, KPolicy::SmemSize, fusedL2kNNRowMajor);
     if (grid.x > 1) {
       const auto numMutexes = raft::ceildiv<int>(m, KPolicy::Mblk);
diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index 9d97755f03..82c2bfdfec 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/random/rng.cuh>
 #include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index a10710e622..f6b18198d2 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/distance/distance.cuh>
+#include <raft/distance/distance.hpp>
 #include <raft/random/rng.cuh>
 #include "../test_utils.h"
 
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 33f7dbf828..8c06abf370 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -17,7 +17,8 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/distance/fused_l2_nn.cuh>
+#include <raft/distance/detail/fused_l2_nn.cuh>
+#include <raft/distance/fused_l2_nn.hpp>
 #include <raft/linalg/norm.cuh>
 #include <raft/random/rng.cuh>
 #include "../test_utils.h"
@@ -81,7 +82,7 @@ void naive(cub::KeyValuePair<int, DataT> *min, DataT *x, DataT *y, int m, int n,
   CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream));
   auto blks = raft::ceildiv(m, 256);
   MinAndDistanceReduceOp<int, DataT> op;
-  initKernel<DataT, cub::KeyValuePair<int, DataT>, int>
+  detail::initKernel<DataT, cub::KeyValuePair<int, DataT>, int>
     <<<blks, 256, 0, stream>>>(min, m, std::numeric_limits<DataT>::max(), op);
   CUDA_CHECK(cudaGetLastError());
   naiveKernel<DataT, Sqrt, MinAndDistanceReduceOp<int, DataT>, 16>
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index cd3e69bdd2..d24199c5fc 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -25,8 +25,8 @@
 #include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/convert/coo.cuh>
-#include <raft/sparse/distance/coo_spmv.cuh>
-#include <raft/sparse/distance/operators.cuh>
+#include <raft/sparse/distance/detail/coo_spmv.cuh>
+#include <raft/sparse/distance/detail/operators.cuh>
 
 #include "../test_utils.h"
 
@@ -54,15 +54,16 @@ struct InputConfiguration {
   float metric_arg = 0.0;
 };
 
-using dense_smem_strategy_t = dense_smem_strategy<int, float, 1024>;
-using hash_strategy_t = hash_strategy<int, float, 1024>;
+using dense_smem_strategy_t = detail::dense_smem_strategy<int, float, 1024>;
+using hash_strategy_t = detail::hash_strategy<int, float, 1024>;
 
 template <typename value_idx, typename value_t, typename strategy_t>
 struct SparseDistanceCOOSPMVInputs {
   InputConfiguration<value_idx, value_t> input_configuration;
 
   float capacity_threshold = 0.5;
-  int map_size = hash_strategy<value_idx, value_t, 1024>::get_map_size();
+  int map_size =
+    detail::hash_strategy<value_idx, value_t, 1024>::get_map_size();
 };
 
 template <typename value_idx, typename value_t, typename strategy_t>
@@ -109,7 +110,7 @@ class SparseDistanceCOOSPMVTest
                                       dist_config.handle.get_stream());
 
     strategy_t selected_strategy = make_strategy<strategy_t>();
-    balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
+    detail::balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
       out_dists.data(), dist_config, coo_rows.data(), reduce_func, accum_func,
       write_func, selected_strategy);
 
@@ -118,7 +119,7 @@ class SparseDistanceCOOSPMVTest
         dist_config.a_indptr, dist_config.a_nrows, coo_rows.data(),
         dist_config.a_nnz, dist_config.handle.get_stream());
 
-      balanced_coo_pairwise_generalized_spmv_rev<value_idx, value_t>(
+      detail::balanced_coo_pairwise_generalized_spmv_rev<value_idx, value_t>(
         out_dists.data(), dist_config, coo_rows.data(), reduce_func, accum_func,
         write_func, selected_strategy);
     }
@@ -127,27 +128,28 @@ class SparseDistanceCOOSPMVTest
   void run_spmv() {
     switch (params.input_configuration.metric) {
       case raft::distance::DistanceType::InnerProduct:
-        compute_dist(Product(), Sum(), AtomicAdd(), true);
+        compute_dist(detail::Product(), detail::Sum(), detail::AtomicAdd(),
+                     true);
         break;
       case raft::distance::DistanceType::L2Unexpanded:
-        compute_dist(SqDiff(), Sum(), AtomicAdd());
+        compute_dist(detail::SqDiff(), detail::Sum(), detail::AtomicAdd());
         break;
       case raft::distance::DistanceType::Canberra:
         compute_dist(
           [] __device__(value_t a, value_t b) {
             return fabsf(a - b) / (fabsf(a) + fabsf(b));
           },
-          Sum(), AtomicAdd());
+          detail::Sum(), detail::AtomicAdd());
         break;
       case raft::distance::DistanceType::L1:
-        compute_dist(AbsDiff(), Sum(), AtomicAdd());
+        compute_dist(detail::AbsDiff(), detail::Sum(), detail::AtomicAdd());
         break;
       case raft::distance::DistanceType::Linf:
-        compute_dist(AbsDiff(), Max(), AtomicMax());
+        compute_dist(detail::AbsDiff(), detail::Max(), detail::AtomicMax());
         break;
       case raft::distance::DistanceType::LpUnexpanded: {
-        compute_dist(PDiff(params.input_configuration.metric_arg), Sum(),
-                     AtomicAdd());
+        compute_dist(detail::PDiff(params.input_configuration.metric_arg),
+                     detail::Sum(), detail::AtomicAdd());
         float p = 1.0f / params.input_configuration.metric_arg;
         raft::linalg::unaryOp<value_t>(
           out_dists.data(), out_dists.data(),
diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu
index efe49c6470..3bc562bb68 100644
--- a/cpp/test/sparse/distance.cu
+++ b/cpp/test/sparse/distance.cu
@@ -22,7 +22,7 @@
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
 
-#include <raft/sparse/distance/distance.cuh>
+#include <raft/sparse/distance/distance.hpp>
 
 #include "../test_utils.h"
 

From 091cfdfe5b65242848355a4aca6af157aafee4bd Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Wed, 27 Oct 2021 16:49:31 -0400
Subject: [PATCH 031/171] Fixing overflow in expanded distances (#365)

Closes https://github.com/rapidsai/cuml/issues/4289

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/365
---
 cpp/include/raft/sparse/distance/detail/bin_distance.cuh | 2 +-
 cpp/include/raft/sparse/distance/detail/l2_distance.cuh  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
index e6dd1331ae..3f8c32a20b 100644
--- a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
@@ -54,7 +54,7 @@ __global__ void compute_binary_warp_kernel(value_t *__restrict__ C,
                                            const value_t *__restrict__ R_norms,
                                            value_idx n_rows, value_idx n_cols,
                                            expansion_f expansion_func) {
-  value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
+  std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
   value_idx i = tid / n_cols;
   value_idx j = tid % n_cols;
 
diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
index e7ac78b80a..f06a15215c 100644
--- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
@@ -64,7 +64,7 @@ __global__ void compute_euclidean_warp_kernel(
   value_t *__restrict__ C, const value_t *__restrict__ Q_sq_norms,
   const value_t *__restrict__ R_sq_norms, value_idx n_rows, value_idx n_cols,
   expansion_f expansion_func) {
-  value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
+  std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
   value_idx i = tid / n_cols;
   value_idx j = tid % n_cols;
 
@@ -85,7 +85,7 @@ __global__ void compute_correlation_warp_kernel(
   const value_t *__restrict__ R_sq_norms, const value_t *__restrict__ Q_norms,
   const value_t *__restrict__ R_norms, value_idx n_rows, value_idx n_cols,
   value_idx n) {
-  value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
+  std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
   value_idx i = tid / n_cols;
   value_idx j = tid % n_cols;
 

From 3a7e3ed58ded1b7724e1fc6484bc59cf669c0c9b Mon Sep 17 00:00:00 2001
From: AJ Schmidt <aschmidt@nvidia.com>
Date: Thu, 4 Nov 2021 10:14:28 -0400
Subject: [PATCH 032/171] DOC v22.02 Updates

---
 CHANGELOG.md       | 4 ++++
 cpp/CMakeLists.txt | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cf36c3facd..45ccea5cac 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# raft 22.02.00 (Date TBD)
+
+Please see https://github.com/rapidsai/raft/releases/tag/v22.02.00a for the latest changes to this development branch.
+
 # raft 21.12.00 (Date TBD)
 
 Please see https://github.com/rapidsai/raft/releases/tag/v21.12.00a for the latest changes to this development branch.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 071b5dbed6..779f855f3d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -15,7 +15,7 @@
 #=============================================================================
 
 cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR)
-file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.10/RAPIDS.cmake
+file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.02/RAPIDS.cmake
     ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
 include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
 include(rapids-cmake)
@@ -26,7 +26,7 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(RAFT)
 
-project(RAFT VERSION 21.12.00 LANGUAGES CXX CUDA)
+project(RAFT VERSION 22.02.00 LANGUAGES CXX CUDA)
 
 ##############################################################################
 # - build type ---------------------------------------------------------------

From 3848fa777516e2f8fdf481a010f6235cb200179c Mon Sep 17 00:00:00 2001
From: Micka <9810050+lowener@users.noreply.github.com>
Date: Fri, 5 Nov 2021 18:35:19 +0100
Subject: [PATCH 033/171] Use 64 bit CuSolver API for Eigen decomposition
 (#349)

This PR replace the legacy call to `cusolverDnsyevj` with the 64-bit version.
It also improves the indexing types used, from int to size_t when possible.

Authors:
  - Micka (https://github.com/lowener)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/349
---
 cpp/include/raft/linalg/cusolver_wrappers.h | 71 ++++++++++++++++++++
 cpp/include/raft/linalg/eig.cuh             | 74 ++++++++++++++++-----
 2 files changed, 129 insertions(+), 16 deletions(-)

diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/cusolver_wrappers.h
index 0eadf47fe3..6aa5e74455 100644
--- a/cpp/include/raft/linalg/cusolver_wrappers.h
+++ b/cpp/include/raft/linalg/cusolver_wrappers.h
@@ -719,5 +719,76 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
 }
 /** @} */
 
+#if CUDART_VERSION >= 11010
+/**
+ * @defgroup DnXsyevd cusolver DnXsyevd operations
+ * @{
+ */
+template <typename T>
+cusolverStatus_t cusolverDnxsyevd_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz,
+  cublasFillMode_t uplo, int64_t n, const T *A, int64_t lda, const T *W,
+  size_t *workspaceInBytesOnDevice, size_t *workspaceInBytesOnHost,
+  cudaStream_t stream);
+
+template <>
+inline cusolverStatus_t cusolverDnxsyevd_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz,
+  cublasFillMode_t uplo, int64_t n, const float *A, int64_t lda, const float *W,
+  size_t *workspaceInBytesOnDevice, size_t *workspaceInBytesOnHost,
+  cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnXsyevd_bufferSize(
+    handle, params, jobz, uplo, n, CUDA_R_32F, A, lda, CUDA_R_32F, W,
+    CUDA_R_32F, workspaceInBytesOnDevice, workspaceInBytesOnHost);
+}
+
+template <>
+inline cusolverStatus_t cusolverDnxsyevd_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz,
+  cublasFillMode_t uplo, int64_t n, const double *A, int64_t lda,
+  const double *W, size_t *workspaceInBytesOnDevice,
+  size_t *workspaceInBytesOnHost, cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnXsyevd_bufferSize(
+    handle, params, jobz, uplo, n, CUDA_R_64F, A, lda, CUDA_R_64F, W,
+    CUDA_R_64F, workspaceInBytesOnDevice, workspaceInBytesOnHost);
+}
+
+template <typename T>
+cusolverStatus_t cusolverDnxsyevd(  // NOLINT
+  cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz,
+  cublasFillMode_t uplo, int64_t n, T *A, int64_t lda, T *W, T *bufferOnDevice,
+  size_t workspaceInBytesOnDevice, T *bufferOnHost,
+  size_t workspaceInBytesOnHost, int *info, cudaStream_t stream);
+
+template <>
+inline cusolverStatus_t cusolverDnxsyevd(  // NOLINT
+  cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz,
+  cublasFillMode_t uplo, int64_t n, float *A, int64_t lda, float *W,
+  float *bufferOnDevice, size_t workspaceInBytesOnDevice, float *bufferOnHost,
+  size_t workspaceInBytesOnHost, int *info, cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnXsyevd(handle, params, jobz, uplo, n, CUDA_R_32F, A, lda,
+                          CUDA_R_32F, W, CUDA_R_32F, bufferOnDevice,
+                          workspaceInBytesOnDevice, bufferOnHost,
+                          workspaceInBytesOnHost, info);
+}
+
+template <>
+inline cusolverStatus_t cusolverDnxsyevd(  // NOLINT
+  cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz,
+  cublasFillMode_t uplo, int64_t n, double *A, int64_t lda, double *W,
+  double *bufferOnDevice, size_t workspaceInBytesOnDevice, double *bufferOnHost,
+  size_t workspaceInBytesOnHost, int *info, cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnXsyevd(handle, params, jobz, uplo, n, CUDA_R_64F, A, lda,
+                          CUDA_R_64F, W, CUDA_R_64F, bufferOnDevice,
+                          workspaceInBytesOnDevice, bufferOnHost,
+                          workspaceInBytesOnHost, info);
+}
+/** @} */
+#endif
+
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
index 5b2df3bcb3..4161ddb8b9 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -28,6 +28,34 @@
 namespace raft {
 namespace linalg {
 
+template <typename math_t>
+void eigDC_legacy(const raft::handle_t &handle, const math_t *in,
+                  std::size_t n_rows, std::size_t n_cols, math_t *eig_vectors,
+                  math_t *eig_vals, cudaStream_t stream) {
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+
+  int lwork;
+  CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
+                                            CUBLAS_FILL_MODE_UPPER, n_rows, in,
+                                            n_cols, eig_vals, &lwork));
+
+  rmm::device_uvector<math_t> d_work(lwork, stream);
+  rmm::device_scalar<int> d_dev_info(stream);
+
+  raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
+
+  CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
+                                 CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors,
+                                 n_cols, eig_vals, d_work.data(), lwork,
+                                 d_dev_info.data(), stream));
+  CUDA_CHECK(cudaGetLastError());
+
+  auto dev_info = d_dev_info.value(stream);
+  ASSERT(dev_info == 0,
+         "eig.cuh: eigensolver couldn't converge to a solution. "
+         "This usually occurs when some of the features do not vary enough.");
+}
+
 /**
  * @defgroup eig decomp with divide and conquer method for the column-major
  * symmetric matrices
@@ -42,31 +70,43 @@ namespace linalg {
  * @{
  */
 template <typename math_t>
-void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows,
-           int n_cols, math_t *eig_vectors, math_t *eig_vals,
+void eigDC(const raft::handle_t &handle, const math_t *in, std::size_t n_rows,
+           std::size_t n_cols, math_t *eig_vectors, math_t *eig_vals,
            cudaStream_t stream) {
+#if CUDART_VERSION < 11010
+  eigDC_legacy(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream);
+#else
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
-  int lwork;
-  CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
-                                            CUBLAS_FILL_MODE_UPPER, n_rows, in,
-                                            n_cols, eig_vals, &lwork));
+  cusolverDnParams_t dn_params = nullptr;
+  CUSOLVER_CHECK(cusolverDnCreateParams(&dn_params));
 
-  rmm::device_uvector<math_t> d_work(lwork, stream);
+  size_t workspaceDevice = 0;
+  size_t workspaceHost = 0;
+  CUSOLVER_CHECK(cusolverDnxsyevd_bufferSize(
+    cusolverH, dn_params, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER,
+    static_cast<int64_t>(n_rows), eig_vectors, static_cast<int64_t>(n_cols),
+    eig_vals, &workspaceDevice, &workspaceHost, stream));
+
+  rmm::device_uvector<math_t> d_work(workspaceDevice / sizeof(math_t), stream);
   rmm::device_scalar<int> d_dev_info(stream);
+  std::vector<math_t> h_work(workspaceHost / sizeof(math_t));
 
   raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
 
-  CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
-                                 CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors,
-                                 n_cols, eig_vals, d_work.data(), lwork,
-                                 d_dev_info.data(), stream));
-  CUDA_CHECK(cudaGetLastError());
+  CUSOLVER_CHECK(cusolverDnxsyevd(
+    cusolverH, dn_params, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER,
+    static_cast<int64_t>(n_rows), eig_vectors, static_cast<int64_t>(n_cols),
+    eig_vals, d_work.data(), workspaceDevice, h_work.data(), workspaceHost,
+    d_dev_info.data(), stream));
 
+  CUDA_CHECK(cudaGetLastError());
+  CUSOLVER_CHECK(cusolverDnDestroyParams(dn_params));
   int dev_info = d_dev_info.value(stream);
   ASSERT(dev_info == 0,
          "eig.cuh: eigensolver couldn't converge to a solution. "
          "This usually occurs when some of the features do not vary enough.");
+#endif
 }
 
 enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT };
@@ -155,15 +195,17 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
  * @{
  */
 template <typename math_t>
-void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows,
-               int n_cols, math_t *eig_vectors, math_t *eig_vals,
-               cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) {
+void eigJacobi(const raft::handle_t &handle, const math_t *in,
+               std::size_t n_rows, std::size_t n_cols, math_t *eig_vectors,
+               math_t *eig_vals, cudaStream_t stream, math_t tol = 1.e-7,
+               std::uint32_t sweeps = 15) {
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   syevjInfo_t syevj_params = nullptr;
   CUSOLVER_CHECK(cusolverDnCreateSyevjInfo(&syevj_params));
   CUSOLVER_CHECK(cusolverDnXsyevjSetTolerance(syevj_params, tol));
-  CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(syevj_params, sweeps));
+  CUSOLVER_CHECK(
+    cusolverDnXsyevjSetMaxSweeps(syevj_params, static_cast<int>(sweeps)));
 
   int lwork;
   CUSOLVER_CHECK(cusolverDnsyevj_bufferSize(

From 22d1fc61a241baea685304b5e7025f65220e3489 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Sat, 6 Nov 2021 10:05:18 -0400
Subject: [PATCH 034/171] README updates (#351)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/raft/pull/351
---
 README.md | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 83 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 546a1df1f0..8091c345e1 100755
--- a/README.md
+++ b/README.md
@@ -1,15 +1,94 @@
-# <div align="left"><img src="https://rapids.ai/assets/images/rapids_logo.png" width="90px"/>&nbsp;RAFT: RAPIDS Analytics Frameworks Toolset</div>
+# <div align="left"><img src="https://rapids.ai/assets/images/rapids_logo.png" width="90px"/>&nbsp;RAFT: RAPIDS Analytics Framework Toolkit</div>
 
-RAFT is a repository containining shared utilities, mathematical operations and common functions for the analytics components of RAPIDS. Both the C++ and Python components can be included in consuming libraries.
+RAFT is a library containing building-blocks for rapid composition of RAPIDS Analytics. These building-blocks include shared representations, mathematical computational primitives, and utilities that accelerate building analytics and data science algorithms in the RAPIDS ecosystem. Both the C++ and Python components can be included in consuming libraries, providing building-blocks for both dense and sparse matrix formats in the following general categories:
+#####
+| Category | Description / Examples |
+| --- | --- |
+| **Data Formats** | tensor representations and conversions for both sparse and dense formats |
+| **Data Generation** | graph, spatial, and machine learning dataset generation |
+| **Dense Operations** | linear algebra, statistics |
+| **Spatial** | pairwise distances, nearest neighbors, neighborhood / proximity graph construction |
+| **Sparse/Graph Operations** | linear algebra, statistics, slicing, msf, spectral embedding/clustering, slhc, vertex degree |
+| **Solvers** | eigenvalue decomposition, least squares, lanczos |
+| **Tools** | multi-node multi-gpu communicator, utilities |
+
+By taking a primitives-based approach to algorithm development, RAFT accelerates algorithm construction time and reduces
+the maintenance burden by maximizing reuse across projects. RAFT relies on the [RAPIDS memory manager (RMM)](https://github.com/rapidsai/rmm) which, 
+like other projects in the RAPIDS ecosystem, eases the burden of configuring different allocation strategies globally 
+across the libraries that use it. RMM also provides RAII wrappers around device arrays that handle the allocation and cleanup.
+
+## Getting started
 
 Refer to the [Build and Development Guide](BUILD.md) for details on RAFT's design, building, testing and development guidelines.
 
+Most of the primitives in RAFT accept a `raft::handle_t` object for the management of resources which are expensive to create, such CUDA streams, stream pools, and handles to other CUDA libraries like `cublas` and `cusolver`. 
+
+
+### C++ Example
+
+The example below demonstrates creating a RAFT handle and using it with RMM's `device_uvector` to allocate memory on device and compute
+pairwise Euclidean distances:
+```c++
+#include <raft/handle.hpp>
+#include <raft/distance/distance.hpp>
+
+#include <rmm/device_uvector.hpp>
+raft::handle_t handle;
+
+int n_samples = ...;
+int n_features = ...;
+
+rmm::device_uvector<float> input(n_samples * n_features, handle.get_stream());
+rmm::device_uvector<float> output(n_samples * n_samples, handle.get_stream());
+
+// ... Populate feature matrix ...
+
+auto metric = raft::distance::DistanceType::L2SqrtExpanded;
+rmm::device_uvector<char> workspace(0, handle.get_stream());
+raft::distance::pairwise_distance(handle, input.data(), input.data(),
+                                  output.data(),
+                                  n_samples, n_samples, n_features,
+                                  workspace.data(), metric);
+```
+
+
+
+
 ## Folder Structure and Contents
 
-The folder structure mirrors the main RAPIDS repos (cuDF, cuML, cuGraph...), with the following folders:
+The folder structure mirrors other RAPIDS repos (cuDF, cuML, cuGraph...), with the following folders:
 
-- `cpp`: Source code for all C++ code. The code is header only, therefore it is in the `include` folder (with no `src`).
+- `cpp`: Source code for all C++ code. The code is currently header-only, therefore it is in the `include` folder (with no `src`).
 - `python`: Source code for all Python source code.
 - `ci`: Scripts for running CI in PRs
 
+[comment]: <> (TODO: This needs to be updated after the public API is established)
+[comment]: <> (The library layout contains the following structure:)
+
+[comment]: <> (```bash)
+
+[comment]: <> (cpp/include/raft)
+
+[comment]: <> (     |------------ comms      [communication abstraction layer])
+
+[comment]: <> (     |------------ distance   [dense pairwise distances])
+
+[comment]: <> (     |------------ linalg     [dense linear algebra])
+
+[comment]: <> (     |------------ matrix     [dense matrix format])
+
+[comment]: <> (     |------------ random     [random matrix generation])
+
+[comment]: <> (     |------------ sparse     [sparse matrix and graph algorithms])
+
+[comment]: <> (     |------------ spatial    [spatial algorithms])
+
+[comment]: <> (     |------------ spectral   [spectral clustering])
+
+[comment]: <> (     |------------ stats      [statistics primitives])
+
+[comment]: <> (     |------------ handle.hpp [raft handle])
+
+[comment]: <> (```)
+
 

From 6c93d46a1f3e17e613497f510baa7e449b7abe59 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Tue, 9 Nov 2021 14:48:50 -0600
Subject: [PATCH 035/171] Port libcudacxx patch from cudf (#370)

Porting @robertmaynard's patch for libcudacxx patch for CUDA 11.5 from cuDF, this should fix RAFT in 11.5 and in conjunction with https://github.com/rapidsai/cuml/pull/4327 also unblocks cuML

Authors:
  - Dante Gama Dessavre (https://github.com/dantegd)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Robert Maynard (https://github.com/robertmaynard)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/raft/pull/370
---
 ci/gpu/build.sh                           |  2 +-
 cpp/CMakeLists.txt                        |  4 +++-
 cpp/cmake/libcudacxx.patch                | 21 ++++++++++++++++++
 cpp/cmake/thirdparty/get_cuco.cmake       |  2 +-
 cpp/cmake/thirdparty/get_libcudacxx.cmake | 26 +++++++++++++++++++++++
 5 files changed, 52 insertions(+), 3 deletions(-)
 create mode 100644 cpp/cmake/libcudacxx.patch
 create mode 100644 cpp/cmake/thirdparty/get_libcudacxx.cmake

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 96d70577f5..ede657bccd 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -15,7 +15,7 @@ function hasArg {
 
 # Set path and build parallel level
 export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
-export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
+export PARALLEL_LEVEL=${PARALLEL_LEVEL:-8}
 export CUDA_REL=${CUDA_VERSION%.*}
 
 # Set home to the job's workspace
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 071b5dbed6..07042217e5 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -15,7 +15,7 @@
 #=============================================================================
 
 cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR)
-file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.10/RAPIDS.cmake
+file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.12/RAPIDS.cmake
     ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
 include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
 include(rapids-cmake)
@@ -100,8 +100,10 @@ endif()
 # add third party dependencies using CPM
 rapids_cpm_init()
 
+# thrust and libcudacxx need to be before cuco!
 include(cmake/thirdparty/get_thrust.cmake)
 include(cmake/thirdparty/get_rmm.cmake)
+include(cmake/thirdparty/get_libcudacxx.cmake)
 include(cmake/thirdparty/get_cuco.cmake)
 
 if(BUILD_TESTS)
diff --git a/cpp/cmake/libcudacxx.patch b/cpp/cmake/libcudacxx.patch
new file mode 100644
index 0000000000..3cdc40ef08
--- /dev/null
+++ b/cpp/cmake/libcudacxx.patch
@@ -0,0 +1,21 @@
+diff --git a/include/cuda/std/detail/__config b/include/cuda/std/detail/__config
+index d55a43688..654142d7e 100644
+--- a/include/cuda/std/detail/__config
++++ b/include/cuda/std/detail/__config
+@@ -23,7 +23,7 @@
+     #define _LIBCUDACXX_CUDACC_VER_MINOR __CUDACC_VER_MINOR__
+     #define _LIBCUDACXX_CUDACC_VER_BUILD __CUDACC_VER_BUILD__
+     #define _LIBCUDACXX_CUDACC_VER                                                  \
+-        _LIBCUDACXX_CUDACC_VER_MAJOR * 10000 + _LIBCUDACXX_CUDACC_VER_MINOR * 100 + \
++        _LIBCUDACXX_CUDACC_VER_MAJOR * 100000 + _LIBCUDACXX_CUDACC_VER_MINOR * 1000 + \
+         _LIBCUDACXX_CUDACC_VER_BUILD
+ 
+     #define _LIBCUDACXX_HAS_NO_LONG_DOUBLE
+@@ -64,7 +64,7 @@
+ #  endif
+ #endif
+ 
+-#if defined(_LIBCUDACXX_COMPILER_MSVC) || (defined(_LIBCUDACXX_CUDACC_VER) && (_LIBCUDACXX_CUDACC_VER < 110500))
++#if defined(_LIBCUDACXX_COMPILER_MSVC) || (defined(_LIBCUDACXX_CUDACC_VER) && (_LIBCUDACXX_CUDACC_VER < 1105000))
+ #  define _LIBCUDACXX_HAS_NO_INT128
+ #endif
diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index 33e28ff622..a0c0faf0a9 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -22,7 +22,7 @@ function(find_and_configure_cuco VERSION)
       INSTALL_EXPORT_SET  raft-exports
       CPM_ARGS
         GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
-        GIT_TAG        729857a5698a0e8d8f812e0464f65f37854ae17b
+        GIT_TAG        f0eecb203590f1f4ac4a9f1700229f4434ac64dc
         OPTIONS        "BUILD_TESTS OFF"
                        "BUILD_BENCHMARKS OFF"
                        "BUILD_EXAMPLES OFF"
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
new file mode 100644
index 0000000000..e18b912ba7
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -0,0 +1,26 @@
+# =============================================================================
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# This function finds libcudacxx and sets any additional necessary environment variables.
+function(find_and_configure_libcudacxx)
+  include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
+
+  rapids_cpm_libcudacxx(
+    BUILD_EXPORT_SET raft-exports INSTALL_EXPORT_SET raft-exports PATCH_COMMAND patch
+    --reject-file=- -p1 -N < ${RAFT_SOURCE_DIR}/cmake/libcudacxx.patch || true
+  )
+
+endfunction()
+
+find_and_configure_libcudacxx()

From 466ea7150960ec0e8632022ef1bc8bce57b2736d Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Thu, 11 Nov 2021 12:30:36 -0800
Subject: [PATCH 036/171] Hiding implementation details for random, stats, and
 matrix (#356)

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/356
---
 cpp/include/raft/linalg/eig.cuh               |   2 +-
 cpp/include/raft/linalg/qr.cuh                |   2 +-
 cpp/include/raft/linalg/svd.cuh               |   4 +-
 cpp/include/raft/matrix/detail/math.cuh       | 118 ++++
 cpp/include/raft/matrix/detail/matrix.cuh     | 170 ++++++
 .../raft/matrix/{math.cuh => math.hpp}        |  85 +--
 .../raft/matrix/{matrix.cuh => matrix.hpp}    | 131 +----
 .../random/{rng.cuh => detail/rng_impl.cuh}   | 549 ++++++++++--------
 cpp/include/raft/random/rng.hpp               | 376 ++++++++++++
 cpp/include/raft/random/rng_impl.cuh          | 290 ---------
 cpp/include/raft/sparse/selection/knn.cuh     |   2 +-
 .../raft/spatial/knn/detail/ball_cover.cuh    |   4 +-
 .../raft/spatial/knn/detail/processing.hpp    |   4 +-
 cpp/include/raft/stats/{ => detail}/mean.cuh  |  29 +-
 .../raft/stats/{ => detail}/stddev.cuh        |  46 +-
 cpp/include/raft/stats/{ => detail}/sum.cuh   |  26 +-
 cpp/include/raft/stats/mean.hpp               |  50 ++
 .../{mean_center.cuh => mean_center.hpp}      |   0
 cpp/include/raft/stats/stddev.hpp             |  75 +++
 cpp/include/raft/stats/sum.hpp                |  47 ++
 cpp/test/distance/dist_adj.cu                 |   2 +-
 cpp/test/distance/distance_base.cuh           |   2 +-
 cpp/test/distance/fused_l2_nn.cu              |   2 +-
 cpp/test/linalg/add.cu                        |   2 +-
 cpp/test/linalg/binary_op.cu                  |   2 +-
 cpp/test/linalg/coalesced_reduction.cu        |   2 +-
 cpp/test/linalg/divide.cu                     |   2 +-
 cpp/test/linalg/eig.cu                        |   2 +-
 cpp/test/linalg/eig_sel.cu                    |   2 +-
 cpp/test/linalg/eltwise.cu                    |   2 +-
 cpp/test/linalg/gemm_layout.cu                |   2 +-
 cpp/test/linalg/gemv.cu                       |   2 +-
 cpp/test/linalg/map.cu                        |   2 +-
 cpp/test/linalg/map_then_reduce.cu            |   2 +-
 cpp/test/linalg/matrix_vector_op.cu           |   2 +-
 cpp/test/linalg/multiply.cu                   |   2 +-
 cpp/test/linalg/norm.cu                       |   2 +-
 cpp/test/linalg/reduce.cu                     |   2 +-
 cpp/test/linalg/strided_reduction.cu          |   2 +-
 cpp/test/linalg/subtract.cu                   |   2 +-
 cpp/test/linalg/svd.cu                        |   4 +-
 cpp/test/linalg/transpose.cu                  |   2 +-
 cpp/test/linalg/unary_op.cu                   |   2 +-
 cpp/test/matrix/math.cu                       |   4 +-
 cpp/test/matrix/matrix.cu                     |   4 +-
 cpp/test/random/rng.cu                        |   8 +-
 cpp/test/random/rng_int.cu                    |   4 +-
 cpp/test/random/sample_without_replacement.cu |   4 +-
 cpp/test/sparse/add.cu                        |   2 +-
 cpp/test/sparse/convert_coo.cu                |   2 +-
 cpp/test/sparse/convert_csr.cu                |   2 +-
 cpp/test/sparse/degree.cu                     |   2 +-
 cpp/test/sparse/filter.cu                     |   2 +-
 cpp/test/sparse/knn_graph.cu                  |   2 +-
 cpp/test/sparse/norm.cu                       |   2 +-
 cpp/test/sparse/row_op.cu                     |   2 +-
 cpp/test/sparse/sort.cu                       |   2 +-
 cpp/test/sparse/symmetrize.cu                 |   2 +-
 cpp/test/stats/mean.cu                        |   4 +-
 cpp/test/stats/mean_center.cu                 |   6 +-
 cpp/test/stats/stddev.cu                      |   8 +-
 cpp/test/stats/sum.cu                         |   4 +-
 62 files changed, 1261 insertions(+), 863 deletions(-)
 create mode 100644 cpp/include/raft/matrix/detail/math.cuh
 create mode 100644 cpp/include/raft/matrix/detail/matrix.cuh
 rename cpp/include/raft/matrix/{math.cuh => math.hpp} (84%)
 rename cpp/include/raft/matrix/{matrix.cuh => matrix.hpp} (68%)
 rename cpp/include/raft/random/{rng.cuh => detail/rng_impl.cuh} (62%)
 create mode 100644 cpp/include/raft/random/rng.hpp
 delete mode 100644 cpp/include/raft/random/rng_impl.cuh
 rename cpp/include/raft/stats/{ => detail}/mean.cuh (81%)
 rename cpp/include/raft/stats/{ => detail}/stddev.cuh (87%)
 rename cpp/include/raft/stats/{ => detail}/sum.cuh (83%)
 create mode 100644 cpp/include/raft/stats/mean.hpp
 rename cpp/include/raft/stats/{mean_center.cuh => mean_center.hpp} (100%)
 create mode 100644 cpp/include/raft/stats/stddev.hpp
 create mode 100644 cpp/include/raft/stats/sum.hpp

diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
index 4161ddb8b9..e141883b6c 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -21,7 +21,7 @@
 #include <raft/linalg/cusolver_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
-#include <raft/matrix/matrix.cuh>
+#include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh
index cc912d7d86..a50448acbe 100644
--- a/cpp/include/raft/linalg/qr.cuh
+++ b/cpp/include/raft/linalg/qr.cuh
@@ -18,7 +18,7 @@
 
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/linalg/cusolver_wrappers.h>
-#include <raft/matrix/matrix.cuh>
+#include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh
index 8b40a80903..2315920689 100644
--- a/cpp/include/raft/linalg/svd.cuh
+++ b/cpp/include/raft/linalg/svd.cuh
@@ -21,8 +21,8 @@
 #include <raft/linalg/cusolver_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
-#include <raft/matrix/math.cuh>
-#include <raft/matrix/matrix.cuh>
+#include <raft/matrix/math.hpp>
+#include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include "eig.cuh"
diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh
new file mode 100644
index 0000000000..f79cb397b7
--- /dev/null
+++ b/cpp/include/raft/matrix/detail/math.cuh
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/cub.cuh>
+#include <raft/cuda_utils.cuh>
+
+namespace raft {
+namespace matrix {
+namespace detail {
+
+// Computes the argmax(d_in) column-wise in a DxN matrix
+template <typename T, int TPB>
+__global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) {
+  typedef cub::BlockReduce<cub::KeyValuePair<int, T>, TPB> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  // compute maxIndex=argMax  index for column
+  using KVP = cub::KeyValuePair<int, T>;
+  int rowStart = blockIdx.x * D;
+  KVP thread_data(-1, -raft::myInf<T>());
+
+  for (int i = threadIdx.x; i < D; i += TPB) {
+    int idx = rowStart + i;
+    thread_data = cub::ArgMax()(thread_data, KVP(i, d_in[idx]));
+  }
+
+  auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax());
+
+  if (threadIdx.x == 0) {
+    argmax[blockIdx.x] = maxKV.key;
+  }
+}
+
+template <typename math_t>
+void argmax(const math_t *in, int n_rows, int n_cols, math_t *out,
+            cudaStream_t stream) {
+  int D = n_rows;
+  int N = n_cols;
+  if (D <= 32) {
+    argmaxKernel<math_t, 32><<<N, 32, 0, stream>>>(in, D, N, out);
+  } else if (D <= 64) {
+    argmaxKernel<math_t, 64><<<N, 64, 0, stream>>>(in, D, N, out);
+  } else if (D <= 128) {
+    argmaxKernel<math_t, 128><<<N, 128, 0, stream>>>(in, D, N, out);
+  } else {
+    argmaxKernel<math_t, 256><<<N, 256, 0, stream>>>(in, D, N, out);
+  }
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+// Utility kernel needed for signFlip.
+// Computes the argmax(abs(d_in)) column-wise in a DxN matrix followed by
+// flipping the sign if the |max| value for each column is negative.
+template <typename T, int TPB>
+__global__ void signFlipKernel(T *d_in, int D, int N) {
+  typedef cub::BlockReduce<cub::KeyValuePair<int, T>, TPB> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  // compute maxIndex=argMax (with abs()) index for column
+  using KVP = cub::KeyValuePair<int, T>;
+  int rowStart = blockIdx.x * D;
+  KVP thread_data(0, 0);
+  for (int i = threadIdx.x; i < D; i += TPB) {
+    int idx = rowStart + i;
+    thread_data = cub::ArgMax()(thread_data, KVP(idx, abs(d_in[idx])));
+  }
+  auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax());
+
+  // flip column sign if d_in[maxIndex] < 0
+  __shared__ bool need_sign_flip;
+  if (threadIdx.x == 0) {
+    need_sign_flip = d_in[maxKV.key] < T(0);
+  }
+  __syncthreads();
+
+  if (need_sign_flip) {
+    for (int i = threadIdx.x; i < D; i += TPB) {
+      int idx = rowStart + i;
+      d_in[idx] = -d_in[idx];
+    }
+  }
+}
+
+template <typename math_t>
+void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) {
+  int D = n_rows;
+  int N = n_cols;
+  auto data = inout;
+  if (D <= 32) {
+    signFlipKernel<math_t, 32><<<N, 32, 0, stream>>>(data, D, N);
+  } else if (D <= 64) {
+    signFlipKernel<math_t, 64><<<N, 64, 0, stream>>>(data, D, N);
+  } else if (D <= 128) {
+    signFlipKernel<math_t, 128><<<N, 128, 0, stream>>>(data, D, N);
+  } else {
+    signFlipKernel<math_t, 256><<<N, 256, 0, stream>>>(data, D, N);
+  }
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+}  // end namespace detail
+}  // end namespace matrix
+}  // end namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
new file mode 100644
index 0000000000..8293d01bdb
--- /dev/null
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cache/cache_util.cuh>
+#include <raft/cuda_utils.cuh>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/for_each.h>
+
+namespace raft {
+namespace matrix {
+namespace detail {
+
+template <typename m_t, typename idx_array_t = int, typename idx_t = size_t>
+void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out,
+              const idx_array_t *indices, idx_t n_rows_indices,
+              cudaStream_t stream, bool rowMajor = false) {
+  if (rowMajor) {
+    const idx_t TPB = 256;
+    cache::
+      get_vecs<<<raft::ceildiv(n_rows_indices * n_cols, TPB), TPB, 0, stream>>>(
+        in, n_cols, indices, n_rows_indices, out);
+    CUDA_CHECK(cudaPeekAtLastError());
+    return;
+  }
+
+  idx_t size = n_rows_indices * n_cols;
+  auto counting = thrust::make_counting_iterator<idx_t>(0);
+
+  thrust::for_each(rmm::exec_policy(stream), counting, counting + size,
+                   [=] __device__(idx_t idx) {
+                     idx_t row = idx % n_rows_indices;
+                     idx_t col = idx / n_rows_indices;
+
+                     out[col * n_rows_indices + row] =
+                       in[col * n_rows + indices[row]];
+                   });
+}
+
+/**
+ * @brief Kernel for copying a slice of a big matrix to a small matrix with a
+ * size matches that slice
+ * @param src_d: input matrix
+ * @param m: number of rows of input matrix
+ * @param n: number of columns of input matrix
+ * @param dst_d: output matrix
+ * @param x1, y1: coordinate of the top-left point of the wanted area (0-based)
+ * @param x2, y2: coordinate of the bottom-right point of the wanted area
+ * (1-based)
+ */
+template <typename m_t, typename idx_t = int>
+__global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1,
+                      idx_t y1, idx_t x2, idx_t y2) {
+  idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  idx_t dm = x2 - x1, dn = y2 - y1;
+  if (idx < dm * dn) {
+    idx_t i = idx % dm, j = idx / dm;
+    idx_t is = i + x1, js = j + y1;
+    dst_d[idx] = src_d[is + js * m];
+  }
+}
+
+template <typename m_t, typename idx_t = int>
+void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1,
+                 idx_t y1, idx_t x2, idx_t y2, cudaStream_t stream) {
+  // Slicing
+  dim3 block(64);
+  dim3 grid(((x2 - x1) * (y2 - y1) + block.x - 1) / block.x);
+  slice<<<grid, block, 0, stream>>>(in, n_rows, n_cols, out, x1, y1, x2, y2);
+}
+
+/**
+ * @brief Kernel for copying the upper triangular part of a matrix to another
+ * @param src: input matrix with a size of mxn
+ * @param dst: output matrix with a size of kxk
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param k: min(n_rows, n_cols)
+ */
+template <typename m_t, typename idx_t = int>
+__global__ void getUpperTriangular(m_t *src, m_t *dst, idx_t n_rows,
+                                   idx_t n_cols, idx_t k) {
+  idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  idx_t m = n_rows, n = n_cols;
+  if (idx < m * n) {
+    idx_t i = idx % m, j = idx / m;
+    if (i < k && j < k && j >= i) {
+      dst[i + j * k] = src[idx];
+    }
+  }
+}
+
+template <typename m_t, typename idx_t = int>
+void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols,
+                         cudaStream_t stream) {
+  idx_t m = n_rows, n = n_cols;
+  idx_t k = min(m, n);
+  dim3 block(64);
+  dim3 grid((m * n + block.x - 1) / block.x);
+  getUpperTriangular<<<grid, block, 0, stream>>>(src, dst, m, n, k);
+}
+
+/**
+ * @brief Copy a vector to the diagonal of a matrix
+ * @param vec: vector of length k = min(n_rows, n_cols)
+ * @param matrix: matrix of size n_rows x n_cols
+ * @param m: number of rows of the matrix
+ * @param n: number of columns of the matrix
+ * @param k: dimensionality
+ */
+template <typename m_t, typename idx_t = int>
+__global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, idx_t m,
+                                           idx_t n, idx_t k) {
+  idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+
+  if (idx < k) {
+    matrix[idx + idx * m] = vec[idx];
+  }
+}
+
+template <typename m_t, typename idx_t = int>
+void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols,
+                              cudaStream_t stream) {
+  idx_t k = min(n_rows, n_cols);
+  dim3 block(64);
+  dim3 grid((k + block.x - 1) / block.x);
+  copyVectorToMatrixDiagonal<<<grid, block, 0, stream>>>(vec, matrix, n_rows,
+                                                         n_cols, k);
+}
+
+/**
+ * @brief Calculate the inverse of the diagonal of a square matrix
+ * element-wise and in place
+ * @param in: square input matrix with size len x len
+ * @param len: size of one side of the matrix
+ */
+template <typename m_t, typename idx_t = int>
+__global__ void matrixDiagonalInverse(m_t *in, idx_t len) {
+  idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  if (idx < len) {
+    in[idx + idx * len] = 1.0 / in[idx + idx * len];
+  }
+}
+
+template <typename m_t, typename idx_t = int>
+void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) {
+  dim3 block(64);
+  dim3 grid((len + block.x - 1) / block.x);
+  matrixDiagonalInverse<m_t><<<grid, block, 0, stream>>>(in, len);
+}
+
+}  // end namespace detail
+}  // end namespace matrix
+}  // end namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.hpp
similarity index 84%
rename from cpp/include/raft/matrix/math.cuh
rename to cpp/include/raft/matrix/math.hpp
index 41ca85dce0..e67440019f 100644
--- a/cpp/include/raft/matrix/math.cuh
+++ b/cpp/include/raft/matrix/math.hpp
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include "detail/math.cuh"
+
 #include <raft/handle.hpp>
 #include <raft/linalg/binary_op.cuh>
 #include <raft/linalg/map_then_reduce.cuh>
@@ -304,29 +306,6 @@ void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len,
 
 /** @} */
 
-// Computes the argmax(d_in) column-wise in a DxN matrix
-template <typename T, int TPB>
-__global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) {
-  typedef cub::BlockReduce<cub::KeyValuePair<int, T>, TPB> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  // compute maxIndex=argMax  index for column
-  using KVP = cub::KeyValuePair<int, T>;
-  int rowStart = blockIdx.x * D;
-  KVP thread_data(-1, -raft::myInf<T>());
-
-  for (int i = threadIdx.x; i < D; i += TPB) {
-    int idx = rowStart + i;
-    thread_data = cub::ArgMax()(thread_data, KVP(i, d_in[idx]));
-  }
-
-  auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax());
-
-  if (threadIdx.x == 0) {
-    argmax[blockIdx.x] = maxKV.key;
-  }
-}
-
 /**
  * @brief Argmax: find the row idx with maximum value for each column
  * @param in: input matrix
@@ -338,51 +317,7 @@ __global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) {
 template <typename math_t>
 void argmax(const math_t *in, int n_rows, int n_cols, math_t *out,
             cudaStream_t stream) {
-  int D = n_rows;
-  int N = n_cols;
-  if (D <= 32) {
-    argmaxKernel<math_t, 32><<<N, 32, 0, stream>>>(in, D, N, out);
-  } else if (D <= 64) {
-    argmaxKernel<math_t, 64><<<N, 64, 0, stream>>>(in, D, N, out);
-  } else if (D <= 128) {
-    argmaxKernel<math_t, 128><<<N, 128, 0, stream>>>(in, D, N, out);
-  } else {
-    argmaxKernel<math_t, 256><<<N, 256, 0, stream>>>(in, D, N, out);
-  }
-  CUDA_CHECK(cudaPeekAtLastError());
-}
-
-// Utility kernel needed for signFlip.
-// Computes the argmax(abs(d_in)) column-wise in a DxN matrix followed by
-// flipping the sign if the |max| value for each column is negative.
-template <typename T, int TPB>
-__global__ void signFlipKernel(T *d_in, int D, int N) {
-  typedef cub::BlockReduce<cub::KeyValuePair<int, T>, TPB> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  // compute maxIndex=argMax (with abs()) index for column
-  using KVP = cub::KeyValuePair<int, T>;
-  int rowStart = blockIdx.x * D;
-  KVP thread_data(0, 0);
-  for (int i = threadIdx.x; i < D; i += TPB) {
-    int idx = rowStart + i;
-    thread_data = cub::ArgMax()(thread_data, KVP(idx, abs(d_in[idx])));
-  }
-  auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax());
-
-  // flip column sign if d_in[maxIndex] < 0
-  __shared__ bool need_sign_flip;
-  if (threadIdx.x == 0) {
-    need_sign_flip = d_in[maxKV.key] < T(0);
-  }
-  __syncthreads();
-
-  if (need_sign_flip) {
-    for (int i = threadIdx.x; i < D; i += TPB) {
-      int idx = rowStart + i;
-      d_in[idx] = -d_in[idx];
-    }
-  }
+  detail::argmax(in, n_rows, n_cols, out, stream);
 }
 
 /**
@@ -395,19 +330,7 @@ __global__ void signFlipKernel(T *d_in, int D, int N) {
  */
 template <typename math_t>
 void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) {
-  int D = n_rows;
-  int N = n_cols;
-  auto data = inout;
-  if (D <= 32) {
-    signFlipKernel<math_t, 32><<<N, 32, 0, stream>>>(data, D, N);
-  } else if (D <= 64) {
-    signFlipKernel<math_t, 64><<<N, 64, 0, stream>>>(data, D, N);
-  } else if (D <= 128) {
-    signFlipKernel<math_t, 128><<<N, 128, 0, stream>>>(data, D, N);
-  } else {
-    signFlipKernel<math_t, 256><<<N, 256, 0, stream>>>(data, D, N);
-  }
-  CUDA_CHECK(cudaPeekAtLastError());
+  detail::signFlip(inout, n_rows, n_cols, stream);
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.hpp
similarity index 68%
rename from cpp/include/raft/matrix/matrix.cuh
rename to cpp/include/raft/matrix/matrix.hpp
index 688b92da09..8dd9fbf487 100644
--- a/cpp/include/raft/matrix/matrix.cuh
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -16,17 +16,15 @@
 
 #pragma once
 
+#include "detail/matrix.cuh"
+
 #include <cuda_runtime.h>
 #include <cusolverDn.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/cublas_wrappers.h>
-#include <thrust/execution_policy.h>
 #include <algorithm>
 #include <cstddef>
-#include <raft/cache/cache_util.cuh>
-#include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
-#include <rmm/exec_policy.hpp>
 
 namespace raft {
 namespace matrix {
@@ -52,26 +50,8 @@ template <typename m_t, typename idx_array_t = int, typename idx_t = size_t>
 void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out,
               const idx_array_t *indices, idx_t n_rows_indices,
               cudaStream_t stream, bool rowMajor = false) {
-  if (rowMajor) {
-    const idx_t TPB = 256;
-    cache::
-      get_vecs<<<raft::ceildiv(n_rows_indices * n_cols, TPB), TPB, 0, stream>>>(
-        in, n_cols, indices, n_rows_indices, out);
-    CUDA_CHECK(cudaPeekAtLastError());
-    return;
-  }
-
-  idx_t size = n_rows_indices * n_cols;
-  auto counting = thrust::make_counting_iterator<idx_t>(0);
-
-  thrust::for_each(rmm::exec_policy(stream), counting, counting + size,
-                   [=] __device__(idx_t idx) {
-                     idx_t row = idx % n_rows_indices;
-                     idx_t col = idx / n_rows_indices;
-
-                     out[col * n_rows_indices + row] =
-                       in[col * n_rows + indices[row]];
-                   });
+  detail::copyRows(in, n_rows, n_cols, out, indices, n_rows_indices, stream,
+                   rowMajor);
 }
 
 /**
@@ -185,10 +165,10 @@ void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) {
  */
 template <typename m_t, typename idx_t = int>
 void print(const m_t *in, idx_t n_rows, idx_t n_cols, char h_separator = ' ',
-           char v_separator = '\n') {
+           char v_separator = '\n',
+           cudaStream_t stream = rmm::cuda_stream_default) {
   std::vector<m_t> h_matrix = std::vector<m_t>(n_cols * n_rows);
-  CUDA_CHECK(cudaMemcpy(h_matrix.data(), in, n_cols * n_rows * sizeof(m_t),
-                        cudaMemcpyDeviceToHost));
+  raft::update_host(h_matrix.data(), in, n_cols * n_rows, stream);
 
   for (idx_t i = 0; i < n_rows; i++) {
     for (idx_t j = 0; j < n_cols; j++) {
@@ -214,29 +194,6 @@ void printHost(const m_t *in, idx_t n_rows, idx_t n_cols) {
   }
 }
 
-/**
- * @brief Kernel for copying a slice of a big matrix to a small matrix with a
- * size matches that slice
- * @param src_d: input matrix
- * @param m: number of rows of input matrix
- * @param n: number of columns of input matrix
- * @param dst_d: output matrix
- * @param x1, y1: coordinate of the top-left point of the wanted area (0-based)
- * @param x2, y2: coordinate of the bottom-right point of the wanted area
- * (1-based)
- */
-template <typename m_t, typename idx_t = int>
-__global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1,
-                      idx_t y1, idx_t x2, idx_t y2) {
-  idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-  idx_t dm = x2 - x1, dn = y2 - y1;
-  if (idx < dm * dn) {
-    idx_t i = idx % dm, j = idx / dm;
-    idx_t is = i + x1, js = j + y1;
-    dst_d[idx] = src_d[is + js * m];
-  }
-}
-
 /**
  * @brief Slice a matrix (in-place)
  * @param in: input matrix
@@ -253,31 +210,7 @@ __global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1,
 template <typename m_t, typename idx_t = int>
 void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1,
                  idx_t y1, idx_t x2, idx_t y2, cudaStream_t stream) {
-  // Slicing
-  dim3 block(64);
-  dim3 grid(((x2 - x1) * (y2 - y1) + block.x - 1) / block.x);
-  slice<<<grid, block, 0, stream>>>(in, n_rows, n_cols, out, x1, y1, x2, y2);
-}
-
-/**
- * @brief Kernel for copying the upper triangular part of a matrix to another
- * @param src: input matrix with a size of mxn
- * @param dst: output matrix with a size of kxk
- * @param n_rows: number of rows of input matrix
- * @param n_cols: number of columns of input matrix
- * @param k: min(n_rows, n_cols)
- */
-template <typename m_t, typename idx_t = int>
-__global__ void getUpperTriangular(m_t *src, m_t *dst, idx_t n_rows,
-                                   idx_t n_cols, idx_t k) {
-  idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-  idx_t m = n_rows, n = n_cols;
-  if (idx < m * n) {
-    idx_t i = idx % m, j = idx / m;
-    if (i < k && j < k && j >= i) {
-      dst[i + j * k] = src[idx];
-    }
-  }
+  detail::sliceMatrix(in, n_rows, n_cols, out, x1, y1, x2, y2, stream);
 }
 
 /**
@@ -291,29 +224,7 @@ __global__ void getUpperTriangular(m_t *src, m_t *dst, idx_t n_rows,
 template <typename m_t, typename idx_t = int>
 void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols,
                          cudaStream_t stream) {
-  idx_t m = n_rows, n = n_cols;
-  idx_t k = min(m, n);
-  dim3 block(64);
-  dim3 grid((m * n + block.x - 1) / block.x);
-  getUpperTriangular<<<grid, block, 0, stream>>>(src, dst, m, n, k);
-}
-
-/**
- * @brief Copy a vector to the diagonal of a matrix
- * @param vec: vector of length k = min(n_rows, n_cols)
- * @param matrix: matrix of size n_rows x n_cols
- * @param m: number of rows of the matrix
- * @param n: number of columns of the matrix
- * @param k: dimensionality
- */
-template <typename m_t, typename idx_t = int>
-__global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, idx_t m,
-                                           idx_t n, idx_t k) {
-  idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-
-  if (idx < k) {
-    matrix[idx + idx * m] = vec[idx];
-  }
+  detail::copyUpperTriangular(src, dst, n_rows, n_cols, stream);
 }
 
 /**
@@ -327,25 +238,7 @@ __global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, idx_t m,
 template <typename m_t, typename idx_t = int>
 void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols,
                               cudaStream_t stream) {
-  idx_t k = min(n_rows, n_cols);
-  dim3 block(64);
-  dim3 grid((k + block.x - 1) / block.x);
-  copyVectorToMatrixDiagonal<<<grid, block, 0, stream>>>(vec, matrix, n_rows,
-                                                         n_cols, k);
-}
-
-/**
- * @brief Calculate the inverse of the diagonal of a square matrix
- * element-wise and in place
- * @param in: square input matrix with size len x len
- * @param len: size of one side of the matrix
- */
-template <typename m_t, typename idx_t = int>
-__global__ void matrixDiagonalInverse(m_t *in, idx_t len) {
-  idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-  if (idx < len) {
-    in[idx + idx * len] = 1.0 / in[idx + idx * len];
-  }
+  detail::initializeDiagonalMatrix(vec, matrix, n_rows, n_cols, stream);
 }
 
 /**
@@ -356,9 +249,7 @@ __global__ void matrixDiagonalInverse(m_t *in, idx_t len) {
  */
 template <typename m_t, typename idx_t = int>
 void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) {
-  dim3 block(64);
-  dim3 grid((len + block.x - 1) / block.x);
-  matrixDiagonalInverse<m_t><<<grid, block, 0, stream>>>(in, len);
+  detail::getDiagonalInverseMatrix(in, len, stream);
 }
 
 /**
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/detail/rng_impl.cuh
similarity index 62%
rename from cpp/include/raft/random/rng.cuh
rename to cpp/include/raft/random/detail/rng_impl.cuh
index 3d2e44e49b..654c46bbf9 100644
--- a/cpp/include/raft/random/rng.cuh
+++ b/cpp/include/raft/random/detail/rng_impl.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <curand_kernel.h>
 #include <raft/cudart_utils.h>
 #include <stdint.h>
 #include <cstdio>
@@ -27,10 +28,10 @@
 #include <random>
 #include <rmm/device_uvector.hpp>
 #include <type_traits>
-#include "rng_impl.cuh"
 
 namespace raft {
 namespace random {
+namespace detail {
 
 /** all different generator types used */
 enum GeneratorType {
@@ -42,12 +43,48 @@ enum GeneratorType {
   GenKiss99
 };
 
+template <typename Type>
+DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1,
+                             Type sigma2, Type mu2) {
+  constexpr Type twoPi = Type(2.0) * Type(3.141592654);
+  constexpr Type minus2 = -Type(2.0);
+  Type R = raft::mySqrt(minus2 * raft::myLog(val1));
+  Type theta = twoPi * val2;
+  Type s, c;
+  raft::mySinCos(theta, s, c);
+  val1 = R * c * sigma1 + mu1;
+  val2 = R * s * sigma2 + mu2;
+}
+template <typename Type>
+DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) {
+  box_muller_transform<Type>(val1, val2, sigma1, mu1, sigma1, mu1);
+}
+
+/**
+ * @brief generator-agnostic way of generating random numbers
+ * @tparam GenType the generator object that expose 'next' method
+ */
+template <typename GenType>
+struct Generator {
+  DI Generator(uint64_t seed, uint64_t subsequence, uint64_t offset)
+    : gen(seed, subsequence, offset) {}
+
+  template <typename Type>
+  DI void next(Type &ret) {
+    gen.next(ret);
+  }
+
+ private:
+  /** the actual generator */
+  GenType gen;
+};
+
 template <typename OutType, typename MathType, typename GenType,
           typename LenType, typename Lambda>
 __global__ void randKernel(uint64_t seed, uint64_t offset, OutType *ptr,
                            LenType len, Lambda randOp) {
   LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  detail::Generator<GenType> gen(seed, (uint64_t)tid, offset);
+  Generator<GenType> gen(seed, (uint64_t)tid, offset);
   const LenType stride = gridDim.x * blockDim.x;
   for (LenType idx = tid; idx < len; idx += stride) {
     MathType val;
@@ -62,7 +99,7 @@ template <typename OutType, typename MathType, typename GenType,
 __global__ void rand2Kernel(uint64_t seed, uint64_t offset, OutType *ptr,
                             LenType len, Lambda2 rand2Op) {
   LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  detail::Generator<GenType> gen(seed, (uint64_t)tid, offset);
+  Generator<GenType> gen(seed, (uint64_t)tid, offset);
   const LenType stride = gridDim.x * blockDim.x;
   for (LenType idx = tid; idx < len; idx += stride) {
     MathType val1, val2;
@@ -84,47 +121,252 @@ __global__ void constFillKernel(Type *ptr, int len, Type val) {
   }
 }
 
-/**
- * @brief Helper method to compute Box Muller transform
- *
- * @tparam Type data type
- *
- * @param[inout] val1   first value
- * @param[inout] val2   second value
- * @param[in]    sigma1 standard deviation of output gaussian for first value
- * @param[in]    mu1    mean of output gaussian for first value
- * @param[in]    sigma2 standard deviation of output gaussian for second value
- * @param[in]    mu2    mean of output gaussian for second value
- * @{
- */
-template <typename Type>
-DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1,
-                             Type sigma2, Type mu2) {
-  constexpr Type twoPi = Type(2.0) * Type(3.141592654);
-  constexpr Type minus2 = -Type(2.0);
-  Type R = raft::mySqrt(minus2 * raft::myLog(val1));
-  Type theta = twoPi * val2;
-  Type s, c;
-  raft::mySinCos(theta, s, c);
-  val1 = R * c * sigma1 + mu1;
-  val2 = R * s * sigma2 + mu2;
-}
-template <typename Type>
-DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) {
-  box_muller_transform<Type>(val1, val2, sigma1, mu1, sigma1, mu1);
-}
-/** @} */
+/** Philox-based random number generator */
+// Courtesy: Jakub Szuppe
+struct PhiloxGenerator {
+  /**
+   * @brief ctor. Initializes the state for RNG
+   * @param seed random seed (can be same across all threads)
+   * @param subsequence as found in curand docs
+   * @param offset as found in curand docs
+   */
+  DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
+    curand_init(seed, subsequence, offset, &state);
+  }
 
-/** The main random number generator class, fully on GPUs */
-class Rng {
- public:
   /**
-   * @brief ctor
-   * @param _s 64b seed used to initialize the RNG
-   * @param _t backend device RNG generator type
-   * @note Refer to the `Rng::seed` method for details about seeding the engine
+   * @defgroup NextRand Generate the next random number
+   * @{
+   */
+  DI void next(float &ret) { ret = curand_uniform(&(this->state)); }
+  DI void next(double &ret) { ret = curand_uniform_double(&(this->state)); }
+  DI void next(uint32_t &ret) { ret = curand(&(this->state)); }
+  DI void next(uint64_t &ret) {
+    uint32_t a, b;
+    next(a);
+    next(b);
+    ret = (uint64_t)a | ((uint64_t)b << 32);
+  }
+  DI void next(int32_t &ret) {
+    uint32_t val;
+    next(val);
+    ret = int32_t(val & 0x7fffffff);
+  }
+  DI void next(int64_t &ret) {
+    uint64_t val;
+    next(val);
+    ret = int64_t(val & 0x7fffffffffffffff);
+  }
+  /** @} */
+
+ private:
+  /** the state for RNG */
+  curandStatePhilox4_32_10_t state;
+};
+
+/** LFSR taps-filter for generating random numbers. */
+// Courtesy: Vinay Deshpande
+struct TapsGenerator {
+  /**
+   * @brief ctor. Initializes the state for RNG
+   * @param seed the seed (can be same across all threads)
+   * @param subsequence unused
+   * @param offset unused
+   */
+  DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
+    uint64_t delta = (blockIdx.x * blockDim.x) + threadIdx.x;
+    uint64_t stride = blockDim.x * gridDim.x;
+    delta += ((blockIdx.y * blockDim.y) + threadIdx.y) * stride;
+    stride *= blockDim.y * gridDim.y;
+    delta += ((blockIdx.z * blockDim.z) + threadIdx.z) * stride;
+    state = seed + delta + 1;
+  }
+
+  /**
+   * @defgroup NextRand Generate the next random number
+   * @{
+   */
+  template <typename Type>
+  DI void next(Type &ret) {
+    constexpr double ULL_LARGE = 1.8446744073709551614e19;
+    uint64_t val;
+    next(val);
+    ret = static_cast<Type>(val);
+    ret /= static_cast<Type>(ULL_LARGE);
+  }
+  DI void next(uint64_t &ret) {
+    constexpr uint64_t TAPS = 0x8000100040002000ULL;
+    constexpr int ROUNDS = 128;
+    for (int i = 0; i < ROUNDS; i++)
+      state = (state >> 1) ^ (-(state & 1ULL) & TAPS);
+    ret = state;
+  }
+  DI void next(uint32_t &ret) {
+    uint64_t val;
+    next(val);
+    ret = (uint32_t)val;
+  }
+  DI void next(int32_t &ret) {
+    uint32_t val;
+    next(val);
+    ret = int32_t(val & 0x7fffffff);
+  }
+  DI void next(int64_t &ret) {
+    uint64_t val;
+    next(val);
+    ret = int64_t(val & 0x7fffffffffffffff);
+  }
+  /** @} */
+
+ private:
+  /** the state for RNG */
+  uint64_t state;
+};
+
+/** Kiss99-based random number generator */
+
+struct Kiss99Generator {
+  /**
+   * @brief ctor. Initializes the state for RNG
+   * @param seed the seed (can be same across all threads)
+   * @param subsequence unused
+   * @param offset unused
    */
-  Rng(uint64_t _s, GeneratorType _t = GenPhilox)
+  DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
+    initKiss99(seed);
+  }
+
+  /**
+   * @defgroup NextRand Generate the next random number
+   * @{
+   */
+  template <typename Type>
+  DI void next(Type &ret) {
+    constexpr double U_LARGE = 4.294967295e9;
+    uint32_t val;
+    next(val);
+    ret = static_cast<Type>(val);
+    ret /= static_cast<Type>(U_LARGE);
+  }
+  DI void next(uint32_t &ret) {
+    uint32_t MWC;
+    z = 36969 * (z & 65535) + (z >> 16);
+    w = 18000 * (w & 65535) + (w >> 16);
+    MWC = ((z << 16) + w);
+    jsr ^= (jsr << 17);
+    jsr ^= (jsr >> 13);
+    jsr ^= (jsr << 5);
+    jcong = 69069 * jcong + 1234567;
+    MWC = ((MWC ^ jcong) + jsr);
+    ret = MWC;
+  }
+  DI void next(uint64_t &ret) {
+    uint32_t a, b;
+    next(a);
+    next(b);
+    ret = (uint64_t)a | ((uint64_t)b << 32);
+  }
+  DI void next(int32_t &ret) {
+    uint32_t val;
+    next(val);
+    ret = int32_t(val & 0x7fffffff);
+  }
+  DI void next(int64_t &ret) {
+    uint64_t val;
+    next(val);
+    ret = int64_t(val & 0x7fffffffffffffff);
+  }
+  /** @} */
+
+ private:
+  /** one of the kiss99 states */
+  uint32_t z;
+  /** one of the kiss99 states */
+  uint32_t w;
+  /** one of the kiss99 states */
+  uint32_t jsr;
+  /** one of the kiss99 states */
+  uint32_t jcong;
+
+  // This function multiplies 128-bit hash by 128-bit FNV prime and returns lower
+  // 128 bits. It uses 32-bit wide multiply only.
+  DI void mulByFnv1a128Prime(uint32_t *h) {
+    typedef union {
+      uint32_t u32[2];
+      uint64_t u64[1];
+    } words64;
+
+    // 128-bit FNV prime = p3 * 2^96 + p2 * 2^64 + p1 * 2^32 + p0
+    // Here p0 = 315, p2 = 16777216, p1 = p3 = 0
+    const uint32_t p0 = uint32_t(315), p2 = uint32_t(16777216);
+    // Partial products
+    words64 h0p0, h1p0, h2p0, h0p2, h3p0, h1p2;
+
+    h0p0.u64[0] = uint64_t(h[0]) * p0;
+    h1p0.u64[0] = uint64_t(h[1]) * p0;
+    h2p0.u64[0] = uint64_t(h[2]) * p0;
+    h0p2.u64[0] = uint64_t(h[0]) * p2;
+    h3p0.u64[0] = uint64_t(h[3]) * p0;
+    h1p2.u64[0] = uint64_t(h[1]) * p2;
+
+    // h_n[0] = LO(h[0]*p[0]);
+    // h_n[1] = HI(h[0]*p[0]) + LO(h[1]*p[0]);
+    // h_n[2] = HI(h[1]*p[0]) + LO(h[2]*p[0]) + LO(h[0]*p[2]);
+    // h_n[3] = HI(h[2]*p[0]) + HI(h[0]*p[2]) + LO(h[3]*p[0]) + LO(h[1]*p[2]);
+    uint32_t carry = 0;
+    h[0] = h0p0.u32[0];
+
+    h[1] = h0p0.u32[1] + h1p0.u32[0];
+    carry = h[1] < h0p0.u32[1] ? 1 : 0;
+
+    h[2] = h1p0.u32[1] + carry;
+    carry = h[2] < h1p0.u32[1] ? 1 : 0;
+    h[2] += h2p0.u32[0];
+    carry = h[2] < h2p0.u32[0] ? carry + 1 : carry;
+    h[2] += h0p2.u32[0];
+    carry = h[2] < h0p2.u32[0] ? carry + 1 : carry;
+
+    h[3] = h2p0.u32[1] + h0p2.u32[1] + h3p0.u32[0] + h1p2.u32[0] + carry;
+    return;
+  }
+
+  DI void fnv1a128(uint32_t *hash, uint32_t txt) {
+    hash[0] ^= (txt >> 0) & 0xFF;
+    mulByFnv1a128Prime(hash);
+    hash[0] ^= (txt >> 8) & 0xFF;
+    mulByFnv1a128Prime(hash);
+    hash[0] ^= (txt >> 16) & 0xFF;
+    mulByFnv1a128Prime(hash);
+    hash[0] ^= (txt >> 24) & 0xFF;
+    mulByFnv1a128Prime(hash);
+  }
+
+  DI void initKiss99(uint64_t seed) {
+    // Initialize hash to 128-bit FNV1a basis
+    uint32_t hash[4] = {1653982605UL, 1656234357UL, 129696066UL, 1818371886UL};
+
+    // Digest threadIdx, blockIdx and seed
+    fnv1a128(hash, threadIdx.x);
+    fnv1a128(hash, threadIdx.y);
+    fnv1a128(hash, threadIdx.z);
+    fnv1a128(hash, blockIdx.x);
+    fnv1a128(hash, blockIdx.y);
+    fnv1a128(hash, blockIdx.z);
+    fnv1a128(hash, uint32_t(seed));
+    fnv1a128(hash, uint32_t(seed >> 32));
+
+    // Initialize KISS99 state with hash
+    z = hash[0];
+    w = hash[1];
+    jsr = hash[2];
+    jcong = hash[3];
+  }
+};
+
+/** The main random number generator class, fully on GPUs */
+class RngImpl {
+ public:
+  RngImpl(uint64_t _s, GeneratorType _t = GenPhilox)
     : type(_t),
       offset(0),
       // simple heuristic to make sure all SMs will be occupied properly
@@ -134,28 +376,11 @@ class Rng {
     seed(_s);
   }
 
-  /**
-   * @brief Seed (and thus re-initialize) the underlying RNG engine
-   * @param _s 64b seed used to initialize the RNG
-   * @note If you need non-reproducibility, pass a seed that's, for example, a
-   *       function of timestamp. Another example is to use the c++11's
-   *       `std::random_device` for setting seed.
-   */
   void seed(uint64_t _s) {
     gen.seed(_s);
     offset = 0;
   }
 
-  /**
-   * @brief Generates the 'a' and 'b' parameters for a modulo affine
-   *        transformation equation: `(ax + b) % n`
-   *
-   * @tparam IdxT integer type
-   *
-   * @param[in]  n the modulo range
-   * @param[out] a slope parameter
-   * @param[out] b intercept parameter
-   */
   template <typename IdxT>
   void affine_transform_params(IdxT n, IdxT &a, IdxT &b) {
     // always keep 'a' to be coprime to 'n'
@@ -168,17 +393,6 @@ class Rng {
     b = gen() % n;
   }
 
-  /**
-   * @brief Generate uniformly distributed numbers in the given range
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output array
-   * @param len the number of elements in the output
-   * @param start start of the range
-   * @param end end of the range
-   * @param stream stream where to launch the kernel
-   * @{
-   */
   template <typename Type, typename LenType = int>
   void uniform(Type *ptr, LenType len, Type start, Type end,
                cudaStream_t stream) {
@@ -203,19 +417,7 @@ class Rng {
       },
       stream);
   }
-  /** @} */
 
-  /**
-   * @brief Generate normal distributed numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output array
-   * @param len the number of elements in the output
-   * @param mu mean of the distribution
-   * @param sigma std-dev of the distribution
-   * @param stream stream where to launch the kernel
-   * @{
-   */
   template <typename Type, typename LenType = int>
   void normal(Type *ptr, LenType len, Type mu, Type sigma,
               cudaStream_t stream) {
@@ -240,28 +442,7 @@ class Rng {
       },
       NumThreads, nBlocks, type, stream);
   }
-  /** @} */
 
-  /**
-   * @brief Generate normal distributed table according to the given set of
-   * means and scalar standard deviations.
-   *
-   * Each row in this table conforms to a normally distributed n-dim vector
-   * whose mean is the input vector and standard deviation is the corresponding
-   * vector or scalar. Correlations among the dimensions itself is assumed to
-   * be absent.
-   *
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output table (dim = n_rows x n_cols)
-   * @param n_rows number of rows in the table
-   * @param n_cols number of columns in the table
-   * @param mu mean vector (dim = n_cols x 1).
-   * @param sigma_vec std-dev vector of each component (dim = n_cols x 1). Pass
-   * a nullptr to use the same scalar 'sigma' across all components
-   * @param sigma scalar sigma to be used if 'sigma_vec' is nullptr
-   * @param stream stream where to launch the kernel
-   */
   template <typename Type, typename LenType = int>
   void normalTable(Type *ptr, LenType n_rows, LenType n_cols, const Type *mu,
                    const Type *sigma_vec, Type sigma, cudaStream_t stream) {
@@ -280,33 +461,13 @@ class Rng {
       NumThreads, nBlocks, type, stream);
   }
 
-  /**
-   * @brief Fill an array with the given value
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output array
-   * @param len the number of elements in the output
-   * @param val value to be filled
-   * @param stream stream where to launch the kernel
-   */
   template <typename Type, typename LenType = int>
   void fill(Type *ptr, LenType len, Type val, cudaStream_t stream) {
-    constFillKernel<Type><<<nBlocks, NumThreads, 0, stream>>>(ptr, len, val);
+    detail::constFillKernel<Type>
+      <<<nBlocks, NumThreads, 0, stream>>>(ptr, len, val);
     CUDA_CHECK(cudaPeekAtLastError());
   }
 
-  /**
-   * @brief Generate bernoulli distributed boolean array
-   *
-   * @tparam Type    data type in which to compute the probabilities
-   * @tparam OutType output data type
-   * @tparam LenType data type used to represent length of the arrays
-   *
-   * @param[out] ptr    the output array
-   * @param[in]  len    the number of elements in the output
-   * @param[in]  prob   coin-toss probability for heads
-   * @param[in]  stream stream where to launch the kernel
-   */
   template <typename Type, typename OutType = bool, typename LenType = int>
   void bernoulli(OutType *ptr, LenType len, Type prob, cudaStream_t stream) {
     custom_distribution<OutType, Type>(
@@ -314,16 +475,6 @@ class Rng {
       stream);
   }
 
-  /**
-   * @brief Generate bernoulli distributed array and applies scale
-   * @tparam Type data type in which to compute the probabilities
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output array
-   * @param len the number of elements in the output
-   * @param prob coin-toss probability for heads
-   * @param scale scaling factor
-   * @param stream stream where to launch the kernel
-   */
   template <typename Type, typename LenType = int>
   void scaled_bernoulli(Type *ptr, LenType len, Type prob, Type scale,
                         cudaStream_t stream) {
@@ -337,17 +488,6 @@ class Rng {
       stream);
   }
 
-  /**
-   * @brief Generate gumbel distributed random numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr output array
-   * @param len number of elements in the output array
-   * @param mu mean value
-   * @param beta scale value
-   * @param stream stream where to launch the kernel
-   * @note https://en.wikipedia.org/wiki/Gumbel_distribution
-   */
   template <typename Type, typename LenType = int>
   void gumbel(Type *ptr, LenType len, Type mu, Type beta, cudaStream_t stream) {
     custom_distribution(
@@ -358,16 +498,6 @@ class Rng {
       stream);
   }
 
-  /**
-   * @brief Generate lognormal distributed numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output array
-   * @param len the number of elements in the output
-   * @param mu mean of the distribution
-   * @param sigma std-dev of the distribution
-   * @param stream stream where to launch the kernel
-   */
   template <typename Type, typename LenType = int>
   void lognormal(Type *ptr, LenType len, Type mu, Type sigma,
                  cudaStream_t stream) {
@@ -381,16 +511,6 @@ class Rng {
       NumThreads, nBlocks, type, stream);
   }
 
-  /**
-   * @brief Generate logistic distributed random numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr output array
-   * @param len number of elements in the output array
-   * @param mu mean value
-   * @param scale scale value
-   * @param stream stream where to launch the kernel
-   */
   template <typename Type, typename LenType = int>
   void logistic(Type *ptr, LenType len, Type mu, Type scale,
                 cudaStream_t stream) {
@@ -403,15 +523,6 @@ class Rng {
       stream);
   }
 
-  /**
-   * @brief Generate exponentially distributed random numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr output array
-   * @param len number of elements in the output array
-   * @param lambda the lambda
-   * @param stream stream where to launch the kernel
-   */
   template <typename Type, typename LenType = int>
   void exponential(Type *ptr, LenType len, Type lambda, cudaStream_t stream) {
     custom_distribution(
@@ -423,15 +534,6 @@ class Rng {
       stream);
   }
 
-  /**
-   * @brief Generate rayleigh distributed random numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr output array
-   * @param len number of elements in the output array
-   * @param sigma the sigma
-   * @param stream stream where to launch the kernel
-   */
   template <typename Type, typename LenType = int>
   void rayleigh(Type *ptr, LenType len, Type sigma, cudaStream_t stream) {
     custom_distribution(
@@ -444,16 +546,6 @@ class Rng {
       stream);
   }
 
-  /**
-   * @brief Generate laplace distributed random numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr output array
-   * @param len number of elements in the output array
-   * @param mu the mean
-   * @param scale the scale
-   * @param stream stream where to launch the kernel
-   */
   template <typename Type, typename LenType = int>
   void laplace(Type *ptr, LenType len, Type mu, Type scale,
                cudaStream_t stream) {
@@ -474,31 +566,6 @@ class Rng {
       stream);
   }
 
-  /**
-   * @brief Sample the input array without replacement, optionally based on the
-   * input weight vector for each element in the array
-   *
-   * Implementation here is based on the `one-pass sampling` algo described here:
-   * https://www.ethz.ch/content/dam/ethz/special-interest/baug/ivt/ivt-dam/vpl/reports/1101-1200/ab1141.pdf
-   *
-   * @note In the sampled array the elements which are picked will always appear
-   * in the increasing order of their weights as computed using the exponential
-   * distribution. So, if you're particular about the order (for eg. array
-   * permutations), then this might not be the right choice!
-   *
-   * @tparam DataT data type
-   * @tparam WeightsT weights type
-   * @tparam IdxT index type
-   * @param out output sampled array (of length 'sampledLen')
-   * @param outIdx indices of the sampled array (of length 'sampledLen'). Pass
-   * a nullptr if this is not required.
-   * @param in input array to be sampled (of length 'len')
-   * @param wts weights array (of length 'len'). Pass a nullptr if uniform
-   * sampling is desired
-   * @param sampledLen output sampled array length
-   * @param len input array length
-   * @param stream cuda stream
-   */
   template <typename DataT, typename WeightsT, typename IdxT = int>
   void sampleWithoutReplacement(const raft::handle_t &handle, DataT *out,
                                 IdxT *outIdx, const DataT *in,
@@ -535,24 +602,9 @@ class Rng {
       CUDA_CHECK(cudaMemcpyAsync(outIdx, outIdxPtr, sizeof(IdxT) * sampledLen,
                                  cudaMemcpyDeviceToDevice, stream));
     }
-    scatter<DataT, IdxT>(out, in, outIdxPtr, sampledLen, stream);
+    raft::scatter<DataT, IdxT>(out, in, outIdxPtr, sampledLen, stream);
   }
 
-  /**
-   * @brief Core method to generate a pdf based on the cdf that is defined in
-   *        the input device lambda
-   *
-   * @tparam OutType  output type
-   * @tparam MathType type on which arithmetic is done
-   * @tparam LenTyp   index type
-   * @tparam Lambda   device lambda (or operator)
-   *
-   * @param[out] ptr    output buffer [on device] [len = len]
-   * @param[in]  len    number of elements to be generated
-   * @param[in]  randOp the device lambda or operator
-   * @param[in]  stream cuda stream
-   * @{
-   */
   template <typename OutType, typename MathType = OutType,
             typename LenType = int, typename Lambda>
   void custom_distribution(OutType *ptr, LenType len, Lambda randOp,
@@ -573,10 +625,10 @@ class Rng {
   /** generator type */
   GeneratorType type;
   /**
-   * offset is also used to initialize curand state.
-   * Limits period of Philox RNG from (4 * 2^128) to (Blocks * Threads * 2^64),
-   * but is still a large period.
-   */
+  * offset is also used to initialize curand state.
+  * Limits period of Philox RNG from (4 * 2^128) to (Blocks * Threads * 2^64),
+  * but is still a large period.
+  */
   uint64_t offset;
   /** number of blocks to launch */
   int nBlocks;
@@ -617,15 +669,18 @@ class Rng {
                                                            nThreads, nBlocks);
     switch (type) {
       case GenPhilox:
-        randKernel<OutType, MathType, detail::PhiloxGenerator, LenType, Lambda>
+        detail::randKernel<OutType, MathType, detail::PhiloxGenerator, LenType,
+                           Lambda>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
         break;
       case GenTaps:
-        randKernel<OutType, MathType, detail::TapsGenerator, LenType, Lambda>
+        detail::randKernel<OutType, MathType, detail::TapsGenerator, LenType,
+                           Lambda>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
         break;
       case GenKiss99:
-        randKernel<OutType, MathType, detail::Kiss99Generator, LenType, Lambda>
+        detail::randKernel<OutType, MathType, detail::Kiss99Generator, LenType,
+                           Lambda>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
         break;
       default:
@@ -646,17 +701,18 @@ class Rng {
                                                           nThreads, nBlocks);
     switch (type) {
       case GenPhilox:
-        rand2Kernel<OutType, MathType, detail::PhiloxGenerator, LenType,
-                    Lambda2>
+        detail::rand2Kernel<OutType, MathType, detail::PhiloxGenerator, LenType,
+                            Lambda2>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
         break;
       case GenTaps:
-        rand2Kernel<OutType, MathType, detail::TapsGenerator, LenType, Lambda2>
+        detail::rand2Kernel<OutType, MathType, detail::TapsGenerator, LenType,
+                            Lambda2>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
         break;
       case GenKiss99:
-        rand2Kernel<OutType, MathType, detail::Kiss99Generator, LenType,
-                    Lambda2>
+        detail::rand2Kernel<OutType, MathType, detail::Kiss99Generator, LenType,
+                            Lambda2>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
         break;
       default:
@@ -667,5 +723,6 @@ class Rng {
   }
 };
 
+};  // end namespace detail
 };  // end namespace random
 };  // end namespace raft
diff --git a/cpp/include/raft/random/rng.hpp b/cpp/include/raft/random/rng.hpp
new file mode 100644
index 0000000000..b6b0911ab0
--- /dev/null
+++ b/cpp/include/raft/random/rng.hpp
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/rng_impl.cuh"
+
+namespace raft {
+namespace random {
+
+/** all different generator types used */
+using detail::GeneratorType;
+/** curand-based philox generator */
+using detail::GenPhilox;
+/** GenTaps : LFSR taps generator */
+using detail::GenTaps;
+/** GenKiss99 : kiss99 generator (currently the fastest) */
+using detail::GenKiss99;
+
+/** Philox-based random number generator */
+using detail::PhiloxGenerator;
+/** LFSR taps-filter for generating random numbers. */
+using detail::TapsGenerator;
+/** Kiss99-based random number generator */
+using detail::Kiss99Generator;
+
+/**
+ * @brief Helper method to compute Box Muller transform
+ *
+ * @tparam Type data type
+ *
+ * @param[inout] val1   first value
+ * @param[inout] val2   second value
+ * @param[in]    sigma1 standard deviation of output gaussian for first value
+ * @param[in]    mu1    mean of output gaussian for first value
+ * @param[in]    sigma2 standard deviation of output gaussian for second value
+ * @param[in]    mu2    mean of output gaussian for second value
+ * @{
+ */
+template <typename Type>
+DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1,
+                             Type sigma2, Type mu2) {
+  detail::box_muller_transform(val1, val2, sigma1, mu1, sigma1, mu2);
+}
+template <typename Type>
+DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) {
+  detail::box_muller_transform(val1, val2, sigma1, mu1);
+}
+/** @} */
+
+/** The main random number generator class, fully on GPUs */
+class Rng : public detail::RngImpl {
+ public:
+  /**
+   * @brief ctor
+   * @param _s 64b seed used to initialize the RNG
+   * @param _t backend device RNG generator type
+   * @note Refer to the `Rng::seed` method for details about seeding the engine
+   */
+  Rng(uint64_t _s, GeneratorType _t = GenPhilox) : detail::RngImpl{_s, _t} {}
+
+  /**
+   * @brief Seed (and thus re-initialize) the underlying RNG engine
+   * @param _s 64b seed used to initialize the RNG
+   * @note If you need non-reproducibility, pass a seed that's, for example, a
+   *       function of timestamp. Another example is to use the c++11's
+   *       `std::random_device` for setting seed.
+   */
+  void seed(uint64_t _s) { detail::RngImpl::seed(_s); }
+
+  /**
+   * @brief Generates the 'a' and 'b' parameters for a modulo affine
+   *        transformation equation: `(ax + b) % n`
+   *
+   * @tparam IdxT integer type
+   *
+   * @param[in]  n the modulo range
+   * @param[out] a slope parameter
+   * @param[out] b intercept parameter
+   */
+  template <typename IdxT>
+  void affine_transform_params(IdxT n, IdxT &a, IdxT &b) {
+    detail::RngImpl::affine_transform_params(n, a, b);
+  }
+
+  /**
+   * @brief Generate uniformly distributed numbers in the given range
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param start start of the range
+   * @param end end of the range
+   * @param stream stream where to launch the kernel
+   * @{
+   */
+  template <typename Type, typename LenType = int>
+  void uniform(Type *ptr, LenType len, Type start, Type end,
+               cudaStream_t stream) {
+    detail::RngImpl::uniform(ptr, len, start, end, stream);
+  }
+  template <typename IntType, typename LenType = int>
+  void uniformInt(IntType *ptr, LenType len, IntType start, IntType end,
+                  cudaStream_t stream) {
+    detail::RngImpl::uniformInt(ptr, len, start, end, stream);
+  }
+  /** @} */
+
+  /**
+   * @brief Generate normal distributed numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param mu mean of the distribution
+   * @param sigma std-dev of the distribution
+   * @param stream stream where to launch the kernel
+   * @{
+   */
+  template <typename Type, typename LenType = int>
+  void normal(Type *ptr, LenType len, Type mu, Type sigma,
+              cudaStream_t stream) {
+    detail::RngImpl::normal(ptr, len, mu, sigma, stream);
+  }
+  template <typename IntType, typename LenType = int>
+  void normalInt(IntType *ptr, LenType len, IntType mu, IntType sigma,
+                 cudaStream_t stream) {
+    detail::RngImpl::normalInt(ptr, len, mu, sigma, stream);
+  }
+  /** @} */
+
+  /**
+   * @brief Generate normal distributed table according to the given set of
+   * means and scalar standard deviations.
+   *
+   * Each row in this table conforms to a normally distributed n-dim vector
+   * whose mean is the input vector and standard deviation is the corresponding
+   * vector or scalar. Correlations among the dimensions itself is assumed to
+   * be absent.
+   *
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output table (dim = n_rows x n_cols)
+   * @param n_rows number of rows in the table
+   * @param n_cols number of columns in the table
+   * @param mu mean vector (dim = n_cols x 1).
+   * @param sigma_vec std-dev vector of each component (dim = n_cols x 1). Pass
+   * a nullptr to use the same scalar 'sigma' across all components
+   * @param sigma scalar sigma to be used if 'sigma_vec' is nullptr
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void normalTable(Type *ptr, LenType n_rows, LenType n_cols, const Type *mu,
+                   const Type *sigma_vec, Type sigma, cudaStream_t stream) {
+    detail::RngImpl::normalTable(ptr, n_rows, n_cols, mu, sigma_vec, sigma,
+                                 stream);
+  }
+
+  /**
+   * @brief Fill an array with the given value
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param val value to be filled
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void fill(Type *ptr, LenType len, Type val, cudaStream_t stream) {
+    detail::RngImpl::fill(ptr, len, val, stream);
+  }
+
+  /**
+   * @brief Generate bernoulli distributed boolean array
+   *
+   * @tparam Type    data type in which to compute the probabilities
+   * @tparam OutType output data type
+   * @tparam LenType data type used to represent length of the arrays
+   *
+   * @param[out] ptr    the output array
+   * @param[in]  len    the number of elements in the output
+   * @param[in]  prob   coin-toss probability for heads
+   * @param[in]  stream stream where to launch the kernel
+   */
+  template <typename Type, typename OutType = bool, typename LenType = int>
+  void bernoulli(OutType *ptr, LenType len, Type prob, cudaStream_t stream) {
+    detail::RngImpl::bernoulli(ptr, len, prob, stream);
+  }
+
+  /**
+   * @brief Generate bernoulli distributed array and applies scale
+   * @tparam Type data type in which to compute the probabilities
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param prob coin-toss probability for heads
+   * @param scale scaling factor
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void scaled_bernoulli(Type *ptr, LenType len, Type prob, Type scale,
+                        cudaStream_t stream) {
+    detail::RngImpl::scaled_bernoulli(ptr, len, prob, scale, stream);
+  }
+
+  /**
+   * @brief Generate gumbel distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param mu mean value
+   * @param beta scale value
+   * @param stream stream where to launch the kernel
+   * @note https://en.wikipedia.org/wiki/Gumbel_distribution
+   */
+  template <typename Type, typename LenType = int>
+  void gumbel(Type *ptr, LenType len, Type mu, Type beta, cudaStream_t stream) {
+    detail::RngImpl::gumbel(ptr, len, mu, beta, stream);
+  }
+
+  /**
+   * @brief Generate lognormal distributed numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param mu mean of the distribution
+   * @param sigma std-dev of the distribution
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void lognormal(Type *ptr, LenType len, Type mu, Type sigma,
+                 cudaStream_t stream) {
+    detail::RngImpl::lognormal(ptr, len, mu, sigma, stream);
+  }
+
+  /**
+   * @brief Generate logistic distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param mu mean value
+   * @param scale scale value
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void logistic(Type *ptr, LenType len, Type mu, Type scale,
+                cudaStream_t stream) {
+    detail::RngImpl::logistic(ptr, len, mu, scale, stream);
+  }
+
+  /**
+   * @brief Generate exponentially distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param lambda the lambda
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void exponential(Type *ptr, LenType len, Type lambda, cudaStream_t stream) {
+    detail::RngImpl::exponential(ptr, len, lambda, stream);
+  }
+
+  /**
+   * @brief Generate rayleigh distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param sigma the sigma
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void rayleigh(Type *ptr, LenType len, Type sigma, cudaStream_t stream) {
+    detail::RngImpl::rayleigh(ptr, len, sigma, stream);
+  }
+
+  /**
+   * @brief Generate laplace distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param mu the mean
+   * @param scale the scale
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void laplace(Type *ptr, LenType len, Type mu, Type scale,
+               cudaStream_t stream) {
+    detail::RngImpl::laplace(ptr, len, mu, scale, stream);
+  }
+
+  /**
+   * @brief Sample the input array without replacement, optionally based on the
+   * input weight vector for each element in the array
+   *
+   * Implementation here is based on the `one-pass sampling` algo described here:
+   * https://www.ethz.ch/content/dam/ethz/special-interest/baug/ivt/ivt-dam/vpl/reports/1101-1200/ab1141.pdf
+   *
+   * @note In the sampled array the elements which are picked will always appear
+   * in the increasing order of their weights as computed using the exponential
+   * distribution. So, if you're particular about the order (for eg. array
+   * permutations), then this might not be the right choice!
+   *
+   * @tparam DataT data type
+   * @tparam WeightsT weights type
+   * @tparam IdxT index type
+   * @param out output sampled array (of length 'sampledLen')
+   * @param outIdx indices of the sampled array (of length 'sampledLen'). Pass
+   * a nullptr if this is not required.
+   * @param in input array to be sampled (of length 'len')
+   * @param wts weights array (of length 'len'). Pass a nullptr if uniform
+   * sampling is desired
+   * @param sampledLen output sampled array length
+   * @param len input array length
+   * @param stream cuda stream
+   */
+  template <typename DataT, typename WeightsT, typename IdxT = int>
+  void sampleWithoutReplacement(const raft::handle_t &handle, DataT *out,
+                                IdxT *outIdx, const DataT *in,
+                                const WeightsT *wts, IdxT sampledLen, IdxT len,
+                                cudaStream_t stream) {
+    detail::RngImpl::sampleWithoutReplacement(handle, out, outIdx, in, wts,
+                                              sampledLen, len, stream);
+  }
+
+  /**
+   * @brief Core method to generate a pdf based on the cdf that is defined in
+   *        the input device lambda
+   *
+   * @tparam OutType  output type
+   * @tparam MathType type on which arithmetic is done
+   * @tparam LenTyp   index type
+   * @tparam Lambda   device lambda (or operator)
+   *
+   * @param[out] ptr    output buffer [on device] [len = len]
+   * @param[in]  len    number of elements to be generated
+   * @param[in]  randOp the device lambda or operator
+   * @param[in]  stream cuda stream
+   * @{
+   */
+  template <typename OutType, typename MathType = OutType,
+            typename LenType = int, typename Lambda>
+  void custom_distribution(OutType *ptr, LenType len, Lambda randOp,
+                           cudaStream_t stream) {
+    detail::RngImpl::custom_distribution(ptr, len, randOp, stream);
+  }
+  template <typename OutType, typename MathType = OutType,
+            typename LenType = int, typename Lambda>
+  void custom_distribution2(OutType *ptr, LenType len, Lambda randOp,
+                            cudaStream_t stream) {
+    detail::RngImpl::custom_distribution2(ptr, len, randOp, stream);
+  }
+  /** @} */
+};
+
+};  // end namespace random
+};  // end namespace raft
diff --git a/cpp/include/raft/random/rng_impl.cuh b/cpp/include/raft/random/rng_impl.cuh
deleted file mode 100644
index d44c6f018b..0000000000
--- a/cpp/include/raft/random/rng_impl.cuh
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <curand_kernel.h>
-#include <stdint.h>
-#include <raft/cuda_utils.cuh>
-
-namespace raft {
-namespace random {
-namespace detail {
-
-/** Philox-based random number generator */
-// Courtesy: Jakub Szuppe
-struct PhiloxGenerator {
-  /**
-   * @brief ctor. Initializes the state for RNG
-   * @param seed random seed (can be same across all threads)
-   * @param subsequence as found in curand docs
-   * @param offset as found in curand docs
-   */
-  DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
-    curand_init(seed, subsequence, offset, &state);
-  }
-
-  /**
-   * @defgroup NextRand Generate the next random number
-   * @{
-   */
-  DI void next(float& ret) { ret = curand_uniform(&(this->state)); }
-  DI void next(double& ret) { ret = curand_uniform_double(&(this->state)); }
-  DI void next(uint32_t& ret) { ret = curand(&(this->state)); }
-  DI void next(uint64_t& ret) {
-    uint32_t a, b;
-    next(a);
-    next(b);
-    ret = (uint64_t)a | ((uint64_t)b << 32);
-  }
-  DI void next(int32_t& ret) {
-    uint32_t val;
-    next(val);
-    ret = int32_t(val & 0x7fffffff);
-  }
-  DI void next(int64_t& ret) {
-    uint64_t val;
-    next(val);
-    ret = int64_t(val & 0x7fffffffffffffff);
-  }
-  /** @} */
-
- private:
-  /** the state for RNG */
-  curandStatePhilox4_32_10_t state;
-};
-
-/** LFSR taps-filter for generating random numbers. */
-// Courtesy: Vinay Deshpande
-struct TapsGenerator {
-  /**
-   * @brief ctor. Initializes the state for RNG
-   * @param seed the seed (can be same across all threads)
-   * @param subsequence unused
-   * @param offset unused
-   */
-  DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
-    uint64_t delta = (blockIdx.x * blockDim.x) + threadIdx.x;
-    uint64_t stride = blockDim.x * gridDim.x;
-    delta += ((blockIdx.y * blockDim.y) + threadIdx.y) * stride;
-    stride *= blockDim.y * gridDim.y;
-    delta += ((blockIdx.z * blockDim.z) + threadIdx.z) * stride;
-    state = seed + delta + 1;
-  }
-
-  /**
-   * @defgroup NextRand Generate the next random number
-   * @{
-   */
-  template <typename Type>
-  DI void next(Type& ret) {
-    constexpr double ULL_LARGE = 1.8446744073709551614e19;
-    uint64_t val;
-    next(val);
-    ret = static_cast<Type>(val);
-    ret /= static_cast<Type>(ULL_LARGE);
-  }
-  DI void next(uint64_t& ret) {
-    constexpr uint64_t TAPS = 0x8000100040002000ULL;
-    constexpr int ROUNDS = 128;
-    for (int i = 0; i < ROUNDS; i++)
-      state = (state >> 1) ^ (-(state & 1ULL) & TAPS);
-    ret = state;
-  }
-  DI void next(uint32_t& ret) {
-    uint64_t val;
-    next(val);
-    ret = (uint32_t)val;
-  }
-  DI void next(int32_t& ret) {
-    uint32_t val;
-    next(val);
-    ret = int32_t(val & 0x7fffffff);
-  }
-  DI void next(int64_t& ret) {
-    uint64_t val;
-    next(val);
-    ret = int64_t(val & 0x7fffffffffffffff);
-  }
-  /** @} */
-
- private:
-  /** the state for RNG */
-  uint64_t state;
-};
-
-/** Kiss99-based random number generator */
-
-struct Kiss99Generator {
-  /**
-   * @brief ctor. Initializes the state for RNG
-   * @param seed the seed (can be same across all threads)
-   * @param subsequence unused
-   * @param offset unused
-   */
-  DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
-    initKiss99(seed);
-  }
-
-  /**
-   * @defgroup NextRand Generate the next random number
-   * @{
-   */
-  template <typename Type>
-  DI void next(Type& ret) {
-    constexpr double U_LARGE = 4.294967295e9;
-    uint32_t val;
-    next(val);
-    ret = static_cast<Type>(val);
-    ret /= static_cast<Type>(U_LARGE);
-  }
-  DI void next(uint32_t& ret) {
-    uint32_t MWC;
-    z = 36969 * (z & 65535) + (z >> 16);
-    w = 18000 * (w & 65535) + (w >> 16);
-    MWC = ((z << 16) + w);
-    jsr ^= (jsr << 17);
-    jsr ^= (jsr >> 13);
-    jsr ^= (jsr << 5);
-    jcong = 69069 * jcong + 1234567;
-    MWC = ((MWC ^ jcong) + jsr);
-    ret = MWC;
-  }
-  DI void next(uint64_t& ret) {
-    uint32_t a, b;
-    next(a);
-    next(b);
-    ret = (uint64_t)a | ((uint64_t)b << 32);
-  }
-  DI void next(int32_t& ret) {
-    uint32_t val;
-    next(val);
-    ret = int32_t(val & 0x7fffffff);
-  }
-  DI void next(int64_t& ret) {
-    uint64_t val;
-    next(val);
-    ret = int64_t(val & 0x7fffffffffffffff);
-  }
-  /** @} */
-
- private:
-  /** one of the kiss99 states */
-  uint32_t z;
-  /** one of the kiss99 states */
-  uint32_t w;
-  /** one of the kiss99 states */
-  uint32_t jsr;
-  /** one of the kiss99 states */
-  uint32_t jcong;
-
-  // This function multiplies 128-bit hash by 128-bit FNV prime and returns lower
-  // 128 bits. It uses 32-bit wide multiply only.
-  DI void mulByFnv1a128Prime(uint32_t* h) {
-    typedef union {
-      uint32_t u32[2];
-      uint64_t u64[1];
-    } words64;
-
-    // 128-bit FNV prime = p3 * 2^96 + p2 * 2^64 + p1 * 2^32 + p0
-    // Here p0 = 315, p2 = 16777216, p1 = p3 = 0
-    const uint32_t p0 = uint32_t(315), p2 = uint32_t(16777216);
-    // Partial products
-    words64 h0p0, h1p0, h2p0, h0p2, h3p0, h1p2;
-
-    h0p0.u64[0] = uint64_t(h[0]) * p0;
-    h1p0.u64[0] = uint64_t(h[1]) * p0;
-    h2p0.u64[0] = uint64_t(h[2]) * p0;
-    h0p2.u64[0] = uint64_t(h[0]) * p2;
-    h3p0.u64[0] = uint64_t(h[3]) * p0;
-    h1p2.u64[0] = uint64_t(h[1]) * p2;
-
-    // h_n[0] = LO(h[0]*p[0]);
-    // h_n[1] = HI(h[0]*p[0]) + LO(h[1]*p[0]);
-    // h_n[2] = HI(h[1]*p[0]) + LO(h[2]*p[0]) + LO(h[0]*p[2]);
-    // h_n[3] = HI(h[2]*p[0]) + HI(h[0]*p[2]) + LO(h[3]*p[0]) + LO(h[1]*p[2]);
-    uint32_t carry = 0;
-    h[0] = h0p0.u32[0];
-
-    h[1] = h0p0.u32[1] + h1p0.u32[0];
-    carry = h[1] < h0p0.u32[1] ? 1 : 0;
-
-    h[2] = h1p0.u32[1] + carry;
-    carry = h[2] < h1p0.u32[1] ? 1 : 0;
-    h[2] += h2p0.u32[0];
-    carry = h[2] < h2p0.u32[0] ? carry + 1 : carry;
-    h[2] += h0p2.u32[0];
-    carry = h[2] < h0p2.u32[0] ? carry + 1 : carry;
-
-    h[3] = h2p0.u32[1] + h0p2.u32[1] + h3p0.u32[0] + h1p2.u32[0] + carry;
-    return;
-  }
-
-  DI void fnv1a128(uint32_t* hash, uint32_t txt) {
-    hash[0] ^= (txt >> 0) & 0xFF;
-    mulByFnv1a128Prime(hash);
-    hash[0] ^= (txt >> 8) & 0xFF;
-    mulByFnv1a128Prime(hash);
-    hash[0] ^= (txt >> 16) & 0xFF;
-    mulByFnv1a128Prime(hash);
-    hash[0] ^= (txt >> 24) & 0xFF;
-    mulByFnv1a128Prime(hash);
-  }
-
-  DI void initKiss99(uint64_t seed) {
-    // Initialize hash to 128-bit FNV1a basis
-    uint32_t hash[4] = {1653982605UL, 1656234357UL, 129696066UL, 1818371886UL};
-
-    // Digest threadIdx, blockIdx and seed
-    fnv1a128(hash, threadIdx.x);
-    fnv1a128(hash, threadIdx.y);
-    fnv1a128(hash, threadIdx.z);
-    fnv1a128(hash, blockIdx.x);
-    fnv1a128(hash, blockIdx.y);
-    fnv1a128(hash, blockIdx.z);
-    fnv1a128(hash, uint32_t(seed));
-    fnv1a128(hash, uint32_t(seed >> 32));
-
-    // Initialize KISS99 state with hash
-    z = hash[0];
-    w = hash[1];
-    jsr = hash[2];
-    jcong = hash[3];
-  }
-};
-
-/**
- * @brief generator-agnostic way of generating random numbers
- * @tparam GenType the generator object that expose 'next' method
- */
-template <typename GenType>
-struct Generator {
-  DI Generator(uint64_t seed, uint64_t subsequence, uint64_t offset)
-    : gen(seed, subsequence, offset) {}
-
-  template <typename Type>
-  DI void next(Type& ret) {
-    gen.next(ret);
-  }
-
- private:
-  /** the actual generator */
-  GenType gen;
-};
-
-};  // end namespace detail
-};  // end namespace random
-};  // end namespace raft
diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh
index fc1a7c0d8d..b796b63dc8 100644
--- a/cpp/include/raft/sparse/selection/knn.cuh
+++ b/cpp/include/raft/sparse/selection/knn.cuh
@@ -22,7 +22,7 @@
 #include <raft/linalg/distance_type.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/matrix/matrix.cuh>
+#include <raft/matrix/matrix.hpp>
 #include <raft/mr/device/buffer.hpp>
 
 #include <raft/sparse/op/slice.h>
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
index 909e28708e..7354fa3497 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
@@ -31,8 +31,8 @@
 
 #include <raft/cuda_utils.cuh>
 
-#include <raft/matrix/matrix.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/matrix/matrix.hpp>
+#include <raft/random/rng.hpp>
 #include <raft/sparse/convert/csr.cuh>
 
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index 876e91e877..b66ea025a2 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -19,8 +19,8 @@
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/norm.cuh>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/stats/mean.cuh>
-#include <raft/stats/mean_center.cuh>
+#include <raft/stats/mean.hpp>
+#include <raft/stats/mean_center.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh
similarity index 81%
rename from cpp/include/raft/stats/mean.cuh
rename to cpp/include/raft/stats/detail/mean.cuh
index 8691cabc85..1b338a035a 100644
--- a/cpp/include/raft/stats/mean.cuh
+++ b/cpp/include/raft/stats/detail/mean.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,14 @@
 
 #pragma once
 
-#include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
-#include <raft/handle.hpp>
 #include <raft/linalg/eltwise.cuh>
 
+#include <cub/cub.cuh>
+
 namespace raft {
 namespace stats {
+namespace detail {
 
 ///@todo: ColsPerBlk has been tested only for 32!
 template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
@@ -62,23 +63,6 @@ __global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D,
   }
 }
 
-/**
- * @brief Compute mean of the input matrix
- *
- * Mean operation is assumed to be performed on a given column.
- *
- * @tparam Type: the data type
- * @tparam IdxType Integer type used to for addressing
- * @param mu: the output mean vector
- * @param data: the input matrix
- * @param D: number of columns of data
- * @param N: number of rows of data
- * @param sample: whether to evaluate sample mean or not. In other words,
- * whether
- *  to normalize the output using N-1 or N, for true or false, respectively
- * @param rowMajor: whether the input data is row or col major
- * @param stream: cuda stream
- */
 template <typename Type, typename IdxType = int>
 void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample,
           bool rowMajor, cudaStream_t stream) {
@@ -102,5 +86,6 @@ void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample,
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
-};  // namespace stats
-};  // namespace raft
+}  // namespace detail
+}  // namespace stats
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/stats/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh
similarity index 87%
rename from cpp/include/raft/stats/stddev.cuh
rename to cpp/include/raft/stats/detail/stddev.cuh
index f12c633829..e8917a60b3 100644
--- a/cpp/include/raft/stats/stddev.cuh
+++ b/cpp/include/raft/stats/detail/stddev.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,14 @@
 
 #pragma once
 
-#include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
-#include <raft/handle.hpp>
 #include <raft/linalg/binary_op.cuh>
 
+#include <cub/cub.cuh>
+
 namespace raft {
 namespace stats {
+namespace detail {
 
 ///@todo: ColPerBlk has been tested only for 32!
 template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
@@ -131,23 +132,23 @@ void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N,
 }
 
 /**
- * @brief Compute variance of the input matrix
- *
- * Variance operation is assumed to be performed on a given column.
- *
- * @tparam Type the data type
- * @tparam IdxType Integer type used to for addressing
- * @param var the output stddev vector
- * @param data the input matrix
- * @param mu the mean vector
- * @param D number of columns of data
- * @param N number of rows of data
- * @param sample whether to evaluate sample stddev or not. In other words,
- * whether
- *  to normalize the output using N-1 or N, for true or false, respectively
- * @param rowMajor whether the input data is row or col major
- * @param stream cuda stream where to launch work
- */
+  * @brief Compute variance of the input matrix
+  *
+  * Variance operation is assumed to be performed on a given column.
+  *
+  * @tparam Type the data type
+  * @tparam IdxType Integer type used to for addressing
+  * @param var the output stddev vector
+  * @param data the input matrix
+  * @param mu the mean vector
+  * @param D number of columns of data
+  * @param N number of rows of data
+  * @param sample whether to evaluate sample stddev or not. In other words,
+  * whether
+  *  to normalize the output using N-1 or N, for true or false, respectively
+  * @param rowMajor whether the input data is row or col major
+  * @param stream cuda stream where to launch work
+  */
 template <typename Type, typename IdxType = int>
 void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N,
           bool sample, bool rowMajor, cudaStream_t stream) {
@@ -172,5 +173,6 @@ void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N,
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
-};  // namespace stats
-};  // namespace raft
+}  // namespace detail
+}  // namespace stats
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/stats/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh
similarity index 83%
rename from cpp/include/raft/stats/sum.cuh
rename to cpp/include/raft/stats/detail/sum.cuh
index 5f8416c7e2..37a3313ed1 100644
--- a/cpp/include/raft/stats/sum.cuh
+++ b/cpp/include/raft/stats/detail/sum.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,14 @@
 
 #pragma once
 
-#include <raft/cudart_utils.h>
-#include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/eltwise.cuh>
 
+#include <cub/cub.cuh>
+
 namespace raft {
 namespace stats {
+namespace detail {
 
 ///@todo: ColsPerBlk has been tested only for 32!
 template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
@@ -62,20 +63,6 @@ __global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D,
   }
 }
 
-/**
- * @brief Compute sum of the input matrix
- *
- * Sum operation is assumed to be performed on a given column.
- *
- * @tparam Type the data type
- * @tparam IdxType Integer type used to for addressing
- * @param output the output mean vector
- * @param input the input matrix
- * @param D number of columns of data
- * @param N number of rows of data
- * @param rowMajor whether the input data is row or col major
- * @param stream cuda stream where to launch work
- */
 template <typename Type, typename IdxType = int>
 void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor,
          cudaStream_t stream) {
@@ -96,5 +83,6 @@ void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor,
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
-};  // end namespace stats
-};  // end namespace raft
+}  // namespace detail
+}  // namespace stats
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/stats/mean.hpp b/cpp/include/raft/stats/mean.hpp
new file mode 100644
index 0000000000..6e4cf39850
--- /dev/null
+++ b/cpp/include/raft/stats/mean.hpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/mean.cuh"
+
+#include <raft/handle.hpp>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute mean of the input matrix
+ *
+ * Mean operation is assumed to be performed on a given column.
+ *
+ * @tparam Type: the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param mu: the output mean vector
+ * @param data: the input matrix
+ * @param D: number of columns of data
+ * @param N: number of rows of data
+ * @param sample: whether to evaluate sample mean or not. In other words,
+ * whether
+ *  to normalize the output using N-1 or N, for true or false, respectively
+ * @param rowMajor: whether the input data is row or col major
+ * @param stream: cuda stream
+ */
+template <typename Type, typename IdxType = int>
+void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample,
+          bool rowMajor, cudaStream_t stream) {
+  detail::mean(mu, data, D, N, sample, rowMajor, stream);
+}
+
+};  // namespace stats
+};  // namespace raft
diff --git a/cpp/include/raft/stats/mean_center.cuh b/cpp/include/raft/stats/mean_center.hpp
similarity index 100%
rename from cpp/include/raft/stats/mean_center.cuh
rename to cpp/include/raft/stats/mean_center.hpp
diff --git a/cpp/include/raft/stats/stddev.hpp b/cpp/include/raft/stats/stddev.hpp
new file mode 100644
index 0000000000..17c5ae457d
--- /dev/null
+++ b/cpp/include/raft/stats/stddev.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/stddev.cuh"
+
+#include <raft/handle.hpp>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute stddev of the input matrix
+ *
+ * Stddev operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param std the output stddev vector
+ * @param data the input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param sample whether to evaluate sample stddev or not. In other words,
+ * whether
+ *  to normalize the output using N-1 or N, for true or false, respectively
+ * @param rowMajor whether the input data is row or col major
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int>
+void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N,
+            bool sample, bool rowMajor, cudaStream_t stream) {
+  detail::stddev(std, data, mu, D, N, sample, rowMajor, stream);
+}
+
+/**
+ * @brief Compute variance of the input matrix
+ *
+ * Variance operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param var the output stddev vector
+ * @param data the input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param sample whether to evaluate sample stddev or not. In other words,
+ * whether
+ *  to normalize the output using N-1 or N, for true or false, respectively
+ * @param rowMajor whether the input data is row or col major
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int>
+void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N,
+          bool sample, bool rowMajor, cudaStream_t stream) {
+  detail::vars(var, data, mu, D, N, sample, rowMajor, stream);
+}
+
+};  // namespace stats
+};  // namespace raft
diff --git a/cpp/include/raft/stats/sum.hpp b/cpp/include/raft/stats/sum.hpp
new file mode 100644
index 0000000000..4f67acdf36
--- /dev/null
+++ b/cpp/include/raft/stats/sum.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/sum.cuh"
+
+#include <raft/cudart_utils.h>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute sum of the input matrix
+ *
+ * Sum operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param output the output mean vector
+ * @param input the input matrix
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param rowMajor whether the input data is row or col major
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int>
+void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor,
+         cudaStream_t stream) {
+  detail::sum(output, input, D, N, rowMajor, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index 82c2bfdfec..efa1e2cd41 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -18,7 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/distance.hpp>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
 
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index f6b18198d2..f31fbc9165 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -18,7 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/distance.hpp>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 8c06abf370..33782baf8d 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -20,7 +20,7 @@
 #include <raft/distance/detail/fused_l2_nn.cuh>
 #include <raft/distance/fused_l2_nn.hpp>
 #include <raft/linalg/norm.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu
index eea9df046b..48ad83dfd2 100644
--- a/cpp/test/linalg/add.cu
+++ b/cpp/test/linalg/add.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/add.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 #include "add.cuh"
 
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu
index b60f04cd34..c8121bfbe4 100644
--- a/cpp/test/linalg/binary_op.cu
+++ b/cpp/test/linalg/binary_op.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/binary_op.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
 #include "binary_op.cuh"
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index f17a0f0f5d..fdfc3052b7 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -18,7 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/coalesced_reduction.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 #include "reduce.cuh"
 
diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu
index 430c35f41b..d90955147c 100644
--- a/cpp/test/linalg/divide.cu
+++ b/cpp/test/linalg/divide.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/divide.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 #include "unary_op.cuh"
 
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
index 87d6c4faa6..2ac9118506 100644
--- a/cpp/test/linalg/eig.cu
+++ b/cpp/test/linalg/eig.cu
@@ -18,7 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/eig.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index 3c75654733..9eb1c10313 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -20,7 +20,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/eig.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu
index 65bd7e4427..c3b26f5423 100644
--- a/cpp/test/linalg/eltwise.cu
+++ b/cpp/test/linalg/eltwise.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/eltwise.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu
index cecfc5eb8e..699d40d55e 100644
--- a/cpp/test/linalg/gemm_layout.cu
+++ b/cpp/test/linalg/gemm_layout.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/gemm.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/linalg/gemv.cu b/cpp/test/linalg/gemv.cu
index 4a474bc461..92e59ae49b 100644
--- a/cpp/test/linalg/gemv.cu
+++ b/cpp/test/linalg/gemv.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/linalg/gemv.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu
index 271ae13b2e..f04c225aa9 100644
--- a/cpp/test/linalg/map.cu
+++ b/cpp/test/linalg/map.cu
@@ -18,7 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/eltwise.cuh>
 #include <raft/linalg/map.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index e77809def7..9d59e49e60 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -18,7 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <limits>
 #include <raft/linalg/map_then_reduce.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index 28296ad7bd..aad1d1e137 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 #include "matrix_vector_op.cuh"
 
diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu
index d0772e538d..f78ae64f05 100644
--- a/cpp/test/linalg/multiply.cu
+++ b/cpp/test/linalg/multiply.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/multiply.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 #include "unary_op.cuh"
 
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
index 94b703d15e..659956534e 100644
--- a/cpp/test/linalg/norm.cu
+++ b/cpp/test/linalg/norm.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/norm.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
index cf7585dc23..9822ca2c60 100644
--- a/cpp/test/linalg/reduce.cu
+++ b/cpp/test/linalg/reduce.cu
@@ -18,7 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/reduce.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 #include "reduce.cuh"
 
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu
index 56632a59cc..4f761d39f6 100644
--- a/cpp/test/linalg/strided_reduction.cu
+++ b/cpp/test/linalg/strided_reduction.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/strided_reduction.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 #include "reduce.cuh"
 
diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu
index df3686ee32..0a82da61c9 100644
--- a/cpp/test/linalg/subtract.cu
+++ b/cpp/test/linalg/subtract.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/subtract.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu
index cbd6df9c8f..8ebbf19683 100644
--- a/cpp/test/linalg/svd.cu
+++ b/cpp/test/linalg/svd.cu
@@ -18,8 +18,8 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/svd.cuh>
-#include <raft/matrix/matrix.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/matrix/matrix.hpp>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index b3f7f5b126..1d8ef08673 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -18,7 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/transpose.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu
index c3d10d70e7..0fcf465150 100644
--- a/cpp/test/linalg/unary_op.cu
+++ b/cpp/test/linalg/unary_op.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 #include "unary_op.cuh"
 
diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu
index 84aa310076..7c7f29815b 100644
--- a/cpp/test/matrix/math.cu
+++ b/cpp/test/matrix/math.cu
@@ -16,8 +16,8 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/matrix/math.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/matrix/math.hpp>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu
index 9dde1cca9a..e247abad1e 100644
--- a/cpp/test/matrix/matrix.cu
+++ b/cpp/test/matrix/matrix.cu
@@ -16,8 +16,8 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/matrix/matrix.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/matrix/matrix.hpp>
+#include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
 
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
index f0e0f6cb93..810d6cb871 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/test/random/rng.cu
@@ -18,14 +18,16 @@
 #include <raft/cudart_utils.h>
 #include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
-#include <raft/random/rng.cuh>
-#include <raft/stats/mean.cuh>
-#include <raft/stats/stddev.cuh>
+#include <raft/random/rng.hpp>
+#include <raft/stats/mean.hpp>
+#include <raft/stats/stddev.hpp>
 #include "../test_utils.h"
 
 namespace raft {
 namespace random {
 
+using namespace raft::random::detail;
+
 enum RandomType {
   RNG_Normal,
   RNG_LogNormal,
diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu
index e51700fbb7..cef2d47276 100644
--- a/cpp/test/random/rng_int.cu
+++ b/cpp/test/random/rng_int.cu
@@ -18,12 +18,14 @@
 #include <raft/cudart_utils.h>
 #include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
 namespace random {
 
+using namespace raft::random::detail;
+
 enum RandomType { RNG_Uniform };
 
 template <typename T, int TPB>
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu
index ecb4164616..1d33f08c62 100644
--- a/cpp/test/random/sample_without_replacement.cu
+++ b/cpp/test/random/sample_without_replacement.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include <set>
 #include <vector>
 #include "../test_utils.h"
@@ -25,6 +25,8 @@
 namespace raft {
 namespace random {
 
+using namespace raft::random::detail;
+
 // Terminology:
 // SWoR - Sample Without Replacement
 
diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu
index b9d4d18e98..a5f08489f1 100644
--- a/cpp/test/sparse/add.cu
+++ b/cpp/test/sparse/add.cu
@@ -20,7 +20,7 @@
 #include <raft/sparse/linalg/add.cuh>
 
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 #include <iostream>
diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu
index 8bdd5b88c7..d30114bbcb 100644
--- a/cpp/test/sparse/convert_coo.cu
+++ b/cpp/test/sparse/convert_coo.cu
@@ -20,7 +20,7 @@
 #include <raft/sparse/csr.cuh>
 
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 
 #include "../test_utils.h"
 
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index 2f1ed99332..cd665934c2 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 #include <raft/sparse/convert/csr.cuh>
diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu
index 2201702b03..fbadadb29d 100644
--- a/cpp/test/sparse/degree.cu
+++ b/cpp/test/sparse/degree.cu
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 #include <raft/sparse/linalg/degree.cuh>
diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu
index 4634e5fc0e..58ad9cf803 100644
--- a/cpp/test/sparse/filter.cu
+++ b/cpp/test/sparse/filter.cu
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 #include <raft/sparse/op/sort.h>
diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu
index 712f95018c..c2a1c4b93c 100644
--- a/cpp/test/sparse/knn_graph.cu
+++ b/cpp/test/sparse/knn_graph.cu
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
index 91b9bc6c54..4900b3ff2b 100644
--- a/cpp/test/sparse/norm.cu
+++ b/cpp/test/sparse/norm.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include <raft/sparse/csr.cuh>
 #include <raft/sparse/linalg/norm.cuh>
 #include "../test_utils.h"
diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu
index 8011d73a6e..d527e7323e 100644
--- a/cpp/test/sparse/row_op.cu
+++ b/cpp/test/sparse/row_op.cu
@@ -20,7 +20,7 @@
 #include <raft/sparse/op/row_op.cuh>
 
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 #include <iostream>
diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu
index 92833630dd..7d43780cfd 100644
--- a/cpp/test/sparse/sort.cu
+++ b/cpp/test/sparse/sort.cu
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 #include <raft/sparse/op/sort.h>
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu
index d50211f971..77d9d3d822 100644
--- a/cpp/test/sparse/symmetrize.cu
+++ b/cpp/test/sparse/symmetrize.cu
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/coo.cuh>
 #include <raft/sparse/linalg/symmetrize.cuh>
diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu
index 9884202cc0..cf866a5663 100644
--- a/cpp/test/stats/mean.cu
+++ b/cpp/test/stats/mean.cu
@@ -19,8 +19,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/random/rng.cuh>
-#include <raft/stats/mean.cuh>
+#include <raft/random/rng.hpp>
+#include <raft/stats/mean.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu
index 9845663df9..dcc4b4e551 100644
--- a/cpp/test/stats/mean_center.cu
+++ b/cpp/test/stats/mean_center.cu
@@ -16,9 +16,9 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
-#include <raft/stats/mean.cuh>
-#include <raft/stats/mean_center.cuh>
+#include <raft/random/rng.hpp>
+#include <raft/stats/mean.hpp>
+#include <raft/stats/mean_center.hpp>
 #include "../linalg/matrix_vector_op.cuh"
 #include "../test_utils.h"
 
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index 8c42b70c07..53f392aaf3 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -16,10 +16,10 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/matrix/math.cuh>
-#include <raft/random/rng.cuh>
-#include <raft/stats/mean.cuh>
-#include <raft/stats/stddev.cuh>
+#include <raft/matrix/math.hpp>
+#include <raft/random/rng.hpp>
+#include <raft/stats/mean.hpp>
+#include <raft/stats/stddev.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index f5b341cb0e..ac4d642c8e 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -17,8 +17,8 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/eltwise.cuh>
-#include <raft/random/rng.cuh>
-#include <raft/stats/sum.cuh>
+#include <raft/random/rng.hpp>
+#include <raft/stats/sum.hpp>
 #include "../test_utils.h"
 
 namespace raft {

From 224e16e727f80791cde3bc3cd7da53f64995221d Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Mon, 15 Nov 2021 14:58:07 -0500
Subject: [PATCH 037/171] Fixing bad host->device copy (#375)

A host vector in haversine knn test was being resized to `n` rather `n*d` before `n*d` elements were copied from it to a device vector. Not sure if this is causing the following issue in the test, but it's not correct nonetheless:

```
[----------] 1 test from HaversineKNNTestF
[ RUN      ] HaversineKNNTestF.Fit
unknown file: Failure
C++ exception with description "std::bad_alloc: CUDA error at: _deps/rmm-src/include/rmm/mr/device/cuda_memory_resource.hpp:70: cudaErrorMemoryAllocation out of memory" thrown in the test fixture's constructor.
[  FAILED  ] HaversineKNNTestF.Fit (0 ms)
[----------] 1 test from HaversineKNNTestF (0 ms total)
```

Since it was happening on host, the only thing I can think of is that somehow there is some host memory being used to determine the amount of device memory to allocate, which is grabbing a garbage value from somewhere.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/raft/pull/375
---
 cpp/test/spatial/haversine.cu | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu
index e51bb227a1..5a45c45bff 100644
--- a/cpp/test/spatial/haversine.cu
+++ b/cpp/test/spatial/haversine.cu
@@ -31,11 +31,11 @@ class HaversineKNNTest : public ::testing::Test {
  public:
   HaversineKNNTest()
     : stream(handle.get_stream()),
-      d_train_inputs(n * d, stream),
-      d_ref_I(n * n, stream),
-      d_ref_D(n * n, stream),
-      d_pred_I(n * n, stream),
-      d_pred_D(n * n, stream) {}
+      d_train_inputs(0, stream),
+      d_ref_I(0, stream),
+      d_ref_D(0, stream),
+      d_pred_I(0, stream),
+      d_pred_D(0, stream) {}
 
  protected:
   void basicTest() {
@@ -56,9 +56,9 @@ class HaversineKNNTest : public ::testing::Test {
       0.74932804, -1.33634042, 0.51486728, -1.65962873,
       0.53154002, -1.47049808, 0.72891737, -1.54095137};
 
-    h_train_inputs.resize(n);
-    raft::update_device(d_train_inputs.data(), h_train_inputs.data(), n * d,
-                        stream);
+    h_train_inputs.resize(d_train_inputs.size());
+    raft::update_device(d_train_inputs.data(), h_train_inputs.data(),
+                        d_train_inputs.size(), stream);
 
     std::vector<value_t> h_res_D = {
       0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595,
@@ -77,9 +77,6 @@ class HaversineKNNTest : public ::testing::Test {
     raft::update_device<value_idx>(d_ref_I.data(), h_res_I.data(), n * n,
                                    stream);
 
-    std::vector<value_t *> input_vec = {d_train_inputs.data()};
-    std::vector<value_idx> sizes_vec = {n};
-
     raft::spatial::knn::detail::haversine_knn(
       d_pred_I.data(), d_pred_D.data(), d_train_inputs.data(),
       d_train_inputs.data(), n, n, k, stream);

From 258c1168b85f55c2aeb9e3b3da766adf3f415d51 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Mon, 15 Nov 2021 20:09:35 -0500
Subject: [PATCH 038/171] Removing conflict w/ CUDA_CHECK (#378)

Closes https://github.com/rapidsai/raft/issues/128

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/raft/pull/378
---
 cpp/include/raft/cudart_utils.h | 49 +++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 18 deletions(-)

diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index 85ca310530..486103dedb 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -53,6 +53,7 @@ struct cuda_error : public raft::exception {
  * exception detailing the CUDA error that occurred
  *
  */
+#ifndef CUDA_TRY
 #define CUDA_TRY(call)                                                        \
   do {                                                                        \
     cudaError_t const status = call;                                          \
@@ -65,7 +66,7 @@ struct cuda_error : public raft::exception {
       throw raft::cuda_error(msg);                                            \
     }                                                                         \
   } while (0)
-
+#endif
 /**
  * @brief Debug macro to check for CUDA errors
  *
@@ -86,13 +87,16 @@ struct cuda_error : public raft::exception {
 #endif
 
 /** FIXME: temporary alias for cuML compatibility */
+#ifndef CUDA_CHECK
 #define CUDA_CHECK(call) CUDA_TRY(call)
+#endif
 
 ///@todo: enable this only after we have added logging support in raft
 // /**
 //  * @brief check for cuda runtime API errors but log error instead of raising
 //  *        exception.
 //  */
+#ifndef CUDA_CHECK_NO_THROW
 #define CUDA_CHECK_NO_THROW(call)                                         \
   do {                                                                    \
     cudaError_t const status = call;                                      \
@@ -101,6 +105,15 @@ struct cuda_error : public raft::exception {
              __FILE__, __LINE__, cudaGetErrorString(status));             \
     }                                                                     \
   } while (0)
+#endif
+
+/**
+ * Alias to raft scope for now.
+ * TODO: Rename original implementations in 22.04 to fix
+ * https://github.com/rapidsai/raft/issues/128
+ */
+#define RAFT_CUDA_CHECK(call) CUDA_CHECK(call)
+#define RAFT_CUDA_CHECK_NO_THROW(call) CUDA_CHECK_NO_THROW(call)
 
 namespace raft {
 
@@ -121,13 +134,13 @@ class grid_1d_thread_t {
   int const num_blocks{0};
 
   /**
-   * @param overall_num_elements The number of elements the kernel needs to handle/process
-   * @param num_threads_per_block The grid block size, determined according to the kernel's
-   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-   * this can't be determined generically/automatically (as opposed to the number of blocks)
-   * @param elements_per_thread Typically, a single kernel thread processes more than a single
-   * element; this affects the number of threads the grid must contain
-   */
+         * @param overall_num_elements The number of elements the kernel needs to handle/process
+         * @param num_threads_per_block The grid block size, determined according to the kernel's
+         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+         * this can't be determined generically/automatically (as opposed to the number of blocks)
+         * @param elements_per_thread Typically, a single kernel thread processes more than a single
+         * element; this affects the number of threads the grid must contain
+         */
   grid_1d_thread_t(size_t overall_num_elements, size_t num_threads_per_block,
                    size_t max_num_blocks_1d, size_t elements_per_thread = 1)
     : block_size(num_threads_per_block),
@@ -152,11 +165,11 @@ class grid_1d_warp_t {
   int const num_blocks{0};
 
   /**
-   * @param overall_num_elements The number of elements the kernel needs to handle/process
-   * @param num_threads_per_block The grid block size, determined according to the kernel's
-   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-   * this can't be determined generically/automatically (as opposed to the number of blocks)
-   */
+         * @param overall_num_elements The number of elements the kernel needs to handle/process
+         * @param num_threads_per_block The grid block size, determined according to the kernel's
+         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+         * this can't be determined generically/automatically (as opposed to the number of blocks)
+         */
   grid_1d_warp_t(size_t overall_num_elements, size_t num_threads_per_block,
                  size_t max_num_blocks_1d)
     : block_size(num_threads_per_block),
@@ -180,11 +193,11 @@ class grid_1d_block_t {
   int const num_blocks{0};
 
   /**
-   * @param overall_num_elements The number of elements the kernel needs to handle/process
-   * @param num_threads_per_block The grid block size, determined according to the kernel's
-   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-   * this can't be determined generically/automatically (as opposed to the number of blocks)
-   */
+         * @param overall_num_elements The number of elements the kernel needs to handle/process
+         * @param num_threads_per_block The grid block size, determined according to the kernel's
+         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+         * this can't be determined generically/automatically (as opposed to the number of blocks)
+         */
   grid_1d_block_t(size_t overall_num_elements, size_t num_threads_per_block,
                   size_t max_num_blocks_1d)
     : block_size(num_threads_per_block),

From 4a8fa9fcde2c13fe8d5c350315c2af0249f8ad35 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Tue, 16 Nov 2021 14:18:38 -0500
Subject: [PATCH 039/171] Getting doxygen to run (#371)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/371
---
 cpp/Doxyfile.in                               | 18 ++----
 cpp/cmake/doxygen.cmake                       |  2 +-
 cpp/include/raft/comms/comms.hpp              | 15 ++---
 cpp/include/raft/comms/std_comms.hpp          | 11 ++--
 cpp/include/raft/comms/test.hpp               | 63 +++++++++++++++----
 cpp/include/raft/distance/distance.hpp        |  5 +-
 cpp/include/raft/distance/fused_l2_nn.hpp     |  1 +
 cpp/include/raft/handle.hpp                   |  3 +-
 cpp/include/raft/mr/buffer_base.hpp           |  4 --
 cpp/include/raft/sparse/distance/distance.hpp |  1 +
 cpp/include/raft/spatial/knn/ball_cover.hpp   |  5 +-
 cpp/include/raft/spatial/knn/knn.hpp          |  4 +-
 12 files changed, 81 insertions(+), 51 deletions(-)

diff --git a/cpp/Doxyfile.in b/cpp/Doxyfile.in
index 8a6a8e731e..0918e12e4f 100644
--- a/cpp/Doxyfile.in
+++ b/cpp/Doxyfile.in
@@ -771,10 +771,7 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = @CMAKE_CURRENT_SOURCE_DIR@/comms \
-                         @CMAKE_CURRENT_SOURCE_DIR@/include \
-                         @CMAKE_CURRENT_SOURCE_DIR@/src \
-                         @CMAKE_CURRENT_SOURCE_DIR@/src_prims
+INPUT                  = @CMAKE_CURRENT_SOURCE_DIR@/include \
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -799,12 +796,7 @@ INPUT_ENCODING         = UTF-8
 # *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f, *.for, *.tcl,
 # *.vhd, *.vhdl, *.ucf, *.qsf, *.as and *.js.
 
-FILE_PATTERNS          = *.cpp \
-                         *.h \
-                         *.hpp \
-                         *.hxx \
-                         *.cu \
-                         *.cuh
+FILE_PATTERNS          = *.hpp
 
 # The RECURSIVE tag can be used to specify whether or not subdirectories should
 # be searched for input files as well.
@@ -835,8 +827,8 @@ EXCLUDE_SYMLINKS       = NO
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories for example use the pattern */test/*
 
-EXCLUDE_PATTERNS       = columnWiseSort.h \
-                         smoblocksolve.h
+EXCLUDE_PATTERNS       = **/detail/** \
+                         **/spectral/**
 
 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
 # (namespaces, classes, functions, etc.) that should be excluded from the
@@ -873,7 +865,7 @@ EXAMPLE_RECURSIVE      = NO
 # that contain images that are to be included in the documentation (see the
 # \image command).
 
-IMAGE_PATH             = @CMAKE_CURRENT_SOURCE_DIR@/doxygen/images
+IMAGE_PATH             =
 
 # The INPUT_FILTER tag can be used to specify a program that doxygen should
 # invoke to filter for each input file. Doxygen will invoke the filter program
diff --git a/cpp/cmake/doxygen.cmake b/cpp/cmake/doxygen.cmake
index 061981f64d..5b2da57eb5 100644
--- a/cpp/cmake/doxygen.cmake
+++ b/cpp/cmake/doxygen.cmake
@@ -22,7 +22,7 @@ function(add_doxygen_target)
     set(multiValueArgs "")
     cmake_parse_arguments(dox "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     configure_file(${dox_IN_DOXYFILE} ${dox_OUT_DOXYFILE} @ONLY)
-    add_custom_target(doc
+    add_custom_target(docs_raft
       ${DOXYGEN_EXECUTABLE} ${dox_OUT_DOXYFILE}
       WORKING_DIRECTORY ${dox_CWD}
       VERBATIM
diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index 3f2f6b28f9..bd8a4ce9e7 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -339,9 +339,9 @@ class comms_t {
 
   /**
    * Gathers data from all ranks and delivers to combined data to all ranks
-   * @param value_t datatype of underlying buffers
-   * @param sendbuff buffer containing data to send
-   * @param recvbuff buffer containing data to receive
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuf buffer containing data to send
+   * @param recvbuf buffer containing data to receive
    * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
    *                   elements that are to be received from each rank
    * @param displs pointer to an array (of length num_ranks size) to specify the displacement
@@ -376,9 +376,9 @@ class comms_t {
 
   /**
    * Gathers data from all ranks and delivers to combined data to all ranks
-   * @param value_t datatype of underlying buffers
-   * @param sendbuff buffer containing data to send
-   * @param recvbuff buffer containing data to receive
+   * @tparam value_t datatype of underlying buffers
+   * @param sendbuf buffer containing data to send
+   * @param recvbuf buffer containing data to receive
    * @param sendcount number of elements in send buffer
    * @param recvcounts pointer to an array (of length num_ranks size) containing the number of
    *                   elements that are to be received from each rank
@@ -401,6 +401,7 @@ class comms_t {
    * @tparam value_t datatype of underlying buffers
    * @param sendbuff buffer containing data to send (size recvcount * num_ranks)
    * @param recvbuff buffer containing received data
+   * @param recvcount number of items to receive
    * @param op reduction operation to perform
    * @param stream CUDA stream to synchronize operation
    */
@@ -476,7 +477,7 @@ class comms_t {
    * @param sendbuf pointer to array of data to send
    * @param sendsizes numbers of elements to send
    * @param sendoffsets offsets in a number of elements from sendbuf
-   * @param dest destination ranks
+   * @param dests destination ranks
    * @param recvbuf pointer to (initialized) array that will hold received data
    * @param recvsizes numbers of elements to recv
    * @param recvoffsets offsets in a number of elements from recvbuf
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index 562c548bcb..47559b1718 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -56,11 +56,13 @@ class std_comms : public comms_iface {
 
   /**
    * @brief Constructor for collective + point-to-point operation.
-   * @param comm initialized nccl comm
+   * @param nccl_comm initialized nccl comm
    * @param ucp_worker initialized ucp_worker instance
    * @param eps shared pointer to array of ucp endpoints
-   * @param size size of the cluster
+   * @param num_ranks number of ranks in the cluster
    * @param rank rank of the current worker
+   * @param stream cuda stream for synchronizing and ordering collective operations
+   * @param subcomms_ucp use ucp for subcommunicators
    */
   std_comms(ncclComm_t nccl_comm, ucp_worker_h ucp_worker,
             std::shared_ptr<ucp_ep_h *> eps, int num_ranks, int rank,
@@ -79,9 +81,10 @@ class std_comms : public comms_iface {
 
   /**
    * @brief constructor for collective-only operation
-   * @param comm initilized nccl communicator
-   * @param size size of the cluster
+   * @param nccl_comm initilized nccl communicator
+   * @param num_ranks size of the cluster
    * @param rank rank of the current worker
+   * @param stream stream for ordering collective operations
    */
   std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank,
             cudaStream_t stream)
diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp
index 17db8e88af..39086de25d 100644
--- a/cpp/include/raft/comms/test.hpp
+++ b/cpp/include/raft/comms/test.hpp
@@ -31,10 +31,11 @@ namespace raft {
 namespace comms {
 
 /**
- * A simple sanity check that NCCL is able to perform a collective operation
+ * @brief A simple sanity check that NCCL is able to perform a collective operation
  *
- * @param the raft handle to use. This is expected to already have an
+ * @param[in] handle the raft handle to use. This is expected to already have an
  *        initialized comms instance.
+*  @param[in] root the root rank id
  */
 bool test_collective_allreduce(const handle_t &handle, int root) {
   comms_t const &communicator = handle.get_comms();
@@ -62,10 +63,11 @@ bool test_collective_allreduce(const handle_t &handle, int root) {
 }
 
 /**
- * A simple sanity check that NCCL is able to perform a collective operation
+ * @brief A simple sanity check that NCCL is able to perform a collective operation
  *
- * @param the raft handle to use. This is expected to already have an
+ * @param[in] handle the raft handle to use. This is expected to already have an
  *        initialized comms instance.
+*  @param[in] root the root rank id
  */
 bool test_collective_broadcast(const handle_t &handle, int root) {
   comms_t const &communicator = handle.get_comms();
@@ -94,6 +96,13 @@ bool test_collective_broadcast(const handle_t &handle, int root) {
   return temp_h == root;
 }
 
+/**
+ * @brief A simple sanity check that NCCL is able to perform a collective reduce
+ *
+ * @param[in] handle the raft handle to use. This is expected to already have an
+ *        initialized comms instance.
+*  @param[in] root the root rank id
+ */
 bool test_collective_reduce(const handle_t &handle, int root) {
   comms_t const &communicator = handle.get_comms();
 
@@ -123,6 +132,13 @@ bool test_collective_reduce(const handle_t &handle, int root) {
     return true;
 }
 
+/**
+ * @brief A simple sanity check that NCCL is able to perform a collective allgather
+ *
+ * @param[in] handle the raft handle to use. This is expected to already have an
+ *        initialized comms instance.
+*  @param[in] root the root rank id
+ */
 bool test_collective_allgather(const handle_t &handle, int root) {
   comms_t const &communicator = handle.get_comms();
 
@@ -155,6 +171,13 @@ bool test_collective_allgather(const handle_t &handle, int root) {
   return true;
 }
 
+/**
+ * @brief A simple sanity check that NCCL is able to perform a collective gather
+ *
+ * @param[in] handle the raft handle to use. This is expected to already have an
+ *        initialized comms instance.
+*  @param[in] root the root rank id
+ */
 bool test_collective_gather(const handle_t &handle, int root) {
   comms_t const &communicator = handle.get_comms();
 
@@ -186,6 +209,13 @@ bool test_collective_gather(const handle_t &handle, int root) {
   return true;
 }
 
+/**
+ * @brief A simple sanity check that NCCL is able to perform a collective gatherv
+ *
+ * @param[in] handle the raft handle to use. This is expected to already have an
+ *        initialized comms instance.
+*  @param[in] root the root rank id
+ */
 bool test_collective_gatherv(const handle_t &handle, int root) {
   comms_t const &communicator = handle.get_comms();
 
@@ -236,6 +266,13 @@ bool test_collective_gatherv(const handle_t &handle, int root) {
   return true;
 }
 
+/**
+ * @brief A simple sanity check that NCCL is able to perform a collective reducescatter
+ *
+ * @param[in] handle the raft handle to use. This is expected to already have an
+ *        initialized comms instance.
+*  @param[in] root the root rank id
+ */
 bool test_collective_reducescatter(const handle_t &handle, int root) {
   comms_t const &communicator = handle.get_comms();
 
@@ -268,9 +305,9 @@ bool test_collective_reducescatter(const handle_t &handle, int root) {
 /**
  * A simple sanity check that UCX is able to send messages between all ranks
  *
- * @param the raft handle to use. This is expected to already have an
+ * @param[in] h the raft handle to use. This is expected to already have an
  *        initialized comms instance.
- * @param number of iterations of all-to-all messaging to perform
+ * @param[in] numTrials number of iterations of all-to-all messaging to perform
  */
 bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) {
   comms_t const &communicator = h.get_comms();
@@ -333,9 +370,9 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) {
 /**
  * A simple sanity check that device is able to send OR receive.
  *
- * @param the raft handle to use. This is expected to already have an
+ * @param h the raft handle to use. This is expected to already have an
  *        initialized comms instance.
- * @param number of iterations of send or receive messaging to perform
+ * @param numTrials number of iterations of send or receive messaging to perform
  */
 bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) {
   comms_t const &communicator = h.get_comms();
@@ -378,9 +415,9 @@ bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) {
 /**
  * A simple sanity check that device is able to send and receive at the same time.
  *
- * @param the raft handle to use. This is expected to already have an
+ * @param h the raft handle to use. This is expected to already have an
  *        initialized comms instance.
- * @param number of iterations of send or receive messaging to perform
+ * @param numTrials number of iterations of send or receive messaging to perform
  */
 bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) {
   comms_t const &communicator = h.get_comms();
@@ -425,9 +462,9 @@ bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) {
 /**
  * A simple sanity check that device is able to perform multiple concurrent sends and receives.
  *
- * @param the raft handle to use. This is expected to already have an
+ * @param h the raft handle to use. This is expected to already have an
  *        initialized comms instance.
- * @param number of iterations of send or receive messaging to perform
+ * @param numTrials number of iterations of send or receive messaging to perform
  */
 bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h,
                                                  int numTrials) {
@@ -483,7 +520,7 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h,
 /**
  * A simple test that the comms can be split into 2 separate subcommunicators
  *
- * @param the raft handle to use. This is expected to already have an
+ * @param h the raft handle to use. This is expected to already have an
  *        initialized comms instance.
  * @param n_colors number of different colors to test
  */
diff --git a/cpp/include/raft/distance/distance.hpp b/cpp/include/raft/distance/distance.hpp
index 84e8af261a..8b55543ff8 100644
--- a/cpp/include/raft/distance/distance.hpp
+++ b/cpp/include/raft/distance/distance.hpp
@@ -43,6 +43,7 @@ namespace distance {
 * @param fin_op the final gemm epilogue lambda
 * @param stream cuda stream
 * @param isRowMajor whether the matrices are row-major or col-major
+* @param metric_arg metric argument (used for Minkowski distance)
 *
 * @note fin_op: This is a device lambda which is supposed to operate upon the
 * input which is AccType and returns the output in OutType. It's signature is
@@ -78,6 +79,7 @@ void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
 * @param worksize number of bytes of the workspace
 * @param stream cuda stream
 * @param isRowMajor whether the matrices are row-major or col-major
+* @param metric_arg metric argument (used for Minkowski distance)
 *
 * @note if workspace is passed as nullptr, this will return in
 *  worksize, the number of bytes of workspace required
@@ -129,10 +131,9 @@ size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n,
 * @param m number of points in x
 * @param n number of points in y
 * @param k dimensionality
-* @param workspace temporary workspace needed for computations
-* @param worksize number of bytes of the workspace
 * @param stream cuda stream
 * @param isRowMajor whether the matrices are row-major or col-major
+* @param metric_arg metric argument (used for Minkowski distance)
 *
 * @note if workspace is passed as nullptr, this will return in
 *  worksize, the number of bytes of workspace required
diff --git a/cpp/include/raft/distance/fused_l2_nn.hpp b/cpp/include/raft/distance/fused_l2_nn.hpp
index df9974f602..0a730506c8 100644
--- a/cpp/include/raft/distance/fused_l2_nn.hpp
+++ b/cpp/include/raft/distance/fused_l2_nn.hpp
@@ -76,6 +76,7 @@ void initialize(const raft::handle_t& handle, OutT* min, IdxT m, DataT maxVal,
  * @param[in]  k             gemm k
  * @param[in]  workspace     temp workspace. Size = sizeof(int)*m. (on device)
  * @param[in]  redOp         reduction operator in the epilogue
+ * @param[in] pairRedOp reduction operation on key value pairs
  * @param[in]  sqrt          Whether the output `minDist` should contain L2-sqrt
  * @param[in]  initOutBuffer whether to initialize the output buffer before the
  *                           main kernel launch
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 190062e92f..794951ca9c 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -73,7 +73,8 @@ class handle_t {
    * @brief Construct a light handle copy from another 
    * user stream, cuda handles, comms and worker pool are not copied
    * The user_stream of the returned handle is set to the specified stream 
-   * of the other handle worker pool 
+   * of the other handle worker pool
+   * @param[in] other other handle for which to use streams
    * @param[in] stream_id stream id in `other` worker streams 
    * to be set as user stream in the constructed handle
    * @param[in] n_streams number worker streams to be created
diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp
index 3a44175182..4a2362bf97 100644
--- a/cpp/include/raft/mr/buffer_base.hpp
+++ b/cpp/include/raft/mr/buffer_base.hpp
@@ -98,7 +98,6 @@ class buffer_base {
    * removes the old one.
    *
    * @param[in] new_capacity new capacity (in number of elements)
-   * @param[in] stream       cuda stream where allocation operations are queued
    * @{
    */
   void reserve(size_type new_capacity) {
@@ -127,7 +126,6 @@ class buffer_base {
    * @brief Resize the underlying buffer (uses `reserve` method internally)
    *
    * @param[in] new_size new buffer size
-   * @param[in] stream   cuda stream where the work will be queued
    * @{
    */
   void resize(const size_type new_size) {
@@ -145,8 +143,6 @@ class buffer_base {
    * @brief Deletes the underlying buffer
    *
    * If this method is not explicitly called, it will be during the destructor
-   *
-   * @param[in] stream   cuda stream where the work will be queued
    * @{
    */
   void release() {
diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp
index 24b10420f3..0aeabe5019 100644
--- a/cpp/include/raft/sparse/distance/distance.hpp
+++ b/cpp/include/raft/sparse/distance/distance.hpp
@@ -71,6 +71,7 @@ static const std::unordered_set<raft::distance::DistanceType> supportedDistance{
  * @param[out] out dense output array (size A.nrows * B.nrows)
  * @param[in] input_config input argument configuration
  * @param[in] metric distance metric to use
+* @param[in] metric_arg metric argument (used for Minkowski distance)
  */
 template <typename value_idx = int, typename value_t = float>
 void pairwiseDistance(value_t *out,
diff --git a/cpp/include/raft/spatial/knn/ball_cover.hpp b/cpp/include/raft/spatial/knn/ball_cover.hpp
index e4b50c77e3..a98473f186 100644
--- a/cpp/include/raft/spatial/knn/ball_cover.hpp
+++ b/cpp/include/raft/spatial/knn/ball_cover.hpp
@@ -125,10 +125,7 @@ void rbc_all_knn_query(const raft::handle_t &handle,
  *               based on how many relevant balls are ignored. Note that
  *               many datasets can still have great recall even by only
  *               looking in the closest landmark.
- * @param k
- * @param inds
- * @param dists
- * @param n_samples
+ * @param[in] n_query_pts number of query points
  */
 template <typename value_idx = std::int64_t, typename value_t,
           typename value_int = std::uint32_t>
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index 73866b35a8..6472eaa80b 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -108,8 +108,8 @@ inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols,
  * 			   default
  * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances. This
  * 					 is ignored if the metric_type is not Minkowski.
- * @param[in] expanded should lp-based distances be returned in their expanded
- * 					 form (e.g., without raising to the 1/p power).
+ * @param[in] translations starting offsets for partitions. should be the same size
+ *            as input vector.
  */
 inline void brute_force_knn(
   raft::handle_t const &handle, std::vector<float *> &input,

From 4d1d2010dded199cc46ec81b3157259ac6a3a0af Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Wed, 17 Nov 2021 18:55:50 +0100
Subject: [PATCH 040/171] Fix coalesced access checks in matrix_vector_op
 (#372)

One of the conditions in [`test_aligned_access`](https://github.com/rapidsai/raft/blob/branch-21.12/cpp/include/raft/linalg/matrix_vector_op.cuh#L106) in `linalg/matrix_vector_op.cuh` was incorrect (`ptr % elem_size` should be zero, not otherwise). Due to that typo, `matrixVectorOp` function was never using vectorized load/store instructions.
This PR fixes the problem while also adding a new helper struct to simplify such checks in future.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/372
---
 cpp/include/raft/linalg/matrix_vector_op.cuh |  45 +++---
 cpp/include/raft/pow2_utils.cuh              | 161 +++++++++++++++++++
 cpp/test/CMakeLists.txt                      |   1 +
 cpp/test/pow2_utils.cu                       | 109 +++++++++++++
 4 files changed, 295 insertions(+), 21 deletions(-)
 create mode 100644 cpp/include/raft/pow2_utils.cuh
 create mode 100644 cpp/test/pow2_utils.cu

diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index e948c3e673..93f2d746fa 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,11 +17,24 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
+#include <raft/pow2_utils.cuh>
 #include <raft/vectorized.cuh>
 
 namespace raft {
 namespace linalg {
 
+namespace {
+template <size_t VecBytes>
+struct AlignedAccess {
+  template <typename T>
+  static inline bool test(const T *matrix, size_t strideBytes) {
+    return Pow2<VecBytes>::isAligned(matrix) &&
+           Pow2<VecBytes>::isAligned(strideBytes) &&
+           Pow2<sizeof(T)>::isAligned(VecBytes);
+  }
+};
+};  // namespace
+
 template <typename Type, int veclen_, typename Lambda, typename IdxType>
 __global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
                                      const Type *vector, IdxType D, IdxType N,
@@ -101,24 +114,19 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D,
   IdxType stride = rowMajor ? D : N;
   size_t stride_bytes = stride * sizeof(Type);
 
-  auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) {
-    return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 &&
-           reinterpret_cast<uintptr_t>(matrix) % sizeof(Type);
-  };
-
-  if (test_aligned_access(16)) {
+  if (AlignedAccess<16>::test(matrix, stride_bytes)) {
     matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (test_aligned_access(8)) {
+  } else if (AlignedAccess<8>::test(matrix, stride_bytes)) {
     matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (test_aligned_access(4)) {
+  } else if (AlignedAccess<4>::test(matrix, stride_bytes)) {
     matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (test_aligned_access(2)) {
+  } else if (AlignedAccess<2>::test(matrix, stride_bytes)) {
     matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (1 / sizeof(Type)) {
+  } else if (AlignedAccess<1>::test(matrix, stride_bytes)) {
     matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
   } else {
@@ -209,24 +217,19 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1,
   IdxType stride = rowMajor ? D : N;
   size_t stride_bytes = stride * sizeof(Type);
 
-  auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) {
-    return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 &&
-           reinterpret_cast<uintptr_t>(matrix) % sizeof(Type);
-  };
-
-  if (test_aligned_access(16)) {
+  if (AlignedAccess<16>::test(matrix, stride_bytes)) {
     matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (test_aligned_access(8)) {
+  } else if (AlignedAccess<8>::test(matrix, stride_bytes)) {
     matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (test_aligned_access(4)) {
+  } else if (AlignedAccess<4>::test(matrix, stride_bytes)) {
     matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (test_aligned_access(2)) {
+  } else if (AlignedAccess<2>::test(matrix, stride_bytes)) {
     matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (1 / sizeof(Type)) {
+  } else if (AlignedAccess<1>::test(matrix, stride_bytes)) {
     matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
   } else {
diff --git a/cpp/include/raft/pow2_utils.cuh b/cpp/include/raft/pow2_utils.cuh
new file mode 100644
index 0000000000..de5fc46452
--- /dev/null
+++ b/cpp/include/raft/pow2_utils.cuh
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cuda_utils.cuh"
+
+namespace raft {
+
+/**
+ * @brief Fast arithmetics and alignment checks for power-of-two values known at compile time.
+ *
+ * @tparam Value_ a compile-time value representable as a power-of-two.
+ */
+template <auto Value_>
+struct Pow2 {
+  typedef decltype(Value_) Type;
+  static constexpr Type Value = Value_;
+  static constexpr Type Log2 = log2(Value);
+  static constexpr Type Mask = Value - 1;
+
+  static_assert(std::is_integral<Type>::value, "Value must be integral.");
+  static_assert(Value && !(Value & Mask), "Value must be power of two.");
+
+#define Pow2_IsRepresentableAs(I) \
+  (std::is_integral<I>::value && Type(I(Value)) == Value)
+
+  /**
+   * Integer division by Value truncated toward zero
+   * (same as `x / Value` in C++).
+   *
+   *  Invariant: `x = Value * quot(x) + rem(x)`
+   */
+  template <typename I>
+  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> quot(
+    I x) noexcept {
+    if constexpr (std::is_signed<I>::value)
+      return (x >> I(Log2)) + (x < 0 && (x & I(Mask)));
+    if constexpr (std::is_unsigned<I>::value) return x >> I(Log2);
+  }
+
+  /**
+   *  Remainder of integer division by Value truncated toward zero
+   *  (same as `x % Value` in C++).
+   *
+   *  Invariant: `x = Value * quot(x) + rem(x)`.
+   */
+  template <typename I>
+  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> rem(
+    I x) noexcept {
+    if constexpr (std::is_signed<I>::value)
+      return x < 0 ? -((-x) & I(Mask)) : (x & I(Mask));
+    if constexpr (std::is_unsigned<I>::value) return x & I(Mask);
+  }
+
+  /**
+   * Integer division by Value truncated toward negative infinity
+   * (same as `x // Value` in Python).
+   *
+   * Invariant: `x = Value * div(x) + mod(x)`.
+   *
+   * Note, `div` and `mod` for negative values are slightly faster
+   * than `quot` and `rem`, but behave slightly different
+   * compared to normal C++ operators `/` and `%`.
+   */
+  template <typename I>
+  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> div(
+    I x) noexcept {
+    return x >> I(Log2);
+  }
+
+  /**
+   * x modulo Value operation (remainder of the `div(x)`)
+   * (same as `x % Value` in Python).
+   *
+   * Invariant: `mod(x) >= 0`
+   * Invariant: `x = Value * div(x) + mod(x)`.
+   *
+   * Note, `div` and `mod` for negative values are slightly faster
+   * than `quot` and `rem`, but behave slightly different
+   * compared to normal C++ operators `/` and `%`.
+   */
+  template <typename I>
+  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> mod(
+    I x) noexcept {
+    return x & I(Mask);
+  }
+
+#define Pow2_CHECK_TYPE(T)                                               \
+  static_assert(std::is_pointer<T>::value || std::is_integral<T>::value, \
+                "Only pointer or integral types make sense here")
+
+  /**
+   * Tell whether the pointer or integral is Value-aligned.
+   * NB: for pointers, the alignment is checked in bytes, not in elements.
+   */
+  template <typename PtrT>
+  static constexpr HDI bool isAligned(PtrT p) noexcept {
+    Pow2_CHECK_TYPE(PtrT);
+    if constexpr (Pow2_IsRepresentableAs(PtrT)) return mod(p) == 0;
+    if constexpr (!Pow2_IsRepresentableAs(PtrT))
+      return mod(reinterpret_cast<Type>(p)) == 0;
+  }
+
+  /** Tell whether two pointers have the same address modulo Value. */
+  template <typename PtrT, typename PtrS>
+  static constexpr HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept {
+    Pow2_CHECK_TYPE(PtrT);
+    Pow2_CHECK_TYPE(PtrS);
+    Type x, y;
+    if constexpr (Pow2_IsRepresentableAs(PtrT))
+      x = Type(mod(a));
+    else
+      x = mod(reinterpret_cast<Type>(a));
+    if constexpr (Pow2_IsRepresentableAs(PtrS))
+      y = Type(mod(b));
+    else
+      y = mod(reinterpret_cast<Type>(b));
+    return x == y;
+  }
+
+  /** Get this or next Value-aligned address (in bytes) or integral. */
+  template <typename PtrT>
+  static constexpr HDI PtrT roundUp(PtrT p) noexcept {
+    Pow2_CHECK_TYPE(PtrT);
+    if constexpr (Pow2_IsRepresentableAs(PtrT))
+      return p + PtrT(Mask) - mod(p + PtrT(Mask));
+    if constexpr (!Pow2_IsRepresentableAs(PtrT)) {
+      auto x = reinterpret_cast<Type>(p);
+      return reinterpret_cast<PtrT>(x + Mask - mod(x + Mask));
+    }
+  }
+
+  /** Get this or previous Value-aligned address (in bytes) or integral. */
+  template <typename PtrT>
+  static constexpr HDI PtrT roundDown(PtrT p) noexcept {
+    Pow2_CHECK_TYPE(PtrT);
+    if constexpr (Pow2_IsRepresentableAs(PtrT)) return p - mod(p);
+    if constexpr (!Pow2_IsRepresentableAs(PtrT)) {
+      auto x = reinterpret_cast<Type>(p);
+      return reinterpret_cast<PtrT>(x - mod(x));
+    }
+  }
+#undef Pow2_CHECK_TYPE
+#undef Pow2_IsRepresentableAs
+};
+
+};  // namespace raft
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 43e1c65695..4a89fd3273 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -36,6 +36,7 @@ add_executable(test_raft
     test/eigen_solvers.cu
     test/handle.cpp
     test/integer_utils.cpp
+    test/pow2_utils.cu
     test/label/label.cu
     test/label/merge_labels.cu
     test/lap/lap.cu
diff --git a/cpp/test/pow2_utils.cu b/cpp/test/pow2_utils.cu
new file mode 100644
index 0000000000..92976e5c61
--- /dev/null
+++ b/cpp/test/pow2_utils.cu
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/pow2_utils.cuh>
+
+namespace raft {
+
+template <auto Val, typename TargetT>
+struct Pow2Test : public ::testing::Test {
+  typedef Pow2<Val> P;
+  std::vector<TargetT> data;
+
+  void SetUp() override {
+    std::vector<TargetT> pos = {0, 1, 2, 7, 15, 16, 17, 31, 35, 1024, 1623};
+    data.insert(data.end(), pos.begin(), pos.end());
+    if constexpr (std::is_signed<TargetT>::value) {
+      std::vector<TargetT> neg = {-0, -1, -2, -5, -15, -16, -17, -156};
+      data.insert(data.end(), neg.begin(), neg.end());
+    }
+    data.push_back(std::numeric_limits<TargetT>::min());
+    data.push_back(std::numeric_limits<TargetT>::max());
+  }
+
+  void quotRem() {
+    for (auto x : data) {
+      ASSERT_EQ(P::quot(x), x / P::Value) << "  where x = " << x;
+      ASSERT_EQ(P::rem(x), x % P::Value) << "  where x = " << x;
+      ASSERT_EQ(x, P::quot(x) * P::Value + P::rem(x));
+    }
+  }
+
+  void divMod() {
+    for (auto x : data) {
+      ASSERT_GE(P::mod(x), 0) << "  where x = " << x;
+      ASSERT_EQ(x, P::div(x) * P::Value + P::mod(x));
+    }
+  }
+
+  void round() {
+    for (auto x : data) {
+      if (x <= std::numeric_limits<TargetT>::max() - TargetT(P::Value))
+        ASSERT_GE(P::roundUp(x), x);
+      if (x >= std::numeric_limits<TargetT>::min() + TargetT(P::Value))
+        ASSERT_LE(P::roundDown(x), x);
+      ASSERT_EQ(x - P::roundDown(x), P::mod(x)) << "  where x = " << x;
+      ASSERT_EQ(P::mod(P::roundUp(x) + P::mod(x) - x), 0)
+        << "  where x = " << x;
+    }
+  }
+
+  void alignment() {
+    for (auto x : data) {
+      ASSERT_TRUE(P::areSameAlignOffsets(x, x));
+      if (x <= std::numeric_limits<TargetT>::max() - TargetT(P::Value)) {
+        ASSERT_TRUE(P::areSameAlignOffsets(x, x + TargetT(P::Value)));
+        int aligned_count = 0;
+        int same_aligned_count = 0;
+        for (int i = 0; i < int(P::Value); i++) {
+          aligned_count += P::isAligned(x + i);
+          same_aligned_count += P::areSameAlignOffsets(x, x + i);
+        }
+        ASSERT_EQ(aligned_count, 1) << "  where x = " << x;
+        ASSERT_EQ(same_aligned_count, 1) << "  where x = " << x;
+      }
+    }
+  }
+};
+
+#define TEST_IT(T)                 \
+  TEST_F(T, quotRem) { divMod(); } \
+  TEST_F(T, divMod) { divMod(); }  \
+  TEST_F(T, round) { round(); }    \
+  TEST_F(T, alignment) { alignment(); }
+
+typedef Pow2Test<16, int> Pow2_i32_i32_16;
+typedef Pow2Test<1UL, uint64_t> Pow2_u64_u64_1;
+typedef Pow2Test<128UL, int> Pow2_u64_i32_128;
+typedef Pow2Test<32LL, uint16_t> Pow2_ll_u16_32;
+typedef Pow2Test<16, uint64_t> Pow2_i32_u64_16;
+TEST_IT(Pow2_i32_i32_16);
+TEST_IT(Pow2_u64_u64_1);
+TEST_IT(Pow2_u64_i32_128);
+TEST_IT(Pow2_ll_u16_32);
+TEST_IT(Pow2_i32_u64_16);
+
+TEST(Pow2, pointers) {
+  typedef Pow2<32UL> P;
+  for (ptrdiff_t i = 0; i <= ptrdiff_t(P::Value); i++) {
+    auto *p = reinterpret_cast<float *>(16345 + i);
+    ASSERT_GE(P::roundUp(p), p);
+    ASSERT_LE(P::roundDown(p), p);
+  }
+}
+
+}  // namespace raft

From ad1ac18ae0bcf15b73d915d02124716be33f86d4 Mon Sep 17 00:00:00 2001
From: AJ Schmidt <aschmidt@nvidia.com>
Date: Fri, 19 Nov 2021 14:32:22 -0500
Subject: [PATCH 041/171] update changelog

---
 CHANGELOG.md | 40 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b8d89d1a8f..88761ccfdb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,42 @@
-# raft 21.10.00 (Date TBD)
+# raft 21.10.00 (7 Oct 2021)
 
-Please see https://github.com/rapidsai/raft/releases/tag/v21.10.00a for the latest changes to this development branch.
+## 🚨 Breaking Changes
+
+- Miscellaneous tech debts/cleanups ([#286](https://github.com/rapidsai/raft/pull/286)) [@viclafargue](https://github.com/viclafargue)
+
+## 🐛 Bug Fixes
+
+- Accounting for rmm::cuda_stream_pool not having a constructor for 0 streams ([#329](https://github.com/rapidsai/raft/pull/329)) [@divyegala](https://github.com/divyegala)
+- Fix wrong lda parameter in gemv ([#327](https://github.com/rapidsai/raft/pull/327)) [@achirkin](https://github.com/achirkin)
+- Fix `matrixVectorOp` to verify promoted pointer type is still aligned to vectorized load boundary ([#325](https://github.com/rapidsai/raft/pull/325)) [@viclafargue](https://github.com/viclafargue)
+- Pin rmm to branch-21.10 and remove warnings from kmeans.hpp ([#322](https://github.com/rapidsai/raft/pull/322)) [@dantegd](https://github.com/dantegd)
+- Temporarily pin RMM while refactor removes deprecated calls ([#315](https://github.com/rapidsai/raft/pull/315)) [@dantegd](https://github.com/dantegd)
+- Fix more warnings ([#311](https://github.com/rapidsai/raft/pull/311)) [@harrism](https://github.com/harrism)
+
+## 📖 Documentation
+
+- Fix build doc ([#316](https://github.com/rapidsai/raft/pull/316)) [@lowener](https://github.com/lowener)
+
+## 🚀 New Features
+
+- Add Hamming, Jensen-Shannon, KL-Divergence, Russell rao and Correlation distance metrics support ([#306](https://github.com/rapidsai/raft/pull/306)) [@mdoijade](https://github.com/mdoijade)
+
+## 🛠️ Improvements
+
+- Pin max `dask` and `distributed` versions to `2021.09.1` ([#334](https://github.com/rapidsai/raft/pull/334)) [@galipremsagar](https://github.com/galipremsagar)
+- Make sure we keep the rapids-cmake and raft cal version in sync ([#331](https://github.com/rapidsai/raft/pull/331)) [@robertmaynard](https://github.com/robertmaynard)
+- Add broadcast with const input iterator ([#328](https://github.com/rapidsai/raft/pull/328)) [@seunghwak](https://github.com/seunghwak)
+- Fused L2 (unexpanded) kNN kernel for NN &lt;= 64, without using temporary gmem to store intermediate distances ([#324](https://github.com/rapidsai/raft/pull/324)) [@mdoijade](https://github.com/mdoijade)
+- Update with rapids cmake new features ([#320](https://github.com/rapidsai/raft/pull/320)) [@robertmaynard](https://github.com/robertmaynard)
+- Update to UCX-Py 0.22 ([#319](https://github.com/rapidsai/raft/pull/319)) [@pentschev](https://github.com/pentschev)
+- Fix Forward-Merge Conflicts ([#318](https://github.com/rapidsai/raft/pull/318)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Enable CUDA device code warnings as errors ([#307](https://github.com/rapidsai/raft/pull/307)) [@harrism](https://github.com/harrism)
+- Remove max version pin for dask &amp; distributed on development branch ([#303](https://github.com/rapidsai/raft/pull/303)) [@galipremsagar](https://github.com/galipremsagar)
+- Warnings are errors ([#299](https://github.com/rapidsai/raft/pull/299)) [@harrism](https://github.com/harrism)
+- Use the new RAPIDS.cmake to fetch rapids-cmake ([#298](https://github.com/rapidsai/raft/pull/298)) [@robertmaynard](https://github.com/robertmaynard)
+- ENH Replace gpuci_conda_retry with gpuci_mamba_retry ([#295](https://github.com/rapidsai/raft/pull/295)) [@dillon-cullinan](https://github.com/dillon-cullinan)
+- Miscellaneous tech debts/cleanups ([#286](https://github.com/rapidsai/raft/pull/286)) [@viclafargue](https://github.com/viclafargue)
+- Random Ball Cover Algorithm for 2D Haversine/Euclidean ([#213](https://github.com/rapidsai/raft/pull/213)) [@cjnolet](https://github.com/cjnolet)
 
 # raft 21.08.00 (4 Aug 2021)
 

From 742cd1bccaac2e8ca8e5c0ce659070836232229a Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 22 Nov 2021 14:30:51 -0600
Subject: [PATCH 042/171] Upgrade `clang` to `11.1.0` (#394)

This PR upgrades clang to `11.1.0` which is similar to: https://github.com/rapidsai/cudf/pull/9716
---
 cpp/include/raft/sparse/cusparse_wrappers.h | 2 +-
 cpp/scripts/run-clang-format.py             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h
index 360832f557..d072100672 100644
--- a/cpp/include/raft/sparse/cusparse_wrappers.h
+++ b/cpp/include/raft/sparse/cusparse_wrappers.h
@@ -55,7 +55,7 @@ namespace detail {
 inline const char* cusparse_error_to_string(cusparseStatus_t err) {
 #if defined(CUDART_VERSION) && CUDART_VERSION >= 10100
   return cusparseGetErrorString(err);
-#else  // CUDART_VERSION
+#else   // CUDART_VERSION
   switch (err) {
     _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_SUCCESS);
     _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_NOT_INITIALIZED);
diff --git a/cpp/scripts/run-clang-format.py b/cpp/scripts/run-clang-format.py
index 24306de3fc..56fc9a5928 100755
--- a/cpp/scripts/run-clang-format.py
+++ b/cpp/scripts/run-clang-format.py
@@ -22,7 +22,7 @@
 import tempfile
 
 
-EXPECTED_VERSION = "11.0.0"
+EXPECTED_VERSION = "11.1.0"
 VERSION_REGEX = re.compile(r"clang-format version ([0-9.]+)")
 # NOTE: populate this list with more top-level dirs as we add more of them to
 #       to the cuml repo

From a3e8181b62259a7bc972ba8e1f36db3ea3271f5b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Mon, 22 Nov 2021 16:15:15 -0500
Subject: [PATCH 043/171] Adding dev conda environment files. (#397)

With the recent update of the default clang-format version breaking the style checks in a few projects, I realized other projects are explicitly pinning this version while RAFT is not. Now that RAFT is beginning to act more independently of other projects, it's time for it to provide its own dev conda environment files. This will also get the directories in place for the conda recipes, which will be coming in the next couple versions.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/raft/pull/397
---
 conda/environments/raft_dev_cuda11.0.yml | 33 ++++++++++++++++++++++++
 conda/environments/raft_dev_cuda11.2.yml | 33 ++++++++++++++++++++++++
 conda/environments/raft_dev_cuda11.4.yml | 33 ++++++++++++++++++++++++
 conda/environments/raft_dev_cuda11.5.yml | 33 ++++++++++++++++++++++++
 4 files changed, 132 insertions(+)
 create mode 100644 conda/environments/raft_dev_cuda11.0.yml
 create mode 100644 conda/environments/raft_dev_cuda11.2.yml
 create mode 100644 conda/environments/raft_dev_cuda11.4.yml
 create mode 100644 conda/environments/raft_dev_cuda11.5.yml

diff --git a/conda/environments/raft_dev_cuda11.0.yml b/conda/environments/raft_dev_cuda11.0.yml
new file mode 100644
index 0000000000..28f06752d1
--- /dev/null
+++ b/conda/environments/raft_dev_cuda11.0.yml
@@ -0,0 +1,33 @@
+name: raft_dev
+channels:
+- rapidsai
+- nvidia
+- rapidsai-nightly
+- conda-forge
+dependencies:
+- cudatoolkit=11.0
+- clang=11.1.0
+- clang-tools=11.1.0
+- rapids-build-env=22.02.*
+- rapids-notebook-env=22.02.*
+- rapids-doc-env=22.02.*
+- rmm=22.02.*
+- dask-cuda=22.02.*
+- ucx-py=0.23
+- ucx-proc=*=gpu
+- doxygen>=1.8.20
+- libfaiss>=1.7.0
+- faiss-proc=*=cuda
+- pip
+- pip:
+    - sphinx_markdown_tables
+    - git+https://github.com/dask/dask.git@2021.11.2
+    - git+https://github.com/dask/distributed.git@2021.11.2
+
+# rapids-build-env, notebook-env and doc-env are defined in
+# https://docs.rapids.ai/maintainers/depmgmt/
+
+# To install different versions of packages contained in those meta packages,
+# it is recommended to remove those meta packages (without removing the actual
+# packages contained in the environment) first with:
+# conda remove --force rapids-build-env rapids-notebook-env rapids-doc-env
diff --git a/conda/environments/raft_dev_cuda11.2.yml b/conda/environments/raft_dev_cuda11.2.yml
new file mode 100644
index 0000000000..fa3c601e51
--- /dev/null
+++ b/conda/environments/raft_dev_cuda11.2.yml
@@ -0,0 +1,33 @@
+name: raft_dev
+channels:
+- rapidsai
+- nvidia
+- rapidsai-nightly
+- conda-forge
+dependencies:
+- cudatoolkit=11.2
+- clang=11.1.0
+- clang-tools=11.1.0
+- rapids-build-env=22.02.*
+- rapids-notebook-env=22.02.*
+- rapids-doc-env=22.02.*
+- rmm=22.02.*
+- dask-cuda=22.02.*
+- ucx-py=0.23
+- ucx-proc=*=gpu
+- doxygen>=1.8.20
+- libfaiss>=1.7.0
+- faiss-proc=*=cuda
+- pip
+- pip:
+    - sphinx_markdown_tables
+    - git+https://github.com/dask/dask.git@2021.11.2
+    - git+https://github.com/dask/distributed.git@2021.11.2
+
+# rapids-build-env, notebook-env and doc-env are defined in
+# https://docs.rapids.ai/maintainers/depmgmt/
+
+# To install different versions of packages contained in those meta packages,
+# it is recommended to remove those meta packages (without removing the actual
+# packages contained in the environment) first with:
+# conda remove --force rapids-build-env rapids-notebook-env rapids-doc-env
diff --git a/conda/environments/raft_dev_cuda11.4.yml b/conda/environments/raft_dev_cuda11.4.yml
new file mode 100644
index 0000000000..4e2c6f2154
--- /dev/null
+++ b/conda/environments/raft_dev_cuda11.4.yml
@@ -0,0 +1,33 @@
+name: raft_dev
+channels:
+- rapidsai
+- nvidia
+- rapidsai-nightly
+- conda-forge
+dependencies:
+- cudatoolkit=11.4
+- clang=11.1.0
+- clang-tools=11.1.0
+- rapids-build-env=22.02.*
+- rapids-notebook-env=22.02.*
+- rapids-doc-env=22.02.*
+- rmm=22.02.*
+- dask-cuda=22.02.*
+- ucx-py=0.23
+- ucx-proc=*=gpu
+- doxygen>=1.8.20
+- libfaiss>=1.7.0
+- faiss-proc=*=cuda
+- pip
+- pip:
+    - sphinx_markdown_tables
+    - git+https://github.com/dask/dask.git@2021.11.2
+    - git+https://github.com/dask/distributed.git@2021.11.2
+
+# rapids-build-env, notebook-env and doc-env are defined in
+# https://docs.rapids.ai/maintainers/depmgmt/
+
+# To install different versions of packages contained in those meta packages,
+# it is recommended to remove those meta packages (without removing the actual
+# packages contained in the environment) first with:
+# conda remove --force rapids-build-env rapids-notebook-env rapids-doc-env
diff --git a/conda/environments/raft_dev_cuda11.5.yml b/conda/environments/raft_dev_cuda11.5.yml
new file mode 100644
index 0000000000..841431cc0f
--- /dev/null
+++ b/conda/environments/raft_dev_cuda11.5.yml
@@ -0,0 +1,33 @@
+name: raft_dev
+channels:
+- rapidsai
+- nvidia
+- rapidsai-nightly
+- conda-forge
+dependencies:
+- cudatoolkit=11.5
+- clang=11.1.0
+- clang-tools=11.1.0
+- rapids-build-env=22.02.*
+- rapids-notebook-env=22.02.*
+- rapids-doc-env=22.02.*
+- rmm=22.02.*
+- dask-cuda=22.02.*
+- ucx-py=0.23
+- ucx-proc=*=gpu
+- doxygen>=1.8.20
+- libfaiss>=1.7.0
+- faiss-proc=*=cuda
+- pip
+- pip:
+    - sphinx_markdown_tables
+    - git+https://github.com/dask/dask.git@2021.11.2
+    - git+https://github.com/dask/distributed.git@2021.11.2
+
+# rapids-build-env, notebook-env and doc-env are defined in
+# https://docs.rapids.ai/maintainers/depmgmt/
+
+# To install different versions of packages contained in those meta packages,
+# it is recommended to remove those meta packages (without removing the actual
+# packages contained in the environment) first with:
+# conda remove --force rapids-build-env rapids-notebook-env rapids-doc-env

From 8ef8166aac795ab5b99e5427951ce290122a23ae Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 22 Nov 2021 16:37:32 -0600
Subject: [PATCH 044/171] Pin max `dask` & `distributed` (#388)

Changes to be in-line with: https://github.com/rapidsai/cudf/pull/9734

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/raft/pull/388
---
 ci/gpu/build.sh          | 4 ++--
 ci/local/old-gpubuild.sh | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index ede657bccd..70a13b2318 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -59,8 +59,8 @@ gpuci_mamba_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid
 # Install the master version of dask, distributed, and dask-ml
 gpuci_logger "Install the master version of dask and distributed"
 set -x
-pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
+pip install "git+https://github.com/dask/distributed.git@2021.11.2" --upgrade --no-deps
+pip install "git+https://github.com/dask/dask.git@2021.11.2" --upgrade --no-deps
 set +x
 
 
diff --git a/ci/local/old-gpubuild.sh b/ci/local/old-gpubuild.sh
index 3d0a5cf87b..28fd2d13a0 100644
--- a/ci/local/old-gpubuild.sh
+++ b/ci/local/old-gpubuild.sh
@@ -81,8 +81,8 @@ fi
 
 # Install the master version of dask, distributed, and dask-ml
 set -x
-pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
+pip install "git+https://github.com/dask/distributed.git@2021.11.2" --upgrade --no-deps
+pip install "git+https://github.com/dask/dask.git@2021.11.2" --upgrade --no-deps
 set +x
 
 
From 6166a47a221ed1424f494429dbf8aaba151c4824 Mon Sep 17 00:00:00 2001
From: Mahesh Doijade <36705640+mdoijade@users.noreply.github.com>
Date: Wed, 24 Nov 2021 02:41:44 +0530
Subject: [PATCH 045/171] Add Fused L2 Expanded KNN kernel (#339)

-- adds fused L2 expanded kNN kernel, this is faster by at least 20-25% on higher dimensions (D >= 128) than L2 unexpanded version.
-- also on smaller dimension (D <=32) L2 expanded is always faster by 10-15%
 -- slight improvement in updateSortedWarpQ device function by reducing redundant instruction.
-- Fix incorrect output for NN >32 case when taking prod-cons knn merge path, this was caught in HDBSCAN pytest.

Authors:
  - Mahesh Doijade (https://github.com/mdoijade)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/339
---
 cpp/include/raft/device_atomics.cuh           |  29 +-
 .../raft/spatial/knn/detail/fused_l2_knn.cuh  | 554 ++++++++++++------
 .../knn/detail/knn_brute_force_faiss.cuh      |  77 +--
 cpp/include/raft/spatial/knn/knn.hpp          |   2 +-
 cpp/test/CMakeLists.txt                       |   1 +
 cpp/test/spatial/ball_cover.cu                |  39 +-
 cpp/test/spatial/fused_l2_knn.cu              | 209 +++++++
 7 files changed, 664 insertions(+), 247 deletions(-)
 create mode 100644 cpp/test/spatial/fused_l2_knn.cu

diff --git a/cpp/include/raft/device_atomics.cuh b/cpp/include/raft/device_atomics.cuh
index dc8093ca1d..a4ebcc9900 100644
--- a/cpp/include/raft/device_atomics.cuh
+++ b/cpp/include/raft/device_atomics.cuh
@@ -179,10 +179,15 @@ struct genericAtomicOperationImpl<T, Op, 4> {
   __forceinline__ __device__ T operator()(T* addr, T const& update_value,
                                           Op op) {
     using T_int = unsigned int;
-
     T old_value = *addr;
     T assumed{old_value};
 
+    if constexpr (std::is_same<T, float>{} && (std::is_same<Op, DeviceMin>{})) {
+      if (isnan(update_value)) {
+        return old_value;
+      }
+    }
+
     do {
       assumed = old_value;
       const T new_value = op(old_value, update_value);
@@ -191,13 +196,32 @@ struct genericAtomicOperationImpl<T, Op, 4> {
                             type_reinterpret<T_int, T>(assumed),
                             type_reinterpret<T_int, T>(new_value));
       old_value = type_reinterpret<T, T_int>(ret);
-
     } while (assumed != old_value);
 
     return old_value;
   }
 };
 
+// 4 bytes fp32 atomic Max operation
+template <>
+struct genericAtomicOperationImpl<float, DeviceMax, 4> {
+  using T = float;
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
+                                          DeviceMax op) {
+    if (isnan(update_value)) {
+      return *addr;
+    }
+
+    T old =
+      (update_value >= 0)
+        ? __int_as_float(atomicMax((int*)addr, __float_as_int(update_value)))
+        : __uint_as_float(
+            atomicMin((unsigned int*)addr, __float_as_uint(update_value)));
+
+    return old;
+  }
+};
+
 // 8 bytes atomic operation
 template <typename T, typename Op>
 struct genericAtomicOperationImpl<T, Op, 8> {
@@ -423,7 +447,6 @@ struct typesAtomicCASImpl<T, 4> {
     T_int ret = atomicCAS(reinterpret_cast<T_int*>(addr),
                           type_reinterpret<T_int, T>(compare),
                           type_reinterpret<T_int, T>(update_value));
-
     return type_reinterpret<T, T_int>(ret);
   }
 };
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index 7f8523a587..f774d9d1ea 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -17,8 +17,9 @@
 #include <cub/cub.cuh>
 #include <faiss/gpu/utils/Select.cuh>
 #include <limits>
-
+#include <raft/linalg/norm.cuh>
 // TODO: Need to hide the PairwiseDistance class impl and expose to public API
+#include <raft/distance/detail/distance.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
 #include "processing.hpp"
 
@@ -145,21 +146,21 @@ DI void updateSortedWarpQ(myWarpSelect &heapArr, Pair *allWarpTopKs, int rowId,
         Pair tempKV;
         tempKV.value = raft::shfl(heapArr->warpK[i], srcLane);
         tempKV.key = raft::shfl(heapArr->warpV[i], srcLane);
-        const auto firstActiveLane = __ffs(activeLanes);
-        if (firstActiveLane == (lid + 1)) {
+        const auto firstActiveLane = __ffs(activeLanes) - 1;
+        if (firstActiveLane == lid) {
           heapArr->warpK[i] = KVPair.value;
           heapArr->warpV[i] = KVPair.key;
-        } else if (activeLanes & ((uint32_t)1 << lid)) {
+        } else if (lid > firstActiveLane) {
           heapArr->warpK[i] = tempKV.value;
           heapArr->warpV[i] = tempKV.key;
         }
         if (i == 0 && NumWarpQRegs > 1) {
+          heapArr->warpK[1] = __shfl_up_sync(mask, heapArr->warpK[1], 1);
+          heapArr->warpV[1] = __shfl_up_sync(mask, heapArr->warpV[1], 1);
           if (lid == 0) {
             heapArr->warpK[1] = tempKV.value;
             heapArr->warpV[1] = tempKV.key;
           }
-          heapArr->warpK[1] = __shfl_up_sync(mask, heapArr->warpK[1], 1);
-          heapArr->warpV[1] = __shfl_up_sync(mask, heapArr->warpV[1], 1);
           break;
         }
       }
@@ -193,7 +194,16 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
     }
 
     volatile int *mutex = mutexes;
-    Pair *shDumpKV = (Pair *)(&smem[Policy::SmemSize]);
+
+    Pair *shDumpKV = nullptr;
+    if (useNorms) {
+      shDumpKV =
+        (Pair *)(&smem[Policy::SmemSize +
+                       ((Policy::Mblk + Policy::Nblk) * sizeof(DataT))]);
+    } else {
+      shDumpKV = (Pair *)(&smem[Policy::SmemSize]);
+    }
+
     const int lid = threadIdx.x % warpSize;
     const IdxT starty = gridStrideY + (threadIdx.x / Policy::AccThCols);
 
@@ -206,13 +216,11 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
       myWarpSelect heapArr1(identity, keyMax, numOfNN);
       myWarpSelect heapArr2(identity, keyMax, numOfNN);
       myWarpSelect *heapArr[] = {&heapArr1, &heapArr2};
-      __syncthreads();
+      __syncwarp();
 
       loadAllWarpQShmem<Policy, Pair>(heapArr, &shDumpKV[0], m, numOfNN);
 
       while (cta_processed < gridDim.x - 1) {
-        Pair otherKV[Policy::AccRowsPerTh];
-
         if (threadIdx.x == 0) {
           int32_t old = -3;
           while (old != -1) {
@@ -225,12 +233,19 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
 #pragma unroll
         for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
           const auto rowId = starty + i * Policy::AccThRows;
-          otherKV[i].value = identity;
-          otherKV[i].key = keyMax;
-
-          if (lid < numOfNN && rowId < m) {
-            otherKV[i].value = out_dists[rowId * numOfNN + lid];
-            otherKV[i].key = (uint32_t)out_inds[rowId * numOfNN + lid];
+          const auto shMemRowId =
+            (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+#pragma unroll
+          for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) {
+            Pair otherKV;
+            otherKV.value = identity;
+            otherKV.key = keyMax;
+            const auto idx = j * warpSize + lid;
+            if (idx < numOfNN && rowId < m) {
+              otherKV.value = out_dists[rowId * numOfNN + idx];
+              otherKV.key = (uint32_t)out_inds[rowId * numOfNN + idx];
+              shDumpKV[shMemRowId * numOfNN + idx] = otherKV;
+            }
           }
         }
         __threadfence();
@@ -241,14 +256,27 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
         }
 
         // Perform merging of otherKV with topk's across warp.
+        __syncwarp();
+
 #pragma unroll
         for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
           const auto rowId = starty + i * Policy::AccThRows;
+          const auto shMemRowId =
+            (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
           if (rowId < m) {
-            heapArr[i]->add(otherKV[i].value, otherKV[i].key);
+#pragma unroll
+            for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) {
+              Pair otherKV;
+              otherKV.value = identity;
+              otherKV.key = keyMax;
+              const auto idx = j * warpSize + lid;
+              if (idx < numOfNN) {
+                otherKV = shDumpKV[shMemRowId * numOfNN + idx];
+              }
+              heapArr[i]->add(otherKV.value, otherKV.key);
+            }
           }
         }
-
         cta_processed++;
       }
 #pragma unroll
@@ -298,168 +326,176 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda =
-    [numOfNN, sqrt, m, n, ldd, out_dists, out_inds] __device__(
-      AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], DataT * regxn,
-      DataT * regyn, IdxT gridStrideX, IdxT gridStrideY) {
-      if (sqrt) {
+  auto epilog_lambda = [numOfNN, m, n, ldd, out_dists, out_inds] __device__(
+                         AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         IdxT gridStrideY) {
+    if (useNorms) {
 #pragma unroll
-        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+      for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
 #pragma unroll
-          for (int j = 0; j < Policy::AccColsPerTh; ++j) {
-            acc[i][j] = raft::mySqrt(acc[i][j]);
-          }
+        for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+          acc[i][j] = regxn[i] + regyn[j] - (DataT)2.0 * acc[i][j];
         }
       }
-      Pair *shDumpKV = (Pair *)(&smem[Policy::SmemSize]);
+    }
 
-      constexpr uint32_t mask = 0xffffffffu;
-      const IdxT starty = gridStrideY + (threadIdx.x / Policy::AccThCols);
-      const IdxT startx = gridStrideX + (threadIdx.x % Policy::AccThCols);
-      const int lid = raft::laneId();
+    Pair *shDumpKV = nullptr;
+    if (useNorms) {
+      constexpr size_t shmemSize =
+        Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT));
+      shDumpKV = (Pair *)(&smem[shmemSize]);
+    } else {
+      shDumpKV = (Pair *)(&smem[Policy::SmemSize]);
+    }
 
-      myWarpSelect heapArr1(identity, keyMax, numOfNN);
-      myWarpSelect heapArr2(identity, keyMax, numOfNN);
-      myWarpSelect *heapArr[] = {&heapArr1, &heapArr2};
-      if (usePrevTopKs) {
-        if (gridStrideX == blockIdx.x * Policy::Nblk) {
-          loadPrevTopKsGmemWarpQ<Policy, Pair>(heapArr, out_dists, out_inds, m,
-                                               numOfNN, starty);
-        }
+    constexpr uint32_t mask = 0xffffffffu;
+    const IdxT starty = gridStrideY + (threadIdx.x / Policy::AccThCols);
+    const IdxT startx = gridStrideX + (threadIdx.x % Policy::AccThCols);
+    const int lid = raft::laneId();
+
+    myWarpSelect heapArr1(identity, keyMax, numOfNN);
+    myWarpSelect heapArr2(identity, keyMax, numOfNN);
+    myWarpSelect *heapArr[] = {&heapArr1, &heapArr2};
+    if (usePrevTopKs) {
+      if (gridStrideX == blockIdx.x * Policy::Nblk) {
+        loadPrevTopKsGmemWarpQ<Policy, Pair>(heapArr, out_dists, out_inds, m,
+                                             numOfNN, starty);
       }
+    }
 
-      if (gridStrideX > blockIdx.x * Policy::Nblk) {
+    if (gridStrideX > blockIdx.x * Policy::Nblk) {
 #pragma unroll
-        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-          const auto rowId =
-            (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
-          Pair tempKV = shDumpKV[(rowId * numOfNN) + numOfNN - 1];
-          heapArr[i]->warpKTop = tempKV.value;
-        }
+      for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+        const auto rowId =
+          (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+        Pair tempKV = shDumpKV[(rowId * numOfNN) + numOfNN - 1];
+        heapArr[i]->warpKTop = tempKV.value;
+      }
 
-        // total vals can atmost be 256, (32*8)
-        int numValsWarpTopK[Policy::AccRowsPerTh];
-        int anyWarpTopKs = 0;
+      // total vals can atmost be 256, (32*8)
+      int numValsWarpTopK[Policy::AccRowsPerTh];
+      int anyWarpTopKs = 0;
 #pragma unroll
-        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-          const auto rowId = starty + i * Policy::AccThRows;
-          numValsWarpTopK[i] = 0;
-          if (rowId < m) {
+      for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+        const auto rowId = starty + i * Policy::AccThRows;
+        numValsWarpTopK[i] = 0;
+        if (rowId < m) {
 #pragma unroll
-            for (int j = 0; j < Policy::AccColsPerTh; ++j) {
-              const auto colId = startx + j * Policy::AccThCols;
-              if (colId < ldd) {
-                if (acc[i][j] < heapArr[i]->warpKTop) {
-                  numValsWarpTopK[i]++;
-                }
+          for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+            const auto colId = startx + j * Policy::AccThCols;
+            if (colId < ldd) {
+              if (acc[i][j] < heapArr[i]->warpKTop) {
+                numValsWarpTopK[i]++;
               }
             }
-            anyWarpTopKs += numValsWarpTopK[i];
           }
+          anyWarpTopKs += numValsWarpTopK[i];
         }
-        anyWarpTopKs = __syncthreads_or(anyWarpTopKs > 0);
-        if (anyWarpTopKs) {
-          Pair *allWarpTopKs = (Pair *)(&smem[0]);
-          uint32_t needScanSort[Policy::AccRowsPerTh];
+      }
+      anyWarpTopKs = __syncthreads_or(anyWarpTopKs > 0);
+      if (anyWarpTopKs) {
+        Pair *allWarpTopKs = (Pair *)(&smem[0]);
+        uint32_t needScanSort[Policy::AccRowsPerTh];
 
 #pragma unroll
-          for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-            const auto gmemRowId = starty + i * Policy::AccThRows;
-            needScanSort[i] = 0;
-            if (gmemRowId < m) {
-              int myVals = numValsWarpTopK[i];
-              needScanSort[i] = __ballot_sync(mask, myVals > 0);
-              if (needScanSort[i]) {
+        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+          const auto gmemRowId = starty + i * Policy::AccThRows;
+          needScanSort[i] = 0;
+          if (gmemRowId < m) {
+            int myVals = numValsWarpTopK[i];
+            needScanSort[i] = __ballot_sync(mask, myVals > 0);
+            if (needScanSort[i]) {
 #pragma unroll
-                for (unsigned int k = 1; k <= 16; k *= 2) {
-                  const unsigned int n =
-                    __shfl_up_sync(mask, numValsWarpTopK[i], k);
-                  if (lid >= k) {
-                    numValsWarpTopK[i] += n;
-                  }
+              for (unsigned int k = 1; k <= 16; k *= 2) {
+                const unsigned int n =
+                  __shfl_up_sync(mask, numValsWarpTopK[i], k);
+                if (lid >= k) {
+                  numValsWarpTopK[i] += n;
                 }
               }
-              // As each thread will know its total vals to write.
-              // we only store its starting location.
-              numValsWarpTopK[i] -= myVals;
             }
+            // As each thread will know its total vals to write.
+            // we only store its starting location.
+            numValsWarpTopK[i] -= myVals;
+          }
 
-            if (needScanSort[i]) {
-              const auto rowId =
-                (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
-              if (gmemRowId < m) {
-                if (needScanSort[i] & ((uint32_t)1 << lid)) {
+          if (needScanSort[i]) {
+            const auto rowId =
+              (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+            if (gmemRowId < m) {
+              if (needScanSort[i] & ((uint32_t)1 << lid)) {
 #pragma unroll
-                  for (int j = 0; j < Policy::AccColsPerTh; ++j) {
-                    const auto colId = startx + j * Policy::AccThCols;
-                    if (colId < ldd) {
-                      if (acc[i][j] < heapArr[i]->warpKTop) {
-                        Pair otherKV = {colId, acc[i][j]};
-                        allWarpTopKs[rowId * (256) + numValsWarpTopK[i]] =
-                          otherKV;
-                        numValsWarpTopK[i]++;
-                      }
+                for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+                  const auto colId = startx + j * Policy::AccThCols;
+                  if (colId < ldd) {
+                    if (acc[i][j] < heapArr[i]->warpKTop) {
+                      Pair otherKV = {colId, acc[i][j]};
+                      allWarpTopKs[rowId * (256) + numValsWarpTopK[i]] =
+                        otherKV;
+                      numValsWarpTopK[i]++;
                     }
                   }
                 }
-                const int finalNumVals = raft::shfl(numValsWarpTopK[i], 31);
-                loadWarpQShmem<Policy, Pair>(heapArr[i], &shDumpKV[0], rowId,
-                                             numOfNN);
-                updateSortedWarpQ<Pair, heapArr[i]->kNumWarpQRegisters>(
-                  heapArr[i], &allWarpTopKs[0], rowId, finalNumVals);
               }
+              const int finalNumVals = raft::shfl(numValsWarpTopK[i], 31);
+              loadWarpQShmem<Policy, Pair>(heapArr[i], &shDumpKV[0], rowId,
+                                           numOfNN);
+              updateSortedWarpQ<Pair, heapArr[i]->kNumWarpQRegisters>(
+                heapArr[i], &allWarpTopKs[0], rowId, finalNumVals);
             }
           }
-          __syncthreads();
+        }
+        __syncthreads();
 #pragma unroll
-          for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-            if (needScanSort[i]) {
-              const auto rowId =
-                (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
-              const auto gmemRowId = starty + i * Policy::AccThRows;
-              if (gmemRowId < m) {
-                storeWarpQShmem<Policy, Pair>(heapArr[i], shDumpKV, rowId,
-                                              numOfNN);
-              }
+        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+          if (needScanSort[i]) {
+            const auto rowId =
+              (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+            const auto gmemRowId = starty + i * Policy::AccThRows;
+            if (gmemRowId < m) {
+              storeWarpQShmem<Policy, Pair>(heapArr[i], shDumpKV, rowId,
+                                            numOfNN);
             }
           }
         }
-      } else {
+      }
+    } else {
 #pragma unroll
-        for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-          const auto gmemRowId = starty + i * Policy::AccThRows;
-          const auto shMemRowId =
-            (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
-          if (gmemRowId < m) {
+      for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+        const auto gmemRowId = starty + i * Policy::AccThRows;
+        const auto shMemRowId =
+          (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+        if (gmemRowId < m) {
 #pragma unroll
-            for (int j = 0; j < Policy::AccColsPerTh; ++j) {
-              const auto colId = startx + j * Policy::AccThCols;
-              Pair otherKV = {keyMax, identity};
-              if (colId < ldd) {
-                otherKV.value = acc[i][j];
-                otherKV.key = colId;
-              }
-              heapArr[i]->add(otherKV.value, otherKV.key);
+          for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+            const auto colId = startx + j * Policy::AccThCols;
+            Pair otherKV = {keyMax, identity};
+            if (colId < ldd) {
+              otherKV.value = acc[i][j];
+              otherKV.key = colId;
             }
+            heapArr[i]->add(otherKV.value, otherKV.key);
+          }
 
-            bool needSort = (heapArr[i]->numVals > 0);
-            needSort = __any_sync(mask, needSort);
-            if (needSort) {
-              heapArr[i]->reduce();
-            }
-            storeWarpQShmem<Policy, Pair>(heapArr[i], shDumpKV, shMemRowId,
-                                          numOfNN);
+          bool needSort = (heapArr[i]->numVals > 0);
+          needSort = __any_sync(mask, needSort);
+          if (needSort) {
+            heapArr[i]->reduce();
           }
+          storeWarpQShmem<Policy, Pair>(heapArr[i], shDumpKV, shMemRowId,
+                                        numOfNN);
         }
       }
+    }
 
-      if (((gridStrideX + Policy::Nblk * gridDim.x) > n) && gridDim.x == 1) {
-        // This is last iteration of grid stride X
-        loadAllWarpQShmem<Policy, Pair>(heapArr, &shDumpKV[0], m, numOfNN);
-        storeWarpQGmem<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN,
-                                     starty);
-      }
-    };
+    if (((gridStrideX + Policy::Nblk * gridDim.x) > n) && gridDim.x == 1) {
+      // This is last iteration of grid stride X
+      loadAllWarpQShmem<Policy, Pair>(heapArr, &shDumpKV[0], m, numOfNN);
+      storeWarpQGmem<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN,
+                                   starty);
+    }
+  };
 
   raft::distance::detail::PairwiseDistances<
     useNorms, DataT, AccT, OutT, IdxT, Policy, CoreLambda,
@@ -472,10 +508,11 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
 
 template <typename DataT, typename AccT, typename OutT, typename IdxT,
           int VecLen, bool usePrevTopKs, bool isRowMajor>
-void fusedL2kNNImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
-                    IdxT lda, IdxT ldb, IdxT ldd, bool sqrt, OutT *out_dists,
-                    IdxT *out_inds, IdxT numOfNN, cudaStream_t stream,
-                    void *workspace, size_t &worksize) {
+void fusedL2UnexpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
+                         IdxT lda, IdxT ldb, IdxT ldd, bool sqrt,
+                         OutT *out_dists, IdxT *out_inds, IdxT numOfNN,
+                         cudaStream_t stream, void *workspace,
+                         size_t &worksize) {
   typedef typename raft::linalg::Policy2x8<DataT, 1>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
@@ -495,25 +532,28 @@ void fusedL2kNNImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
   typedef cub::KeyValuePair<uint32_t, AccT> Pair;
 
   if (isRowMajor) {
-    constexpr auto fusedL2kNN32RowMajor =
+    constexpr auto fusedL2UnexpKnn32RowMajor =
       fusedL2kNN<false, DataT, AccT, OutT, IdxT, KPolicy, decltype(core_lambda),
                  decltype(fin_op), 32, 2, usePrevTopKs, true>;
-    constexpr auto fusedL2kNN64RowMajor =
+    constexpr auto fusedL2UnexpKnn64RowMajor =
       fusedL2kNN<false, DataT, AccT, OutT, IdxT, KPolicy, decltype(core_lambda),
                  decltype(fin_op), 64, 3, usePrevTopKs, true>;
 
-    auto fusedL2kNNRowMajor = fusedL2kNN32RowMajor;
+    auto fusedL2UnexpKnnRowMajor = fusedL2UnexpKnn32RowMajor;
     if (numOfNN <= 32) {
-      fusedL2kNNRowMajor = fusedL2kNN32RowMajor;
+      fusedL2UnexpKnnRowMajor = fusedL2UnexpKnn32RowMajor;
     } else if (numOfNN <= 64) {
-      fusedL2kNNRowMajor = fusedL2kNN64RowMajor;
+      fusedL2UnexpKnnRowMajor = fusedL2UnexpKnn64RowMajor;
     } else {
       ASSERT(numOfNN <= 64,
              "fusedL2kNN: num of nearest neighbors must be <= 64");
     }
 
+    const auto sharedMemSize =
+      KPolicy::SmemSize + (KPolicy::Mblk * numOfNN * sizeof(Pair));
     dim3 grid = raft::distance::detail::launchConfigGenerator<KPolicy>(
-      m, n, KPolicy::SmemSize, fusedL2kNNRowMajor);
+      m, n, sharedMemSize, fusedL2UnexpKnnRowMajor);
+
     if (grid.x > 1) {
       const auto numMutexes = raft::ceildiv<int>(m, KPolicy::Mblk);
       if (workspace == nullptr || worksize < (sizeof(int32_t) * numMutexes)) {
@@ -525,10 +565,7 @@ void fusedL2kNNImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
       }
     }
 
-    const auto sharedMemSize =
-      KPolicy::SmemSize + (KPolicy::Mblk * numOfNN * sizeof(Pair));
-
-    fusedL2kNNRowMajor<<<grid, blk, sharedMemSize, stream>>>(
+    fusedL2UnexpKnnRowMajor<<<grid, blk, sharedMemSize, stream>>>(
       x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, core_lambda, fin_op, sqrt,
       (uint32_t)numOfNN, (int *)workspace, out_dists, out_inds);
   } else {
@@ -539,29 +576,147 @@ void fusedL2kNNImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
 
 template <typename DataT, typename AccT, typename OutT, typename IdxT,
           bool usePrevTopKs, bool isRowMajor>
-void fusedL2kNN(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                const DataT *x, const DataT *y, bool sqrt, OutT *out_dists,
-                IdxT *out_inds, IdxT numOfNN, cudaStream_t stream,
-                void *workspace, size_t &worksize) {
+void fusedL2UnexpKnn(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                     const DataT *x, const DataT *y, bool sqrt, OutT *out_dists,
+                     IdxT *out_inds, IdxT numOfNN, cudaStream_t stream,
+                     void *workspace, size_t &worksize) {
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    fusedL2kNNImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), usePrevTopKs,
-                   isRowMajor>(x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists,
-                               out_inds, numOfNN, stream, workspace, worksize);
+    fusedL2UnexpKnnImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT),
+                        usePrevTopKs, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, out_inds, numOfNN, stream,
+      workspace, worksize);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    fusedL2kNNImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), usePrevTopKs,
-                   isRowMajor>(x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists,
-                               out_inds, numOfNN, stream, workspace, worksize);
+    fusedL2UnexpKnnImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT),
+                        usePrevTopKs, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, out_inds, numOfNN, stream,
+      workspace, worksize);
   } else {
-    fusedL2kNNImpl<DataT, AccT, OutT, IdxT, 1, usePrevTopKs, isRowMajor>(
+    fusedL2UnexpKnnImpl<DataT, AccT, OutT, IdxT, 1, usePrevTopKs, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, out_inds, numOfNN, stream,
+      workspace, worksize);
+  }
+}
+
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          int VecLen, bool usePrevTopKs, bool isRowMajor>
+void fusedL2ExpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
+                       IdxT lda, IdxT ldb, IdxT ldd, bool sqrt, OutT *out_dists,
+                       IdxT *out_inds, IdxT numOfNN, cudaStream_t stream,
+                       void *workspace, size_t &worksize) {
+  typedef typename raft::linalg::Policy2x8<DataT, 1>::Policy RowPolicy;
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
+
+  typedef typename std::conditional<true, RowPolicy, ColPolicy>::type KPolicy;
+
+  ASSERT(isRowMajor, "Only Row major inputs are allowed");
+
+  ASSERT(!(((x != y) && (worksize < (m + n) * sizeof(AccT))) ||
+           (worksize < m * sizeof(AccT))),
+         "workspace size error");
+  ASSERT(workspace != nullptr, "workspace is null");
+
+  dim3 blk(KPolicy::Nthreads);
+  // Accumulation operation lambda
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
+    acc += x * y;
+  };
+
+  auto fin_op = [] __device__(AccT d_val, int g_d_idx) { return d_val; };
+
+  typedef cub::KeyValuePair<uint32_t, AccT> Pair;
+
+  if (isRowMajor) {
+    constexpr auto fusedL2ExpKnn32RowMajor =
+      fusedL2kNN<true, DataT, AccT, OutT, IdxT, KPolicy, decltype(core_lambda),
+                 decltype(fin_op), 32, 2, usePrevTopKs, true>;
+    constexpr auto fusedL2ExpKnn64RowMajor =
+      fusedL2kNN<true, DataT, AccT, OutT, IdxT, KPolicy, decltype(core_lambda),
+                 decltype(fin_op), 64, 3, usePrevTopKs, true>;
+
+    auto fusedL2ExpKnnRowMajor = fusedL2ExpKnn32RowMajor;
+    if (numOfNN <= 32) {
+      fusedL2ExpKnnRowMajor = fusedL2ExpKnn32RowMajor;
+    } else if (numOfNN <= 64) {
+      fusedL2ExpKnnRowMajor = fusedL2ExpKnn64RowMajor;
+    } else {
+      ASSERT(numOfNN <= 64,
+             "fusedL2kNN: num of nearest neighbors must be <= 64");
+    }
+
+    const auto sharedMemSize =
+      KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)) +
+      (KPolicy::Mblk * numOfNN * sizeof(Pair));
+    dim3 grid = raft::distance::detail::launchConfigGenerator<KPolicy>(
+      m, n, sharedMemSize, fusedL2ExpKnnRowMajor);
+    int32_t *mutexes = nullptr;
+    if (grid.x > 1) {
+      const auto numMutexes = raft::ceildiv<int>(m, KPolicy::Mblk);
+      const auto normsSize =
+        (x != y) ? (m + n) * sizeof(DataT) : n * sizeof(DataT);
+      const auto requiredSize = sizeof(int32_t) * numMutexes + normsSize;
+      if (worksize < requiredSize) {
+        worksize = requiredSize;
+        return;
+      } else {
+        mutexes = (int32_t *)((char *)workspace + normsSize);
+        CUDA_CHECK(
+          cudaMemsetAsync(mutexes, 0, sizeof(int32_t) * numMutexes, stream));
+      }
+    }
+
+    DataT *xn = (DataT *)workspace;
+    DataT *yn = (DataT *)workspace;
+
+    auto norm_op = [] __device__(DataT in) { return in; };
+
+    if (x != y) {
+      yn += m;
+      raft::linalg::rowNorm(xn, x, k, m, raft::linalg::L2Norm, isRowMajor,
+                            stream, norm_op);
+      raft::linalg::rowNorm(yn, y, k, n, raft::linalg::L2Norm, isRowMajor,
+                            stream, norm_op);
+    } else {
+      raft::linalg::rowNorm(xn, x, k, n, raft::linalg::L2Norm, isRowMajor,
+                            stream, norm_op);
+    }
+    fusedL2ExpKnnRowMajor<<<grid, blk, sharedMemSize, stream>>>(
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, core_lambda, fin_op, sqrt,
+      (uint32_t)numOfNN, mutexes, out_dists, out_inds);
+  } else {
+  }
+
+  CUDA_CHECK(cudaGetLastError());
+}
+
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          bool usePrevTopKs, bool isRowMajor>
+void fusedL2ExpKnn(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                   const DataT *x, const DataT *y, bool sqrt, OutT *out_dists,
+                   IdxT *out_inds, IdxT numOfNN, cudaStream_t stream,
+                   void *workspace, size_t &worksize) {
+  size_t bytesA = sizeof(DataT) * lda;
+  size_t bytesB = sizeof(DataT) * ldb;
+  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
+    fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), usePrevTopKs,
+                      isRowMajor>(x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists,
+                                  out_inds, numOfNN, stream, workspace,
+                                  worksize);
+  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
+    fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), usePrevTopKs,
+                      isRowMajor>(x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists,
+                                  out_inds, numOfNN, stream, workspace,
+                                  worksize);
+  } else {
+    fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 1, usePrevTopKs, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, out_inds, numOfNN, stream,
       workspace, worksize);
   }
 }
 
 /**
- * Compute the k-nearest neighbors using L2 unexpanded distance.
+ * Compute the k-nearest neighbors using L2 expanded/unexpanded distance.
 
  * @tparam value_idx
  * @tparam value_t
@@ -576,13 +731,12 @@ void fusedL2kNN(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param[in] rowMajorQuery are the query array in row-major layout?
  * @param[in] stream stream to order kernel launch
  */
-template <raft::distance::DistanceType distanceType, typename value_idx,
-          typename value_t, bool usePrevTopKs>
-void l2_unexpanded_knn(size_t D, value_idx *out_inds, value_t *out_dists,
-                       const value_t *index, const value_t *query,
-                       size_t n_index_rows, size_t n_query_rows, int k,
-                       bool rowMajorIndex, bool rowMajorQuery,
-                       cudaStream_t stream, void *workspace, size_t &worksize) {
+template <typename value_idx, typename value_t, bool usePrevTopKs = false>
+void fusedL2Knn(size_t D, value_idx *out_inds, value_t *out_dists,
+                const value_t *index, const value_t *query, size_t n_index_rows,
+                size_t n_query_rows, int k, bool rowMajorIndex,
+                bool rowMajorQuery, cudaStream_t stream,
+                raft::distance::DistanceType metric) {
   // Validate the input data
   ASSERT(k > 0, "l2Knn: k must be > 0");
   ASSERT(D > 0, "l2Knn: D must be > 0");
@@ -595,17 +749,53 @@ void l2_unexpanded_knn(size_t D, value_idx *out_inds, value_t *out_dists,
   // Currently we only support same layout for x & y inputs.
   ASSERT(rowMajorIndex == rowMajorQuery,
          "l2Knn: rowMajorIndex and rowMajorQuery should have same layout");
-
-  bool sqrt = (distanceType == raft::distance::DistanceType::L2SqrtUnexpanded);
-
-  if (rowMajorIndex) {
-    value_idx lda = D, ldb = D, ldd = n_index_rows;
-    fusedL2kNN<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(
-      n_query_rows, n_index_rows, D, lda, ldb, ldd, query, index, sqrt,
-      out_dists, out_inds, k, stream, workspace, worksize);
-  } else {
-    // TODO: Add support for column major layout
-  }
+  // TODO: Add support for column major layout
+  ASSERT(rowMajorIndex == true,
+         "l2Knn: only rowMajor inputs are supported for now.");
+
+  // Even for L2 Sqrt distance case we use non-sqrt version as FAISS bfKNN only support
+  // non-sqrt metric & some tests in RAFT/cuML (like Linkage) fails if we use L2 sqrt.
+  constexpr bool sqrt = false;
+
+  size_t worksize = 0, tempWorksize = 0;
+  rmm::device_uvector<char> workspace(worksize, stream);
+  value_idx lda = D, ldb = D, ldd = n_index_rows;
+
+  switch (metric) {
+    case raft::distance::DistanceType::L2SqrtExpanded:
+    case raft::distance::DistanceType::L2Expanded:
+      tempWorksize = raft::distance::detail::getWorkspaceSize<
+        raft::distance::DistanceType::L2Expanded, float, float, float,
+        value_idx>(query, index, n_query_rows, n_index_rows, D);
+      worksize = tempWorksize;
+      workspace.resize(worksize, stream);
+      fusedL2ExpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(
+        n_query_rows, n_index_rows, D, lda, ldb, ldd, query, index, sqrt,
+        out_dists, out_inds, k, stream, workspace.data(), worksize);
+      if (worksize > tempWorksize) {
+        workspace.resize(worksize, stream);
+        fusedL2ExpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(
+          n_query_rows, n_index_rows, D, lda, ldb, ldd, query, index, sqrt,
+          out_dists, out_inds, k, stream, workspace.data(), worksize);
+      }
+      break;
+    case raft::distance::DistanceType::L2Unexpanded:
+    case raft::distance::DistanceType::L2SqrtUnexpanded:
+      fusedL2UnexpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(
+        n_query_rows, n_index_rows, D, lda, ldb, ldd, query, index, sqrt,
+        out_dists, out_inds, k, stream, workspace.data(), worksize);
+      if (worksize) {
+        workspace.resize(worksize, stream);
+        fusedL2UnexpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs,
+                        true>(n_query_rows, n_index_rows, D, lda, ldb, ldd,
+                              query, index, sqrt, out_dists, out_inds, k,
+                              stream, workspace.data(), worksize);
+      }
+      break;
+    default:
+      printf("only L2 distance metric is supported\n");
+      break;
+  };
 }
 
 }  // namespace detail
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 3a3f0a6513..da1217e3cf 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -271,44 +271,53 @@ void brute_force_knn_impl(std::vector<float *> &input,
     cudaStream_t stream =
       raft::select_stream(userStream, internalStreams, n_int_streams, i);
 
-    switch (metric) {
-      case raft::distance::DistanceType::Haversine:
-
-        ASSERT(D == 2,
-               "Haversine distance requires 2 dimensions "
-               "(latitude / longitude).");
-
-        haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n,
-                      k, stream);
-        break;
-      default:
-        faiss::MetricType m = build_faiss_metric(metric);
-
-        faiss::gpu::StandardGpuResources gpu_res;
-
-        gpu_res.noTempMemory();
-        gpu_res.setDefaultStream(device, stream);
-
-        faiss::gpu::GpuDistanceParams args;
-        args.metric = m;
-        args.metricArg = metricArg;
-        args.k = k;
-        args.dims = D;
-        args.vectors = input[i];
-        args.vectorsRowMajor = rowMajorIndex;
-        args.numVectors = sizes[i];
-        args.queries = search_items;
-        args.queriesRowMajor = rowMajorQuery;
-        args.numQueries = n;
-        args.outDistances = out_d_ptr;
-        args.outIndices = out_i_ptr;
-
-        /**
+    if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true &&
+        (metric == raft::distance::DistanceType::L2Unexpanded ||
+         metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
+         metric == raft::distance::DistanceType::L2Expanded ||
+         metric == raft::distance::DistanceType::L2SqrtExpanded)) {
+      fusedL2Knn(D, out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n,
+                 k, rowMajorIndex, rowMajorQuery, stream, metric);
+    } else {
+      switch (metric) {
+        case raft::distance::DistanceType::Haversine:
+
+          ASSERT(D == 2,
+                 "Haversine distance requires 2 dimensions "
+                 "(latitude / longitude).");
+
+          haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i],
+                        n, k, stream);
+          break;
+        default:
+          faiss::MetricType m = build_faiss_metric(metric);
+
+          faiss::gpu::StandardGpuResources gpu_res;
+
+          gpu_res.noTempMemory();
+          gpu_res.setDefaultStream(device, stream);
+
+          faiss::gpu::GpuDistanceParams args;
+          args.metric = m;
+          args.metricArg = metricArg;
+          args.k = k;
+          args.dims = D;
+          args.vectors = input[i];
+          args.vectorsRowMajor = rowMajorIndex;
+          args.numVectors = sizes[i];
+          args.queries = search_items;
+          args.queriesRowMajor = rowMajorQuery;
+          args.numQueries = n;
+          args.outDistances = out_d_ptr;
+          args.outIndices = out_i_ptr;
+
+          /**
            * @todo: Until FAISS supports pluggable allocation strategies,
            * we will not reap the benefits of the pool allocator for
            * avoiding device-wide synchronizations from cudaMalloc/cudaFree
            */
-        bfKnn(&gpu_res, args);
+          bfKnn(&gpu_res, args);
+      }
     }
 
     CUDA_CHECK(cudaPeekAtLastError());
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index 6472eaa80b..a2e9151dbc 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -116,7 +116,7 @@ inline void brute_force_knn(
   std::vector<int> &sizes, int D, float *search_items, int n, int64_t *res_I,
   float *res_D, int k, bool rowMajorIndex = true, bool rowMajorQuery = true,
   std::vector<int64_t> *translations = nullptr,
-  distance::DistanceType metric = distance::DistanceType::L2Unexpanded,
+  distance::DistanceType metric = distance::DistanceType::L2Expanded,
   float metric_arg = 2.0f) {
   ASSERT(input.size() == sizes.size(),
          "input and sizes vectors must be the same size");
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 4a89fd3273..14052293cf 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -88,6 +88,7 @@ add_executable(test_raft
     test/sparse/sort.cu
     test/sparse/symmetrize.cu
     test/spatial/knn.cu
+    test/spatial/fused_l2_knn.cu
     test/spatial/haversine.cu
     test/spatial/ball_cover.cu
     test/spatial/selection.cu
diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu
index c43ce78cbf..ca30506df0 100644
--- a/cpp/test/spatial/ball_cover.cu
+++ b/cpp/test/spatial/ball_cover.cu
@@ -17,8 +17,6 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
 #include <raft/spatial/knn/ball_cover.hpp>
-#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
-#include <raft/spatial/knn/detail/haversine_distance.cuh>
 #include <raft/spatial/knn/detail/knn_brute_force_faiss.cuh>
 #include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
@@ -88,30 +86,12 @@ void compute_bfknn(const raft::handle_t &handle, const value_t *X1,
   std::vector<value_t *> input_vec = {const_cast<value_t *>(X1)};
   std::vector<uint32_t> sizes_vec = {n};
 
-  if (metric == raft::distance::DistanceType::Haversine) {
-    cudaStream_t *int_streams = nullptr;
-    std::vector<int64_t> *translations = nullptr;
+  cudaStream_t *int_streams = nullptr;
+  std::vector<int64_t> *translations = nullptr;
 
-    raft::spatial::knn::detail::brute_force_knn_impl<uint32_t, int64_t>(
-      input_vec, sizes_vec, d, const_cast<value_t *>(X2), n, inds, dists, k,
-      handle.get_stream(), int_streams, 0, true, true, translations, metric);
-  } else {
-    size_t worksize = 0;
-    void *workspace = nullptr;
-    raft::spatial::knn::detail::l2_unexpanded_knn<
-      raft::distance::DistanceType::L2SqrtUnexpanded, int64_t, value_t, false>(
-      (size_t)d, inds, dists, input_vec[0], X2, (size_t)sizes_vec[0], (size_t)n,
-      (int)k, true, true, handle.get_stream(), workspace, worksize);
-    if (worksize) {
-      rmm::device_uvector<int> d_mutexes(worksize, handle.get_stream());
-      workspace = d_mutexes.data();
-      raft::spatial::knn::detail::l2_unexpanded_knn<
-        raft::distance::DistanceType::L2SqrtUnexpanded, int64_t, value_t,
-        false>((size_t)d, inds, dists, input_vec[0], X2, (size_t)sizes_vec[0],
-               (size_t)n, (int)k, true, true, handle.get_stream(), workspace,
-               worksize);
-    }
-  }
+  raft::spatial::knn::detail::brute_force_knn_impl<uint32_t, int64_t>(
+    input_vec, sizes_vec, d, const_cast<value_t *>(X2), n, inds, dists, k,
+    handle.get_stream(), int_streams, 0, true, true, translations, metric);
 }
 
 struct ToRadians {
@@ -226,11 +206,16 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam<BallCoverInputs> {
                         d_train_inputs.data(), ToRadians());
     }
 
+    cudaStream_t *int_streams = nullptr;
+    std::vector<int64_t> *translations = nullptr;
+
     std::vector<float *> input_vec = {d_train_inputs.data()};
     std::vector<uint32_t> sizes_vec = {n};
 
-    compute_bfknn(handle, d_train_inputs.data(), d_train_inputs.data(), n, d, k,
-                  metric, d_ref_D.data(), d_ref_I.data());
+    raft::spatial::knn::detail::brute_force_knn_impl<uint32_t, int64_t>(
+      input_vec, sizes_vec, d, d_train_inputs.data(), n, d_ref_I.data(),
+      d_ref_D.data(), k, handle.get_stream(), int_streams, 0, true, true,
+      translations, metric);
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
 
diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu
new file mode 100644
index 0000000000..4930b47e0c
--- /dev/null
+++ b/cpp/test/spatial/fused_l2_knn.cu
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+
+#include <faiss/gpu/GpuDistance.h>
+#include <faiss/gpu/StandardGpuResources.h>
+
+#include <raft/linalg/distance_type.h>
+#include <raft/spatial/knn/detail/common_faiss.h>
+#include <raft/random/rng.hpp>
+#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
+#include <raft/spatial/knn/knn.hpp>
+
+#include <rmm/device_buffer.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <iostream>
+#include <vector>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+struct FusedL2KNNInputs {
+  int num_queries;
+  int num_db_vecs;
+  int dim;
+  int k;
+  raft::distance::DistanceType metric_;
+};
+
+template <typename IdxT, typename DistT, typename compareDist>
+struct idx_dist_pair {
+  IdxT idx;
+  DistT dist;
+  compareDist eq_compare;
+  bool operator==(const idx_dist_pair<IdxT, DistT, compareDist> &a) const {
+    if (idx == a.idx) return true;
+    if (eq_compare(dist, a.dist)) return true;
+    return false;
+  }
+  idx_dist_pair(IdxT x, DistT y, compareDist op)
+    : idx(x), dist(y), eq_compare(op) {}
+};
+
+template <typename T, typename DistT>
+testing::AssertionResult devArrMatchKnnPair(
+  const T *expected_idx, const T *actual_idx, const DistT *expected_dist,
+  const DistT *actual_dist, size_t rows, size_t cols, const DistT eps,
+  cudaStream_t stream = 0) {
+  size_t size = rows * cols;
+  std::unique_ptr<T[]> exp_idx_h(new T[size]);
+  std::unique_ptr<T[]> act_idx_h(new T[size]);
+  std::unique_ptr<DistT[]> exp_dist_h(new DistT[size]);
+  std::unique_ptr<DistT[]> act_dist_h(new DistT[size]);
+  raft::update_host<T>(exp_idx_h.get(), expected_idx, size, stream);
+  raft::update_host<T>(act_idx_h.get(), actual_idx, size, stream);
+  raft::update_host<DistT>(exp_dist_h.get(), expected_dist, size, stream);
+  raft::update_host<DistT>(act_dist_h.get(), actual_dist, size, stream);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  for (size_t i(0); i < rows; ++i) {
+    for (size_t j(0); j < cols; ++j) {
+      auto idx = i * cols + j;  // row major assumption!
+      auto exp_idx = exp_idx_h.get()[idx];
+      auto act_idx = act_idx_h.get()[idx];
+      auto exp_dist = exp_dist_h.get()[idx];
+      auto act_dist = act_dist_h.get()[idx];
+      idx_dist_pair exp_kvp(exp_idx, exp_dist, raft::CompareApprox<DistT>(eps));
+      idx_dist_pair act_kvp(act_idx, act_dist, raft::CompareApprox<DistT>(eps));
+      if (!(exp_kvp == act_kvp)) {
+        return testing::AssertionFailure()
+               << "actual=" << act_kvp.idx << "," << act_kvp.dist << "!="
+               << "expected" << exp_kvp.idx << "," << exp_kvp.dist << " @" << i
+               << "," << j;
+      }
+    }
+  }
+  return testing::AssertionSuccess();
+}
+
+template <typename T>
+class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
+ protected:
+  void testBruteForce() {
+    cudaStream_t stream = handle_.get_stream();
+
+    launchFaissBfknn();
+    detail::fusedL2Knn(dim, raft_indices_, raft_distances_, database,
+                       search_queries, num_db_vecs, num_queries, k_, true, true,
+                       stream, metric);
+
+    // verify.
+    devArrMatchKnnPair(faiss_indices_, raft_indices_, faiss_distances_,
+                       raft_distances_, num_queries, k_, float(0.001), stream);
+  }
+
+  void SetUp() override {
+    params_ = ::testing::TestWithParam<FusedL2KNNInputs>::GetParam();
+    num_queries = params_.num_queries;
+    num_db_vecs = params_.num_db_vecs;
+    dim = params_.dim;
+    k_ = params_.k;
+    metric = params_.metric_;
+
+    cudaStream_t stream = handle_.get_stream();
+
+    raft::allocate(database, num_db_vecs * dim, stream, true);
+    raft::allocate(search_queries, num_queries * dim, stream, true);
+
+    unsigned long long int seed = 1234ULL;
+    raft::random::Rng r(seed);
+    r.uniform(database, num_db_vecs * dim, T(-1.0), T(1.0), stream);
+    r.uniform(search_queries, num_queries * dim, T(-1.0), T(1.0), stream);
+
+    raft::allocate(raft_indices_, num_queries * k_, stream, true);
+    raft::allocate(raft_distances_, num_queries * k_, stream, true);
+    raft::allocate(faiss_indices_, num_queries * k_, stream, true);
+    raft::allocate(faiss_distances_, num_queries * k_, stream, true);
+  }
+
+  void TearDown() override {
+    cudaStream_t stream = handle_.get_stream();
+    raft::deallocate_all(stream);
+  }
+
+  void launchFaissBfknn() {
+    faiss::MetricType m = detail::build_faiss_metric(metric);
+
+    faiss::gpu::StandardGpuResources gpu_res;
+
+    gpu_res.noTempMemory();
+    int device;
+    CUDA_CHECK(cudaGetDevice(&device));
+    gpu_res.setDefaultStream(device, handle_.get_stream());
+
+    faiss::gpu::GpuDistanceParams args;
+    args.metric = m;
+    args.metricArg = 0;
+    args.k = k_;
+    args.dims = dim;
+    args.vectors = database;
+    args.vectorsRowMajor = true;
+    args.numVectors = num_db_vecs;
+    args.queries = search_queries;
+    args.queriesRowMajor = true;
+    args.numQueries = num_queries;
+    args.outDistances = faiss_distances_;
+    args.outIndices = faiss_indices_;
+
+    bfKnn(&gpu_res, args);
+  }
+
+ private:
+  raft::handle_t handle_;
+  FusedL2KNNInputs params_;
+  int num_queries;
+  int num_db_vecs;
+  int dim;
+  T *database;
+  T *search_queries;
+  int64_t *raft_indices_;
+  T *raft_distances_;
+  int64_t *faiss_indices_;
+  T *faiss_distances_;
+  int k_;
+  raft::distance::DistanceType metric;
+};
+
+const std::vector<FusedL2KNNInputs> inputs = {
+  {100, 1000, 16, 10, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 10, raft::distance::DistanceType::L2Expanded},
+  {100, 1000, 16, 50, raft::distance::DistanceType::L2Expanded},
+  {20, 10000, 16, 10, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 50, raft::distance::DistanceType::L2Expanded},
+  {1000, 10000, 32, 50, raft::distance::DistanceType::L2Expanded},
+  {10000, 40000, 32, 30, raft::distance::DistanceType::L2Expanded},
+  // L2 unexpanded
+  {100, 1000, 16, 10, raft::distance::DistanceType::L2Unexpanded},
+  {1000, 10000, 16, 10, raft::distance::DistanceType::L2Unexpanded},
+  {100, 1000, 16, 50, raft::distance::DistanceType::L2Unexpanded},
+  {20, 10000, 16, 50, raft::distance::DistanceType::L2Unexpanded},
+  {1000, 10000, 16, 50, raft::distance::DistanceType::L2Unexpanded},
+  {1000, 10000, 32, 50, raft::distance::DistanceType::L2Unexpanded},
+  {10000, 40000, 32, 30, raft::distance::DistanceType::L2Unexpanded}};
+
+typedef FusedL2KNNTest<float> FusedL2KNNTestF;
+TEST_P(FusedL2KNNTestF, FusedBruteForce) { this->testBruteForce(); }
+
+INSTANTIATE_TEST_CASE_P(FusedL2KNNTest, FusedL2KNNTestF,
+                        ::testing::ValuesIn(inputs));
+
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft

From ca4f36a82104e58e2575dad36620ac59f5bc4a07 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
Date: Thu, 25 Nov 2021 08:10:21 -0500
Subject: [PATCH 046/171] Update `.clang-format` to be consistent with all
 other RAPIDS repos (#300)

All RAPIDS repos share the same `.clang-format` config file except Raft. This PR fixes that.

Authors:
  - Conor Hoekstra (https://github.com/codereport)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Brad Rees (https://github.com/BradReesWork)
  - Jordan Jacobelli (https://github.com/Ethyling)

URL: https://github.com/rapidsai/raft/pull/300
---
 cpp/.clang-format                             |   97 +-
 cpp/include/raft.hpp                          |    3 +-
 cpp/include/raft/cache/cache_util.cuh         |  105 +-
 cpp/include/raft/common/cub_wrappers.cuh      |   42 +-
 .../raft/common/device_loads_stores.cuh       |   87 +-
 cpp/include/raft/common/scatter.cuh           |   77 +-
 cpp/include/raft/comms/comms.hpp              |  359 ++--
 cpp/include/raft/comms/helper.hpp             |   35 +-
 cpp/include/raft/comms/mpi_comms.hpp          |  314 ++--
 cpp/include/raft/comms/std_comms.hpp          |  327 ++--
 cpp/include/raft/comms/test.hpp               |  239 ++-
 cpp/include/raft/comms/ucp_helper.hpp         |  138 +-
 cpp/include/raft/comms/util.hpp               |  114 +-
 cpp/include/raft/cuda_utils.cuh               |  259 ++-
 cpp/include/raft/cudart_utils.h               |  238 +--
 cpp/include/raft/device_atomics.cuh           |  285 ++-
 cpp/include/raft/distance/detail/canberra.cuh |  136 +-
 .../raft/distance/detail/chebyshev.cuh        |  136 +-
 .../raft/distance/detail/correlation.cuh      |  255 ++-
 cpp/include/raft/distance/detail/cosine.cuh   |  175 +-
 cpp/include/raft/distance/detail/distance.cuh |  698 +++++---
 .../raft/distance/detail/euclidean.cuh        |  314 ++--
 .../raft/distance/detail/fused_l2_nn.cuh      |  230 ++-
 cpp/include/raft/distance/detail/hamming.cuh  |  155 +-
 .../raft/distance/detail/hellinger.cuh        |  154 +-
 .../raft/distance/detail/jensen_shannon.cuh   |  155 +-
 .../raft/distance/detail/kl_divergence.cuh    |  253 ++-
 cpp/include/raft/distance/detail/l1.cuh       |  128 +-
 .../raft/distance/detail/minkowski.cuh        |  139 +-
 .../detail/pairwise_distance_base.cuh         |  169 +-
 .../raft/distance/detail/russell_rao.cuh      |  137 +-
 cpp/include/raft/distance/distance.hpp        |  369 ++--
 cpp/include/raft/distance/fused_l2_nn.hpp     |   41 +-
 cpp/include/raft/error.hpp                    |   50 +-
 cpp/include/raft/handle.hpp                   |  120 +-
 cpp/include/raft/integer_utils.h              |   55 +-
 cpp/include/raft/label/classlabels.cuh        |  118 +-
 cpp/include/raft/label/merge_labels.cuh       |   31 +-
 cpp/include/raft/lap/d_structs.h              |   20 +-
 cpp/include/raft/lap/lap.cuh                  |  158 +-
 cpp/include/raft/lap/lap_functions.cuh        |  366 ++--
 cpp/include/raft/lap/lap_kernels.cuh          |  343 ++--
 cpp/include/raft/linalg/add.cuh               |   35 +-
 cpp/include/raft/linalg/binary_op.cuh         |   61 +-
 .../raft/linalg/cholesky_r1_update.cuh        |   63 +-
 .../raft/linalg/coalesced_reduction.cuh       |   55 +-
 cpp/include/raft/linalg/contractions.cuh      |   79 +-
 cpp/include/raft/linalg/cublas_wrappers.h     |  937 +++++++---
 cpp/include/raft/linalg/cusolver_wrappers.h   | 1317 ++++++++++----
 cpp/include/raft/linalg/divide.cuh            |    7 +-
 cpp/include/raft/linalg/eig.cuh               |  214 ++-
 cpp/include/raft/linalg/eltwise.cuh           |   56 +-
 cpp/include/raft/linalg/gemm.cuh              |   85 +-
 cpp/include/raft/linalg/gemv.h                |   88 +-
 cpp/include/raft/linalg/init.h                |    6 +-
 cpp/include/raft/linalg/lanczos.hpp           |  786 +++++---
 cpp/include/raft/linalg/map.cuh               |   31 +-
 cpp/include/raft/linalg/map_then_reduce.cuh   |   92 +-
 cpp/include/raft/linalg/matrix_vector_op.cuh  |  108 +-
 .../raft/linalg/mean_squared_error.cuh        |   10 +-
 cpp/include/raft/linalg/multiply.cuh          |    7 +-
 cpp/include/raft/linalg/norm.cuh              |   92 +-
 cpp/include/raft/linalg/qr.cuh                |   83 +-
 cpp/include/raft/linalg/reduce.cuh            |   37 +-
 cpp/include/raft/linalg/strided_reduction.cuh |   74 +-
 cpp/include/raft/linalg/subtract.cuh          |   34 +-
 cpp/include/raft/linalg/svd.cuh               |  227 ++-
 cpp/include/raft/linalg/transpose.h           |   60 +-
 cpp/include/raft/linalg/unary_op.cuh          |   86 +-
 cpp/include/raft/matrix/detail/math.cuh       |   35 +-
 cpp/include/raft/matrix/detail/matrix.cuh     |   86 +-
 cpp/include/raft/matrix/math.hpp              |  255 ++-
 cpp/include/raft/matrix/matrix.hpp            |  160 +-
 cpp/include/raft/mr/buffer_base.hpp           |   59 +-
 cpp/include/raft/mr/device/allocator.hpp      |    9 +-
 cpp/include/raft/mr/device/buffer.hpp         |   14 +-
 cpp/include/raft/mr/host/allocator.hpp        |   13 +-
 cpp/include/raft/mr/host/buffer.hpp           |   21 +-
 cpp/include/raft/pow2_utils.cuh               |   47 +-
 cpp/include/raft/random/detail/rng_impl.cuh   |  432 +++--
 cpp/include/raft/random/rng.hpp               |  101 +-
 cpp/include/raft/sparse/convert/coo.cuh       |   20 +-
 cpp/include/raft/sparse/convert/csr.cuh       |  114 +-
 cpp/include/raft/sparse/convert/dense.cuh     |   35 +-
 cpp/include/raft/sparse/coo.cuh               |  192 +-
 cpp/include/raft/sparse/csr.cuh               |  129 +-
 cpp/include/raft/sparse/cusparse_wrappers.h   | 1590 ++++++++++++-----
 cpp/include/raft/sparse/distance/common.h     |   18 +-
 .../sparse/distance/detail/bin_distance.cuh   |  187 +-
 .../raft/sparse/distance/detail/coo_spmv.cuh  |  118 +-
 .../distance/detail/coo_spmv_kernel.cuh       |  196 +-
 .../coo_spmv_strategies/base_strategy.cuh     |  138 +-
 .../coo_mask_row_iterators.cuh                |  166 +-
 .../dense_smem_strategy.cuh                   |  104 +-
 .../coo_spmv_strategies/hash_strategy.cuh     |  277 +--
 .../sparse/distance/detail/ip_distance.cuh    |   39 +-
 .../sparse/distance/detail/l2_distance.cuh    |  384 ++--
 .../sparse/distance/detail/lp_distance.cuh    |  199 ++-
 .../raft/sparse/distance/detail/operators.cuh |   29 +-
 .../raft/sparse/distance/detail/utils.cuh     |    6 +-
 cpp/include/raft/sparse/distance/distance.hpp |   65 +-
 cpp/include/raft/sparse/hierarchy/common.h    |   10 +-
 .../sparse/hierarchy/detail/agglomerative.cuh |  128 +-
 .../hierarchy/detail/connectivities.cuh       |   83 +-
 .../raft/sparse/hierarchy/detail/mst.cuh      |   84 +-
 .../raft/sparse/hierarchy/single_linkage.hpp  |   62 +-
 cpp/include/raft/sparse/linalg/add.cuh        |  116 +-
 cpp/include/raft/sparse/linalg/degree.cuh     |   56 +-
 cpp/include/raft/sparse/linalg/norm.cuh       |   51 +-
 cpp/include/raft/sparse/linalg/spectral.cuh   |   65 +-
 cpp/include/raft/sparse/linalg/symmetrize.cuh |  154 +-
 cpp/include/raft/sparse/linalg/transpose.h    |   60 +-
 .../raft/sparse/mst/detail/mst_kernels.cuh    |  160 +-
 .../raft/sparse/mst/detail/mst_solver_inl.cuh |  258 +--
 cpp/include/raft/sparse/mst/detail/utils.cuh  |   19 +-
 cpp/include/raft/sparse/mst/mst.cuh           |   34 +-
 cpp/include/raft/sparse/mst/mst_solver.cuh    |   48 +-
 cpp/include/raft/sparse/op/filter.cuh         |  105 +-
 cpp/include/raft/sparse/op/reduce.cuh         |   54 +-
 cpp/include/raft/sparse/op/row_op.cuh         |   16 +-
 cpp/include/raft/sparse/op/slice.h            |   34 +-
 cpp/include/raft/sparse/op/sort.h             |   21 +-
 .../sparse/selection/connect_components.cuh   |  214 ++-
 cpp/include/raft/sparse/selection/knn.cuh     |  441 +++--
 .../raft/sparse/selection/knn_graph.cuh       |   52 +-
 cpp/include/raft/sparse/utils.h               |   22 +-
 cpp/include/raft/spatial/knn/ann.hpp          |   31 +-
 cpp/include/raft/spatial/knn/ann_common.h     |   10 +-
 cpp/include/raft/spatial/knn/ball_cover.hpp   |   82 +-
 .../raft/spatial/knn/ball_cover_common.h      |   37 +-
 .../knn/detail/ann_quantized_faiss.cuh        |  130 +-
 .../raft/spatial/knn/detail/ball_cover.cuh    |  322 ++--
 .../spatial/knn/detail/ball_cover/common.cuh  |   26 +-
 .../knn/detail/ball_cover/registers.cuh       |  613 +++++--
 .../spatial/knn/detail/block_select_faiss.cuh |   80 +-
 .../raft/spatial/knn/detail/common_faiss.h    |   37 +-
 .../raft/spatial/knn/detail/fused_l2_knn.cuh  |  802 ++++++---
 .../spatial/knn/detail/haversine_distance.cuh |   56 +-
 .../knn/detail/knn_brute_force_faiss.cuh      |  188 +-
 .../raft/spatial/knn/detail/processing.hpp    |  121 +-
 .../spatial/knn/detail/selection_faiss.cuh    |   99 +-
 .../spatial/knn/detail/warp_select_faiss.cuh  |  276 +--
 cpp/include/raft/spatial/knn/knn.hpp          |   75 +-
 cpp/include/raft/spectral/cluster_solvers.hpp |   39 +-
 cpp/include/raft/spectral/eigen_solvers.hpp   |   66 +-
 cpp/include/raft/spectral/kmeans.hpp          |  402 +++--
 cpp/include/raft/spectral/lapack.hpp          |  552 ++++--
 cpp/include/raft/spectral/matrix_wrappers.hpp |  260 +--
 .../raft/spectral/modularity_maximization.hpp |   44 +-
 cpp/include/raft/spectral/partition.hpp       |   53 +-
 cpp/include/raft/spectral/spectral_util.hpp   |  118 +-
 cpp/include/raft/spectral/warn_dbg.hpp        |    4 +-
 cpp/include/raft/stats/detail/mean.cuh        |   42 +-
 cpp/include/raft/stats/detail/stddev.cuh      |  136 +-
 cpp/include/raft/stats/detail/sum.cuh         |   38 +-
 cpp/include/raft/stats/mean.hpp               |    5 +-
 cpp/include/raft/stats/mean_center.hpp        |   45 +-
 cpp/include/raft/stats/stddev.hpp             |   22 +-
 cpp/include/raft/stats/sum.hpp                |    4 +-
 cpp/include/raft/vectorized.cuh               |  128 +-
 cpp/test/cluster_solvers.cu                   |   16 +-
 cpp/test/cudart_utils.cpp                     |    3 +-
 cpp/test/distance/dist_adj.cu                 |   94 +-
 cpp/test/distance/dist_canberra.cu            |   24 +-
 cpp/test/distance/dist_chebyshev.cu           |   24 +-
 cpp/test/distance/dist_correlation.cu         |   24 +-
 cpp/test/distance/dist_cos.cu                 |   25 +-
 cpp/test/distance/dist_euc_exp.cu             |   24 +-
 cpp/test/distance/dist_euc_unexp.cu           |   20 +-
 cpp/test/distance/dist_hamming.cu             |   24 +-
 cpp/test/distance/dist_hellinger.cu           |   24 +-
 cpp/test/distance/dist_jensen_shannon.cu      |   20 +-
 cpp/test/distance/dist_kl_divergence.cu       |   20 +-
 cpp/test/distance/dist_l1.cu                  |   24 +-
 cpp/test/distance/dist_minkowski.cu           |   23 +-
 cpp/test/distance/dist_russell_rao.cu         |   24 +-
 cpp/test/distance/distance_base.cuh           |  311 ++--
 cpp/test/distance/fused_l2_nn.cu              |  208 ++-
 cpp/test/eigen_solvers.cu                     |   31 +-
 cpp/test/handle.cpp                           |   18 +-
 cpp/test/integer_utils.cpp                    |    6 +-
 cpp/test/label/label.cu                       |   26 +-
 cpp/test/label/merge_labels.cu                |   67 +-
 cpp/test/lap/lap.cu                           |   93 +-
 cpp/test/linalg/add.cu                        |   14 +-
 cpp/test/linalg/add.cuh                       |   17 +-
 cpp/test/linalg/binary_op.cu                  |   94 +-
 cpp/test/linalg/binary_op.cuh                 |   17 +-
 cpp/test/linalg/cholesky_r1.cu                |   50 +-
 cpp/test/linalg/coalesced_reduction.cu        |   64 +-
 cpp/test/linalg/divide.cu                     |   53 +-
 cpp/test/linalg/eig.cu                        |  206 ++-
 cpp/test/linalg/eig_sel.cu                    |  100 +-
 cpp/test/linalg/eltwise.cu                    |  104 +-
 cpp/test/linalg/gemm_layout.cu                |   63 +-
 cpp/test/linalg/gemv.cu                       |   76 +-
 cpp/test/linalg/map.cu                        |  108 +-
 cpp/test/linalg/map_then_reduce.cu            |  101 +-
 cpp/test/linalg/matrix_vector_op.cu           |  128 +-
 cpp/test/linalg/matrix_vector_op.cuh          |   73 +-
 cpp/test/linalg/multiply.cu                   |   33 +-
 cpp/test/linalg/norm.cu                       |  150 +-
 cpp/test/linalg/reduce.cu                     |   86 +-
 cpp/test/linalg/reduce.cuh                    |   51 +-
 cpp/test/linalg/strided_reduction.cu          |   57 +-
 cpp/test/linalg/subtract.cu                   |   75 +-
 cpp/test/linalg/svd.cu                        |  120 +-
 cpp/test/linalg/transpose.cu                  |   63 +-
 cpp/test/linalg/unary_op.cu                   |   47 +-
 cpp/test/linalg/unary_op.cuh                  |   17 +-
 cpp/test/matrix/math.cu                       |  213 +--
 cpp/test/matrix/matrix.cu                     |   81 +-
 cpp/test/mr/device/buffer.cpp                 |   16 +-
 cpp/test/mr/host/buffer.cpp                   |    9 +-
 cpp/test/mst.cu                               |  182 +-
 cpp/test/pow2_utils.cu                        |   28 +-
 cpp/test/random/rng.cu                        |  210 +--
 cpp/test/random/rng_int.cu                    |   60 +-
 cpp/test/random/sample_without_replacement.cu |   42 +-
 cpp/test/sparse/add.cu                        |  118 +-
 cpp/test/sparse/connect_components.cu         |  593 +++---
 cpp/test/sparse/convert_coo.cu                |   22 +-
 cpp/test/sparse/convert_csr.cu                |   55 +-
 cpp/test/sparse/csr_row_slice.cu              |   77 +-
 cpp/test/sparse/csr_to_dense.cu               |   64 +-
 cpp/test/sparse/csr_transpose.cu              |   70 +-
 cpp/test/sparse/degree.cu                     |   45 +-
 cpp/test/sparse/dist_coo_spmv.cu              |  922 +++++-----
 cpp/test/sparse/distance.cu                   |  244 ++-
 cpp/test/sparse/filter.cu                     |   30 +-
 cpp/test/sparse/knn.cu                        |   78 +-
 cpp/test/sparse/knn_graph.cu                  |   32 +-
 cpp/test/sparse/linkage.cu                    |  632 +++----
 cpp/test/sparse/norm.cu                       |   25 +-
 cpp/test/sparse/reduce.cu                     |   48 +-
 cpp/test/sparse/row_op.cu                     |   43 +-
 cpp/test/sparse/sort.cu                       |   19 +-
 cpp/test/sparse/symmetrize.cu                 |   86 +-
 cpp/test/spatial/ball_cover.cu                |  200 ++-
 cpp/test/spatial/fused_l2_knn.cu              |  108 +-
 cpp/test/spatial/haversine.cu                 |   71 +-
 cpp/test/spatial/knn.cu                       |   82 +-
 cpp/test/spatial/selection.cu                 |   55 +-
 cpp/test/spatial/spatial_data.h               |   31 +-
 cpp/test/spectral_matrix.cu                   |   13 +-
 cpp/test/stats/mean.cu                        |   98 +-
 cpp/test/stats/mean_center.cu                 |   80 +-
 cpp/test/stats/stddev.cu                      |   52 +-
 cpp/test/stats/sum.cu                         |   23 +-
 cpp/test/test_utils.h                         |  143 +-
 250 files changed, 19731 insertions(+), 13291 deletions(-)

diff --git a/cpp/.clang-format b/cpp/.clang-format
index 779ca0033a..0c05436e92 100644
--- a/cpp/.clang-format
+++ b/cpp/.clang-format
@@ -1,72 +1,78 @@
 ---
 # Refer to the following link for the explanation of each params:
-#   http://releases.llvm.org/8.0.1/tools/clang/docs/ClangFormatStyleOptions.html
-Language:        Cpp
-# BasedOnStyle:  Google
+#   http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html
+Language: Cpp
+# BasedOnStyle: Google
 AccessModifierOffset: -1
 AlignAfterOpenBracket: Align
-AlignConsecutiveAssignments: false
+AlignConsecutiveAssignments: true
+AlignConsecutiveBitFields: true
 AlignConsecutiveDeclarations: false
+AlignConsecutiveMacros: true
 AlignEscapedNewlines: Left
-AlignOperands:   true
+AlignOperands: true
 AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
 AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: false
-AllowShortCaseLabelsOnASingleLine: false
+AllowShortBlocksOnASingleLine: true 
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortEnumsOnASingleLine: true
 AllowShortFunctionsOnASingleLine: All
 AllowShortIfStatementsOnASingleLine: true
-AllowShortLoopsOnASingleLine: true
+AllowShortLambdasOnASingleLine: true
+AllowShortLoopsOnASingleLine: false
 # This is deprecated
 AlwaysBreakAfterDefinitionReturnType: None
 AlwaysBreakAfterReturnType: None
 AlwaysBreakBeforeMultilineStrings: true
 AlwaysBreakTemplateDeclarations: Yes
-BinPackArguments: true
-BinPackParameters: true
+BinPackArguments:  false       
+BinPackParameters: false
 BraceWrapping:
-  AfterClass:      false
+  AfterClass:            false
   AfterControlStatement: false
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  AfterExternBlock: false
-  BeforeCatch:     false
-  BeforeElse:      false
-  IndentBraces:    false
+  AfterEnum:             false
+  AfterFunction:         false
+  AfterNamespace:        false
+  AfterObjCDeclaration:  false
+  AfterStruct:           false
+  AfterUnion:            false
+  AfterExternBlock:      false
+  BeforeCatch:           false
+  BeforeElse:            false
+  IndentBraces:          false
   # disabling the below splits, else, they'll just add to the vertical length of source files!
   SplitEmptyFunction: false
   SplitEmptyRecord: false
   SplitEmptyNamespace: false
+BreakAfterJavaFieldAnnotations: false
 BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Attach
+BreakBeforeBraces: WebKit
 BreakBeforeInheritanceComma: false
-BreakInheritanceList: BeforeColon
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializersBeforeComma: false
 BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: false
+BreakInheritanceList: BeforeColon
 BreakStringLiterals: true
-ColumnLimit:     80
-CommentPragmas:  '^ IWYU pragma:'
+ColumnLimit: 100
+CommentPragmas: '^ IWYU pragma:'
 CompactNamespaces: false
 ConstructorInitializerAllOnOneLineOrOnePerLine: true
 # Kept the below 2 to be the same as `IndentWidth` to keep everything uniform
 ConstructorInitializerIndentWidth: 2
 ContinuationIndentWidth: 2
 Cpp11BracedListStyle: true
-DerivePointerAlignment: true
-DisableFormat:   false
+DerivePointerAlignment: false
+DisableFormat: false
 ExperimentalAutoDetectBinPacking: false
 FixNamespaceComments: true
-ForEachMacros:   
+ForEachMacros:
   - foreach
   - Q_FOREACH
   - BOOST_FOREACH
-IncludeBlocks:   Preserve
-IncludeCategories: 
+IncludeBlocks: Preserve
+IncludeCategories:
   - Regex:           '^<ext/.*\.h>'
     Priority:        2
   - Regex:           '^<.*\.h>'
@@ -100,9 +106,9 @@ PenaltyBreakTemplateDeclaration: 10
 PenaltyExcessCharacter: 1000000
 PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Left
-RawStringFormats: 
-  - Language:        Cpp
-    Delimiters:      
+RawStringFormats:
+  - Language: Cpp
+    Delimiters:
       - cc
       - CC
       - cpp
@@ -111,7 +117,7 @@ RawStringFormats:
       - 'c++'
       - 'C++'
     CanonicalDelimiter: ''
-  - Language:        TextProto
+  - Language: TextProto
     Delimiters:
       - pb
       - PB
@@ -126,10 +132,10 @@ RawStringFormats:
       - ParseTextOrDie
       - ParseTextProtoOrDie
     CanonicalDelimiter: ''
-    BasedOnStyle:    google
+    BasedOnStyle: google
 # Enabling comment reflow causes doxygen comments to be messed up in their formats!
-ReflowComments:  false
-SortIncludes:    true
+ReflowComments: true
+SortIncludes: true
 SortUsingDeclarations: true
 SpaceAfterCStyleCast: false
 SpaceAfterTemplateKeyword: true
@@ -139,19 +145,20 @@ SpaceBeforeCtorInitializerColon: true
 SpaceBeforeInheritanceColon: true
 SpaceBeforeParens: ControlStatements
 SpaceBeforeRangeBasedForLoopColon: true
+SpaceBeforeSquareBrackets: false
+SpaceInEmptyBlock: false
 SpaceInEmptyParentheses: false
 SpacesBeforeTrailingComments: 2
-SpacesInAngles:  false
+SpacesInAngles: false
+SpacesInConditionalStatement: false
 SpacesInContainerLiterals: true
 SpacesInCStyleCastParentheses: false
 SpacesInParentheses: false
 SpacesInSquareBrackets: false
-# We are C++14, but clang-format puts this under `Cpp11` itself
-Standard:        Cpp11
-StatementMacros: 
+Standard: c++17
+StatementMacros:
   - Q_UNUSED
   - QT_REQUIRE_VERSION
 # Be consistent with indent-width, even for people who use tab for indentation!
-TabWidth:        2
-UseTab:          Never
-...
+TabWidth: 2
+UseTab: Never
diff --git a/cpp/include/raft.hpp b/cpp/include/raft.hpp
index f380d276b2..08f836d3a8 100644
--- a/cpp/include/raft.hpp
+++ b/cpp/include/raft.hpp
@@ -21,7 +21,8 @@ namespace raft {
 /* Function for testing RAFT include
  *
  * @return message indicating RAFT has been included succesfully*/
-inline std::string test_raft() {
+inline std::string test_raft()
+{
   std::string status = "RAFT Setup succesfully";
   return status;
 }
diff --git a/cpp/include/raft/cache/cache_util.cuh b/cpp/include/raft/cache/cache_util.cuh
index a65227c402..dc9327bb94 100644
--- a/cpp/include/raft/cache/cache_util.cuh
+++ b/cpp/include/raft/cache/cache_util.cuh
@@ -42,17 +42,16 @@ namespace cache {
  * @param [out] out vectors collected from the cache, size [n_vec * n]
  */
 template <typename math_t, typename idx_t, typename int_t>
-__global__ void get_vecs(const math_t *cache, int_t n_vec,
-                         const idx_t *cache_idx, int_t n, math_t *out) {
+__global__ void get_vecs(
+  const math_t* cache, int_t n_vec, const idx_t* cache_idx, int_t n, math_t* out)
+{
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   int row = tid % n_vec;  // row idx
   if (tid < n_vec * n) {
-    size_t out_col = tid / n_vec;  // col idx
+    size_t out_col   = tid / n_vec;  // col idx
     size_t cache_col = cache_idx[out_col];
     if (cache_idx[out_col] >= 0) {
-      if (row + out_col * n_vec < (size_t)n_vec * n) {
-        out[tid] = cache[row + cache_col * n_vec];
-      }
+      if (row + out_col * n_vec < (size_t)n_vec * n) { out[tid] = cache[row + cache_col * n_vec]; }
     }
   }
 }
@@ -84,21 +83,26 @@ __global__ void get_vecs(const math_t *cache, int_t n_vec,
  * @param [in] n_cache_vecs
  */
 template <typename math_t>
-__global__ void store_vecs(const math_t *tile, int n_tile, int n_vec,
-                           const int *tile_idx, int n, const int *cache_idx,
-                           math_t *cache, int n_cache_vecs) {
+__global__ void store_vecs(const math_t* tile,
+                           int n_tile,
+                           int n_vec,
+                           const int* tile_idx,
+                           int n,
+                           const int* cache_idx,
+                           math_t* cache,
+                           int n_cache_vecs)
+{
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   int row = tid % n_vec;  // row idx
   if (tid < n_vec * n) {
-    int tile_col = tid / n_vec;  // col idx
-    int data_col = tile_idx ? tile_idx[tile_col] : tile_col;
+    int tile_col  = tid / n_vec;  // col idx
+    int data_col  = tile_idx ? tile_idx[tile_col] : tile_col;
     int cache_col = cache_idx[tile_col];
 
     // We ignore negative values. The rest of the checks should be fulfilled
     // if the cache is used properly
     if (cache_col >= 0 && cache_col < n_cache_vecs && data_col < n_tile) {
-      cache[row + (size_t)cache_col * n_vec] =
-        tile[row + (size_t)data_col * n_vec];
+      cache[row + (size_t)cache_col * n_vec] = tile[row + (size_t)data_col * n_vec];
     }
   }
 }
@@ -121,14 +125,15 @@ int DI hash(int key, int n_cache_sets) { return key % n_cache_sets; }
  * @return the index of the first element in the array for which
  * array[idx] >= value. If there is no such value, then return n.
  */
-int DI arg_first_ge(const int *array, int n, int val) {
+int DI arg_first_ge(const int* array, int n, int val)
+{
   int start = 0;
-  int end = n - 1;
+  int end   = n - 1;
   if (array[0] == val) return 0;
   if (array[end] < val) return n;
   while (start + 1 < end) {
     int q = (start + end + 1) / 2;
-    //invariants:
+    // invariants:
     // start < end
     // start < q <=end
     // array[start] < val && array[end] <=val
@@ -157,7 +162,8 @@ int DI arg_first_ge(const int *array, int n, int val) {
  * @return the idx of the k-th occurance of val in array, or -1 if
  * the value is not found.
  */
-int DI find_nth_occurrence(const int *array, int n, int val, int k) {
+int DI find_nth_occurrence(const int* array, int n, int val, int k)
+{
   int q = arg_first_ge(array, n, val);
   if (q + k < n && array[q + k] == val) {
     q += k;
@@ -196,10 +202,10 @@ int DI find_nth_occurrence(const int *array, int n, int val, int k) {
  *   Each block should give a different pointer for rank.
  */
 template <int nthreads, int associativity>
-DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) {
+DI void rank_set_entries(const int* cache_time, int n_cache_sets, int* rank)
+{
   const int items_per_thread = raft::ceildiv(associativity, nthreads);
-  typedef cub::BlockRadixSort<int, nthreads, items_per_thread, int>
-    BlockRadixSort;
+  typedef cub::BlockRadixSort<int, nthreads, items_per_thread, int> BlockRadixSort;
   __shared__ typename BlockRadixSort::TempStorage temp_storage;
 
   int key[items_per_thread];
@@ -208,8 +214,8 @@ DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) {
   int block_offset = blockIdx.x * associativity;
 
   for (int j = 0; j < items_per_thread; j++) {
-    int k = threadIdx.x + j * nthreads;
-    int t = (k < associativity) ? cache_time[block_offset + k] : 32768;
+    int k  = threadIdx.x + j * nthreads;
+    int t  = (k < associativity) ? cache_time[block_offset + k] : 32768;
     key[j] = t;
     val[j] = k;
   }
@@ -217,9 +223,7 @@ DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) {
   BlockRadixSort(temp_storage).Sort(key, val);
 
   for (int j = 0; j < items_per_thread; j++) {
-    if (val[j] < associativity) {
-      rank[val[j]] = threadIdx.x * items_per_thread + j;
-    }
+    if (val[j] < associativity) { rank[val[j]] = threadIdx.x * items_per_thread + j; }
   }
   __syncthreads();
 }
@@ -252,9 +256,15 @@ DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) {
  *   not be cached, size [n]
  */
 template <int nthreads, int associativity>
-__global__ void assign_cache_idx(const int *keys, int n, const int *cache_set,
-                                 int *cached_keys, int n_cache_sets,
-                                 int *cache_time, int time, int *cache_idx) {
+__global__ void assign_cache_idx(const int* keys,
+                                 int n,
+                                 const int* cache_set,
+                                 int* cached_keys,
+                                 int n_cache_sets,
+                                 int* cache_time,
+                                 int time,
+                                 int* cache_idx)
+{
   int block_offset = blockIdx.x * associativity;
 
   const int items_per_thread = raft::ceildiv(associativity, nthreads);
@@ -273,7 +283,7 @@ __global__ void assign_cache_idx(const int *keys, int n, const int *cache_set,
   // these elements are assigned -1.
 
   for (int j = 0; j < items_per_thread; j++) {
-    int i = threadIdx.x + j * nthreads;
+    int i     = threadIdx.x + j * nthreads;
     int t_idx = block_offset + i;
     bool mask = (i < associativity);
     // whether this slot is available for writing
@@ -284,10 +294,10 @@ __global__ void assign_cache_idx(const int *keys, int n, const int *cache_set,
     if (mask) {
       int k = find_nth_occurrence(cache_set, n, blockIdx.x, rank[i]);
       if (k > -1) {
-        int key_val = keys[k];
+        int key_val        = keys[k];
         cached_keys[t_idx] = key_val;
-        cache_idx[k] = t_idx;
-        cache_time[t_idx] = time;
+        cache_idx[k]       = t_idx;
+        cache_time[t_idx]  = time;
       }
     }
   }
@@ -315,21 +325,28 @@ namespace {
  * @param [inout] cached_keys keys stored in the cache, size [n_cache_sets * associativity]
  * @param [in] n_cache_sets number of cache sets
  * @param [in] associativity number of keys in cache set
- * @param [inout] cache_time time stamp when the indices were cached, size [n_cache_sets * associativity]
+ * @param [inout] cache_time time stamp when the indices were cached, size [n_cache_sets *
+ * associativity]
  * @param [out] cache_idx cache indices of the working set elements, size [n]
  * @param [out] is_cached whether the element is cached size[n]
  * @param [in] time iteration counter (used for time stamping)
  */
-__global__ void get_cache_idx(int *keys, int n, int *cached_keys,
-                              int n_cache_sets, int associativity,
-                              int *cache_time, int *cache_idx, bool *is_cached,
-                              int time) {
+__global__ void get_cache_idx(int* keys,
+                              int n,
+                              int* cached_keys,
+                              int n_cache_sets,
+                              int associativity,
+                              int* cache_time,
+                              int* cache_idx,
+                              bool* is_cached,
+                              int time)
+{
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < n) {
-    int widx = keys[tid];
-    int sidx = hash(widx, n_cache_sets);
-    int cidx = sidx * associativity;
-    int i = 0;
+    int widx   = keys[tid];
+    int sidx   = hash(widx, n_cache_sets);
+    int cidx   = sidx * associativity;
+    int i      = 0;
     bool found = false;
     // search for empty spot and the least recently used spot
     while (i < associativity && !found) {
@@ -338,9 +355,9 @@ __global__ void get_cache_idx(int *keys, int n, int *cached_keys,
     }
     is_cached[tid] = found;
     if (found) {
-      cidx = cidx + i - 1;
-      cache_time[cidx] = time;  //update time stamp
-      cache_idx[tid] = cidx;    //exact cache idx
+      cidx             = cidx + i - 1;
+      cache_time[cidx] = time;  // update time stamp
+      cache_idx[tid]   = cidx;  // exact cache idx
     } else {
       cache_idx[tid] = sidx;  // assign cache set
     }
diff --git a/cpp/include/raft/common/cub_wrappers.cuh b/cpp/include/raft/common/cub_wrappers.cuh
index 8e3519fea5..32a46968b6 100644
--- a/cpp/include/raft/common/cub_wrappers.cuh
+++ b/cpp/include/raft/common/cub_wrappers.cuh
@@ -22,28 +22,32 @@
 namespace raft {
 
 /**
-     * @brief Convenience wrapper over cub's SortPairs method
-     * @tparam KeyT key type
-     * @tparam ValueT value type
-     * @param workspace workspace buffer which will get resized if not enough space
-     * @param inKeys input keys array
-     * @param outKeys output keys array
-     * @param inVals input values array
-     * @param outVals output values array
-     * @param len array length
-     * @param stream cuda stream
-     */
+ * @brief Convenience wrapper over cub's SortPairs method
+ * @tparam KeyT key type
+ * @tparam ValueT value type
+ * @param workspace workspace buffer which will get resized if not enough space
+ * @param inKeys input keys array
+ * @param outKeys output keys array
+ * @param inVals input values array
+ * @param outVals output values array
+ * @param len array length
+ * @param stream cuda stream
+ */
 template <typename KeyT, typename ValueT>
-void sortPairs(rmm::device_uvector<char> &workspace, const KeyT *inKeys,
-               KeyT *outKeys, const ValueT *inVals, ValueT *outVals, int len,
-               cudaStream_t stream) {
+void sortPairs(rmm::device_uvector<char>& workspace,
+               const KeyT* inKeys,
+               KeyT* outKeys,
+               const ValueT* inVals,
+               ValueT* outVals,
+               int len,
+               cudaStream_t stream)
+{
   size_t worksize;
-  cub::DeviceRadixSort::SortPairs(nullptr, worksize, inKeys, outKeys, inVals,
-                                  outVals, len, 0, sizeof(KeyT) * 8, stream);
+  cub::DeviceRadixSort::SortPairs(
+    nullptr, worksize, inKeys, outKeys, inVals, outVals, len, 0, sizeof(KeyT) * 8, stream);
   workspace.resize(worksize, stream);
-  cub::DeviceRadixSort::SortPairs(workspace.data(), worksize, inKeys, outKeys,
-                                  inVals, outVals, len, 0, sizeof(KeyT) * 8,
-                                  stream);
+  cub::DeviceRadixSort::SortPairs(
+    workspace.data(), worksize, inKeys, outKeys, inVals, outVals, len, 0, sizeof(KeyT) * 8, stream);
 }
 
 }  // namespace raft
diff --git a/cpp/include/raft/common/device_loads_stores.cuh b/cpp/include/raft/common/device_loads_stores.cuh
index bb2b019ecb..41dc9cab08 100644
--- a/cpp/include/raft/common/device_loads_stores.cuh
+++ b/cpp/include/raft/common/device_loads_stores.cuh
@@ -31,40 +31,43 @@ namespace raft {
  * @param[out] addr shared memory address (should be aligned to vector size)
  * @param[in]  x    data to be stored at this address
  */
-DI void sts(float* addr, const float& x) {
+DI void sts(float* addr, const float& x)
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
   asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x));
 }
-DI void sts(float* addr, const float (&x)[1]) {
+DI void sts(float* addr, const float (&x)[1])
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
   asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x[0]));
 }
-DI void sts(float* addr, const float (&x)[2]) {
+DI void sts(float* addr, const float (&x)[2])
+{
   auto s2 = __cvta_generic_to_shared(reinterpret_cast<float2*>(addr));
-  asm volatile("st.shared.v2.f32 [%0], {%1, %2};"
-               :
-               : "l"(s2), "f"(x[0]), "f"(x[1]));
+  asm volatile("st.shared.v2.f32 [%0], {%1, %2};" : : "l"(s2), "f"(x[0]), "f"(x[1]));
 }
-DI void sts(float* addr, const float (&x)[4]) {
+DI void sts(float* addr, const float (&x)[4])
+{
   auto s4 = __cvta_generic_to_shared(reinterpret_cast<float4*>(addr));
   asm volatile("st.shared.v4.f32 [%0], {%1, %2, %3, %4};"
                :
                : "l"(s4), "f"(x[0]), "f"(x[1]), "f"(x[2]), "f"(x[3]));
 }
 
-DI void sts(double* addr, const double& x) {
+DI void sts(double* addr, const double& x)
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
   asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x));
 }
-DI void sts(double* addr, const double (&x)[1]) {
+DI void sts(double* addr, const double (&x)[1])
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
   asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x[0]));
 }
-DI void sts(double* addr, const double (&x)[2]) {
+DI void sts(double* addr, const double (&x)[2])
+{
   auto s2 = __cvta_generic_to_shared(reinterpret_cast<double2*>(addr));
-  asm volatile("st.shared.v2.f64 [%0], {%1, %2};"
-               :
-               : "l"(s2), "d"(x[0]), "d"(x[1]));
+  asm volatile("st.shared.v2.f64 [%0], {%1, %2};" : : "l"(s2), "d"(x[0]), "d"(x[1]));
 }
 /** @} */
 
@@ -80,39 +83,42 @@ DI void sts(double* addr, const double (&x)[2]) {
  * @param[in]  addr shared memory address from where to load
  *                  (should be aligned to vector size)
  */
-DI void lds(float& x, float* addr) {
+DI void lds(float& x, float* addr)
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
   asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "l"(s1));
 }
-DI void lds(float (&x)[1], float* addr) {
+DI void lds(float (&x)[1], float* addr)
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
   asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x[0]) : "l"(s1));
 }
-DI void lds(float (&x)[2], float* addr) {
+DI void lds(float (&x)[2], float* addr)
+{
   auto s2 = __cvta_generic_to_shared(reinterpret_cast<float2*>(addr));
-  asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];"
-               : "=f"(x[0]), "=f"(x[1])
-               : "l"(s2));
+  asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(s2));
 }
-DI void lds(float (&x)[4], float* addr) {
+DI void lds(float (&x)[4], float* addr)
+{
   auto s4 = __cvta_generic_to_shared(reinterpret_cast<float4*>(addr));
   asm volatile("ld.shared.v4.f32 {%0, %1, %2, %3}, [%4];"
                : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3])
                : "l"(s4));
 }
-DI void lds(double& x, double* addr) {
+DI void lds(double& x, double* addr)
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
   asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x) : "l"(s1));
 }
-DI void lds(double (&x)[1], double* addr) {
+DI void lds(double (&x)[1], double* addr)
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
   asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x[0]) : "l"(s1));
 }
-DI void lds(double (&x)[2], double* addr) {
+DI void lds(double (&x)[2], double* addr)
+{
   auto s2 = __cvta_generic_to_shared(reinterpret_cast<double2*>(addr));
-  asm volatile("ld.shared.v2.f64 {%0, %1}, [%2];"
-               : "=d"(x[0]), "=d"(x[1])
-               : "l"(s2));
+  asm volatile("ld.shared.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(s2));
 }
 /** @} */
 
@@ -123,32 +129,35 @@ DI void lds(double (&x)[2], double* addr) {
  * @param[out] x    data to be loaded from global memory
  * @param[in]  addr address in global memory from where to load
  */
-DI void ldg(float& x, const float* addr) {
+DI void ldg(float& x, const float* addr)
+{
   asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(x) : "l"(addr));
 }
-DI void ldg(float (&x)[1], const float* addr) {
+DI void ldg(float (&x)[1], const float* addr)
+{
   asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(x[0]) : "l"(addr));
 }
-DI void ldg(float (&x)[2], const float* addr) {
-  asm volatile("ld.global.cg.v2.f32 {%0, %1}, [%2];"
-               : "=f"(x[0]), "=f"(x[1])
-               : "l"(addr));
+DI void ldg(float (&x)[2], const float* addr)
+{
+  asm volatile("ld.global.cg.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(addr));
 }
-DI void ldg(float (&x)[4], const float* addr) {
+DI void ldg(float (&x)[4], const float* addr)
+{
   asm volatile("ld.global.cg.v4.f32 {%0, %1, %2, %3}, [%4];"
                : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3])
                : "l"(addr));
 }
-DI void ldg(double& x, const double* addr) {
+DI void ldg(double& x, const double* addr)
+{
   asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(x) : "l"(addr));
 }
-DI void ldg(double (&x)[1], const double* addr) {
+DI void ldg(double (&x)[1], const double* addr)
+{
   asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(x[0]) : "l"(addr));
 }
-DI void ldg(double (&x)[2], const double* addr) {
-  asm volatile("ld.global.cg.v2.f64 {%0, %1}, [%2];"
-               : "=d"(x[0]), "=d"(x[1])
-               : "l"(addr));
+DI void ldg(double (&x)[2], const double* addr)
+{
+  asm volatile("ld.global.cg.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(addr));
 }
 /** @} */
 
diff --git a/cpp/include/raft/common/scatter.cuh b/cpp/include/raft/common/scatter.cuh
index 785794461e..b228ac5499 100644
--- a/cpp/include/raft/common/scatter.cuh
+++ b/cpp/include/raft/common/scatter.cuh
@@ -22,8 +22,8 @@
 namespace raft {
 
 template <typename DataT, int VecLen, typename Lambda, typename IdxT>
-__global__ void scatterKernel(DataT *out, const DataT *in, const IdxT *idx,
-                              IdxT len, Lambda op) {
+__global__ void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op)
+{
   typedef TxN_t<DataT, VecLen> DataVec;
   typedef TxN_t<IdxT, VecLen> IdxVec;
   IdxT tid = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x);
@@ -34,61 +34,60 @@ __global__ void scatterKernel(DataT *out, const DataT *in, const IdxT *idx,
   DataVec dataIn;
 #pragma unroll
   for (int i = 0; i < VecLen; ++i) {
-    auto inPos = idxIn.val.data[i];
+    auto inPos         = idxIn.val.data[i];
     dataIn.val.data[i] = op(in[inPos], tid + i);
   }
   dataIn.store(out, tid);
 }
 
 template <typename DataT, int VecLen, typename Lambda, typename IdxT, int TPB>
-void scatterImpl(DataT *out, const DataT *in, const IdxT *idx, IdxT len,
-                 Lambda op, cudaStream_t stream) {
+void scatterImpl(
+  DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op, cudaStream_t stream)
+{
   const IdxT nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxT)TPB);
-  scatterKernel<DataT, VecLen, Lambda, IdxT>
-    <<<nblks, TPB, 0, stream>>>(out, in, idx, len, op);
+  scatterKernel<DataT, VecLen, Lambda, IdxT><<<nblks, TPB, 0, stream>>>(out, in, idx, len, op);
   CUDA_CHECK(cudaGetLastError());
 }
 
 /**
-     * @brief Performs scatter operation based on the input indexing array
-     * @tparam DataT data type whose array gets scattered
-     * @tparam IdxT indexing type
-     * @tparam TPB threads-per-block in the final kernel launched
-     * @tparam Lambda the device-lambda performing a unary operation on the loaded
-     * data before it gets scattered
-     * @param out the output array
-     * @param in the input array
-     * @param idx the indexing array
-     * @param len number of elements in the input array
-     * @param stream cuda stream where to launch work
-     * @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This
-     * will be applied to every element before scattering it to the right location.
-     * The second param in this method will be the destination index.
-     */
-template <typename DataT, typename IdxT,
-          typename Lambda = raft::Nop<DataT, IdxT>, int TPB = 256>
-void scatter(DataT *out, const DataT *in, const IdxT *idx, IdxT len,
-             cudaStream_t stream, Lambda op = raft::Nop<DataT, IdxT>()) {
+ * @brief Performs scatter operation based on the input indexing array
+ * @tparam DataT data type whose array gets scattered
+ * @tparam IdxT indexing type
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @tparam Lambda the device-lambda performing a unary operation on the loaded
+ * data before it gets scattered
+ * @param out the output array
+ * @param in the input array
+ * @param idx the indexing array
+ * @param len number of elements in the input array
+ * @param stream cuda stream where to launch work
+ * @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This
+ * will be applied to every element before scattering it to the right location.
+ * The second param in this method will be the destination index.
+ */
+template <typename DataT, typename IdxT, typename Lambda = raft::Nop<DataT, IdxT>, int TPB = 256>
+void scatter(DataT* out,
+             const DataT* in,
+             const IdxT* idx,
+             IdxT len,
+             cudaStream_t stream,
+             Lambda op = raft::Nop<DataT, IdxT>())
+{
   if (len <= 0) return;
-  constexpr size_t DataSize = sizeof(DataT);
-  constexpr size_t IdxSize = sizeof(IdxT);
+  constexpr size_t DataSize   = sizeof(DataT);
+  constexpr size_t IdxSize    = sizeof(IdxT);
   constexpr size_t MaxPerElem = DataSize > IdxSize ? DataSize : IdxSize;
-  size_t bytes = len * MaxPerElem;
+  size_t bytes                = len * MaxPerElem;
   if (16 / MaxPerElem && bytes % 16 == 0) {
-    scatterImpl<DataT, 16 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len,
-                                                           op, stream);
+    scatterImpl<DataT, 16 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else if (8 / MaxPerElem && bytes % 8 == 0) {
-    scatterImpl<DataT, 8 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
-                                                          stream);
+    scatterImpl<DataT, 8 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else if (4 / MaxPerElem && bytes % 4 == 0) {
-    scatterImpl<DataT, 4 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
-                                                          stream);
+    scatterImpl<DataT, 4 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else if (2 / MaxPerElem && bytes % 2 == 0) {
-    scatterImpl<DataT, 2 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
-                                                          stream);
+    scatterImpl<DataT, 2 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else if (1 / MaxPerElem) {
-    scatterImpl<DataT, 1 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
-                                                          stream);
+    scatterImpl<DataT, 1 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else {
     scatterImpl<DataT, 1, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   }
diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index bd8a4ce9e7..68b8e723e9 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -25,16 +25,7 @@ namespace raft {
 namespace comms {
 
 typedef unsigned int request_t;
-enum class datatype_t {
-  CHAR,
-  UINT8,
-  INT32,
-  UINT32,
-  INT64,
-  UINT64,
-  FLOAT32,
-  FLOAT64
-};
+enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 };
 enum class op_t { SUM, PROD, MIN, MAX };
 
 /**
@@ -50,42 +41,50 @@ template <typename value_t>
 constexpr datatype_t get_type();
 
 template <>
-constexpr datatype_t get_type<char>() {
+constexpr datatype_t get_type<char>()
+{
   return datatype_t::CHAR;
 }
 
 template <>
-constexpr datatype_t get_type<uint8_t>() {
+constexpr datatype_t get_type<uint8_t>()
+{
   return datatype_t::UINT8;
 }
 
 template <>
-constexpr datatype_t get_type<int>() {
+constexpr datatype_t get_type<int>()
+{
   return datatype_t::INT32;
 }
 
 template <>
-constexpr datatype_t get_type<uint32_t>() {
+constexpr datatype_t get_type<uint32_t>()
+{
   return datatype_t::UINT32;
 }
 
 template <>
-constexpr datatype_t get_type<int64_t>() {
+constexpr datatype_t get_type<int64_t>()
+{
   return datatype_t::INT64;
 }
 
 template <>
-constexpr datatype_t get_type<uint64_t>() {
+constexpr datatype_t get_type<uint64_t>()
+{
   return datatype_t::UINT64;
 }
 
 template <>
-constexpr datatype_t get_type<float>() {
+constexpr datatype_t get_type<float>()
+{
   return datatype_t::FLOAT32;
 }
 
 template <>
-constexpr datatype_t get_type<double>() {
+constexpr datatype_t get_type<double>()
+{
   return datatype_t::FLOAT64;
 }
 
@@ -95,76 +94,106 @@ class comms_iface {
   virtual int get_rank() const = 0;
 
   virtual std::unique_ptr<comms_iface> comm_split(int color, int key) const = 0;
-  virtual void barrier() const = 0;
+  virtual void barrier() const                                              = 0;
 
   virtual status_t sync_stream(cudaStream_t stream) const = 0;
 
-  virtual void isend(const void* buf, size_t size, int dest, int tag,
-                     request_t* request) const = 0;
+  virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0;
 
-  virtual void irecv(void* buf, size_t size, int source, int tag,
-                     request_t* request) const = 0;
+  virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0;
 
   virtual void waitall(int count, request_t array_of_requests[]) const = 0;
 
-  virtual void allreduce(const void* sendbuff, void* recvbuff, size_t count,
-                         datatype_t datatype, op_t op,
+  virtual void allreduce(const void* sendbuff,
+                         void* recvbuff,
+                         size_t count,
+                         datatype_t datatype,
+                         op_t op,
                          cudaStream_t stream) const = 0;
 
-  virtual void bcast(void* buff, size_t count, datatype_t datatype, int root,
-                     cudaStream_t stream) const = 0;
+  virtual void bcast(
+    void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0;
 
-  virtual void bcast(const void* sendbuff, void* recvbuff, size_t count,
-                     datatype_t datatype, int root,
+  virtual void bcast(const void* sendbuff,
+                     void* recvbuff,
+                     size_t count,
+                     datatype_t datatype,
+                     int root,
                      cudaStream_t stream) const = 0;
 
-  virtual void reduce(const void* sendbuff, void* recvbuff, size_t count,
-                      datatype_t datatype, op_t op, int root,
+  virtual void reduce(const void* sendbuff,
+                      void* recvbuff,
+                      size_t count,
+                      datatype_t datatype,
+                      op_t op,
+                      int root,
                       cudaStream_t stream) const = 0;
 
-  virtual void allgather(const void* sendbuff, void* recvbuff, size_t sendcount,
-                         datatype_t datatype, cudaStream_t stream) const = 0;
-
-  virtual void allgatherv(const void* sendbuf, void* recvbuf,
-                          const size_t* recvcounts, const size_t* displs,
-                          datatype_t datatype, cudaStream_t stream) const = 0;
+  virtual void allgather(const void* sendbuff,
+                         void* recvbuff,
+                         size_t sendcount,
+                         datatype_t datatype,
+                         cudaStream_t stream) const = 0;
 
-  virtual void gather(const void* sendbuff, void* recvbuff, size_t sendcount,
-                      datatype_t datatype, int root,
+  virtual void allgatherv(const void* sendbuf,
+                          void* recvbuf,
+                          const size_t* recvcounts,
+                          const size_t* displs,
+                          datatype_t datatype,
+                          cudaStream_t stream) const = 0;
+
+  virtual void gather(const void* sendbuff,
+                      void* recvbuff,
+                      size_t sendcount,
+                      datatype_t datatype,
+                      int root,
                       cudaStream_t stream) const = 0;
 
-  virtual void gatherv(const void* sendbuf, void* recvbuf, size_t sendcount,
-                       const size_t* recvcounts, const size_t* displs,
-                       datatype_t datatype, int root,
+  virtual void gatherv(const void* sendbuf,
+                       void* recvbuf,
+                       size_t sendcount,
+                       const size_t* recvcounts,
+                       const size_t* displs,
+                       datatype_t datatype,
+                       int root,
                        cudaStream_t stream) const = 0;
 
-  virtual void reducescatter(const void* sendbuff, void* recvbuff,
-                             size_t recvcount, datatype_t datatype, op_t op,
+  virtual void reducescatter(const void* sendbuff,
+                             void* recvbuff,
+                             size_t recvcount,
+                             datatype_t datatype,
+                             op_t op,
                              cudaStream_t stream) const = 0;
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  virtual void device_send(const void* buf, size_t size, int dest,
-                           cudaStream_t stream) const = 0;
+  virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0;
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  virtual void device_recv(void* buf, size_t size, int source,
-                           cudaStream_t stream) const = 0;
-
-  virtual void device_sendrecv(const void* sendbuf, size_t sendsize, int dest,
-                               void* recvbuf, size_t recvsize, int source,
+  virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0;
+
+  virtual void device_sendrecv(const void* sendbuf,
+                               size_t sendsize,
+                               int dest,
+                               void* recvbuf,
+                               size_t recvsize,
+                               int source,
                                cudaStream_t stream) const = 0;
 
-  virtual void device_multicast_sendrecv(
-    const void* sendbuf, std::vector<size_t> const& sendsizes,
-    std::vector<size_t> const& sendoffsets, std::vector<int> const& dests,
-    void* recvbuf, std::vector<size_t> const& recvsizes,
-    std::vector<size_t> const& recvoffsets, std::vector<int> const& sources,
-    cudaStream_t stream) const = 0;
+  virtual void device_multicast_sendrecv(const void* sendbuf,
+                                         std::vector<size_t> const& sendsizes,
+                                         std::vector<size_t> const& sendoffsets,
+                                         std::vector<int> const& dests,
+                                         void* recvbuf,
+                                         std::vector<size_t> const& recvsizes,
+                                         std::vector<size_t> const& recvoffsets,
+                                         std::vector<int> const& sources,
+                                         cudaStream_t stream) const = 0;
 };
 
 class comms_t {
  public:
-  comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release()) {
+  comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release())
+  {
     ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!");
   }
 
@@ -191,7 +220,8 @@ class comms_t {
    * @param color ranks w/ the same color are placed in the same communicator
    * @param key controls rank assignment
    */
-  std::unique_ptr<comms_iface> comm_split(int color, int key) const {
+  std::unique_ptr<comms_iface> comm_split(int color, int key) const
+  {
     return impl_->comm_split(color, key);
   }
 
@@ -208,9 +238,7 @@ class comms_t {
    *
    * @param stream the cuda stream to sync collective operations on
    */
-  status_t sync_stream(cudaStream_t stream) const {
-    return impl_->sync_stream(stream);
-  }
+  status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); }
 
   /**
    * Performs an asynchronous point-to-point send
@@ -223,10 +251,9 @@ class comms_t {
    * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
    */
   template <typename value_t>
-  void isend(const value_t* buf, size_t size, int dest, int tag,
-             request_t* request) const {
-    impl_->isend(static_cast<const void*>(buf), size * sizeof(value_t), dest,
-                 tag, request);
+  void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const
+  {
+    impl_->isend(static_cast<const void*>(buf), size * sizeof(value_t), dest, tag, request);
   }
 
   /**
@@ -240,10 +267,9 @@ class comms_t {
    * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
    */
   template <typename value_t>
-  void irecv(value_t* buf, size_t size, int source, int tag,
-             request_t* request) const {
-    impl_->irecv(static_cast<void*>(buf), size * sizeof(value_t), source, tag,
-                 request);
+  void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const
+  {
+    impl_->irecv(static_cast<void*>(buf), size * sizeof(value_t), source, tag, request);
   }
 
   /**
@@ -251,7 +277,8 @@ class comms_t {
    * @param count number of requests to synchronize on
    * @param array_of_requests an array of request_t objects returned from isend/irecv
    */
-  void waitall(int count, request_t array_of_requests[]) const {
+  void waitall(int count, request_t array_of_requests[]) const
+  {
     impl_->waitall(count, array_of_requests);
   }
 
@@ -265,11 +292,15 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void allreduce(const value_t* sendbuff, value_t* recvbuff, size_t count,
-                 op_t op, cudaStream_t stream) const {
+  void allreduce(
+    const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const
+  {
     impl_->allreduce(static_cast<const void*>(sendbuff),
-                     static_cast<void*>(recvbuff), count, get_type<value_t>(),
-                     op, stream);
+                     static_cast<void*>(recvbuff),
+                     count,
+                     get_type<value_t>(),
+                     op,
+                     stream);
   }
 
   /**
@@ -281,9 +312,9 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const {
-    impl_->bcast(static_cast<void*>(buff), count, get_type<value_t>(), root,
-                 stream);
+  void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const
+  {
+    impl_->bcast(static_cast<void*>(buff), count, get_type<value_t>(), root, stream);
   }
 
   /**
@@ -296,10 +327,14 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void bcast(const value_t* sendbuff, value_t* recvbuff, size_t count, int root,
-             cudaStream_t stream) const {
+  void bcast(
+    const value_t* sendbuff, value_t* recvbuff, size_t count, int root, cudaStream_t stream) const
+  {
     impl_->bcast(static_cast<const void*>(sendbuff),
-                 static_cast<void*>(recvbuff), count, get_type<value_t>(), root,
+                 static_cast<void*>(recvbuff),
+                 count,
+                 get_type<value_t>(),
+                 root,
                  stream);
   }
 
@@ -314,11 +349,20 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void reduce(const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op,
-              int root, cudaStream_t stream) const {
+  void reduce(const value_t* sendbuff,
+              value_t* recvbuff,
+              size_t count,
+              op_t op,
+              int root,
+              cudaStream_t stream) const
+  {
     impl_->reduce(static_cast<const void*>(sendbuff),
-                  static_cast<void*>(recvbuff), count, get_type<value_t>(), op,
-                  root, stream);
+                  static_cast<void*>(recvbuff),
+                  count,
+                  get_type<value_t>(),
+                  op,
+                  root,
+                  stream);
   }
 
   /**
@@ -330,11 +374,16 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void allgather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount,
-                 cudaStream_t stream) const {
+  void allgather(const value_t* sendbuff,
+                 value_t* recvbuff,
+                 size_t sendcount,
+                 cudaStream_t stream) const
+  {
     impl_->allgather(static_cast<const void*>(sendbuff),
-                     static_cast<void*>(recvbuff), sendcount,
-                     get_type<value_t>(), stream);
+                     static_cast<void*>(recvbuff),
+                     sendcount,
+                     get_type<value_t>(),
+                     stream);
   }
 
   /**
@@ -349,12 +398,18 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void allgatherv(const value_t* sendbuf, value_t* recvbuf,
-                  const size_t* recvcounts, const size_t* displs,
-                  cudaStream_t stream) const {
+  void allgatherv(const value_t* sendbuf,
+                  value_t* recvbuf,
+                  const size_t* recvcounts,
+                  const size_t* displs,
+                  cudaStream_t stream) const
+  {
     impl_->allgatherv(static_cast<const void*>(sendbuf),
-                      static_cast<void*>(recvbuf), recvcounts, displs,
-                      get_type<value_t>(), stream);
+                      static_cast<void*>(recvbuf),
+                      recvcounts,
+                      displs,
+                      get_type<value_t>(),
+                      stream);
   }
 
   /**
@@ -367,11 +422,18 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void gather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount,
-              int root, cudaStream_t stream) const {
+  void gather(const value_t* sendbuff,
+              value_t* recvbuff,
+              size_t sendcount,
+              int root,
+              cudaStream_t stream) const
+  {
     impl_->gather(static_cast<const void*>(sendbuff),
-                  static_cast<void*>(recvbuff), sendcount, get_type<value_t>(),
-                  root, stream);
+                  static_cast<void*>(recvbuff),
+                  sendcount,
+                  get_type<value_t>(),
+                  root,
+                  stream);
   }
 
   /**
@@ -388,12 +450,22 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void gatherv(const value_t* sendbuf, value_t* recvbuf, size_t sendcount,
-               const size_t* recvcounts, const size_t* displs, int root,
-               cudaStream_t stream) const {
+  void gatherv(const value_t* sendbuf,
+               value_t* recvbuf,
+               size_t sendcount,
+               const size_t* recvcounts,
+               const size_t* displs,
+               int root,
+               cudaStream_t stream) const
+  {
     impl_->gatherv(static_cast<const void*>(sendbuf),
-                   static_cast<void*>(recvbuf), sendcount, recvcounts, displs,
-                   get_type<value_t>(), root, stream);
+                   static_cast<void*>(recvbuf),
+                   sendcount,
+                   recvcounts,
+                   displs,
+                   get_type<value_t>(),
+                   root,
+                   stream);
   }
 
   /**
@@ -406,11 +478,18 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void reducescatter(const value_t* sendbuff, value_t* recvbuff,
-                     size_t recvcount, op_t op, cudaStream_t stream) const {
+  void reducescatter(const value_t* sendbuff,
+                     value_t* recvbuff,
+                     size_t recvcount,
+                     op_t op,
+                     cudaStream_t stream) const
+  {
     impl_->reducescatter(static_cast<const void*>(sendbuff),
-                         static_cast<void*>(recvbuff), recvcount,
-                         get_type<value_t>(), op, stream);
+                         static_cast<void*>(recvbuff),
+                         recvcount,
+                         get_type<value_t>(),
+                         op,
+                         stream);
   }
 
   /**
@@ -425,10 +504,9 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void device_send(const value_t* buf, size_t size, int dest,
-                   cudaStream_t stream) const {
-    impl_->device_send(static_cast<const void*>(buf), size * sizeof(value_t),
-                       dest, stream);
+  void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const
+  {
+    impl_->device_send(static_cast<const void*>(buf), size * sizeof(value_t), dest, stream);
   }
 
   /**
@@ -443,10 +521,9 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void device_recv(value_t* buf, size_t size, int source,
-                   cudaStream_t stream) const {
-    impl_->device_recv(static_cast<void*>(buf), size * sizeof(value_t), source,
-                       stream);
+  void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const
+  {
+    impl_->device_recv(static_cast<void*>(buf), size * sizeof(value_t), source, stream);
   }
 
   /**
@@ -462,12 +539,21 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void device_sendrecv(const value_t* sendbuf, size_t sendsize, int dest,
-                       value_t* recvbuf, size_t recvsize, int source,
-                       cudaStream_t stream) const {
-    impl_->device_sendrecv(
-      static_cast<const void*>(sendbuf), sendsize * sizeof(value_t), dest,
-      static_cast<void*>(recvbuf), recvsize * sizeof(value_t), source, stream);
+  void device_sendrecv(const value_t* sendbuf,
+                       size_t sendsize,
+                       int dest,
+                       value_t* recvbuf,
+                       size_t recvsize,
+                       int source,
+                       cudaStream_t stream) const
+  {
+    impl_->device_sendrecv(static_cast<const void*>(sendbuf),
+                           sendsize * sizeof(value_t),
+                           dest,
+                           static_cast<void*>(recvbuf),
+                           recvsize * sizeof(value_t),
+                           source,
+                           stream);
   }
 
   /**
@@ -485,28 +571,37 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void device_multicast_sendrecv(
-    const value_t* sendbuf, std::vector<size_t> const& sendsizes,
-    std::vector<size_t> const& sendoffsets, std::vector<int> const& dests,
-    value_t* recvbuf, std::vector<size_t> const& recvsizes,
-    std::vector<size_t> const& recvoffsets, std::vector<int> const& sources,
-    cudaStream_t stream) const {
-    auto sendbytesizes = sendsizes;
+  void device_multicast_sendrecv(const value_t* sendbuf,
+                                 std::vector<size_t> const& sendsizes,
+                                 std::vector<size_t> const& sendoffsets,
+                                 std::vector<int> const& dests,
+                                 value_t* recvbuf,
+                                 std::vector<size_t> const& recvsizes,
+                                 std::vector<size_t> const& recvoffsets,
+                                 std::vector<int> const& sources,
+                                 cudaStream_t stream) const
+  {
+    auto sendbytesizes   = sendsizes;
     auto sendbyteoffsets = sendoffsets;
     for (size_t i = 0; i < sendsizes.size(); ++i) {
       sendbytesizes[i] *= sizeof(value_t);
       sendbyteoffsets[i] *= sizeof(value_t);
     }
-    auto recvbytesizes = recvsizes;
+    auto recvbytesizes   = recvsizes;
     auto recvbyteoffsets = recvoffsets;
     for (size_t i = 0; i < recvsizes.size(); ++i) {
       recvbytesizes[i] *= sizeof(value_t);
       recvbyteoffsets[i] *= sizeof(value_t);
     }
     impl_->device_multicast_sendrecv(static_cast<const void*>(sendbuf),
-                                     sendbytesizes, sendbyteoffsets, dests,
-                                     static_cast<void*>(recvbuf), recvbytesizes,
-                                     recvbyteoffsets, sources, stream);
+                                     sendbytesizes,
+                                     sendbyteoffsets,
+                                     dests,
+                                     static_cast<void*>(recvbuf),
+                                     recvbytesizes,
+                                     recvbyteoffsets,
+                                     sources,
+                                     stream);
   }
 
  private:
diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp
index e01490d728..2be5b0d23f 100644
--- a/cpp/include/raft/comms/helper.hpp
+++ b/cpp/include/raft/comms/helper.hpp
@@ -36,12 +36,12 @@ namespace comms {
  * @param num_ranks number of ranks in communicator clique
  * @param rank rank of local instance
  */
-void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm,
-                           int num_ranks, int rank) {
+void build_comms_nccl_only(handle_t* handle, ncclComm_t nccl_comm, int num_ranks, int rank)
+{
   cudaStream_t stream = handle->get_stream();
 
-  auto communicator = std::make_shared<comms_t>(std::unique_ptr<comms_iface>(
-    new raft::comms::std_comms(nccl_comm, num_ranks, rank, stream)));
+  auto communicator = std::make_shared<comms_t>(
+    std::unique_ptr<comms_iface>(new raft::comms::std_comms(nccl_comm, num_ranks, rank, stream)));
   handle->set_comms(communicator);
 }
 
@@ -60,20 +60,20 @@ void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm,
  * @param num_ranks number of ranks in communicator clique
  * @param rank rank of local instance
  */
-void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm,
-                          void *ucp_worker, void *eps, int num_ranks,
-                          int rank) {
-  auto eps_sp = std::make_shared<ucp_ep_h *>(new ucp_ep_h[num_ranks]);
+void build_comms_nccl_ucx(
+  handle_t* handle, ncclComm_t nccl_comm, void* ucp_worker, void* eps, int num_ranks, int rank)
+{
+  auto eps_sp = std::make_shared<ucp_ep_h*>(new ucp_ep_h[num_ranks]);
 
-  auto size_t_ep_arr = reinterpret_cast<size_t *>(eps);
+  auto size_t_ep_arr = reinterpret_cast<size_t*>(eps);
 
   for (int i = 0; i < num_ranks; i++) {
-    size_t ptr = size_t_ep_arr[i];
-    auto ucp_ep_v = reinterpret_cast<ucp_ep_h *>(*eps_sp);
+    size_t ptr    = size_t_ep_arr[i];
+    auto ucp_ep_v = reinterpret_cast<ucp_ep_h*>(*eps_sp);
 
     if (ptr != 0) {
       auto eps_ptr = reinterpret_cast<ucp_ep_h>(size_t_ep_arr[i]);
-      ucp_ep_v[i] = eps_ptr;
+      ucp_ep_v[i]  = eps_ptr;
     } else {
       ucp_ep_v[i] = nullptr;
     }
@@ -81,18 +81,19 @@ void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm,
 
   cudaStream_t stream = handle->get_stream();
 
-  auto communicator = std::make_shared<comms_t>(
-    std::unique_ptr<comms_iface>(new raft::comms::std_comms(
+  auto communicator =
+    std::make_shared<comms_t>(std::unique_ptr<comms_iface>(new raft::comms::std_comms(
       nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, num_ranks, rank, stream)));
   handle->set_comms(communicator);
 }
 
-inline void nccl_unique_id_from_char(ncclUniqueId *id, char *uniqueId,
-                                     int size) {
+inline void nccl_unique_id_from_char(ncclUniqueId* id, char* uniqueId, int size)
+{
   memcpy(id->internal, uniqueId, size);
 }
 
-inline void get_unique_id(char *uid, int size) {
+inline void get_unique_id(char* uid, int size)
+{
   ncclUniqueId id;
   ncclGetUniqueId(&id);
 
diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp
index 067c7bd0ab..3091cd53a9 100644
--- a/cpp/include/raft/comms/mpi_comms.hpp
+++ b/cpp/include/raft/comms/mpi_comms.hpp
@@ -32,16 +32,16 @@
 #include <raft/error.hpp>
 #include <raft/handle.hpp>
 
-#define MPI_TRY(call)                                                          \
-  do {                                                                         \
-    int status = call;                                                         \
-    if (MPI_SUCCESS != status) {                                               \
-      int mpi_error_string_lenght = 0;                                         \
-      char mpi_error_string[MPI_MAX_ERROR_STRING];                             \
-      MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght);    \
-      RAFT_EXPECTS(MPI_SUCCESS == status, "ERROR: MPI call='%s'. Reason:%s\n", \
-                   #call, mpi_error_string);                                   \
-    }                                                                          \
+#define MPI_TRY(call)                                                                         \
+  do {                                                                                        \
+    int status = call;                                                                        \
+    if (MPI_SUCCESS != status) {                                                              \
+      int mpi_error_string_lenght = 0;                                                        \
+      char mpi_error_string[MPI_MAX_ERROR_STRING];                                            \
+      MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght);                   \
+      RAFT_EXPECTS(                                                                           \
+        MPI_SUCCESS == status, "ERROR: MPI call='%s'. Reason:%s\n", #call, mpi_error_string); \
+    }                                                                                         \
   } while (0)
 
 #define MPI_TRY_NO_THROW(call)                                              \
@@ -51,48 +51,41 @@
       int mpi_error_string_lenght = 0;                                      \
       char mpi_error_string[MPI_MAX_ERROR_STRING];                          \
       MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \
-      printf("MPI call='%s' at file=%s line=%d failed with %s ", #call,     \
-             __FILE__, __LINE__, mpi_error_string);                         \
+      printf("MPI call='%s' at file=%s line=%d failed with %s ",            \
+             #call,                                                         \
+             __FILE__,                                                      \
+             __LINE__,                                                      \
+             mpi_error_string);                                             \
     }                                                                       \
   } while (0)
 
 namespace raft {
 namespace comms {
 
-constexpr MPI_Datatype get_mpi_datatype(const datatype_t datatype) {
+constexpr MPI_Datatype get_mpi_datatype(const datatype_t datatype)
+{
   switch (datatype) {
-    case datatype_t::CHAR:
-      return MPI_CHAR;
-    case datatype_t::UINT8:
-      return MPI_UNSIGNED_CHAR;
-    case datatype_t::INT32:
-      return MPI_INT;
-    case datatype_t::UINT32:
-      return MPI_UNSIGNED;
-    case datatype_t::INT64:
-      return MPI_LONG_LONG;
-    case datatype_t::UINT64:
-      return MPI_UNSIGNED_LONG_LONG;
-    case datatype_t::FLOAT32:
-      return MPI_FLOAT;
-    case datatype_t::FLOAT64:
-      return MPI_DOUBLE;
+    case datatype_t::CHAR: return MPI_CHAR;
+    case datatype_t::UINT8: return MPI_UNSIGNED_CHAR;
+    case datatype_t::INT32: return MPI_INT;
+    case datatype_t::UINT32: return MPI_UNSIGNED;
+    case datatype_t::INT64: return MPI_LONG_LONG;
+    case datatype_t::UINT64: return MPI_UNSIGNED_LONG_LONG;
+    case datatype_t::FLOAT32: return MPI_FLOAT;
+    case datatype_t::FLOAT64: return MPI_DOUBLE;
     default:
       // Execution should never reach here. This takes care of compiler warning.
       return MPI_DOUBLE;
   }
 }
 
-constexpr MPI_Op get_mpi_op(const op_t op) {
+constexpr MPI_Op get_mpi_op(const op_t op)
+{
   switch (op) {
-    case op_t::SUM:
-      return MPI_SUM;
-    case op_t::PROD:
-      return MPI_PROD;
-    case op_t::MIN:
-      return MPI_MIN;
-    case op_t::MAX:
-      return MPI_MAX;
+    case op_t::SUM: return MPI_SUM;
+    case op_t::PROD: return MPI_PROD;
+    case op_t::MIN: return MPI_MIN;
+    case op_t::MAX: return MPI_MAX;
     default:
       // Execution should never reach here. This takes care of compiler warning.
       return MPI_MAX;
@@ -102,38 +95,35 @@ constexpr MPI_Op get_mpi_op(const op_t op) {
 class mpi_comms : public comms_iface {
  public:
   mpi_comms(MPI_Comm comm, const bool owns_mpi_comm)
-    : owns_mpi_comm_(owns_mpi_comm),
-      mpi_comm_(comm),
-      size_(0),
-      rank_(1),
-      next_request_id_(0) {
+    : owns_mpi_comm_(owns_mpi_comm), mpi_comm_(comm), size_(0), rank_(1), next_request_id_(0)
+  {
     int mpi_is_initialized = 0;
     MPI_TRY(MPI_Initialized(&mpi_is_initialized));
     RAFT_EXPECTS(mpi_is_initialized, "ERROR: MPI is not initialized!");
     MPI_TRY(MPI_Comm_size(mpi_comm_, &size_));
     MPI_TRY(MPI_Comm_rank(mpi_comm_, &rank_));
-    //get NCCL unique ID at rank 0 and broadcast it to all others
+    // get NCCL unique ID at rank 0 and broadcast it to all others
     ncclUniqueId id;
     if (0 == rank_) NCCL_TRY(ncclGetUniqueId(&id));
     MPI_TRY(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, mpi_comm_));
 
-    //initializing NCCL
+    // initializing NCCL
     NCCL_TRY(ncclCommInitRank(&nccl_comm_, size_, id, rank_));
   }
 
-  virtual ~mpi_comms() {
-    //finalizing NCCL
+  virtual ~mpi_comms()
+  {
+    // finalizing NCCL
     NCCL_TRY_NO_THROW(ncclCommDestroy(nccl_comm_));
-    if (owns_mpi_comm_) {
-      MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_));
-    }
+    if (owns_mpi_comm_) { MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_)); }
   }
 
   int get_size() const { return size_; }
 
   int get_rank() const { return rank_; }
 
-  std::unique_ptr<comms_iface> comm_split(int color, int key) const {
+  std::unique_ptr<comms_iface> comm_split(int color, int key) const
+  {
     MPI_Comm new_comm;
     MPI_TRY(MPI_Comm_split(mpi_comm_, color, key, &new_comm));
     return std::unique_ptr<comms_iface>(new mpi_comms(new_comm, true));
@@ -141,15 +131,15 @@ class mpi_comms : public comms_iface {
 
   void barrier() const { MPI_TRY(MPI_Barrier(mpi_comm_)); }
 
-  void isend(const void* buf, size_t size, int dest, int tag,
-             request_t* request) const {
+  void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const
+  {
     MPI_Request mpi_req;
     request_t req_id;
     if (free_requests_.empty()) {
       req_id = next_request_id_++;
     } else {
       auto it = free_requests_.begin();
-      req_id = *it;
+      req_id  = *it;
       free_requests_.erase(it);
     }
     MPI_TRY(MPI_Isend(buf, size, MPI_BYTE, dest, tag, mpi_comm_, &mpi_req));
@@ -157,15 +147,15 @@ class mpi_comms : public comms_iface {
     *request = req_id;
   }
 
-  void irecv(void* buf, size_t size, int source, int tag,
-             request_t* request) const {
+  void irecv(void* buf, size_t size, int source, int tag, request_t* request) const
+  {
     MPI_Request mpi_req;
     request_t req_id;
     if (free_requests_.empty()) {
       req_id = next_request_id_++;
     } else {
       auto it = free_requests_.begin();
-      req_id = *it;
+      req_id  = *it;
       free_requests_.erase(it);
     }
 
@@ -174,7 +164,8 @@ class mpi_comms : public comms_iface {
     *request = req_id;
   }
 
-  void waitall(int count, request_t array_of_requests[]) const {
+  void waitall(int count, request_t array_of_requests[]) const
+  {
     std::vector<MPI_Request> requests;
     requests.reserve(count);
     for (int i = 0; i < count; ++i) {
@@ -189,94 +180,149 @@ class mpi_comms : public comms_iface {
     MPI_TRY(MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE));
   }
 
-  void allreduce(const void* sendbuff, void* recvbuff, size_t count,
-                 datatype_t datatype, op_t op, cudaStream_t stream) const {
-    NCCL_TRY(ncclAllReduce(sendbuff, recvbuff, count,
-                           get_nccl_datatype(datatype), get_nccl_op(op),
-                           nccl_comm_, stream));
+  void allreduce(const void* sendbuff,
+                 void* recvbuff,
+                 size_t count,
+                 datatype_t datatype,
+                 op_t op,
+                 cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclAllReduce(
+      sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream));
   }
 
-  void bcast(void* buff, size_t count, datatype_t datatype, int root,
-             cudaStream_t stream) const {
-    NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root,
-                           nccl_comm_, stream));
+  void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const
+  {
+    NCCL_TRY(
+      ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
   }
 
-  void bcast(const void* sendbuff, void* recvbuff, size_t count,
-             datatype_t datatype, int root, cudaStream_t stream) const {
-    NCCL_TRY(ncclBroadcast(sendbuff, recvbuff, count,
-                           get_nccl_datatype(datatype), root, nccl_comm_,
-                           stream));
+  void bcast(const void* sendbuff,
+             void* recvbuff,
+             size_t count,
+             datatype_t datatype,
+             int root,
+             cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclBroadcast(
+      sendbuff, recvbuff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
   }
 
-  void reduce(const void* sendbuff, void* recvbuff, size_t count,
-              datatype_t datatype, op_t op, int root,
-              cudaStream_t stream) const {
-    NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype),
-                        get_nccl_op(op), root, nccl_comm_, stream));
+  void reduce(const void* sendbuff,
+              void* recvbuff,
+              size_t count,
+              datatype_t datatype,
+              op_t op,
+              int root,
+              cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclReduce(sendbuff,
+                        recvbuff,
+                        count,
+                        get_nccl_datatype(datatype),
+                        get_nccl_op(op),
+                        root,
+                        nccl_comm_,
+                        stream));
   }
 
-  void allgather(const void* sendbuff, void* recvbuff, size_t sendcount,
-                 datatype_t datatype, cudaStream_t stream) const {
-    NCCL_TRY(ncclAllGather(sendbuff, recvbuff, sendcount,
-                           get_nccl_datatype(datatype), nccl_comm_, stream));
+  void allgather(const void* sendbuff,
+                 void* recvbuff,
+                 size_t sendcount,
+                 datatype_t datatype,
+                 cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclAllGather(
+      sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream));
   }
 
-  void allgatherv(const void* sendbuf, void* recvbuf, const size_t* recvcounts,
-                  const size_t* displs, datatype_t datatype,
-                  cudaStream_t stream) const {
-    //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf
-    //Listing 1 on page 4.
+  void allgatherv(const void* sendbuf,
+                  void* recvbuf,
+                  const size_t* recvcounts,
+                  const size_t* displs,
+                  datatype_t datatype,
+                  cudaStream_t stream) const
+  {
+    // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" -
+    // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4.
     for (int root = 0; root < size_; ++root) {
-      NCCL_TRY(ncclBroadcast(sendbuf,
-                             static_cast<char*>(recvbuf) +
-                               displs[root] * get_datatype_size(datatype),
-                             recvcounts[root], get_nccl_datatype(datatype),
-                             root, nccl_comm_, stream));
+      NCCL_TRY(
+        ncclBroadcast(sendbuf,
+                      static_cast<char*>(recvbuf) + displs[root] * get_datatype_size(datatype),
+                      recvcounts[root],
+                      get_nccl_datatype(datatype),
+                      root,
+                      nccl_comm_,
+                      stream));
     }
   }
 
-  void gather(const void* sendbuff, void* recvbuff, size_t sendcount,
-              datatype_t datatype, int root, cudaStream_t stream) const {
+  void gather(const void* sendbuff,
+              void* recvbuff,
+              size_t sendcount,
+              datatype_t datatype,
+              int root,
+              cudaStream_t stream) const
+  {
     size_t dtype_size = get_datatype_size(datatype);
     NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
-        NCCL_TRY(ncclRecv(
-          static_cast<char*>(recvbuff) + sendcount * r * dtype_size, sendcount,
-          get_nccl_datatype(datatype), r, nccl_comm_, stream));
+        NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + sendcount * r * dtype_size,
+                          sendcount,
+                          get_nccl_datatype(datatype),
+                          r,
+                          nccl_comm_,
+                          stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
-                      nccl_comm_, stream));
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
-  void gatherv(const void* sendbuff, void* recvbuff, size_t sendcount,
-               const size_t* recvcounts, const size_t* displs,
-               datatype_t datatype, int root, cudaStream_t stream) const {
+  void gatherv(const void* sendbuff,
+               void* recvbuff,
+               size_t sendcount,
+               const size_t* recvcounts,
+               const size_t* displs,
+               datatype_t datatype,
+               int root,
+               cudaStream_t stream) const
+  {
     size_t dtype_size = get_datatype_size(datatype);
     NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
         NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + displs[r] * dtype_size,
-                          recvcounts[r], get_nccl_datatype(datatype), r,
-                          nccl_comm_, stream));
+                          recvcounts[r],
+                          get_nccl_datatype(datatype),
+                          r,
+                          nccl_comm_,
+                          stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
-                      nccl_comm_, stream));
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
-  void reducescatter(const void* sendbuff, void* recvbuff, size_t recvcount,
-                     datatype_t datatype, op_t op, cudaStream_t stream) const {
-    NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount,
-                               get_nccl_datatype(datatype), get_nccl_op(op),
-                               nccl_comm_, stream));
+  void reducescatter(const void* sendbuff,
+                     void* recvbuff,
+                     size_t recvcount,
+                     datatype_t datatype,
+                     op_t op,
+                     cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclReduceScatter(sendbuff,
+                               recvbuff,
+                               recvcount,
+                               get_nccl_datatype(datatype),
+                               get_nccl_op(op),
+                               nccl_comm_,
+                               stream));
   }
 
-  status_t sync_stream(cudaStream_t stream) const {
+  status_t sync_stream(cudaStream_t stream) const
+  {
     cudaError_t cudaErr;
     ncclResult_t ncclErr, ncclAsyncErr;
     while (1) {
@@ -309,45 +355,58 @@ class mpi_comms : public comms_iface {
   };
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  void device_send(const void* buf, size_t size, int dest,
-                   cudaStream_t stream) const {
+  void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const
+  {
     NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream));
   }
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  void device_recv(void* buf, size_t size, int source,
-                   cudaStream_t stream) const {
+  void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const
+  {
     NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream));
   }
 
-  void device_sendrecv(const void* sendbuf, size_t sendsize, int dest,
-                       void* recvbuf, size_t recvsize, int source,
-                       cudaStream_t stream) const {
+  void device_sendrecv(const void* sendbuf,
+                       size_t sendsize,
+                       int dest,
+                       void* recvbuf,
+                       size_t recvsize,
+                       int source,
+                       cudaStream_t stream) const
+  {
     // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
     NCCL_TRY(ncclGroupStart());
     NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream));
-    NCCL_TRY(
-      ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
+    NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
   void device_multicast_sendrecv(const void* sendbuf,
                                  std::vector<size_t> const& sendsizes,
                                  std::vector<size_t> const& sendoffsets,
-                                 std::vector<int> const& dests, void* recvbuf,
+                                 std::vector<int> const& dests,
+                                 void* recvbuf,
                                  std::vector<size_t> const& recvsizes,
                                  std::vector<size_t> const& recvoffsets,
                                  std::vector<int> const& sources,
-                                 cudaStream_t stream) const {
+                                 cudaStream_t stream) const
+  {
     // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
     NCCL_TRY(ncclGroupStart());
     for (size_t i = 0; i < sendsizes.size(); ++i) {
       NCCL_TRY(ncclSend(static_cast<const char*>(sendbuf) + sendoffsets[i],
-                        sendsizes[i], ncclUint8, dests[i], nccl_comm_, stream));
+                        sendsizes[i],
+                        ncclUint8,
+                        dests[i],
+                        nccl_comm_,
+                        stream));
     }
     for (size_t i = 0; i < recvsizes.size(); ++i) {
       NCCL_TRY(ncclRecv(static_cast<char*>(recvbuf) + recvoffsets[i],
-                        recvsizes[i], ncclUint8, sources[i], nccl_comm_,
+                        recvsizes[i],
+                        ncclUint8,
+                        sources[i],
+                        nccl_comm_,
                         stream));
     }
     NCCL_TRY(ncclGroupEnd());
@@ -365,9 +424,10 @@ class mpi_comms : public comms_iface {
   mutable std::unordered_set<request_t> free_requests_;
 };
 
-inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm) {
-  auto communicator = std::make_shared<comms_t>(
-    std::unique_ptr<comms_iface>(new mpi_comms(comm, true)));
+inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm)
+{
+  auto communicator =
+    std::make_shared<comms_t>(std::unique_ptr<comms_iface>(new mpi_comms(comm, true)));
   handle->set_comms(communicator);
 };
 
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index 47559b1718..1647c29667 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -64,9 +64,13 @@ class std_comms : public comms_iface {
    * @param stream cuda stream for synchronizing and ordering collective operations
    * @param subcomms_ucp use ucp for subcommunicators
    */
-  std_comms(ncclComm_t nccl_comm, ucp_worker_h ucp_worker,
-            std::shared_ptr<ucp_ep_h *> eps, int num_ranks, int rank,
-            cudaStream_t stream, bool subcomms_ucp = true)
+  std_comms(ncclComm_t nccl_comm,
+            ucp_worker_h ucp_worker,
+            std::shared_ptr<ucp_ep_h*> eps,
+            int num_ranks,
+            int rank,
+            cudaStream_t stream,
+            bool subcomms_ucp = true)
     : nccl_comm_(nccl_comm),
       stream_(stream),
       status_(2, stream),
@@ -75,7 +79,8 @@ class std_comms : public comms_iface {
       subcomms_ucp_(subcomms_ucp),
       ucp_worker_(ucp_worker),
       ucp_eps_(eps),
-      next_request_id_(0) {
+      next_request_id_(0)
+  {
     initialize();
   };
 
@@ -86,18 +91,19 @@ class std_comms : public comms_iface {
    * @param rank rank of the current worker
    * @param stream stream for ordering collective operations
    */
-  std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank,
-            cudaStream_t stream)
+  std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank, cudaStream_t stream)
     : nccl_comm_(nccl_comm),
       stream_(stream),
       status_(2, stream),
       num_ranks_(num_ranks),
       rank_(rank),
-      subcomms_ucp_(false) {
+      subcomms_ucp_(false)
+  {
     initialize();
   };
 
-  void initialize() {
+  void initialize()
+  {
     sendbuff_ = status_.data();
     recvbuff_ = status_.data() + 1;
   }
@@ -106,17 +112,16 @@ class std_comms : public comms_iface {
 
   int get_rank() const { return rank_; }
 
-  std::unique_ptr<comms_iface> comm_split(int color, int key) const {
+  std::unique_ptr<comms_iface> comm_split(int color, int key) const
+  {
     rmm::device_uvector<int> d_colors(get_size(), stream_);
     rmm::device_uvector<int> d_keys(get_size(), stream_);
 
     update_device(d_colors.data() + get_rank(), &color, 1, stream_);
     update_device(d_keys.data() + get_rank(), &key, 1, stream_);
 
-    allgather(d_colors.data() + get_rank(), d_colors.data(), 1,
-              datatype_t::INT32, stream_);
-    allgather(d_keys.data() + get_rank(), d_keys.data(), 1, datatype_t::INT32,
-              stream_);
+    allgather(d_colors.data() + get_rank(), d_colors.data(), 1, datatype_t::INT32, stream_);
+    allgather(d_keys.data() + get_rank(), d_keys.data(), 1, datatype_t::INT32, stream_);
     this->sync_stream(stream_);
 
     std::vector<int> h_colors(get_size());
@@ -133,9 +138,7 @@ class std_comms : public comms_iface {
     for (int i = 0; i < get_size(); ++i) {
       if (h_colors[i] == color) {
         subcomm_ranks.push_back(i);
-        if (ucp_worker_ != nullptr && subcomms_ucp_) {
-          new_ucx_ptrs.push_back((*ucp_eps_)[i]);
-        }
+        if (ucp_worker_ != nullptr && subcomms_ucp_) { new_ucx_ptrs.push_back((*ucp_eps_)[i]); }
       }
     }
 
@@ -144,8 +147,7 @@ class std_comms : public comms_iface {
       NCCL_TRY(ncclGetUniqueId(&id));
       std::vector<request_t> requests(subcomm_ranks.size() - 1);
       for (size_t i = 1; i < subcomm_ranks.size(); ++i) {
-        isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color,
-              requests.data() + (i - 1));
+        isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color, requests.data() + (i - 1));
       }
       waitall(requests.size(), requests.data());
     } else {
@@ -160,17 +162,22 @@ class std_comms : public comms_iface {
     NCCL_TRY(ncclCommInitRank(&nccl_comm, subcomm_ranks.size(), id, key));
 
     if (ucp_worker_ != nullptr && subcomms_ucp_) {
-      auto eps_sp = std::make_shared<ucp_ep_h *>(new_ucx_ptrs.data());
-      return std::unique_ptr<comms_iface>(
-        new std_comms(nccl_comm, (ucp_worker_h)ucp_worker_, eps_sp,
-                      subcomm_ranks.size(), key, stream_, subcomms_ucp_));
+      auto eps_sp = std::make_shared<ucp_ep_h*>(new_ucx_ptrs.data());
+      return std::unique_ptr<comms_iface>(new std_comms(nccl_comm,
+                                                        (ucp_worker_h)ucp_worker_,
+                                                        eps_sp,
+                                                        subcomm_ranks.size(),
+                                                        key,
+                                                        stream_,
+                                                        subcomms_ucp_));
     } else {
       return std::unique_ptr<comms_iface>(
         new std_comms(nccl_comm, subcomm_ranks.size(), key, stream_));
     }
   }
 
-  void barrier() const {
+  void barrier() const
+  {
     CUDA_CHECK(cudaMemsetAsync(sendbuff_, 1, sizeof(int), stream_));
     CUDA_CHECK(cudaMemsetAsync(recvbuff_, 1, sizeof(int), stream_));
 
@@ -180,39 +187,37 @@ class std_comms : public comms_iface {
            "ERROR: syncStream failed. This can be caused by a failed rank_.");
   }
 
-  void get_request_id(request_t *req) const {
+  void get_request_id(request_t* req) const
+  {
     request_t req_id;
 
     if (this->free_requests_.empty())
       req_id = this->next_request_id_++;
     else {
       auto it = this->free_requests_.begin();
-      req_id = *it;
+      req_id  = *it;
       this->free_requests_.erase(it);
     }
     *req = req_id;
   }
 
-  void isend(const void *buf, size_t size, int dest, int tag,
-             request_t *request) const {
-    ASSERT(ucp_worker_ != nullptr,
-           "ERROR: UCX comms not initialized on communicator.");
+  void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const
+  {
+    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
 
     get_request_id(request);
     ucp_ep_h ep_ptr = (*ucp_eps_)[dest];
 
-    ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request));
+    ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request));
 
-    this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag,
-                                 default_tag_mask, get_rank());
+    this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag, default_tag_mask, get_rank());
 
     requests_in_flight_.insert(std::make_pair(*request, ucp_req));
   }
 
-  void irecv(void *buf, size_t size, int source, int tag,
-             request_t *request) const {
-    ASSERT(ucp_worker_ != nullptr,
-           "ERROR: UCX comms not initialized on communicator.");
+  void irecv(void* buf, size_t size, int source, int tag, request_t* request) const
+  {
+    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
 
     get_request_id(request);
 
@@ -220,18 +225,17 @@ class std_comms : public comms_iface {
 
     ucp_tag_t tag_mask = default_tag_mask;
 
-    ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request));
-    ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag,
-                           tag_mask, source);
+    ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request));
+    ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag, tag_mask, source);
 
     requests_in_flight_.insert(std::make_pair(*request, ucp_req));
   }
 
-  void waitall(int count, request_t array_of_requests[]) const {
-    ASSERT(ucp_worker_ != nullptr,
-           "ERROR: UCX comms not initialized on communicator.");
+  void waitall(int count, request_t array_of_requests[]) const
+  {
+    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
 
-    std::vector<ucp_request *> requests;
+    std::vector<ucp_request*> requests;
     requests.reserve(count);
 
     time_t start = time(NULL);
@@ -239,7 +243,8 @@ class std_comms : public comms_iface {
     for (int i = 0; i < count; ++i) {
       auto req_it = requests_in_flight_.find(array_of_requests[i]);
       ASSERT(requests_in_flight_.end() != req_it,
-             "ERROR: waitall on invalid request: %d", array_of_requests[i]);
+             "ERROR: waitall on invalid request: %d",
+             array_of_requests[i]);
       requests.push_back(req_it->second);
       free_requests_.insert(req_it->first);
       requests_in_flight_.erase(req_it);
@@ -252,8 +257,7 @@ class std_comms : public comms_iface {
       // in 10 or more seconds.
       ASSERT(now - start < 10, "Timed out waiting for requests.");
 
-      for (std::vector<ucp_request *>::iterator it = requests.begin();
-           it != requests.end();) {
+      for (std::vector<ucp_request*>::iterator it = requests.begin(); it != requests.end();) {
         bool restart = false;  // resets the timeout when any progress was made
 
         // Causes UCP to progress through the send/recv message queue
@@ -266,10 +270,8 @@ class std_comms : public comms_iface {
         // If the message needs release, we know it will be sent/received
         // asynchronously, so we will need to track and verify its state
         if (req->needs_release) {
-          ASSERT(UCS_PTR_IS_PTR(req->req),
-                 "UCX Request Error. Request is not valid UCX pointer");
-          ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n",
-                 UCS_PTR_STATUS(req->req));
+          ASSERT(UCS_PTR_IS_PTR(req->req), "UCX Request Error. Request is not valid UCX pointer");
+          ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n", UCS_PTR_STATUS(req->req));
           ASSERT(req->req->completed == 1 || req->req->completed == 0,
                  "request->completed not a valid value: %d\n",
                  req->req->completed);
@@ -290,101 +292,154 @@ class std_comms : public comms_iface {
           ++it;
         }
         // if any progress was made, reset the timeout start time
-        if (restart) {
-          start = time(NULL);
-        }
+        if (restart) { start = time(NULL); }
       }
     }
   }
 
-  void allreduce(const void *sendbuff, void *recvbuff, size_t count,
-                 datatype_t datatype, op_t op, cudaStream_t stream) const {
-    NCCL_TRY(ncclAllReduce(sendbuff, recvbuff, count,
-                           get_nccl_datatype(datatype), get_nccl_op(op),
-                           nccl_comm_, stream));
+  void allreduce(const void* sendbuff,
+                 void* recvbuff,
+                 size_t count,
+                 datatype_t datatype,
+                 op_t op,
+                 cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclAllReduce(
+      sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream));
   }
 
-  void bcast(void *buff, size_t count, datatype_t datatype, int root,
-             cudaStream_t stream) const {
-    NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root,
-                           nccl_comm_, stream));
+  void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const
+  {
+    NCCL_TRY(
+      ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
   }
 
-  void bcast(const void *sendbuff, void *recvbuff, size_t count,
-             datatype_t datatype, int root, cudaStream_t stream) const {
-    NCCL_TRY(ncclBroadcast(sendbuff, recvbuff, count,
-                           get_nccl_datatype(datatype), root, nccl_comm_,
-                           stream));
+  void bcast(const void* sendbuff,
+             void* recvbuff,
+             size_t count,
+             datatype_t datatype,
+             int root,
+             cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclBroadcast(
+      sendbuff, recvbuff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
   }
 
-  void reduce(const void *sendbuff, void *recvbuff, size_t count,
-              datatype_t datatype, op_t op, int root,
-              cudaStream_t stream) const {
-    NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype),
-                        get_nccl_op(op), root, nccl_comm_, stream));
+  void reduce(const void* sendbuff,
+              void* recvbuff,
+              size_t count,
+              datatype_t datatype,
+              op_t op,
+              int root,
+              cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclReduce(sendbuff,
+                        recvbuff,
+                        count,
+                        get_nccl_datatype(datatype),
+                        get_nccl_op(op),
+                        root,
+                        nccl_comm_,
+                        stream));
   }
 
-  void allgather(const void *sendbuff, void *recvbuff, size_t sendcount,
-                 datatype_t datatype, cudaStream_t stream) const {
-    NCCL_TRY(ncclAllGather(sendbuff, recvbuff, sendcount,
-                           get_nccl_datatype(datatype), nccl_comm_, stream));
+  void allgather(const void* sendbuff,
+                 void* recvbuff,
+                 size_t sendcount,
+                 datatype_t datatype,
+                 cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclAllGather(
+      sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream));
   }
 
-  void allgatherv(const void *sendbuf, void *recvbuf, const size_t *recvcounts,
-                  const size_t *displs, datatype_t datatype,
-                  cudaStream_t stream) const {
-    //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf
-    //Listing 1 on page 4.
+  void allgatherv(const void* sendbuf,
+                  void* recvbuf,
+                  const size_t* recvcounts,
+                  const size_t* displs,
+                  datatype_t datatype,
+                  cudaStream_t stream) const
+  {
+    // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" -
+    // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4.
     for (int root = 0; root < num_ranks_; ++root) {
       size_t dtype_size = get_datatype_size(datatype);
-      NCCL_TRY(ncclBroadcast(
-        sendbuf, static_cast<char *>(recvbuf) + displs[root] * dtype_size,
-        recvcounts[root], get_nccl_datatype(datatype), root, nccl_comm_,
-        stream));
+      NCCL_TRY(ncclBroadcast(sendbuf,
+                             static_cast<char*>(recvbuf) + displs[root] * dtype_size,
+                             recvcounts[root],
+                             get_nccl_datatype(datatype),
+                             root,
+                             nccl_comm_,
+                             stream));
     }
   }
 
-  void gather(const void *sendbuff, void *recvbuff, size_t sendcount,
-              datatype_t datatype, int root, cudaStream_t stream) const {
+  void gather(const void* sendbuff,
+              void* recvbuff,
+              size_t sendcount,
+              datatype_t datatype,
+              int root,
+              cudaStream_t stream) const
+  {
     size_t dtype_size = get_datatype_size(datatype);
     NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
-        NCCL_TRY(ncclRecv(
-          static_cast<char *>(recvbuff) + sendcount * r * dtype_size, sendcount,
-          get_nccl_datatype(datatype), r, nccl_comm_, stream));
+        NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + sendcount * r * dtype_size,
+                          sendcount,
+                          get_nccl_datatype(datatype),
+                          r,
+                          nccl_comm_,
+                          stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
-                      nccl_comm_, stream));
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
-  void gatherv(const void *sendbuff, void *recvbuff, size_t sendcount,
-               const size_t *recvcounts, const size_t *displs,
-               datatype_t datatype, int root, cudaStream_t stream) const {
+  void gatherv(const void* sendbuff,
+               void* recvbuff,
+               size_t sendcount,
+               const size_t* recvcounts,
+               const size_t* displs,
+               datatype_t datatype,
+               int root,
+               cudaStream_t stream) const
+  {
     size_t dtype_size = get_datatype_size(datatype);
     NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
-        NCCL_TRY(ncclRecv(
-          static_cast<char *>(recvbuff) + displs[r] * dtype_size, recvcounts[r],
-          get_nccl_datatype(datatype), r, nccl_comm_, stream));
+        NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + displs[r] * dtype_size,
+                          recvcounts[r],
+                          get_nccl_datatype(datatype),
+                          r,
+                          nccl_comm_,
+                          stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
-                      nccl_comm_, stream));
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
-  void reducescatter(const void *sendbuff, void *recvbuff, size_t recvcount,
-                     datatype_t datatype, op_t op, cudaStream_t stream) const {
-    NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount,
-                               get_nccl_datatype(datatype), get_nccl_op(op),
-                               nccl_comm_, stream));
+  void reducescatter(const void* sendbuff,
+                     void* recvbuff,
+                     size_t recvcount,
+                     datatype_t datatype,
+                     op_t op,
+                     cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclReduceScatter(sendbuff,
+                               recvbuff,
+                               recvcount,
+                               get_nccl_datatype(datatype),
+                               get_nccl_op(op),
+                               nccl_comm_,
+                               stream));
   }
 
-  status_t sync_stream(cudaStream_t stream) const {
+  status_t sync_stream(cudaStream_t stream) const
+  {
     cudaError_t cudaErr;
     ncclResult_t ncclErr, ncclAsyncErr;
     while (1) {
@@ -417,45 +472,58 @@ class std_comms : public comms_iface {
   }
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  void device_send(const void *buf, size_t size, int dest,
-                   cudaStream_t stream) const {
+  void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const
+  {
     NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream));
   }
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  void device_recv(void *buf, size_t size, int source,
-                   cudaStream_t stream) const {
+  void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const
+  {
     NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream));
   }
 
-  void device_sendrecv(const void *sendbuf, size_t sendsize, int dest,
-                       void *recvbuf, size_t recvsize, int source,
-                       cudaStream_t stream) const {
+  void device_sendrecv(const void* sendbuf,
+                       size_t sendsize,
+                       int dest,
+                       void* recvbuf,
+                       size_t recvsize,
+                       int source,
+                       cudaStream_t stream) const
+  {
     // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
     NCCL_TRY(ncclGroupStart());
     NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream));
-    NCCL_TRY(
-      ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
+    NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
-  void device_multicast_sendrecv(const void *sendbuf,
-                                 std::vector<size_t> const &sendsizes,
-                                 std::vector<size_t> const &sendoffsets,
-                                 std::vector<int> const &dests, void *recvbuf,
-                                 std::vector<size_t> const &recvsizes,
-                                 std::vector<size_t> const &recvoffsets,
-                                 std::vector<int> const &sources,
-                                 cudaStream_t stream) const {
+  void device_multicast_sendrecv(const void* sendbuf,
+                                 std::vector<size_t> const& sendsizes,
+                                 std::vector<size_t> const& sendoffsets,
+                                 std::vector<int> const& dests,
+                                 void* recvbuf,
+                                 std::vector<size_t> const& recvsizes,
+                                 std::vector<size_t> const& recvoffsets,
+                                 std::vector<int> const& sources,
+                                 cudaStream_t stream) const
+  {
     // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
     NCCL_TRY(ncclGroupStart());
     for (size_t i = 0; i < sendsizes.size(); ++i) {
-      NCCL_TRY(ncclSend(static_cast<const char *>(sendbuf) + sendoffsets[i],
-                        sendsizes[i], ncclUint8, dests[i], nccl_comm_, stream));
+      NCCL_TRY(ncclSend(static_cast<const char*>(sendbuf) + sendoffsets[i],
+                        sendsizes[i],
+                        ncclUint8,
+                        dests[i],
+                        nccl_comm_,
+                        stream));
     }
     for (size_t i = 0; i < recvsizes.size(); ++i) {
-      NCCL_TRY(ncclRecv(static_cast<char *>(recvbuf) + recvoffsets[i],
-                        recvsizes[i], ncclUint8, sources[i], nccl_comm_,
+      NCCL_TRY(ncclRecv(static_cast<char*>(recvbuf) + recvoffsets[i],
+                        recvsizes[i],
+                        ncclUint8,
+                        sources[i],
+                        nccl_comm_,
                         stream));
     }
     NCCL_TRY(ncclGroupEnd());
@@ -475,10 +543,9 @@ class std_comms : public comms_iface {
 
   comms_ucp_handler ucp_handler_;
   ucp_worker_h ucp_worker_;
-  std::shared_ptr<ucp_ep_h *> ucp_eps_;
+  std::shared_ptr<ucp_ep_h*> ucp_eps_;
   mutable request_t next_request_id_;
-  mutable std::unordered_map<request_t, struct ucp_request *>
-    requests_in_flight_;
+  mutable std::unordered_map<request_t, struct ucp_request*> requests_in_flight_;
   mutable std::unordered_set<request_t> free_requests_;
 };
 }  // end namespace comms
diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp
index 39086de25d..5f87bf41fa 100644
--- a/cpp/include/raft/comms/test.hpp
+++ b/cpp/include/raft/comms/test.hpp
@@ -35,24 +35,23 @@ namespace comms {
  *
  * @param[in] handle the raft handle to use. This is expected to already have an
  *        initialized comms instance.
-*  @param[in] root the root rank id
+ *  @param[in] root the root rank id
  */
-bool test_collective_allreduce(const handle_t &handle, int root) {
-  comms_t const &communicator = handle.get_comms();
+bool test_collective_allreduce(const handle_t& handle, int root)
+{
+  comms_t const& communicator = handle.get_comms();
 
   int const send = 1;
 
   cudaStream_t stream = handle.get_stream();
 
   rmm::device_scalar<int> temp_d(stream);
-  CUDA_CHECK(
-    cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream));
 
   communicator.allreduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, stream);
 
   int temp_h = 0;
-  CUDA_CHECK(
-    cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream));
+  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
   communicator.barrier();
 
@@ -67,10 +66,11 @@ bool test_collective_allreduce(const handle_t &handle, int root) {
  *
  * @param[in] handle the raft handle to use. This is expected to already have an
  *        initialized comms instance.
-*  @param[in] root the root rank id
+ *  @param[in] root the root rank id
  */
-bool test_collective_broadcast(const handle_t &handle, int root) {
-  comms_t const &communicator = handle.get_comms();
+bool test_collective_broadcast(const handle_t& handle, int root)
+{
+  comms_t const& communicator = handle.get_comms();
 
   int const send = root;
 
@@ -79,14 +79,12 @@ bool test_collective_broadcast(const handle_t &handle, int root) {
   rmm::device_scalar<int> temp_d(stream);
 
   if (communicator.get_rank() == root)
-    CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
-                               cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
 
   communicator.bcast(temp_d.data(), 1, root, stream);
   communicator.sync_stream(stream);
   int temp_h = -1;  // Verify more than one byte is being sent
-  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int),
-                             cudaMemcpyDeviceToHost, stream));
+  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
   communicator.barrier();
 
@@ -101,10 +99,11 @@ bool test_collective_broadcast(const handle_t &handle, int root) {
  *
  * @param[in] handle the raft handle to use. This is expected to already have an
  *        initialized comms instance.
-*  @param[in] root the root rank id
+ *  @param[in] root the root rank id
  */
-bool test_collective_reduce(const handle_t &handle, int root) {
-  comms_t const &communicator = handle.get_comms();
+bool test_collective_reduce(const handle_t& handle, int root)
+{
+  comms_t const& communicator = handle.get_comms();
 
   int const send = root;
 
@@ -112,14 +111,12 @@ bool test_collective_reduce(const handle_t &handle, int root) {
 
   rmm::device_scalar<int> temp_d(stream);
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
-                             cudaMemcpyHostToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
 
   communicator.reduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, root, stream);
   communicator.sync_stream(stream);
   int temp_h = -1;  // Verify more than one byte is being sent
-  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int),
-                             cudaMemcpyDeviceToHost, stream));
+  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
   communicator.barrier();
 
@@ -137,10 +134,11 @@ bool test_collective_reduce(const handle_t &handle, int root) {
  *
  * @param[in] handle the raft handle to use. This is expected to already have an
  *        initialized comms instance.
-*  @param[in] root the root rank id
+ *  @param[in] root the root rank id
  */
-bool test_collective_allgather(const handle_t &handle, int root) {
-  comms_t const &communicator = handle.get_comms();
+bool test_collective_allgather(const handle_t& handle, int root)
+{
+  comms_t const& communicator = handle.get_comms();
 
   int const send = communicator.get_rank();
 
@@ -149,16 +147,13 @@ bool test_collective_allgather(const handle_t &handle, int root) {
   rmm::device_scalar<int> temp_d(stream);
   rmm::device_uvector<int> recv_d(communicator.get_size(), stream);
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
-                             cudaMemcpyHostToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
 
   communicator.allgather(temp_d.data(), recv_d.data(), 1, stream);
   communicator.sync_stream(stream);
-  int
-    temp_h[communicator.get_size()];  // Verify more than one byte is being sent
-  CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(),
-                             sizeof(int) * communicator.get_size(),
-                             cudaMemcpyDeviceToHost, stream));
+  int temp_h[communicator.get_size()];  // Verify more than one byte is being sent
+  CUDA_CHECK(cudaMemcpyAsync(
+    &temp_h, recv_d.data(), sizeof(int) * communicator.get_size(), cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
   communicator.barrier();
 
@@ -176,30 +171,29 @@ bool test_collective_allgather(const handle_t &handle, int root) {
  *
  * @param[in] handle the raft handle to use. This is expected to already have an
  *        initialized comms instance.
-*  @param[in] root the root rank id
+ *  @param[in] root the root rank id
  */
-bool test_collective_gather(const handle_t &handle, int root) {
-  comms_t const &communicator = handle.get_comms();
+bool test_collective_gather(const handle_t& handle, int root)
+{
+  comms_t const& communicator = handle.get_comms();
 
   int const send = communicator.get_rank();
 
   cudaStream_t stream = handle.get_stream();
 
   rmm::device_scalar<int> temp_d(stream);
-  rmm::device_uvector<int> recv_d(
-    communicator.get_rank() == root ? communicator.get_size() : 0, stream);
+  rmm::device_uvector<int> recv_d(communicator.get_rank() == root ? communicator.get_size() : 0,
+                                  stream);
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
-                             cudaMemcpyHostToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
 
   communicator.gather(temp_d.data(), recv_d.data(), 1, root, stream);
   communicator.sync_stream(stream);
 
   if (communicator.get_rank() == root) {
     std::vector<int> temp_h(communicator.get_size(), 0);
-    CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), recv_d.data(),
-                               sizeof(int) * temp_h.size(),
-                               cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(
+      temp_h.data(), recv_d.data(), sizeof(int) * temp_h.size(), cudaMemcpyDeviceToHost, stream));
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
     for (int i = 0; i < communicator.get_size(); i++) {
@@ -214,45 +208,47 @@ bool test_collective_gather(const handle_t &handle, int root) {
  *
  * @param[in] handle the raft handle to use. This is expected to already have an
  *        initialized comms instance.
-*  @param[in] root the root rank id
+ *  @param[in] root the root rank id
  */
-bool test_collective_gatherv(const handle_t &handle, int root) {
-  comms_t const &communicator = handle.get_comms();
+bool test_collective_gatherv(const handle_t& handle, int root)
+{
+  comms_t const& communicator = handle.get_comms();
 
   std::vector<size_t> sendcounts(communicator.get_size());
   std::iota(sendcounts.begin(), sendcounts.end(), size_t{1});
   std::vector<size_t> displacements(communicator.get_size() + 1, 0);
-  std::partial_sum(sendcounts.begin(), sendcounts.end(),
-                   displacements.begin() + 1);
+  std::partial_sum(sendcounts.begin(), sendcounts.end(), displacements.begin() + 1);
 
-  std::vector<int> sends(displacements[communicator.get_rank() + 1] -
-                           displacements[communicator.get_rank()],
-                         communicator.get_rank());
+  std::vector<int> sends(
+    displacements[communicator.get_rank() + 1] - displacements[communicator.get_rank()],
+    communicator.get_rank());
 
   cudaStream_t stream = handle.get_stream();
 
   rmm::device_uvector<int> temp_d(sends.size(), stream);
-  rmm::device_uvector<int> recv_d(
-    communicator.get_rank() == root ? displacements.back() : 0, stream);
+  rmm::device_uvector<int> recv_d(communicator.get_rank() == root ? displacements.back() : 0,
+                                  stream);
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(),
-                             sends.size() * sizeof(int), cudaMemcpyHostToDevice,
-                             stream));
+  CUDA_CHECK(cudaMemcpyAsync(
+    temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream));
 
   communicator.gatherv(
-    temp_d.data(), recv_d.data(), temp_d.size(),
-    communicator.get_rank() == root ? sendcounts.data()
-                                    : static_cast<size_t *>(nullptr),
-    communicator.get_rank() == root ? displacements.data()
-                                    : static_cast<size_t *>(nullptr),
-    root, stream);
+    temp_d.data(),
+    recv_d.data(),
+    temp_d.size(),
+    communicator.get_rank() == root ? sendcounts.data() : static_cast<size_t*>(nullptr),
+    communicator.get_rank() == root ? displacements.data() : static_cast<size_t*>(nullptr),
+    root,
+    stream);
   communicator.sync_stream(stream);
 
   if (communicator.get_rank() == root) {
     std::vector<int> temp_h(displacements.back(), 0);
-    CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), recv_d.data(),
+    CUDA_CHECK(cudaMemcpyAsync(temp_h.data(),
+                               recv_d.data(),
                                sizeof(int) * displacements.back(),
-                               cudaMemcpyDeviceToHost, stream));
+                               cudaMemcpyDeviceToHost,
+                               stream));
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
     for (int i = 0; i < communicator.get_size(); i++) {
@@ -271,10 +267,11 @@ bool test_collective_gatherv(const handle_t &handle, int root) {
  *
  * @param[in] handle the raft handle to use. This is expected to already have an
  *        initialized comms instance.
-*  @param[in] root the root rank id
+ *  @param[in] root the root rank id
  */
-bool test_collective_reducescatter(const handle_t &handle, int root) {
-  comms_t const &communicator = handle.get_comms();
+bool test_collective_reducescatter(const handle_t& handle, int root)
+{
+  comms_t const& communicator = handle.get_comms();
 
   std::vector<int> sends(communicator.get_size(), 1);
 
@@ -283,16 +280,13 @@ bool test_collective_reducescatter(const handle_t &handle, int root) {
   rmm::device_uvector<int> temp_d(sends.size(), stream);
   rmm::device_scalar<int> recv_d(stream);
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(),
-                             sends.size() * sizeof(int), cudaMemcpyHostToDevice,
-                             stream));
+  CUDA_CHECK(cudaMemcpyAsync(
+    temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream));
 
-  communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM,
-                             stream);
+  communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM, stream);
   communicator.sync_stream(stream);
   int temp_h = -1;  // Verify more than one byte is being sent
-  CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int),
-                             cudaMemcpyDeviceToHost, stream));
+  CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
   communicator.barrier();
 
@@ -309,9 +303,10 @@ bool test_collective_reducescatter(const handle_t &handle, int root) {
  *        initialized comms instance.
  * @param[in] numTrials number of iterations of all-to-all messaging to perform
  */
-bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) {
-  comms_t const &communicator = h.get_comms();
-  int const rank = communicator.get_rank();
+bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials)
+{
+  comms_t const& communicator = h.get_comms();
+  int const rank              = communicator.get_rank();
 
   bool ret = true;
   for (int i = 0; i < numTrials; i++) {
@@ -320,11 +315,11 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) {
     std::vector<request_t> requests;
     requests.resize(2 * (communicator.get_size() - 1));
     int request_idx = 0;
-    //post receives
+    // post receives
     for (int r = 0; r < communicator.get_size(); ++r) {
       if (r != rank) {
-        communicator.irecv(received_data.data() + request_idx, 1, r, 0,
-                           requests.data() + request_idx);
+        communicator.irecv(
+          received_data.data() + request_idx, 1, r, 0, requests.data() + request_idx);
         ++request_idx;
       }
     }
@@ -360,8 +355,7 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) {
       communicator.barrier();
     }
 
-    if (communicator.get_rank() == 0)
-      std::cout << "=========================" << std::endl;
+    if (communicator.get_rank() == 0) std::cout << "=========================" << std::endl;
   }
 
   return ret;
@@ -374,10 +368,11 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) {
  *        initialized comms instance.
  * @param numTrials number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) {
-  comms_t const &communicator = h.get_comms();
-  int const rank = communicator.get_rank();
-  cudaStream_t stream = h.get_stream();
+bool test_pointToPoint_device_send_or_recv(const handle_t& h, int numTrials)
+{
+  comms_t const& communicator = h.get_comms();
+  int const rank              = communicator.get_rank();
+  cudaStream_t stream         = h.get_stream();
 
   bool ret = true;
   for (int i = 0; i < numTrials; i++) {
@@ -400,13 +395,9 @@ bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) {
 
     communicator.sync_stream(stream);
 
-    if (!sender && received_data.value(stream) != rank - 1) {
-      ret = false;
-    }
+    if (!sender && received_data.value(stream) != rank - 1) { ret = false; }
 
-    if (communicator.get_rank() == 0) {
-      std::cout << "=========================" << std::endl;
-    }
+    if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; }
   }
 
   return ret;
@@ -419,10 +410,11 @@ bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) {
  *        initialized comms instance.
  * @param numTrials number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) {
-  comms_t const &communicator = h.get_comms();
-  int const rank = communicator.get_rank();
-  cudaStream_t stream = h.get_stream();
+bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials)
+{
+  comms_t const& communicator = h.get_comms();
+  int const rank              = communicator.get_rank();
+  cudaStream_t stream         = h.get_stream();
 
   bool ret = true;
   for (int i = 0; i < numTrials; i++) {
@@ -436,12 +428,12 @@ bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) {
 
     if (rank % 2 == 0) {
       if (rank + 1 < communicator.get_size()) {
-        communicator.device_sendrecv(sent_data.data(), 1, rank + 1,
-                                     received_data.data(), 1, rank + 1, stream);
+        communicator.device_sendrecv(
+          sent_data.data(), 1, rank + 1, received_data.data(), 1, rank + 1, stream);
       }
     } else {
-      communicator.device_sendrecv(sent_data.data(), 1, rank - 1,
-                                   received_data.data(), 1, rank - 1, stream);
+      communicator.device_sendrecv(
+        sent_data.data(), 1, rank - 1, received_data.data(), 1, rank - 1, stream);
     }
 
     communicator.sync_stream(stream);
@@ -451,9 +443,7 @@ bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) {
       ret = false;
     }
 
-    if (communicator.get_rank() == 0) {
-      std::cout << "=========================" << std::endl;
-    }
+    if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; }
   }
 
   return ret;
@@ -466,11 +456,11 @@ bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) {
  *        initialized comms instance.
  * @param numTrials number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h,
-                                                 int numTrials) {
-  comms_t const &communicator = h.get_comms();
-  int const rank = communicator.get_rank();
-  cudaStream_t stream = h.get_stream();
+bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrials)
+{
+  comms_t const& communicator = h.get_comms();
+  int const rank              = communicator.get_rank();
+  cudaStream_t stream         = h.get_stream();
 
   bool ret = true;
   for (int i = 0; i < numTrials; i++) {
@@ -493,25 +483,26 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h,
     std::vector<int> srcs(communicator.get_size());
     std::iota(srcs.begin(), srcs.end(), int{0});
 
-    communicator.device_multicast_sendrecv(
-      sent_data.data(), sendsizes, sendoffsets, dests, received_data.data(),
-      recvsizes, recvoffsets, srcs, stream);
+    communicator.device_multicast_sendrecv(sent_data.data(),
+                                           sendsizes,
+                                           sendoffsets,
+                                           dests,
+                                           received_data.data(),
+                                           recvsizes,
+                                           recvoffsets,
+                                           srcs,
+                                           stream);
 
     communicator.sync_stream(stream);
 
     std::vector<int> h_received_data(communicator.get_size());
-    raft::update_host(h_received_data.data(), received_data.data(),
-                      received_data.size(), stream);
+    raft::update_host(h_received_data.data(), received_data.data(), received_data.size(), stream);
     CUDA_TRY(cudaStreamSynchronize(stream));
     for (int i = 0; i < communicator.get_size(); ++i) {
-      if (h_received_data[i] != i) {
-        ret = false;
-      }
+      if (h_received_data[i] != i) { ret = false; }
     }
 
-    if (communicator.get_rank() == 0) {
-      std::cout << "=========================" << std::endl;
-    }
+    if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; }
   }
 
   return ret;
@@ -524,20 +515,20 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h,
  *        initialized comms instance.
  * @param n_colors number of different colors to test
  */
-bool test_commsplit(const handle_t &h, int n_colors) {
-  comms_t const &communicator = h.get_comms();
-  int const rank = communicator.get_rank();
-  int const size = communicator.get_size();
+bool test_commsplit(const handle_t& h, int n_colors)
+{
+  comms_t const& communicator = h.get_comms();
+  int const rank              = communicator.get_rank();
+  int const size              = communicator.get_size();
 
   if (n_colors > size) n_colors = size;
 
   // first we need to assign to a color, then assign the rank within the color
   int color = rank % n_colors;
-  int key = rank / n_colors;
+  int key   = rank / n_colors;
 
   handle_t new_handle(1);
-  auto shared_comm =
-    std::make_shared<comms_t>(communicator.comm_split(color, key));
+  auto shared_comm = std::make_shared<comms_t>(communicator.comm_split(color, key));
   new_handle.set_comms(shared_comm);
 
   return test_collective_allreduce(new_handle, 0);
diff --git a/cpp/include/raft/comms/ucp_helper.hpp b/cpp/include/raft/comms/ucp_helper.hpp
index 226b6f0527..89c7b25630 100644
--- a/cpp/include/raft/comms/ucp_helper.hpp
+++ b/cpp/include/raft/comms/ucp_helper.hpp
@@ -25,16 +25,19 @@
 namespace raft {
 namespace comms {
 
-typedef void (*dlsym_print_info)(ucp_ep_h, FILE *);
-typedef void (*dlsym_rec_free)(void *);
+typedef void (*dlsym_print_info)(ucp_ep_h, FILE*);
+typedef void (*dlsym_rec_free)(void*);
 typedef int (*dlsym_worker_progress)(ucp_worker_h);
 
-typedef ucs_status_ptr_t (*dlsym_send)(ucp_ep_h, const void *, size_t,
-                                       ucp_datatype_t, ucp_tag_t,
-                                       ucp_send_callback_t);
-typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h, void *, size_t count,
-                                       ucp_datatype_t datatype, ucp_tag_t,
-                                       ucp_tag_t, ucp_tag_recv_callback_t);
+typedef ucs_status_ptr_t (*dlsym_send)(
+  ucp_ep_h, const void*, size_t, ucp_datatype_t, ucp_tag_t, ucp_send_callback_t);
+typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h,
+                                       void*,
+                                       size_t count,
+                                       ucp_datatype_t datatype,
+                                       ucp_tag_t,
+                                       ucp_tag_t,
+                                       ucp_tag_recv_callback_t);
 
 /**
  * Standard UCX request object that will be passed
@@ -55,9 +58,9 @@ struct ucx_context {
  */
 class ucp_request {
  public:
-  struct ucx_context *req;
-  bool needs_release = true;
-  int other_rank = -1;
+  struct ucx_context* req;
+  bool needs_release   = true;
+  int other_rank       = -1;
   bool is_send_request = false;
 };
 
@@ -67,18 +70,19 @@ static const ucp_tag_t default_tag_mask = -1;
 /**
  * @brief Asynchronous send callback sets request to completed
  */
-static void send_callback(void *request, ucs_status_t status) {
-  struct ucx_context *context = (struct ucx_context *)request;
-  context->completed = 1;
+static void send_callback(void* request, ucs_status_t status)
+{
+  struct ucx_context* context = (struct ucx_context*)request;
+  context->completed          = 1;
 }
 
 /**
  * @brief Asynchronous recv callback sets request to completed
  */
-static void recv_callback(void *request, ucs_status_t status,
-                          ucp_tag_recv_info_t *info) {
-  struct ucx_context *context = (struct ucx_context *)request;
-  context->completed = 1;
+static void recv_callback(void* request, ucs_status_t status, ucp_tag_recv_info_t* info)
+{
+  struct ucx_context* context = (struct ucx_context*)request;
+  context->completed          = 1;
 }
 
 /**
@@ -87,7 +91,8 @@ static void recv_callback(void *request, ucs_status_t status,
  */
 class comms_ucp_handler {
  public:
-  comms_ucp_handler() {
+  comms_ucp_handler()
+  {
     load_ucp_handle();
     load_send_func();
     load_recv_func();
@@ -99,7 +104,7 @@ class comms_ucp_handler {
   ~comms_ucp_handler() { dlclose(ucp_handle); }
 
  private:
-  void *ucp_handle;
+  void* ucp_handle;
 
   dlsym_print_info print_info_func;
   dlsym_rec_free req_free_func;
@@ -107,7 +112,8 @@ class comms_ucp_handler {
   dlsym_send send_func;
   dlsym_recv recv_func;
 
-  void load_ucp_handle() {
+  void load_ucp_handle()
+  {
     ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NOLOAD | RTLD_NODELETE);
     if (!ucp_handle) {
       ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NODELETE);
@@ -117,51 +123,56 @@ class comms_ucp_handler {
     dlerror();
   }
 
-  void assert_dlerror() {
-    char *error = dlerror();
+  void assert_dlerror()
+  {
+    char* error = dlerror();
     ASSERT(error == NULL, "Error loading function symbol: %s\n", error);
   }
 
-  void load_send_func() {
+  void load_send_func()
+  {
     send_func = (dlsym_send)dlsym(ucp_handle, "ucp_tag_send_nb");
     assert_dlerror();
   }
 
-  void load_free_req_func() {
+  void load_free_req_func()
+  {
     req_free_func = (dlsym_rec_free)dlsym(ucp_handle, "ucp_request_free");
     assert_dlerror();
   }
 
-  void load_print_info_func() {
+  void load_print_info_func()
+  {
     print_info_func = (dlsym_print_info)dlsym(ucp_handle, "ucp_ep_print_info");
     assert_dlerror();
   }
 
-  void load_worker_progress_func() {
-    worker_progress_func =
-      (dlsym_worker_progress)dlsym(ucp_handle, "ucp_worker_progress");
+  void load_worker_progress_func()
+  {
+    worker_progress_func = (dlsym_worker_progress)dlsym(ucp_handle, "ucp_worker_progress");
     assert_dlerror();
   }
 
-  void load_recv_func() {
+  void load_recv_func()
+  {
     recv_func = (dlsym_recv)dlsym(ucp_handle, "ucp_tag_recv_nb");
     assert_dlerror();
   }
 
-  ucp_tag_t build_message_tag(int rank, int tag) const {
+  ucp_tag_t build_message_tag(int rank, int tag) const
+  {
     // keeping the rank in the lower bits enables debugging.
     return ((uint32_t)tag << 31) | (uint32_t)rank;
   }
 
  public:
-  int ucp_progress(ucp_worker_h worker) const {
-    return (*(worker_progress_func))(worker);
-  }
+  int ucp_progress(ucp_worker_h worker) const { return (*(worker_progress_func))(worker); }
 
   /**
    * @brief Frees any memory underlying the given ucp request object
    */
-  void free_ucp_request(ucp_request *request) const {
+  void free_ucp_request(ucp_request* request) const
+  {
     if (request->needs_release) {
       request->req->completed = 0;
       (*(req_free_func))(request->req);
@@ -172,56 +183,67 @@ class comms_ucp_handler {
   /**
    * @brief Asynchronously send data to the given endpoint using the given tag
    */
-  void ucp_isend(ucp_request *req, ucp_ep_h ep_ptr, const void *buf,
-                 size_t size, int tag, ucp_tag_t tag_mask, int rank) const {
+  void ucp_isend(ucp_request* req,
+                 ucp_ep_h ep_ptr,
+                 const void* buf,
+                 size_t size,
+                 int tag,
+                 ucp_tag_t tag_mask,
+                 int rank) const
+  {
     ucp_tag_t ucp_tag = build_message_tag(rank, tag);
 
-    ucs_status_ptr_t send_result = (*(send_func))(
-      ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback);
-    struct ucx_context *ucp_req = (struct ucx_context *)send_result;
+    ucs_status_ptr_t send_result =
+      (*(send_func))(ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback);
+    struct ucx_context* ucp_req = (struct ucx_context*)send_result;
 
     if (UCS_PTR_IS_ERR(send_result)) {
       ASSERT(!UCS_PTR_IS_ERR(send_result),
              "unable to send UCX data message (%d)\n",
              UCS_PTR_STATUS(send_result));
       /**
-     * If the request didn't fail, but it's not OK, it is in flight.
-     * Expect the handler to be invoked
-     */
+       * If the request didn't fail, but it's not OK, it is in flight.
+       * Expect the handler to be invoked
+       */
     } else if (UCS_PTR_STATUS(send_result) != UCS_OK) {
       /**
-      * If the request is OK, it's already been completed and we don't need to wait on it.
-      * The request will be a nullptr, however, so we need to create a new request
-      * and set it to completed to make the "waitall()" function work properly.
-      */
+       * If the request is OK, it's already been completed and we don't need to wait on it.
+       * The request will be a nullptr, however, so we need to create a new request
+       * and set it to completed to make the "waitall()" function work properly.
+       */
       req->needs_release = true;
     } else {
       req->needs_release = false;
     }
 
-    req->other_rank = rank;
+    req->other_rank      = rank;
     req->is_send_request = true;
-    req->req = ucp_req;
+    req->req             = ucp_req;
   }
 
   /**
    * @brief Asynchronously receive data from given endpoint with the given tag.
    */
-  void ucp_irecv(ucp_request *req, ucp_worker_h worker, ucp_ep_h ep_ptr,
-                 void *buf, size_t size, int tag, ucp_tag_t tag_mask,
-                 int sender_rank) const {
+  void ucp_irecv(ucp_request* req,
+                 ucp_worker_h worker,
+                 ucp_ep_h ep_ptr,
+                 void* buf,
+                 size_t size,
+                 int tag,
+                 ucp_tag_t tag_mask,
+                 int sender_rank) const
+  {
     ucp_tag_t ucp_tag = build_message_tag(sender_rank, tag);
 
     ucs_status_ptr_t recv_result =
-      (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag,
-                     tag_mask, recv_callback);
+      (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag, tag_mask, recv_callback);
 
-    struct ucx_context *ucp_req = (struct ucx_context *)recv_result;
+    struct ucx_context* ucp_req = (struct ucx_context*)recv_result;
 
-    req->req = ucp_req;
-    req->needs_release = true;
+    req->req             = ucp_req;
+    req->needs_release   = true;
     req->is_send_request = false;
-    req->other_rank = sender_rank;
+    req->other_rank      = sender_rank;
 
     ASSERT(!UCS_PTR_IS_ERR(recv_result),
            "unable to receive UCX data message (%d)\n",
diff --git a/cpp/include/raft/comms/util.hpp b/cpp/include/raft/comms/util.hpp
index f3216abc37..1b0548fc00 100644
--- a/cpp/include/raft/comms/util.hpp
+++ b/cpp/include/raft/comms/util.hpp
@@ -26,88 +26,70 @@
  * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an
  * exception detailing the NCCL error that occurred
  */
-#define NCCL_TRY(call)                                                        \
-  do {                                                                        \
-    ncclResult_t const status = (call);                                       \
-    if (ncclSuccess != status) {                                              \
-      std::string msg{};                                                      \
-      SET_ERROR_MSG(msg,                                                      \
-                    "NCCL error encountered at: ", "call='%s', Reason=%d:%s", \
-                    #call, status, ncclGetErrorString(status));               \
-      throw raft::logic_error(msg);                                           \
-    }                                                                         \
+#define NCCL_TRY(call)                             \
+  do {                                             \
+    ncclResult_t const status = (call);            \
+    if (ncclSuccess != status) {                   \
+      std::string msg{};                           \
+      SET_ERROR_MSG(msg,                           \
+                    "NCCL error encountered at: ", \
+                    "call='%s', Reason=%d:%s",     \
+                    #call,                         \
+                    status,                        \
+                    ncclGetErrorString(status));   \
+      throw raft::logic_error(msg);                \
+    }                                              \
   } while (0);
 
-#define NCCL_TRY_NO_THROW(call)                           \
-  do {                                                    \
-    ncclResult_t status = call;                           \
-    if (ncclSuccess != status) {                          \
-      printf("NCCL call='%s' failed. Reason:%s\n", #call, \
-             ncclGetErrorString(status));                 \
-    }                                                     \
+#define NCCL_TRY_NO_THROW(call)                                                        \
+  do {                                                                                 \
+    ncclResult_t status = call;                                                        \
+    if (ncclSuccess != status) {                                                       \
+      printf("NCCL call='%s' failed. Reason:%s\n", #call, ncclGetErrorString(status)); \
+    }                                                                                  \
   } while (0)
 
 namespace raft {
 namespace comms {
 
-constexpr size_t get_datatype_size(const datatype_t datatype) {
+constexpr size_t get_datatype_size(const datatype_t datatype)
+{
   switch (datatype) {
-    case datatype_t::CHAR:
-      return sizeof(char);
-    case datatype_t::UINT8:
-      return sizeof(uint8_t);
-    case datatype_t::INT32:
-      return sizeof(int);
-    case datatype_t::UINT32:
-      return sizeof(unsigned int);
-    case datatype_t::INT64:
-      return sizeof(int64_t);
-    case datatype_t::UINT64:
-      return sizeof(uint64_t);
-    case datatype_t::FLOAT32:
-      return sizeof(float);
-    case datatype_t::FLOAT64:
-      return sizeof(double);
-    default:
-      throw "Unsupported datatype";
+    case datatype_t::CHAR: return sizeof(char);
+    case datatype_t::UINT8: return sizeof(uint8_t);
+    case datatype_t::INT32: return sizeof(int);
+    case datatype_t::UINT32: return sizeof(unsigned int);
+    case datatype_t::INT64: return sizeof(int64_t);
+    case datatype_t::UINT64: return sizeof(uint64_t);
+    case datatype_t::FLOAT32: return sizeof(float);
+    case datatype_t::FLOAT64: return sizeof(double);
+    default: throw "Unsupported datatype";
   }
 }
 
-constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype) {
+constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype)
+{
   switch (datatype) {
-    case datatype_t::CHAR:
-      return ncclChar;
-    case datatype_t::UINT8:
-      return ncclUint8;
-    case datatype_t::INT32:
-      return ncclInt;
-    case datatype_t::UINT32:
-      return ncclUint32;
-    case datatype_t::INT64:
-      return ncclInt64;
-    case datatype_t::UINT64:
-      return ncclUint64;
-    case datatype_t::FLOAT32:
-      return ncclFloat;
-    case datatype_t::FLOAT64:
-      return ncclDouble;
-    default:
-      throw "Unsupported datatype";
+    case datatype_t::CHAR: return ncclChar;
+    case datatype_t::UINT8: return ncclUint8;
+    case datatype_t::INT32: return ncclInt;
+    case datatype_t::UINT32: return ncclUint32;
+    case datatype_t::INT64: return ncclInt64;
+    case datatype_t::UINT64: return ncclUint64;
+    case datatype_t::FLOAT32: return ncclFloat;
+    case datatype_t::FLOAT64: return ncclDouble;
+    default: throw "Unsupported datatype";
   }
 }
 
-constexpr ncclRedOp_t get_nccl_op(const op_t op) {
+constexpr ncclRedOp_t get_nccl_op(const op_t op)
+{
   switch (op) {
-    case op_t::SUM:
-      return ncclSum;
-    case op_t::PROD:
-      return ncclProd;
-    case op_t::MIN:
-      return ncclMin;
-    case op_t::MAX:
-      return ncclMax;
-    default:
-      throw "Unsupported datatype";
+    case op_t::SUM: return ncclSum;
+    case op_t::PROD: return ncclProd;
+    case op_t::MIN: return ncclMin;
+    case op_t::MAX: return ncclMax;
+    default: throw "Unsupported datatype";
   }
 }
 };  // namespace comms
diff --git a/cpp/include/raft/cuda_utils.cuh b/cpp/include/raft/cuda_utils.cuh
index 14274043f5..8a66eff242 100644
--- a/cpp/include/raft/cuda_utils.cuh
+++ b/cpp/include/raft/cuda_utils.cuh
@@ -36,16 +36,17 @@
 namespace raft {
 
 /** helper macro for device inlined functions */
-#define DI inline __device__
+#define DI  inline __device__
 #define HDI inline __host__ __device__
-#define HD __host__ __device__
+#define HD  __host__ __device__
 
 /**
  * @brief Provide a ceiling division operation ie. ceil(a / b)
  * @tparam IntType supposed to be only integers for now!
  */
 template <typename IntType>
-constexpr HDI IntType ceildiv(IntType a, IntType b) {
+constexpr HDI IntType ceildiv(IntType a, IntType b)
+{
   return (a + b - 1) / b;
 }
 
@@ -54,7 +55,8 @@ constexpr HDI IntType ceildiv(IntType a, IntType b) {
  * @tparam IntType supposed to be only integers for now!
  */
 template <typename IntType>
-constexpr HDI IntType alignTo(IntType a, IntType b) {
+constexpr HDI IntType alignTo(IntType a, IntType b)
+{
   return ceildiv(a, b) * b;
 }
 
@@ -63,7 +65,8 @@ constexpr HDI IntType alignTo(IntType a, IntType b) {
  * @tparam IntType supposed to be only integers for now!
  */
 template <typename IntType>
-constexpr HDI IntType alignDown(IntType a, IntType b) {
+constexpr HDI IntType alignDown(IntType a, IntType b)
+{
   return (a / b) * b;
 }
 
@@ -72,7 +75,8 @@ constexpr HDI IntType alignDown(IntType a, IntType b) {
  * @tparam IntType data type (checked only for integers)
  */
 template <typename IntType>
-constexpr HDI bool isPo2(IntType num) {
+constexpr HDI bool isPo2(IntType num)
+{
   return (num && !(num & (num - 1)));
 }
 
@@ -81,14 +85,16 @@ constexpr HDI bool isPo2(IntType num) {
  * @tparam IntType data type (checked only for integers)
  */
 template <typename IntType>
-constexpr HDI IntType log2(IntType num, IntType ret = IntType(0)) {
+constexpr HDI IntType log2(IntType num, IntType ret = IntType(0))
+{
   return num <= IntType(1) ? ret : log2(num >> IntType(1), ++ret);
 }
 
 /** Device function to apply the input lambda across threads in the grid */
 template <int ItemsPerThread, typename L>
-DI void forEach(int num, L lambda) {
-  int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
+DI void forEach(int num, L lambda)
+{
+  int idx              = (blockDim.x * blockIdx.x) + threadIdx.x;
   const int numThreads = blockDim.x * gridDim.x;
 #pragma unroll
   for (int itr = 0; itr < ItemsPerThread; ++itr, idx += numThreads) {
@@ -100,7 +106,8 @@ DI void forEach(int num, L lambda) {
 static const int WarpSize = 32;
 
 /** get the laneId of the current thread */
-DI int laneId() {
+DI int laneId()
+{
   int id;
   asm("mov.s32 %0, %laneid;" : "=r"(id));
   return id;
@@ -113,15 +120,17 @@ DI int laneId() {
  * @param b second input
  */
 template <typename T>
-HDI void swapVals(T &a, T &b) {
+HDI void swapVals(T& a, T& b)
+{
   T tmp = a;
-  a = b;
-  b = tmp;
+  a     = b;
+  b     = tmp;
 }
 
 /** Device function to have atomic add support for older archs */
 template <typename Type>
-DI void myAtomicAdd(Type *address, Type val) {
+DI void myAtomicAdd(Type* address, Type val)
+{
   atomicAdd(address, val);
 }
 
@@ -129,105 +138,114 @@ DI void myAtomicAdd(Type *address, Type val) {
 // Ref:
 // http://on-demand.gputechconf.com/gtc/2013/presentations/S3101-Atomic-Memory-Operations.pdf
 template <>
-DI void myAtomicAdd(double *address, double val) {
-  unsigned long long int *address_as_ull = (unsigned long long int *)address;
-  unsigned long long int old = *address_as_ull, assumed;
+DI void myAtomicAdd(double* address, double val)
+{
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old             = *address_as_ull, assumed;
   do {
     assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val + __longlong_as_double(assumed)));
+    old =
+      atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
   } while (assumed != old);
 }
 #endif
 
 template <typename T, typename ReduceLambda>
-DI void myAtomicReduce(T *address, T val, ReduceLambda op);
+DI void myAtomicReduce(T* address, T val, ReduceLambda op);
 
 template <typename ReduceLambda>
-DI void myAtomicReduce(double *address, double val, ReduceLambda op) {
-  unsigned long long int *address_as_ull = (unsigned long long int *)address;
-  unsigned long long int old = *address_as_ull, assumed;
+DI void myAtomicReduce(double* address, double val, ReduceLambda op)
+{
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old             = *address_as_ull, assumed;
   do {
     assumed = old;
-    old =
-      atomicCAS(address_as_ull, assumed,
-                __double_as_longlong(op(val, __longlong_as_double(assumed))));
+    old     = atomicCAS(
+      address_as_ull, assumed, __double_as_longlong(op(val, __longlong_as_double(assumed))));
   } while (assumed != old);
 }
 
 template <typename ReduceLambda>
-DI void myAtomicReduce(float *address, float val, ReduceLambda op) {
-  unsigned int *address_as_uint = (unsigned int *)address;
-  unsigned int old = *address_as_uint, assumed;
+DI void myAtomicReduce(float* address, float val, ReduceLambda op)
+{
+  unsigned int* address_as_uint = (unsigned int*)address;
+  unsigned int old              = *address_as_uint, assumed;
   do {
     assumed = old;
-    old = atomicCAS(address_as_uint, assumed,
-                    __float_as_uint(op(val, __uint_as_float(assumed))));
+    old = atomicCAS(address_as_uint, assumed, __float_as_uint(op(val, __uint_as_float(assumed))));
   } while (assumed != old);
 }
 
 template <typename ReduceLambda>
-DI void myAtomicReduce(int *address, int val, ReduceLambda op) {
+DI void myAtomicReduce(int* address, int val, ReduceLambda op)
+{
   int old = *address, assumed;
   do {
     assumed = old;
-    old = atomicCAS(address, assumed, op(val, assumed));
+    old     = atomicCAS(address, assumed, op(val, assumed));
   } while (assumed != old);
 }
 
 template <typename ReduceLambda>
-DI void myAtomicReduce(long long *address, long long val, ReduceLambda op) {
+DI void myAtomicReduce(long long* address, long long val, ReduceLambda op)
+{
   long long old = *address, assumed;
   do {
     assumed = old;
-    old = atomicCAS(address, assumed, op(val, assumed));
+    old     = atomicCAS(address, assumed, op(val, assumed));
   } while (assumed != old);
 }
 
 template <typename ReduceLambda>
-DI void myAtomicReduce(unsigned long long *address, unsigned long long val,
-                       ReduceLambda op) {
+DI void myAtomicReduce(unsigned long long* address, unsigned long long val, ReduceLambda op)
+{
   unsigned long long old = *address, assumed;
   do {
     assumed = old;
-    old = atomicCAS(address, assumed, op(val, assumed));
+    old     = atomicCAS(address, assumed, op(val, assumed));
   } while (assumed != old);
 }
 
 /**
  * @brief Provide atomic min operation.
  * @tparam T: data type for input data (float or double).
- * @param[in] address: address to read old value from, and to atomically update w/ min(old value, val)
+ * @param[in] address: address to read old value from, and to atomically update w/ min(old value,
+ * val)
  * @param[in] val: new value to compare with old
  */
 template <typename T>
-DI T myAtomicMin(T *address, T val);
+DI T myAtomicMin(T* address, T val);
 
 /**
  * @brief Provide atomic max operation.
  * @tparam T: data type for input data (float or double).
- * @param[in] address: address to read old value from, and to atomically update w/ max(old value, val)
+ * @param[in] address: address to read old value from, and to atomically update w/ max(old value,
+ * val)
  * @param[in] val: new value to compare with old
  */
 template <typename T>
-DI T myAtomicMax(T *address, T val);
+DI T myAtomicMax(T* address, T val);
 
-DI float myAtomicMin(float *address, float val) {
+DI float myAtomicMin(float* address, float val)
+{
   myAtomicReduce(address, val, fminf);
   return *address;
 }
 
-DI float myAtomicMax(float *address, float val) {
+DI float myAtomicMax(float* address, float val)
+{
   myAtomicReduce(address, val, fmaxf);
   return *address;
 }
 
-DI double myAtomicMin(double *address, double val) {
+DI double myAtomicMin(double* address, double val)
+{
   myAtomicReduce<double(double, double)>(address, val, fmin);
   return *address;
 }
 
-DI double myAtomicMax(double *address, double val) {
+DI double myAtomicMax(double* address, double val)
+{
   myAtomicReduce<double(double, double)>(address, val, fmax);
   return *address;
 }
@@ -239,11 +257,13 @@ DI double myAtomicMax(double *address, double val) {
 template <typename T>
 HDI T myMax(T x, T y);
 template <>
-HDI float myMax<float>(float x, float y) {
+HDI float myMax<float>(float x, float y)
+{
   return fmaxf(x, y);
 }
 template <>
-HDI double myMax<double>(double x, double y) {
+HDI double myMax<double>(double x, double y)
+{
   return fmax(x, y);
 }
 /** @} */
@@ -255,11 +275,13 @@ HDI double myMax<double>(double x, double y) {
 template <typename T>
 HDI T myMin(T x, T y);
 template <>
-HDI float myMin<float>(float x, float y) {
+HDI float myMin<float>(float x, float y)
+{
   return fminf(x, y);
 }
 template <>
-HDI double myMin<double>(double x, double y) {
+HDI double myMin<double>(double x, double y)
+{
   return fmin(x, y);
 }
 /** @} */
@@ -267,11 +289,13 @@ HDI double myMin<double>(double x, double y) {
 /**
  * @brief Provide atomic min operation.
  * @tparam T: data type for input data (float or double).
- * @param[in] address: address to read old value from, and to atomically update w/ min(old value, val)
+ * @param[in] address: address to read old value from, and to atomically update w/ min(old value,
+ * val)
  * @param[in] val: new value to compare with old
  */
 template <typename T>
-DI T myAtomicMin(T *address, T val) {
+DI T myAtomicMin(T* address, T val)
+{
   myAtomicReduce(address, val, myMin<T>);
   return *address;
 }
@@ -279,11 +303,13 @@ DI T myAtomicMin(T *address, T val) {
 /**
  * @brief Provide atomic max operation.
  * @tparam T: data type for input data (float or double).
- * @param[in] address: address to read old value from, and to atomically update w/ max(old value, val)
+ * @param[in] address: address to read old value from, and to atomically update w/ max(old value,
+ * val)
  * @param[in] val: new value to compare with old
  */
 template <typename T>
-DI T myAtomicMax(T *address, T val) {
+DI T myAtomicMax(T* address, T val)
+{
   myAtomicReduce(address, val, myMax<T>);
   return *address;
 }
@@ -292,7 +318,8 @@ DI T myAtomicMax(T *address, T val) {
  * Sign function
  */
 template <typename T>
-HDI int sgn(const T val) {
+HDI int sgn(const T val)
+{
   return (T(0) < val) - (val < T(0));
 }
 
@@ -303,11 +330,13 @@ HDI int sgn(const T val) {
 template <typename T>
 HDI T myExp(T x);
 template <>
-HDI float myExp(float x) {
+HDI float myExp(float x)
+{
   return expf(x);
 }
 template <>
-HDI double myExp(double x) {
+HDI double myExp(double x)
+{
   return exp(x);
 }
 /** @} */
@@ -319,11 +348,13 @@ HDI double myExp(double x) {
 template <typename T>
 inline __device__ T myInf();
 template <>
-inline __device__ float myInf<float>() {
+inline __device__ float myInf<float>()
+{
   return CUDART_INF_F;
 }
 template <>
-inline __device__ double myInf<double>() {
+inline __device__ double myInf<double>()
+{
   return CUDART_INF;
 }
 /** @} */
@@ -335,11 +366,13 @@ inline __device__ double myInf<double>() {
 template <typename T>
 HDI T myLog(T x);
 template <>
-HDI float myLog(float x) {
+HDI float myLog(float x)
+{
   return logf(x);
 }
 template <>
-HDI double myLog(double x) {
+HDI double myLog(double x)
+{
   return log(x);
 }
 /** @} */
@@ -351,11 +384,13 @@ HDI double myLog(double x) {
 template <typename T>
 HDI T mySqrt(T x);
 template <>
-HDI float mySqrt(float x) {
+HDI float mySqrt(float x)
+{
   return sqrtf(x);
 }
 template <>
-HDI double mySqrt(double x) {
+HDI double mySqrt(double x)
+{
   return sqrt(x);
 }
 /** @} */
@@ -365,13 +400,15 @@ HDI double mySqrt(double x) {
  * @{
  */
 template <typename T>
-DI void mySinCos(T x, T &s, T &c);
+DI void mySinCos(T x, T& s, T& c);
 template <>
-DI void mySinCos(float x, float &s, float &c) {
+DI void mySinCos(float x, float& s, float& c)
+{
   sincosf(x, &s, &c);
 }
 template <>
-DI void mySinCos(double x, double &s, double &c) {
+DI void mySinCos(double x, double& s, double& c)
+{
   sincos(x, &s, &c);
 }
 /** @} */
@@ -383,11 +420,13 @@ DI void mySinCos(double x, double &s, double &c) {
 template <typename T>
 DI T mySin(T x);
 template <>
-DI float mySin(float x) {
+DI float mySin(float x)
+{
   return sinf(x);
 }
 template <>
-DI double mySin(double x) {
+DI double mySin(double x)
+{
   return sin(x);
 }
 /** @} */
@@ -397,15 +436,18 @@ DI double mySin(double x) {
  * @{
  */
 template <typename T>
-DI T myAbs(T x) {
+DI T myAbs(T x)
+{
   return x < 0 ? -x : x;
 }
 template <>
-DI float myAbs(float x) {
+DI float myAbs(float x)
+{
   return fabsf(x);
 }
 template <>
-DI double myAbs(double x) {
+DI double myAbs(double x)
+{
   return fabs(x);
 }
 /** @} */
@@ -417,11 +459,13 @@ DI double myAbs(double x) {
 template <typename T>
 HDI T myPow(T x, T power);
 template <>
-HDI float myPow(float x, float power) {
+HDI float myPow(float x, float power)
+{
   return powf(x, power);
 }
 template <>
-HDI double myPow(double x, double power) {
+HDI double myPow(double x, double power)
+{
   return pow(x, power);
 }
 /** @} */
@@ -433,11 +477,13 @@ HDI double myPow(double x, double power) {
 template <typename T>
 HDI T myTanh(T x);
 template <>
-HDI float myTanh(float x) {
+HDI float myTanh(float x)
+{
   return tanhf(x);
 }
 template <>
-HDI double myTanh(double x) {
+HDI double myTanh(double x)
+{
   return tanh(x);
 }
 /** @} */
@@ -449,11 +495,13 @@ HDI double myTanh(double x) {
 template <typename T>
 HDI T myATanh(T x);
 template <>
-HDI float myATanh(float x) {
+HDI float myATanh(float x)
+{
   return atanhf(x);
 }
 template <>
-HDI double myATanh(double x) {
+HDI double myATanh(double x)
+{
   return atanh(x);
 }
 /** @} */
@@ -492,15 +540,18 @@ struct Sum {
  * @{
  */
 template <typename T>
-DI T signPrim(T x) {
+DI T signPrim(T x)
+{
   return x < 0 ? -1 : +1;
 }
 template <>
-DI float signPrim(float x) {
+DI float signPrim(float x)
+{
   return signbit(x) == true ? -1.0f : +1.0f;
 }
 template <>
-DI double signPrim(double x) {
+DI double signPrim(double x)
+{
   return signbit(x) == true ? -1.0 : +1.0;
 }
 /** @} */
@@ -514,28 +565,33 @@ DI double signPrim(double x) {
  * @{
  */
 template <typename T>
-DI T maxPrim(T x, T y) {
+DI T maxPrim(T x, T y)
+{
   return x > y ? x : y;
 }
 template <>
-DI float maxPrim(float x, float y) {
+DI float maxPrim(float x, float y)
+{
   return fmaxf(x, y);
 }
 template <>
-DI double maxPrim(double x, double y) {
+DI double maxPrim(double x, double y)
+{
   return fmax(x, y);
 }
 /** @} */
 
 /** apply a warp-wide fence (useful from Volta+ archs) */
-DI void warpFence() {
+DI void warpFence()
+{
 #if __CUDA_ARCH__ >= 700
   __syncwarp();
 #endif
 }
 
 /** warp-wide any boolean aggregator */
-DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) {
+DI bool any(bool inFlag, uint32_t mask = 0xffffffffu)
+{
 #if CUDART_VERSION >= 9000
   inFlag = __any_sync(mask, inFlag);
 #else
@@ -545,7 +601,8 @@ DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) {
 }
 
 /** warp-wide all boolean aggregator */
-DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) {
+DI bool all(bool inFlag, uint32_t mask = 0xffffffffu)
+{
 #if CUDART_VERSION >= 9000
   inFlag = __all_sync(mask, inFlag);
 #else
@@ -564,8 +621,8 @@ DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) {
  * @return the shuffled data
  */
 template <typename T>
-DI T shfl(T val, int srcLane, int width = WarpSize,
-          uint32_t mask = 0xffffffffu) {
+DI T shfl(T val, int srcLane, int width = WarpSize, uint32_t mask = 0xffffffffu)
+{
 #if CUDART_VERSION >= 9000
   return __shfl_sync(mask, val, srcLane, width);
 #else
@@ -583,8 +640,8 @@ DI T shfl(T val, int srcLane, int width = WarpSize,
  * @return the shuffled data
  */
 template <typename T>
-DI T shfl_xor(T val, int laneMask, int width = WarpSize,
-              uint32_t mask = 0xffffffffu) {
+DI T shfl_xor(T val, int laneMask, int width = WarpSize, uint32_t mask = 0xffffffffu)
+{
 #if CUDART_VERSION >= 9000
   return __shfl_xor_sync(mask, val, laneMask, width);
 #else
@@ -602,7 +659,8 @@ DI T shfl_xor(T val, int laneMask, int width = WarpSize,
  * @todo Expand this to support arbitrary reduction ops
  */
 template <typename T>
-DI T warpReduce(T val) {
+DI T warpReduce(T val)
+{
 #pragma unroll
   for (int i = WarpSize / 2; i > 0; i >>= 1) {
     T tmp = shfl(val, laneId() + i);
@@ -623,12 +681,13 @@ DI T warpReduce(T val) {
  * @todo Expand this to support arbitrary reduction ops
  */
 template <typename T>
-DI T blockReduce(T val, char *smem) {
-  auto *sTemp = reinterpret_cast<T *>(smem);
-  int nWarps = (blockDim.x + WarpSize - 1) / WarpSize;
-  int lid = laneId();
-  int wid = threadIdx.x / WarpSize;
-  val = warpReduce(val);
+DI T blockReduce(T val, char* smem)
+{
+  auto* sTemp = reinterpret_cast<T*>(smem);
+  int nWarps  = (blockDim.x + WarpSize - 1) / WarpSize;
+  int lid     = laneId();
+  int wid     = threadIdx.x / WarpSize;
+  val         = warpReduce(val);
   if (lid == 0) sTemp[wid] = val;
   __syncthreads();
   val = lid < nWarps ? sTemp[lid] : T(0);
@@ -644,8 +703,10 @@ DI T blockReduce(T val, char *smem) {
  * @param idx the index for which to query the stream
  */
 inline cudaStream_t select_stream(cudaStream_t user_stream,
-                                  cudaStream_t *int_streams, int n_int_streams,
-                                  int idx) {
+                                  cudaStream_t* int_streams,
+                                  int n_int_streams,
+                                  int idx)
+{
   return n_int_streams > 0 ? int_streams[idx % n_int_streams] : user_stream;
 }
 
diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index 486103dedb..cf06416a96 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -54,17 +54,20 @@ struct cuda_error : public raft::exception {
  *
  */
 #ifndef CUDA_TRY
-#define CUDA_TRY(call)                                                        \
-  do {                                                                        \
-    cudaError_t const status = call;                                          \
-    if (status != cudaSuccess) {                                              \
-      cudaGetLastError();                                                     \
-      std::string msg{};                                                      \
-      SET_ERROR_MSG(                                                          \
-        msg, "CUDA error encountered at: ", "call='%s', Reason=%s:%s", #call, \
-        cudaGetErrorName(status), cudaGetErrorString(status));                \
-      throw raft::cuda_error(msg);                                            \
-    }                                                                         \
+#define CUDA_TRY(call)                             \
+  do {                                             \
+    cudaError_t const status = call;               \
+    if (status != cudaSuccess) {                   \
+      cudaGetLastError();                          \
+      std::string msg{};                           \
+      SET_ERROR_MSG(msg,                           \
+                    "CUDA error encountered at: ", \
+                    "call='%s', Reason=%s:%s",     \
+                    #call,                         \
+                    cudaGetErrorName(status),      \
+                    cudaGetErrorString(status));   \
+      throw raft::cuda_error(msg);                 \
+    }                                              \
   } while (0)
 #endif
 /**
@@ -97,13 +100,16 @@ struct cuda_error : public raft::exception {
 //  *        exception.
 //  */
 #ifndef CUDA_CHECK_NO_THROW
-#define CUDA_CHECK_NO_THROW(call)                                         \
-  do {                                                                    \
-    cudaError_t const status = call;                                      \
-    if (cudaSuccess != status) {                                          \
-      printf("CUDA call='%s' at file=%s line=%d failed with %s\n", #call, \
-             __FILE__, __LINE__, cudaGetErrorString(status));             \
-    }                                                                     \
+#define CUDA_CHECK_NO_THROW(call)                                  \
+  do {                                                             \
+    cudaError_t const status = call;                               \
+    if (cudaSuccess != status) {                                   \
+      printf("CUDA call='%s' at file=%s line=%d failed with %s\n", \
+             #call,                                                \
+             __FILE__,                                             \
+             __LINE__,                                             \
+             cudaGetErrorString(status));                          \
+    }                                                              \
   } while (0)
 #endif
 
@@ -112,7 +118,7 @@ struct cuda_error : public raft::exception {
  * TODO: Rename original implementations in 22.04 to fix
  * https://github.com/rapidsai/raft/issues/128
  */
-#define RAFT_CUDA_CHECK(call) CUDA_CHECK(call)
+#define RAFT_CUDA_CHECK(call)          CUDA_CHECK(call)
 #define RAFT_CUDA_CHECK_NO_THROW(call) CUDA_CHECK_NO_THROW(call)
 
 namespace raft {
@@ -120,9 +126,7 @@ namespace raft {
 /** Helper method to get to know warp size in device code */
 __host__ __device__ constexpr inline int warp_size() { return 32; }
 
-__host__ __device__ constexpr inline unsigned int warp_full_mask() {
-  return 0xffffffff;
-}
+__host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; }
 
 /**
  * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
@@ -134,20 +138,23 @@ class grid_1d_thread_t {
   int const num_blocks{0};
 
   /**
-         * @param overall_num_elements The number of elements the kernel needs to handle/process
-         * @param num_threads_per_block The grid block size, determined according to the kernel's
-         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-         * this can't be determined generically/automatically (as opposed to the number of blocks)
-         * @param elements_per_thread Typically, a single kernel thread processes more than a single
-         * element; this affects the number of threads the grid must contain
-         */
-  grid_1d_thread_t(size_t overall_num_elements, size_t num_threads_per_block,
-                   size_t max_num_blocks_1d, size_t elements_per_thread = 1)
+   * @param overall_num_elements The number of elements the kernel needs to handle/process
+   * @param num_threads_per_block The grid block size, determined according to the kernel's
+   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+   * this can't be determined generically/automatically (as opposed to the number of blocks)
+   * @param elements_per_thread Typically, a single kernel thread processes more than a single
+   * element; this affects the number of threads the grid must contain
+   */
+  grid_1d_thread_t(size_t overall_num_elements,
+                   size_t num_threads_per_block,
+                   size_t max_num_blocks_1d,
+                   size_t elements_per_thread = 1)
     : block_size(num_threads_per_block),
-      num_blocks(std::min((overall_num_elements +
-                           (elements_per_thread * num_threads_per_block) - 1) /
-                            (elements_per_thread * num_threads_per_block),
-                          max_num_blocks_1d)) {
+      num_blocks(
+        std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) /
+                   (elements_per_thread * num_threads_per_block),
+                 max_num_blocks_1d))
+  {
     RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
     RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
                  "num_threads_per_block / warp_size() must be > 0");
@@ -165,18 +172,19 @@ class grid_1d_warp_t {
   int const num_blocks{0};
 
   /**
-         * @param overall_num_elements The number of elements the kernel needs to handle/process
-         * @param num_threads_per_block The grid block size, determined according to the kernel's
-         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-         * this can't be determined generically/automatically (as opposed to the number of blocks)
-         */
-  grid_1d_warp_t(size_t overall_num_elements, size_t num_threads_per_block,
+   * @param overall_num_elements The number of elements the kernel needs to handle/process
+   * @param num_threads_per_block The grid block size, determined according to the kernel's
+   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+   * this can't be determined generically/automatically (as opposed to the number of blocks)
+   */
+  grid_1d_warp_t(size_t overall_num_elements,
+                 size_t num_threads_per_block,
                  size_t max_num_blocks_1d)
     : block_size(num_threads_per_block),
-      num_blocks(std::min(
-        (overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
-          (num_threads_per_block / warp_size()),
-        max_num_blocks_1d)) {
+      num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
+                            (num_threads_per_block / warp_size()),
+                          max_num_blocks_1d))
+  {
     RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
     RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
                  "num_threads_per_block / warp_size() must be > 0");
@@ -193,15 +201,17 @@ class grid_1d_block_t {
   int const num_blocks{0};
 
   /**
-         * @param overall_num_elements The number of elements the kernel needs to handle/process
-         * @param num_threads_per_block The grid block size, determined according to the kernel's
-         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-         * this can't be determined generically/automatically (as opposed to the number of blocks)
-         */
-  grid_1d_block_t(size_t overall_num_elements, size_t num_threads_per_block,
+   * @param overall_num_elements The number of elements the kernel needs to handle/process
+   * @param num_threads_per_block The grid block size, determined according to the kernel's
+   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+   * this can't be determined generically/automatically (as opposed to the number of blocks)
+   */
+  grid_1d_block_t(size_t overall_num_elements,
+                  size_t num_threads_per_block,
                   size_t max_num_blocks_1d)
     : block_size(num_threads_per_block),
-      num_blocks(std::min(overall_num_elements, max_num_blocks_1d)) {
+      num_blocks(std::min(overall_num_elements, max_num_blocks_1d))
+  {
     RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
     RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
                  "num_threads_per_block / warp_size() must be > 0");
@@ -217,10 +227,9 @@ class grid_1d_block_t {
  * @param stream cuda stream
  */
 template <typename Type>
-void copy(Type* dst, const Type* src, size_t len,
-          rmm::cuda_stream_view stream) {
-  CUDA_CHECK(
-    cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
+void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream)
+{
+  CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
 }
 
 /**
@@ -231,23 +240,22 @@ void copy(Type* dst, const Type* src, size_t len,
  */
 /** performs a host to device copy */
 template <typename Type>
-void update_device(Type* d_ptr, const Type* h_ptr, size_t len,
-                   rmm::cuda_stream_view stream) {
+void update_device(Type* d_ptr, const Type* h_ptr, size_t len, rmm::cuda_stream_view stream)
+{
   copy(d_ptr, h_ptr, len, stream);
 }
 
 /** performs a device to host copy */
 template <typename Type>
-void update_host(Type* h_ptr, const Type* d_ptr, size_t len,
-                 rmm::cuda_stream_view stream) {
+void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_view stream)
+{
   copy(h_ptr, d_ptr, len, stream);
 }
 
 template <typename Type>
-void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len,
-                rmm::cuda_stream_view stream) {
-  CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type),
-                             cudaMemcpyDeviceToDevice, stream));
+void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream)
+{
+  CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
 }
 /** @} */
 
@@ -256,8 +264,11 @@ void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len,
  * @{
  */
 template <class T, class OutStream>
-void print_host_vector(const char* variable_name, const T* host_mem,
-                       size_t componentsCount, OutStream& out) {
+void print_host_vector(const char* variable_name,
+                       const T* host_mem,
+                       size_t componentsCount,
+                       OutStream& out)
+{
   out << variable_name << "=[";
   for (size_t i = 0; i < componentsCount; ++i) {
     if (i != 0) out << ",";
@@ -267,11 +278,13 @@ void print_host_vector(const char* variable_name, const T* host_mem,
 }
 
 template <class T, class OutStream>
-void print_device_vector(const char* variable_name, const T* devMem,
-                         size_t componentsCount, OutStream& out) {
+void print_device_vector(const char* variable_name,
+                         const T* devMem,
+                         size_t componentsCount,
+                         OutStream& out)
+{
   T* host_mem = new T[componentsCount];
-  CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T),
-                        cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
   print_host_vector(variable_name, host_mem, componentsCount, out);
   delete[] host_mem;
 }
@@ -281,10 +294,10 @@ static std::mutex mutex_;
 static std::unordered_map<void*, size_t> allocations;
 
 template <typename Type>
-void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream,
-              bool setZero = false) {
+void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, bool setZero = false)
+{
   size_t size = len * sizeof(Type);
-  ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream);
+  ptr         = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream);
   if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream));
 
   std::lock_guard<std::mutex> _(mutex_);
@@ -292,17 +305,19 @@ void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream,
 }
 
 template <typename Type>
-void deallocate(Type*& ptr, rmm::cuda_stream_view stream) {
+void deallocate(Type*& ptr, rmm::cuda_stream_view stream)
+{
   std::lock_guard<std::mutex> _(mutex_);
   size_t size = allocations[ptr];
   allocations.erase(ptr);
   rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream);
 }
 
-inline void deallocate_all(rmm::cuda_stream_view stream) {
+inline void deallocate_all(rmm::cuda_stream_view stream)
+{
   std::lock_guard<std::mutex> _(mutex_);
   for (auto& alloc : allocations) {
-    void* ptr = alloc.first;
+    void* ptr   = alloc.first;
     size_t size = alloc.second;
     rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream);
   }
@@ -310,29 +325,29 @@ inline void deallocate_all(rmm::cuda_stream_view stream) {
 }
 
 /** helper method to get max usable shared mem per block parameter */
-inline int getSharedMemPerBlock() {
+inline int getSharedMemPerBlock()
+{
   int devId;
   CUDA_CHECK(cudaGetDevice(&devId));
   int smemPerBlk;
-  CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk,
-                                    cudaDevAttrMaxSharedMemoryPerBlock, devId));
+  CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId));
   return smemPerBlk;
 }
 
 /** helper method to get multi-processor count parameter */
-inline int getMultiProcessorCount() {
+inline int getMultiProcessorCount()
+{
   int devId;
   CUDA_CHECK(cudaGetDevice(&devId));
   int mpCount;
-  CUDA_CHECK(
-    cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
+  CUDA_CHECK(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
   return mpCount;
 }
 
 /** helper method to convert an array on device to a string on host */
 template <typename T>
-std::string arr2Str(const T* arr, int size, std::string name,
-                    cudaStream_t stream, int width = 4) {
+std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4)
+{
   std::stringstream ss;
 
   T* arr_h = (T*)malloc(size * sizeof(T));
@@ -354,53 +369,54 @@ std::string arr2Str(const T* arr, int size, std::string name,
 
 /** this seems to be unused, but may be useful in the future */
 template <typename T>
-void ASSERT_DEVICE_MEM(T* ptr, std::string name) {
+void ASSERT_DEVICE_MEM(T* ptr, std::string name)
+{
   cudaPointerAttributes s_att;
   cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr);
 
   if (s_err != 0 || s_att.device == -1)
-    std::cout << "Invalid device pointer encountered in " << name
-              << ". device=" << s_att.device << ", err=" << s_err << std::endl;
+    std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device
+              << ", err=" << s_err << std::endl;
 }
 
-inline uint32_t curTimeMillis() {
-  auto now = std::chrono::high_resolution_clock::now();
+inline uint32_t curTimeMillis()
+{
+  auto now      = std::chrono::high_resolution_clock::now();
   auto duration = now.time_since_epoch();
-  return std::chrono::duration_cast<std::chrono::milliseconds>(duration)
-    .count();
+  return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
 }
 
 /** Helper function to calculate need memory for allocate to store dense matrix.
-    * @param rows number of rows in matrix
-    * @param columns number of columns in matrix
-    * @return need number of items to allocate via allocate()
-    * @sa allocate()
-    */
-inline size_t allocLengthForMatrix(size_t rows, size_t columns) {
-  return rows * columns;
-}
+ * @param rows number of rows in matrix
+ * @param columns number of columns in matrix
+ * @return need number of items to allocate via allocate()
+ * @sa allocate()
+ */
+inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; }
 
 /** Helper function to check alignment of pointer.
-    * @param ptr the pointer to check
-    * @param alignment to be checked for
-    * @return true if address in bytes is a multiple of alignment
-    */
+ * @param ptr the pointer to check
+ * @param alignment to be checked for
+ * @return true if address in bytes is a multiple of alignment
+ */
 template <typename Type>
-bool is_aligned(Type* ptr, size_t alignment) {
+bool is_aligned(Type* ptr, size_t alignment)
+{
   return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
 }
 
 /** calculate greatest common divisor of two numbers
-* @a integer
-* @b integer
-* @ return gcd of a and b
-*/
+ * @a integer
+ * @b integer
+ * @ return gcd of a and b
+ */
 template <typename IntType>
-IntType gcd(IntType a, IntType b) {
+IntType gcd(IntType a, IntType b)
+{
   while (b != 0) {
     IntType tmp = b;
-    b = a % b;
-    a = tmp;
+    b           = a % b;
+    a           = tmp;
   }
   return a;
 }
diff --git a/cpp/include/raft/device_atomics.cuh b/cpp/include/raft/device_atomics.cuh
index a4ebcc9900..e3b324d030 100644
--- a/cpp/include/raft/device_atomics.cuh
+++ b/cpp/include/raft/device_atomics.cuh
@@ -39,9 +39,9 @@ namespace detail {
 
 /* @brief binary `sum` operator */
 struct DeviceSum {
-  template <typename T,
-            typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
-  __device__ T operator()(const T& lhs, const T& rhs) {
+  template <typename T, typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+  __device__ T operator()(const T& lhs, const T& rhs)
+  {
     return lhs + rhs;
   }
 };
@@ -49,7 +49,8 @@ struct DeviceSum {
 /* @brief binary `min` operator */
 struct DeviceMin {
   template <typename T>
-  __device__ T operator()(const T& lhs, const T& rhs) {
+  __device__ T operator()(const T& lhs, const T& rhs)
+  {
     return lhs < rhs ? lhs : rhs;
   }
 };
@@ -57,43 +58,44 @@ struct DeviceMin {
 /* @brief binary `max` operator */
 struct DeviceMax {
   template <typename T>
-  __device__ T operator()(const T& lhs, const T& rhs) {
+  __device__ T operator()(const T& lhs, const T& rhs)
+  {
     return lhs > rhs ? lhs : rhs;
   }
 };
 
 /* @brief binary `product` operator */
 struct DeviceProduct {
-  template <typename T,
-            typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
-  __device__ T operator()(const T& lhs, const T& rhs) {
+  template <typename T, typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+  __device__ T operator()(const T& lhs, const T& rhs)
+  {
     return lhs * rhs;
   }
 };
 
 /* @brief binary `and` operator */
 struct DeviceAnd {
-  template <typename T,
-            typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  __device__ T operator()(const T& lhs, const T& rhs) {
+  template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+  __device__ T operator()(const T& lhs, const T& rhs)
+  {
     return (lhs & rhs);
   }
 };
 
 /* @brief binary `or` operator */
 struct DeviceOr {
-  template <typename T,
-            typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  __device__ T operator()(const T& lhs, const T& rhs) {
+  template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+  __device__ T operator()(const T& lhs, const T& rhs)
+  {
     return (lhs | rhs);
   }
 };
 
 /* @brief binary `xor` operator */
 struct DeviceXor {
-  template <typename T,
-            typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  __device__ T operator()(const T& lhs, const T& rhs) {
+  template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+  __device__ T operator()(const T& lhs, const T& rhs)
+  {
     return (lhs ^ rhs);
   }
 };
@@ -103,9 +105,9 @@ struct DeviceXor {
 #define errmsg_cast "size mismatch."
 
 template <typename T_output, typename T_input>
-__forceinline__ __device__ T_output type_reinterpret(T_input value) {
-  static_assert(sizeof(T_output) == sizeof(T_input),
-                "type_reinterpret for different size");
+__forceinline__ __device__ T_output type_reinterpret(T_input value)
+{
+  static_assert(sizeof(T_output) == sizeof(T_input), "type_reinterpret for different size");
   return *(reinterpret_cast<T_output*>(&value));
 }
 
@@ -118,25 +120,22 @@ struct genericAtomicOperationImpl;
 // single byte atomic operation
 template <typename T, typename Op>
 struct genericAtomicOperationImpl<T, Op, 1> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          Op op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op)
+  {
     using T_int = unsigned int;
 
-    T_int* address_uint32 =
-      reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
-    T_int shift = ((reinterpret_cast<size_t>(addr) & 3) * 8);
+    T_int* address_uint32 = reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
+    T_int shift           = ((reinterpret_cast<size_t>(addr) & 3) * 8);
 
     T_int old = *address_uint32;
     T_int assumed;
 
     do {
-      assumed = old;
-      T target_value = T((old >> shift) & 0xff);
-      uint8_t updating_value =
-        type_reinterpret<uint8_t, T>(op(target_value, update_value));
-      T_int new_value =
-        (old & ~(0x000000ff << shift)) | (T_int(updating_value) << shift);
-      old = atomicCAS(address_uint32, assumed, new_value);
+      assumed                = old;
+      T target_value         = T((old >> shift) & 0xff);
+      uint8_t updating_value = type_reinterpret<uint8_t, T>(op(target_value, update_value));
+      T_int new_value        = (old & ~(0x000000ff << shift)) | (T_int(updating_value) << shift);
+      old                    = atomicCAS(address_uint32, assumed, new_value);
     } while (assumed != old);
 
     return T((old >> shift) & 0xff);
@@ -146,26 +145,24 @@ struct genericAtomicOperationImpl<T, Op, 1> {
 // 2 bytes atomic operation
 template <typename T, typename Op>
 struct genericAtomicOperationImpl<T, Op, 2> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          Op op) {
-    using T_int = unsigned int;
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op)
+  {
+    using T_int      = unsigned int;
     bool is_32_align = (reinterpret_cast<size_t>(addr) & 2) ? false : true;
-    T_int* address_uint32 = reinterpret_cast<T_int*>(
-      reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
+    T_int* address_uint32 =
+      reinterpret_cast<T_int*>(reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
 
     T_int old = *address_uint32;
     T_int assumed;
 
     do {
-      assumed = old;
-      T target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16);
-      uint16_t updating_value =
-        type_reinterpret<uint16_t, T>(op(target_value, update_value));
-
-      T_int new_value = (is_32_align)
-                          ? (old & 0xffff0000) | updating_value
-                          : (old & 0xffff) | (T_int(updating_value) << 16);
-      old = atomicCAS(address_uint32, assumed, new_value);
+      assumed                 = old;
+      T target_value          = (is_32_align) ? T(old & 0xffff) : T(old >> 16);
+      uint16_t updating_value = type_reinterpret<uint16_t, T>(op(target_value, update_value));
+
+      T_int new_value = (is_32_align) ? (old & 0xffff0000) | updating_value
+                                      : (old & 0xffff) | (T_int(updating_value) << 16);
+      old             = atomicCAS(address_uint32, assumed, new_value);
     } while (assumed != old);
 
     return (is_32_align) ? T(old & 0xffff) : T(old >> 16);
@@ -176,20 +173,18 @@ struct genericAtomicOperationImpl<T, Op, 2> {
 // 4 bytes atomic operation
 template <typename T, typename Op>
 struct genericAtomicOperationImpl<T, Op, 4> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          Op op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op)
+  {
     using T_int = unsigned int;
     T old_value = *addr;
     T assumed{old_value};
 
     if constexpr (std::is_same<T, float>{} && (std::is_same<Op, DeviceMin>{})) {
-      if (isnan(update_value)) {
-        return old_value;
-      }
+      if (isnan(update_value)) { return old_value; }
     }
 
     do {
-      assumed = old_value;
+      assumed           = old_value;
       const T new_value = op(old_value, update_value);
 
       T_int ret = atomicCAS(reinterpret_cast<T_int*>(addr),
@@ -206,17 +201,13 @@ struct genericAtomicOperationImpl<T, Op, 4> {
 template <>
 struct genericAtomicOperationImpl<float, DeviceMax, 4> {
   using T = float;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceMax op) {
-    if (isnan(update_value)) {
-      return *addr;
-    }
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op)
+  {
+    if (isnan(update_value)) { return *addr; }
 
-    T old =
-      (update_value >= 0)
-        ? __int_as_float(atomicMax((int*)addr, __float_as_int(update_value)))
-        : __uint_as_float(
-            atomicMin((unsigned int*)addr, __float_as_uint(update_value)));
+    T old = (update_value >= 0)
+              ? __int_as_float(atomicMax((int*)addr, __float_as_int(update_value)))
+              : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(update_value)));
 
     return old;
   }
@@ -225,8 +216,8 @@ struct genericAtomicOperationImpl<float, DeviceMax, 4> {
 // 8 bytes atomic operation
 template <typename T, typename Op>
 struct genericAtomicOperationImpl<T, Op, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          Op op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
 
@@ -234,7 +225,7 @@ struct genericAtomicOperationImpl<T, Op, 8> {
     T assumed{old_value};
 
     do {
-      assumed = old_value;
+      assumed           = old_value;
       const T new_value = op(old_value, update_value);
 
       T_int ret = atomicCAS(reinterpret_cast<T_int*>(addr),
@@ -250,8 +241,8 @@ struct genericAtomicOperationImpl<T, Op, 8> {
 
 // -------------------------------------------------------------------------------------------------
 // specialized functions for operators
-// `atomicAdd` supports int, unsigned int, unsigend long long int, float, double (long long int is not supproted.)
-// `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int
+// `atomicAdd` supports int, unsigned int, unsigend long long int, float, double (long long int is
+// not supproted.) `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int
 // `atomicAnd`, `atomicOr`, `atomicXor` support int, unsigned int, unsigned long long int
 
 // CUDA natively supports `unsigned long long int` for `atomicAdd`,
@@ -264,12 +255,11 @@ struct genericAtomicOperationImpl<T, Op, 8> {
 template <>
 struct genericAtomicOperationImpl<long int, DeviceSum, 8> {
   using T = long int;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceSum op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr),
-                          type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
@@ -277,12 +267,11 @@ struct genericAtomicOperationImpl<long int, DeviceSum, 8> {
 template <>
 struct genericAtomicOperationImpl<unsigned long int, DeviceSum, 8> {
   using T = unsigned long int;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceSum op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr),
-                          type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
@@ -297,12 +286,11 @@ struct genericAtomicOperationImpl<unsigned long int, DeviceSum, 8> {
 template <>
 struct genericAtomicOperationImpl<long long int, DeviceSum, 8> {
   using T = long long int;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceSum op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr),
-                          type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
@@ -310,12 +298,11 @@ struct genericAtomicOperationImpl<long long int, DeviceSum, 8> {
 template <>
 struct genericAtomicOperationImpl<unsigned long int, DeviceMin, 8> {
   using T = unsigned long int;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceMin op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMin op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T ret = atomicMin(reinterpret_cast<T_int*>(addr),
-                      type_reinterpret<T_int, T>(update_value));
+    T ret = atomicMin(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
@@ -323,48 +310,44 @@ struct genericAtomicOperationImpl<unsigned long int, DeviceMin, 8> {
 template <>
 struct genericAtomicOperationImpl<unsigned long int, DeviceMax, 8> {
   using T = unsigned long int;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceMax op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T ret = atomicMax(reinterpret_cast<T_int*>(addr),
-                      type_reinterpret<T_int, T>(update_value));
+    T ret = atomicMax(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
 
 template <typename T>
 struct genericAtomicOperationImpl<T, DeviceAnd, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceAnd op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceAnd op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicAnd(reinterpret_cast<T_int*>(addr),
-                          type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicAnd(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
 
 template <typename T>
 struct genericAtomicOperationImpl<T, DeviceOr, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceOr op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceOr op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicOr(reinterpret_cast<T_int*>(addr),
-                         type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicOr(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
 
 template <typename T>
 struct genericAtomicOperationImpl<T, DeviceXor, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceXor op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceXor op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicXor(reinterpret_cast<T_int*>(addr),
-                          type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicXor(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
@@ -377,13 +360,12 @@ struct typesAtomicCASImpl;
 
 template <typename T>
 struct typesAtomicCASImpl<T, 1> {
-  __forceinline__ __device__ T operator()(T* addr, T const& compare,
-                                          T const& update_value) {
+  __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
+  {
     using T_int = unsigned int;
 
-    T_int shift = ((reinterpret_cast<size_t>(addr) & 3) * 8);
-    T_int* address_uint32 =
-      reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
+    T_int shift           = ((reinterpret_cast<size_t>(addr) & 3) * 8);
+    T_int* address_uint32 = reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
 
     // the 'target_value' in `old` can be different from `compare`
     // because other thread may update the value
@@ -394,15 +376,14 @@ struct typesAtomicCASImpl<T, 1> {
     uint8_t u_val = type_reinterpret<uint8_t, T>(update_value);
 
     do {
-      assumed = old;
+      assumed      = old;
       target_value = T((old >> shift) & 0xff);
       // have to compare `target_value` and `compare` before calling atomicCAS
       // the `target_value` in `old` can be different with `compare`
       if (target_value != compare) break;
 
-      T_int new_value =
-        (old & ~(0x000000ff << shift)) | (T_int(u_val) << shift);
-      old = atomicCAS(address_uint32, assumed, new_value);
+      T_int new_value = (old & ~(0x000000ff << shift)) | (T_int(u_val) << shift);
+      old             = atomicCAS(address_uint32, assumed, new_value);
     } while (assumed != old);
 
     return target_value;
@@ -411,13 +392,13 @@ struct typesAtomicCASImpl<T, 1> {
 
 template <typename T>
 struct typesAtomicCASImpl<T, 2> {
-  __forceinline__ __device__ T operator()(T* addr, T const& compare,
-                                          T const& update_value) {
+  __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
+  {
     using T_int = unsigned int;
 
     bool is_32_align = (reinterpret_cast<size_t>(addr) & 2) ? false : true;
-    T_int* address_uint32 = reinterpret_cast<T_int*>(
-      reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
+    T_int* address_uint32 =
+      reinterpret_cast<T_int*>(reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
 
     T_int old = *address_uint32;
     T_int assumed;
@@ -425,12 +406,12 @@ struct typesAtomicCASImpl<T, 2> {
     uint16_t u_val = type_reinterpret<uint16_t, T>(update_value);
 
     do {
-      assumed = old;
+      assumed      = old;
       target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16);
       if (target_value != compare) break;
 
-      T_int new_value = (is_32_align) ? (old & 0xffff0000) | u_val
-                                      : (old & 0xffff) | (T_int(u_val) << 16);
+      T_int new_value =
+        (is_32_align) ? (old & 0xffff0000) | u_val : (old & 0xffff) | (T_int(u_val) << 16);
       old = atomicCAS(address_uint32, assumed, new_value);
     } while (assumed != old);
 
@@ -440,8 +421,8 @@ struct typesAtomicCASImpl<T, 2> {
 
 template <typename T>
 struct typesAtomicCASImpl<T, 4> {
-  __forceinline__ __device__ T operator()(T* addr, T const& compare,
-                                          T const& update_value) {
+  __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
+  {
     using T_int = unsigned int;
 
     T_int ret = atomicCAS(reinterpret_cast<T_int*>(addr),
@@ -454,8 +435,8 @@ struct typesAtomicCASImpl<T, 4> {
 // 8 bytes atomic operation
 template <typename T>
 struct typesAtomicCASImpl<T, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& compare,
-                                          T const& update_value) {
+  __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
 
@@ -487,11 +468,10 @@ struct typesAtomicCASImpl<T, 8> {
  * @returns The old value at `address`
  * -------------------------------------------------------------------------**/
 template <typename T, typename BinaryOp>
-typename std::enable_if_t<std::is_arithmetic<T>::value, T> __forceinline__
-  __device__
-  genericAtomicOperation(T* address, T const& update_value, BinaryOp op) {
-  auto fun =
-    raft::device_atomics::detail::genericAtomicOperationImpl<T, BinaryOp>{};
+typename std::enable_if_t<std::is_arithmetic<T>::value, T> __forceinline__ __device__
+genericAtomicOperation(T* address, T const& update_value, BinaryOp op)
+{
+  auto fun = raft::device_atomics::detail::genericAtomicOperationImpl<T, BinaryOp>{};
   return T(fun(address, update_value, op));
 }
 
@@ -499,11 +479,11 @@ typename std::enable_if_t<std::is_arithmetic<T>::value, T> __forceinline__
 template <typename BinaryOp>
 __forceinline__ __device__ bool genericAtomicOperation(bool* address,
                                                        bool const& update_value,
-                                                       BinaryOp op) {
+                                                       BinaryOp op)
+{
   using T = bool;
   // don't use underlying type to apply operation for bool
-  auto fun =
-    raft::device_atomics::detail::genericAtomicOperationImpl<T, BinaryOp>{};
+  auto fun = raft::device_atomics::detail::genericAtomicOperationImpl<T, BinaryOp>{};
   return T(fun(address, update_value, op));
 }
 
@@ -525,9 +505,9 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address,
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicAdd(T* address, T val) {
-  return raft::genericAtomicOperation(
-    address, val, raft::device_atomics::detail::DeviceSum{});
+__forceinline__ __device__ T atomicAdd(T* address, T val)
+{
+  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceSum{});
 }
 
 /**
@@ -546,9 +526,9 @@ __forceinline__ __device__ T atomicAdd(T* address, T val) {
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicMin(T* address, T val) {
-  return raft::genericAtomicOperation(
-    address, val, raft::device_atomics::detail::DeviceMin{});
+__forceinline__ __device__ T atomicMin(T* address, T val)
+{
+  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceMin{});
 }
 
 /**
@@ -567,9 +547,9 @@ __forceinline__ __device__ T atomicMin(T* address, T val) {
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicMax(T* address, T val) {
-  return raft::genericAtomicOperation(
-    address, val, raft::device_atomics::detail::DeviceMax{});
+__forceinline__ __device__ T atomicMax(T* address, T val)
+{
+  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceMax{});
 }
 
 /**
@@ -589,9 +569,9 @@ __forceinline__ __device__ T atomicMax(T* address, T val) {
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicCAS(T* address, T compare, T val) {
-  return raft::device_atomics::detail::typesAtomicCASImpl<T>()(address, compare,
-                                                               val);
+__forceinline__ __device__ T atomicCAS(T* address, T compare, T val)
+{
+  return raft::device_atomics::detail::typesAtomicCASImpl<T>()(address, compare, val);
 }
 
 /**
@@ -609,11 +589,10 @@ __forceinline__ __device__ T atomicCAS(T* address, T compare, T val) {
  *
  * @returns The old value at `address`
  */
-template <typename T,
-          typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
-__forceinline__ __device__ T atomicAnd(T* address, T val) {
-  return raft::genericAtomicOperation(
-    address, val, raft::device_atomics::detail::DeviceAnd{});
+template <typename T, typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
+__forceinline__ __device__ T atomicAnd(T* address, T val)
+{
+  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceAnd{});
 }
 
 /**
@@ -631,11 +610,10 @@ __forceinline__ __device__ T atomicAnd(T* address, T val) {
  *
  * @returns The old value at `address`
  */
-template <typename T,
-          typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
-__forceinline__ __device__ T atomicOr(T* address, T val) {
-  return raft::genericAtomicOperation(address, val,
-                                      raft::device_atomics::detail::DeviceOr{});
+template <typename T, typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
+__forceinline__ __device__ T atomicOr(T* address, T val)
+{
+  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceOr{});
 }
 
 /**
@@ -653,9 +631,8 @@ __forceinline__ __device__ T atomicOr(T* address, T val) {
  *
  * @returns The old value at `address`
  */
-template <typename T,
-          typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
-__forceinline__ __device__ T atomicXor(T* address, T val) {
-  return raft::genericAtomicOperation(
-    address, val, raft::device_atomics::detail::DeviceXor{});
+template <typename T, typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
+__forceinline__ __device__ T atomicXor(T* address, T val)
+{
+  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceXor{});
 }
diff --git a/cpp/include/raft/distance/detail/canberra.cuh b/cpp/include/raft/distance/detail/canberra.cuh
index c4c384c45f..46edf0bf47 100644
--- a/cpp/include/raft/distance/detail/canberra.cuh
+++ b/cpp/include/raft/distance/detail/canberra.cuh
@@ -45,75 +45,108 @@ namespace detail {
  * @param fin_op    the final gemm epilogue lambda
  * @param stream    cuda stream to launch work
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-static void canberraImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
-                         IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
-                         FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+static void canberraImpl(const DataT* x,
+                         const DataT* y,
+                         IdxT m,
+                         IdxT n,
+                         IdxT k,
+                         IdxT lda,
+                         IdxT ldb,
+                         IdxT ldd,
+                         OutT* dOutput,
+                         FinalLambda fin_op,
+                         cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
     const auto diff = raft::L1Op<AccT, IdxT>()(x - y);
-    const auto add = raft::myAbs(x) + raft::myAbs(y);
+    const auto add  = raft::myAbs(x) + raft::myAbs(y);
     // deal with potential for 0 in denominator by
     // forcing 1/0 instead
     acc += ((add != 0) * diff / (add + (add == 0)));
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) { return; };
+  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                     DataT * regxn,
+                                     DataT * regyn,
+                                     IdxT gridStrideX,
+                                     IdxT gridStrideY) { return; };
 
   if (isRowMajor) {
-    auto canberraRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid =
-      launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, canberraRowMajor);
+    auto canberraRowMajor = pairwiseDistanceMatKernel<false,
+                                                      DataT,
+                                                      AccT,
+                                                      OutT,
+                                                      IdxT,
+                                                      KPolicy,
+                                                      decltype(core_lambda),
+                                                      decltype(epilog_lambda),
+                                                      FinalLambda,
+                                                      true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, canberraRowMajor);
 
     canberraRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    auto canberraColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid =
-      launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, canberraColMajor);
+    auto canberraColMajor = pairwiseDistanceMatKernel<false,
+                                                      DataT,
+                                                      AccT,
+                                                      OutT,
+                                                      IdxT,
+                                                      KPolicy,
+                                                      decltype(core_lambda),
+                                                      decltype(epilog_lambda),
+                                                      FinalLambda,
+                                                      false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, canberraColMajor);
     canberraColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void canberra(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-              const DataT *x, const DataT *y, OutT *dOutput, FinalLambda fin_op,
-              cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void canberra(IdxT m,
+              IdxT n,
+              IdxT k,
+              IdxT lda,
+              IdxT ldb,
+              IdxT ldd,
+              const DataT* x,
+              const DataT* y,
+              OutT* dOutput,
+              FinalLambda fin_op,
+              cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    canberraImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                 isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                             stream);
+    canberraImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    canberraImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                 isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                             stream);
+    canberraImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else {
     canberraImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -138,16 +171,25 @@ void canberra(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param[in] stream cuda stream to launch work
  * @param[in] isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void canberraImpl(int m, int n, int k, const InType *pA, const InType *pB,
-                  OutType *pD, FinalLambda fin_op, cudaStream_t stream,
-                  bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void canberraImpl(int m,
+                  int n,
+                  int k,
+                  const InType* pA,
+                  const InType* pB,
+                  OutType* pD,
+                  FinalLambda fin_op,
+                  cudaStream_t stream,
+                  bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    canberraOutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type canberraOutType;
   Index_ lda, ldb, ldd;
-  canberraOutType *pDcast = reinterpret_cast<canberraOutType *>(pD);
+  canberraOutType* pDcast = reinterpret_cast<canberraOutType*>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     canberra<InType, AccType, canberraOutType, Index_, FinalLambda, true>(
diff --git a/cpp/include/raft/distance/detail/chebyshev.cuh b/cpp/include/raft/distance/detail/chebyshev.cuh
index 77fba28310..99b314bd08 100644
--- a/cpp/include/raft/distance/detail/chebyshev.cuh
+++ b/cpp/include/raft/distance/detail/chebyshev.cuh
@@ -44,72 +44,105 @@ namespace detail {
  * @param[in]       fin_op the final gemm epilogue lambda
  * @param[in]       stream cuda stream to launch work
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-static void chebyshevImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
-                          IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
-                          FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+static void chebyshevImpl(const DataT* x,
+                          const DataT* y,
+                          IdxT m,
+                          IdxT n,
+                          IdxT k,
+                          IdxT lda,
+                          IdxT ldb,
+                          IdxT ldd,
+                          OutT* dOutput,
+                          FinalLambda fin_op,
+                          cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
     const auto diff = raft::L1Op<AccT, IdxT>()(x - y);
-    acc = raft::myMax(acc, diff);
+    acc             = raft::myMax(acc, diff);
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) { return; };
+  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                     DataT * regxn,
+                                     DataT * regyn,
+                                     IdxT gridStrideX,
+                                     IdxT gridStrideY) { return; };
 
   if (isRowMajor) {
-    auto chebyshevRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               chebyshevRowMajor);
+    auto chebyshevRowMajor = pairwiseDistanceMatKernel<false,
+                                                       DataT,
+                                                       AccT,
+                                                       OutT,
+                                                       IdxT,
+                                                       KPolicy,
+                                                       decltype(core_lambda),
+                                                       decltype(epilog_lambda),
+                                                       FinalLambda,
+                                                       true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, chebyshevRowMajor);
 
     chebyshevRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    auto chebyshevColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               chebyshevColMajor);
+    auto chebyshevColMajor = pairwiseDistanceMatKernel<false,
+                                                       DataT,
+                                                       AccT,
+                                                       OutT,
+                                                       IdxT,
+                                                       KPolicy,
+                                                       decltype(core_lambda),
+                                                       decltype(epilog_lambda),
+                                                       FinalLambda,
+                                                       false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, chebyshevColMajor);
     chebyshevColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void chebyshev(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-               const DataT *x, const DataT *y, OutT *dOutput,
-               FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void chebyshev(IdxT m,
+               IdxT n,
+               IdxT k,
+               IdxT lda,
+               IdxT ldb,
+               IdxT ldd,
+               const DataT* x,
+               const DataT* y,
+               OutT* dOutput,
+               FinalLambda fin_op,
+               cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    chebyshevImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                  isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                              stream);
+    chebyshevImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    chebyshevImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                  isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                              stream);
+    chebyshevImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else {
     chebyshevImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -134,16 +167,25 @@ void chebyshev(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param[in] stream cuda stream to launch work
  * @param[in] isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void chebyshevImpl(int m, int n, int k, const InType *pA, const InType *pB,
-                   OutType *pD, FinalLambda fin_op, cudaStream_t stream,
-                   bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void chebyshevImpl(int m,
+                   int n,
+                   int k,
+                   const InType* pA,
+                   const InType* pB,
+                   OutType* pD,
+                   FinalLambda fin_op,
+                   cudaStream_t stream,
+                   bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    chebyshevOutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type chebyshevOutType;
   Index_ lda, ldb, ldd;
-  chebyshevOutType *pDcast = reinterpret_cast<chebyshevOutType *>(pD);
+  chebyshevOutType* pDcast = reinterpret_cast<chebyshevOutType*>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     chebyshev<InType, AccType, chebyshevOutType, Index_, FinalLambda, true>(
diff --git a/cpp/include/raft/distance/detail/correlation.cuh b/cpp/include/raft/distance/detail/correlation.cuh
index cee986997a..159f9ab580 100644
--- a/cpp/include/raft/distance/detail/correlation.cuh
+++ b/cpp/include/raft/distance/detail/correlation.cuh
@@ -47,69 +47,81 @@ namespace detail {
  * @param[in]       fin_op the final gemm epilogue lambda
  * @param[in]       stream cuda stream to launch work
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-static void correlationImpl(const DataT *x, const DataT *y, const DataT *xn,
-                            const DataT *yn, const DataT *x2n, const DataT *y2n,
-                            IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb,
-                            IdxT ldd, OutT *dOutput, FinalLambda fin_op,
-                            cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+static void correlationImpl(const DataT* x,
+                            const DataT* y,
+                            const DataT* xn,
+                            const DataT* yn,
+                            const DataT* x2n,
+                            const DataT* y2n,
+                            IdxT m,
+                            IdxT n,
+                            IdxT k,
+                            IdxT lda,
+                            IdxT ldb,
+                            IdxT ldd,
+                            OutT* dOutput,
+                            FinalLambda fin_op,
+                            cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    acc += x * y;
-  };
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
 
   // epilogue operation lambda for final value calculation
   auto epilog_lambda = [x2n, y2n, m, n, k] __device__(
                          AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         DataT * regxn,
+                         DataT * regyn,
+                         IdxT gridStrideX,
                          IdxT gridStrideY) {
     DataT regx2n[KPolicy::AccRowsPerTh], regy2n[KPolicy::AccColsPerTh];
 
     extern __shared__ char smem[];
-    DataT *sx2Norm =
-      (DataT *)(&smem[KPolicy::SmemSize +
-                      (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)]);
-    DataT *sy2Norm = (&sx2Norm[KPolicy::Mblk]);
+    DataT* sx2Norm =
+      (DataT*)(&smem[KPolicy::SmemSize + (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)]);
+    DataT* sy2Norm = (&sx2Norm[KPolicy::Mblk]);
 
     // Load x & y norms required by this threadblock in shmem buffer
     if (gridStrideX == blockIdx.x * KPolicy::Nblk) {
       for (int i = threadIdx.x; i < KPolicy::Mblk; i += KPolicy::Nthreads) {
-        auto idx = gridStrideY + i;
+        auto idx   = gridStrideY + i;
         sx2Norm[i] = idx < m ? x2n[idx] : 0;
       }
     }
 
     for (int i = threadIdx.x; i < KPolicy::Nblk; i += KPolicy::Nthreads) {
-      auto idx = gridStrideX + i;
+      auto idx   = gridStrideX + i;
       sy2Norm[i] = idx < n ? y2n[idx] : 0;
     }
     __syncthreads();
 
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-      regx2n[i] =
-        sx2Norm[i * KPolicy::AccThRows + (threadIdx.x / KPolicy::AccThCols)];
+      regx2n[i] = sx2Norm[i * KPolicy::AccThRows + (threadIdx.x / KPolicy::AccThCols)];
     }
 #pragma unroll
     for (int i = 0; i < KPolicy::AccColsPerTh; ++i) {
-      regy2n[i] =
-        sy2Norm[i * KPolicy::AccThCols + (threadIdx.x % KPolicy::AccThCols)];
+      regy2n[i] = sy2Norm[i * KPolicy::AccThCols + (threadIdx.x % KPolicy::AccThCols)];
     }
 
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
       for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        auto numer = k * acc[i][j] - (regxn[i] * regyn[j]);
+        auto numer   = k * acc[i][j] - (regxn[i] * regyn[j]);
         auto Q_denom = k * regx2n[i] - (regxn[i] * regxn[i]);
         auto R_denom = k * regy2n[j] - (regyn[j] * regyn[j]);
 
@@ -121,46 +133,68 @@ static void correlationImpl(const DataT *x, const DataT *y, const DataT *xn,
   constexpr size_t shmemSize =
     KPolicy::SmemSize + (2 * (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
   if (isRowMajor) {
-    constexpr auto correlationRowMajor =
-      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               correlationRowMajor);
+    constexpr auto correlationRowMajor = pairwiseDistanceMatKernel<true,
+                                                                   DataT,
+                                                                   AccT,
+                                                                   OutT,
+                                                                   IdxT,
+                                                                   KPolicy,
+                                                                   decltype(core_lambda),
+                                                                   decltype(epilog_lambda),
+                                                                   FinalLambda,
+                                                                   true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, correlationRowMajor);
     correlationRowMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
-      fin_op);
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    constexpr auto correlationColMajor =
-      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               correlationColMajor);
+    constexpr auto correlationColMajor = pairwiseDistanceMatKernel<true,
+                                                                   DataT,
+                                                                   AccT,
+                                                                   OutT,
+                                                                   IdxT,
+                                                                   KPolicy,
+                                                                   decltype(core_lambda),
+                                                                   decltype(epilog_lambda),
+                                                                   FinalLambda,
+                                                                   false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, correlationColMajor);
     correlationColMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
-      fin_op);
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void correlation(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                 const DataT *x, const DataT *y, const DataT *xn,
-                 const DataT *yn, const DataT *x2n, const DataT *y2n,
-                 OutT *dOutput, FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void correlation(IdxT m,
+                 IdxT n,
+                 IdxT k,
+                 IdxT lda,
+                 IdxT ldb,
+                 IdxT ldd,
+                 const DataT* x,
+                 const DataT* y,
+                 const DataT* xn,
+                 const DataT* yn,
+                 const DataT* x2n,
+                 const DataT* y2n,
+                 OutT* dOutput,
+                 FinalLambda fin_op,
+                 cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    correlationImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                    isRowMajor>(x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd,
-                                dOutput, fin_op, stream);
+    correlationImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    correlationImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                    isRowMajor>(x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd,
-                                dOutput, fin_op, stream);
+    correlationImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else {
     correlationImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -185,63 +219,118 @@ void correlation(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void correlationImpl(int m, int n, int k, const InType *pA, const InType *pB,
-                     OutType *pD, AccType *workspace, size_t &worksize,
-                     FinalLambda fin_op, cudaStream_t stream, bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void correlationImpl(int m,
+                     int n,
+                     int k,
+                     const InType* pA,
+                     const InType* pB,
+                     OutType* pD,
+                     AccType* workspace,
+                     size_t& worksize,
+                     FinalLambda fin_op,
+                     cudaStream_t stream,
+                     bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    correlationOutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type correlationOutType;
   Index_ lda, ldb, ldd;
-  correlationOutType *pDcast = reinterpret_cast<correlationOutType *>(pD);
+  correlationOutType* pDcast = reinterpret_cast<correlationOutType*>(pD);
 
   ASSERT(!(((pA != pB) && (worksize < 2 * (m + n) * sizeof(AccType))) ||
            (worksize < 2 * m * sizeof(AccType))),
          "workspace size error");
   ASSERT(workspace != nullptr, "workspace is null");
 
-  AccType *norm_col_vec = workspace;
-  AccType *norm_row_vec = workspace;
-  AccType *sq_norm_col_vec = workspace;
-  AccType *sq_norm_row_vec = workspace;
+  AccType* norm_col_vec    = workspace;
+  AccType* norm_row_vec    = workspace;
+  AccType* sq_norm_col_vec = workspace;
+  AccType* sq_norm_row_vec = workspace;
   if (pA != pB) {
     norm_row_vec += m;
 
-    raft::linalg::reduce(norm_col_vec, pA, k, m, (AccType)0, isRowMajor, true,
-                         stream, false, raft::Nop<InType>(),
+    raft::linalg::reduce(norm_col_vec,
+                         pA,
+                         k,
+                         m,
+                         (AccType)0,
+                         isRowMajor,
+                         true,
+                         stream,
+                         false,
+                         raft::Nop<InType>(),
                          raft::Sum<InType>());
-    raft::linalg::reduce(norm_row_vec, pB, k, n, (AccType)0, isRowMajor, true,
-                         stream, false, raft::Nop<InType>(),
+    raft::linalg::reduce(norm_row_vec,
+                         pB,
+                         k,
+                         n,
+                         (AccType)0,
+                         isRowMajor,
+                         true,
+                         stream,
+                         false,
+                         raft::Nop<InType>(),
                          raft::Sum<InType>());
 
     sq_norm_col_vec += (m + n);
     sq_norm_row_vec = sq_norm_col_vec + m;
-    raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm,
-                          isRowMajor, stream);
-    raft::linalg::rowNorm(sq_norm_row_vec, pB, k, n, raft::linalg::L2Norm,
-                          isRowMajor, stream);
+    raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream);
+    raft::linalg::rowNorm(sq_norm_row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream);
   } else {
-    raft::linalg::reduce(norm_col_vec, pA, k, m, (AccType)0, isRowMajor, true,
-                         stream, false, raft::Nop<InType>(),
+    raft::linalg::reduce(norm_col_vec,
+                         pA,
+                         k,
+                         m,
+                         (AccType)0,
+                         isRowMajor,
+                         true,
+                         stream,
+                         false,
+                         raft::Nop<InType>(),
                          raft::Sum<InType>());
     sq_norm_col_vec += m;
     sq_norm_row_vec = sq_norm_col_vec;
-    raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm,
-                          isRowMajor, stream);
+    raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream);
   }
 
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
-    correlation<InType, AccType, correlationOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, norm_col_vec, norm_row_vec,
-      sq_norm_col_vec, sq_norm_row_vec, pDcast, fin_op, stream);
+    correlation<InType, AccType, correlationOutType, Index_, FinalLambda, true>(m,
+                                                                                n,
+                                                                                k,
+                                                                                lda,
+                                                                                ldb,
+                                                                                ldd,
+                                                                                pA,
+                                                                                pB,
+                                                                                norm_col_vec,
+                                                                                norm_row_vec,
+                                                                                sq_norm_col_vec,
+                                                                                sq_norm_row_vec,
+                                                                                pDcast,
+                                                                                fin_op,
+                                                                                stream);
   } else {
     lda = n, ldb = m, ldd = m;
-    correlation<InType, AccType, correlationOutType, Index_, FinalLambda,
-                false>(n, m, k, lda, ldb, ldd, pB, pA, norm_row_vec,
-                       norm_col_vec, sq_norm_row_vec, sq_norm_col_vec, pDcast,
-                       fin_op, stream);
+    correlation<InType, AccType, correlationOutType, Index_, FinalLambda, false>(n,
+                                                                                 m,
+                                                                                 k,
+                                                                                 lda,
+                                                                                 ldb,
+                                                                                 ldd,
+                                                                                 pB,
+                                                                                 pA,
+                                                                                 norm_row_vec,
+                                                                                 norm_col_vec,
+                                                                                 sq_norm_row_vec,
+                                                                                 sq_norm_col_vec,
+                                                                                 pDcast,
+                                                                                 fin_op,
+                                                                                 stream);
   }
 }
 
diff --git a/cpp/include/raft/distance/detail/cosine.cuh b/cpp/include/raft/distance/detail/cosine.cuh
index 900e045edc..5684fd0a16 100644
--- a/cpp/include/raft/distance/detail/cosine.cuh
+++ b/cpp/include/raft/distance/detail/cosine.cuh
@@ -25,7 +25,7 @@ namespace detail {
 
 /**
  * @brief the cosine distance matrix calculation implementer
- *  It computes the following equation: 
+ *  It computes the following equation:
  *    C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2)))
  * @tparam DataT input data-type (for A and B matrices)
  * @tparam AccT   accumulation data-type
@@ -50,30 +50,43 @@ namespace detail {
  * @param fin_op  the final gemm epilogue lambda
 *  @param stream  cuda stream to launch cuda operations.
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-void cosineImpl(const DataT *x, const DataT *y, const DataT *xn,
-                const DataT *yn, IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb,
-                IdxT ldd, OutT *dOutput, FinalLambda fin_op,
-                cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+void cosineImpl(const DataT* x,
+                const DataT* y,
+                const DataT* xn,
+                const DataT* yn,
+                IdxT m,
+                IdxT n,
+                IdxT k,
+                IdxT lda,
+                IdxT ldb,
+                IdxT ldd,
+                OutT* dOutput,
+                FinalLambda fin_op,
+                cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    acc += x * y;
-  };
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) {
+  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                     DataT * regxn,
+                                     DataT * regyn,
+                                     IdxT gridStrideX,
+                                     IdxT gridStrideY) {
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
@@ -86,43 +99,66 @@ void cosineImpl(const DataT *x, const DataT *y, const DataT *xn,
   constexpr size_t shmemSize =
     KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
   if (isRowMajor) {
-    auto cosineRowMajor =
-      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineRowMajor);
+    auto cosineRowMajor = pairwiseDistanceMatKernel<true,
+                                                    DataT,
+                                                    AccT,
+                                                    OutT,
+                                                    IdxT,
+                                                    KPolicy,
+                                                    decltype(core_lambda),
+                                                    decltype(epilog_lambda),
+                                                    FinalLambda,
+                                                    true>;
+    dim3 grid           = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineRowMajor);
     cosineRowMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
-      fin_op);
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    auto cosineColMajor =
-      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineColMajor);
+    auto cosineColMajor = pairwiseDistanceMatKernel<true,
+                                                    DataT,
+                                                    AccT,
+                                                    OutT,
+                                                    IdxT,
+                                                    KPolicy,
+                                                    decltype(core_lambda),
+                                                    decltype(epilog_lambda),
+                                                    FinalLambda,
+                                                    false>;
+    dim3 grid           = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineColMajor);
     cosineColMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
-      fin_op);
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-            const DataT *x, const DataT *y, const DataT *xn, const DataT *yn,
-            OutT *dOutput, FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void cosine(IdxT m,
+            IdxT n,
+            IdxT k,
+            IdxT lda,
+            IdxT ldb,
+            IdxT ldd,
+            const DataT* x,
+            const DataT* y,
+            const DataT* xn,
+            const DataT* yn,
+            OutT* dOutput,
+            FinalLambda fin_op,
+            cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    cosineImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-               isRowMajor>(x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput,
-                           fin_op, stream);
+    cosineImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    cosineImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-               isRowMajor>(x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput,
-                           fin_op, stream);
+    cosineImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else {
     cosineImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -131,7 +167,7 @@ void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
 
 /**
  * @brief the expanded cosine distance matrix calculation
- *  It computes the following equation: 
+ *  It computes the following equation:
  *              C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2)))
  * @tparam IType input data-type (for A and B matrices)
  * @tparam AccType accumulation data-type
@@ -152,12 +188,23 @@ void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void cosineAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA,
-                 const InType *pB, OutType *pD, AccType *workspace,
-                 size_t worksize, FinalLambda fin_op, cudaStream_t stream,
-                 bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void cosineAlgo1(Index_ m,
+                 Index_ n,
+                 Index_ k,
+                 const InType* pA,
+                 const InType* pB,
+                 OutType* pD,
+                 AccType* workspace,
+                 size_t worksize,
+                 FinalLambda fin_op,
+                 cudaStream_t stream,
+                 bool isRowMajor)
+{
   auto norm_op = [] __device__(AccType in) { return raft::mySqrt(in); };
 
   // Wrap fin_op to allow computing 1 - pA before calling fin_op
@@ -166,39 +213,33 @@ void cosineAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA,
   };
 
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    CosOutType;
-  CosOutType *pDcast = reinterpret_cast<CosOutType *>(pD);
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type CosOutType;
+  CosOutType* pDcast = reinterpret_cast<CosOutType*>(pD);
 
-  ASSERT(!(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) ||
-           (worksize < m * sizeof(AccType))),
-         "workspace size error");
+  ASSERT(
+    !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))),
+    "workspace size error");
   ASSERT(workspace != nullptr, "workspace is null");
 
   Index_ lda, ldb, ldd;
-  InType *col_vec = workspace;
-  InType *row_vec = workspace;
+  InType* col_vec = workspace;
+  InType* row_vec = workspace;
   if (pA != pB) {
     row_vec += m;
-    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor,
-                          stream, norm_op);
-    raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor,
-                          stream, norm_op);
+    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
+    raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
   } else {
-    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor,
-                          stream, norm_op);
+    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
   }
 
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     cosine<InType, AccType, CosOutType, Index_, decltype(wrapped_fin_op), true>(
-      m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, wrapped_fin_op,
-      stream);
+      m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, wrapped_fin_op, stream);
   } else {
     lda = n, ldb = m, ldd = m;
-    cosine<InType, AccType, CosOutType, Index_, decltype(wrapped_fin_op),
-           false>(n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast,
-                  wrapped_fin_op, stream);
+    cosine<InType, AccType, CosOutType, Index_, decltype(wrapped_fin_op), false>(
+      n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast, wrapped_fin_op, stream);
   }
 }
 
diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index 199dc73fb6..91838e8bfa 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -85,211 +85,461 @@ enum DistanceType : unsigned short {
 };
 
 namespace {
-template <raft::distance::DistanceType distanceType, typename InType,
-          typename AccType, typename OutType, typename FinalLambda,
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
           typename Index_>
 struct DistanceImpl {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg = 2.0f) {}
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void* workspace,
+           size_t worksize,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType metric_arg = 2.0f)
+  {
+  }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2Expanded, InType, AccType,
-                    OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType) {
-    raft::distance::detail::euclideanAlgo1<InType, AccType, OutType,
-                                           FinalLambda, Index_>(
-      m, n, k, x, y, dist, false, (AccType *)workspace, worksize, fin_op,
-      stream, isRowMajor);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::L2Expanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void* workspace,
+           size_t worksize,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::euclideanAlgo1<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, false, (AccType*)workspace, worksize, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2SqrtExpanded, InType,
-                    AccType, OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType) {
-    raft::distance::detail::euclideanAlgo1<InType, AccType, OutType,
-                                           FinalLambda, Index_>(
-      m, n, k, x, y, dist, true, (AccType *)workspace, worksize, fin_op, stream,
-      isRowMajor);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::L2SqrtExpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void* workspace,
+           size_t worksize,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::euclideanAlgo1<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, true, (AccType*)workspace, worksize, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::CosineExpanded, InType,
-                    AccType, OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType) {
-    raft::distance::detail::cosineAlgo1<InType, AccType, OutType, FinalLambda,
-                                        Index_>(m, n, k, x, y, dist,
-                                                (AccType *)workspace, worksize,
-                                                fin_op, stream, isRowMajor);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::CosineExpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void* workspace,
+           size_t worksize,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::cosineAlgo1<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, (AccType*)workspace, worksize, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2Unexpanded, InType, AccType,
-                    OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
-           bool isRowMajor, InType) {
-    raft::distance::detail::euclideanAlgo2<InType, AccType, OutType,
-                                           FinalLambda, Index_>(
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::L2Unexpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void*,
+           size_t,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::euclideanAlgo2<InType, AccType, OutType, FinalLambda, Index_>(
       m, n, k, x, y, dist, false, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2SqrtUnexpanded, InType,
-                    AccType, OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
-           bool isRowMajor, InType) {
-    raft::distance::detail::euclideanAlgo2<InType, AccType, OutType,
-                                           FinalLambda, Index_>(
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::L2SqrtUnexpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void*,
+           size_t,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::euclideanAlgo2<InType, AccType, OutType, FinalLambda, Index_>(
       m, n, k, x, y, dist, true, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L1, InType, AccType, OutType,
-                    FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
-           bool isRowMajor, InType) {
-    raft::distance::detail::l1Impl<InType, AccType, OutType, FinalLambda,
-                                   Index_>(m, n, k, x, y, dist, fin_op, stream,
-                                           isRowMajor);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::L1,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void*,
+           size_t,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::l1Impl<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::Linf, InType, AccType,
-                    OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
-           bool isRowMajor, InType) {
-    raft::distance::detail::chebyshevImpl<InType, AccType, OutType, FinalLambda,
-                                          Index_>(m, n, k, x, y, dist, fin_op,
-                                                  stream, isRowMajor);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::Linf,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void*,
+           size_t,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::chebyshevImpl<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::HellingerExpanded, InType,
-                    AccType, OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
-           bool isRowMajor, InType) {
-    raft::distance::detail::hellingerImpl<InType, AccType, OutType, FinalLambda,
-                                          Index_>(m, n, k, x, y, dist, fin_op,
-                                                  stream, isRowMajor);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::HellingerExpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void*,
+           size_t,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::hellingerImpl<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::LpUnexpanded, InType, AccType,
-                    OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
-           bool isRowMajor, InType metric_arg) {
-    raft::distance::detail::minkowskiImpl<InType, AccType, OutType, FinalLambda,
-                                          Index_>(
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::LpUnexpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void*,
+           size_t,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType metric_arg)
+  {
+    raft::distance::detail::minkowskiImpl<InType, AccType, OutType, FinalLambda, Index_>(
       m, n, k, x, y, dist, fin_op, stream, isRowMajor, metric_arg);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::Canberra, InType, AccType,
-                    OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
-           bool isRowMajor, InType) {
-    raft::distance::detail::canberraImpl<InType, AccType, OutType, FinalLambda,
-                                         Index_>(m, n, k, x, y, dist, fin_op,
-                                                 stream, isRowMajor);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::Canberra,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void*,
+           size_t,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::canberraImpl<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::HammingUnexpanded, InType,
-                    AccType, OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
-           bool isRowMajor, InType) {
-    raft::distance::detail::hammingUnexpandedImpl<InType, AccType, OutType,
-                                                  FinalLambda, Index_>(
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::HammingUnexpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void*,
+           size_t,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::hammingUnexpandedImpl<InType, AccType, OutType, FinalLambda, Index_>(
       m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::JensenShannon, InType,
-                    AccType, OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
-           bool isRowMajor, InType) {
-    raft::distance::detail::jensenShannonImpl<InType, AccType, OutType,
-                                              FinalLambda, Index_>(
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::JensenShannon,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void*,
+           size_t,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::jensenShannonImpl<InType, AccType, OutType, FinalLambda, Index_>(
       m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::RusselRaoExpanded, InType,
-                    AccType, OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
-           bool isRowMajor, InType) {
-    raft::distance::detail::russellRaoImpl<InType, AccType, OutType,
-                                           FinalLambda, Index_>(
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::RusselRaoExpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void*,
+           size_t,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::russellRaoImpl<InType, AccType, OutType, FinalLambda, Index_>(
       m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::KLDivergence, InType, AccType,
-                    OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
-           bool isRowMajor, InType) {
-    raft::distance::detail::klDivergenceImpl<InType, AccType, OutType,
-                                             FinalLambda, Index_>(
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::KLDivergence,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void*,
+           size_t,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::klDivergenceImpl<InType, AccType, OutType, FinalLambda, Index_>(
       m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::CorrelationExpanded, InType,
-                    AccType, OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType) {
-    raft::distance::detail::correlationImpl<InType, AccType, OutType,
-                                            FinalLambda, Index_>(
-      m, n, k, x, y, dist, (AccType *)workspace, worksize, fin_op, stream,
-      isRowMajor);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::CorrelationExpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void* workspace,
+           size_t worksize,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::correlationImpl<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, (AccType*)workspace, worksize, fin_op, stream, isRowMajor);
   }
 };
 
@@ -320,53 +570,71 @@ struct DistanceImpl<raft::distance::DistanceType::CorrelationExpanded, InType,
  * as follows:  <pre>OutType fin_op(AccType in, int g_idx);</pre>. If one needs
  * any other parameters, feel free to pass them via closure.
  */
-template <raft::distance::DistanceType distanceType, typename InType,
-          typename AccType, typename OutType, typename FinalLambda,
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
           typename Index_ = int>
-void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
-              Index_ n, Index_ k, void *workspace, size_t worksize,
-              FinalLambda fin_op, cudaStream_t stream, bool isRowMajor = true,
-              InType metric_arg = 2.0f) {
-  DistanceImpl<distanceType, InType, AccType, OutType, FinalLambda, Index_>
-    distImpl;
-  distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream,
-               isRowMajor, metric_arg);
+void distance(const InType* x,
+              const InType* y,
+              OutType* dist,
+              Index_ m,
+              Index_ n,
+              Index_ k,
+              void* workspace,
+              size_t worksize,
+              FinalLambda fin_op,
+              cudaStream_t stream,
+              bool isRowMajor   = true,
+              InType metric_arg = 2.0f)
+{
+  DistanceImpl<distanceType, InType, AccType, OutType, FinalLambda, Index_> distImpl;
+  distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 /**
-         * @brief Evaluate pairwise distances for the simple use case
-         * @tparam DistanceType which distance to evaluate
-         * @tparam InType input argument type
-         * @tparam AccType accumulation type
-         * @tparam OutType output type
-         * @tparam Index_ Index type
-         * @param x first set of points
-         * @param y second set of points
-         * @param dist output distance matrix
-         * @param m number of points in x
-         * @param n number of points in y
-         * @param k dimensionality
-         * @param workspace temporary workspace needed for computations
-         * @param worksize number of bytes of the workspace
-         * @param stream cuda stream
-         * @param isRowMajor whether the matrices are row-major or col-major
-         *
-         * @note if workspace is passed as nullptr, this will return in
-         *  worksize, the number of bytes of workspace required
-         */
-template <raft::distance::DistanceType distanceType, typename InType,
-          typename AccType, typename OutType, typename Index_ = int>
-void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
-              Index_ n, Index_ k, void *workspace, size_t worksize,
-              cudaStream_t stream, bool isRowMajor = true,
-              InType metric_arg = 2.0f) {
-  auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) {
-    return d_val;
-  };
-  distance<distanceType, InType, AccType, OutType, decltype(default_fin_op),
-           Index_>(x, y, dist, m, n, k, workspace, worksize, default_fin_op,
-                   stream, isRowMajor, metric_arg);
+ * @brief Evaluate pairwise distances for the simple use case
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam Index_ Index type
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param workspace temporary workspace needed for computations
+ * @param worksize number of bytes of the workspace
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ *
+ * @note if workspace is passed as nullptr, this will return in
+ *  worksize, the number of bytes of workspace required
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int>
+void distance(const InType* x,
+              const InType* y,
+              OutType* dist,
+              Index_ m,
+              Index_ n,
+              Index_ k,
+              void* workspace,
+              size_t worksize,
+              cudaStream_t stream,
+              bool isRowMajor   = true,
+              InType metric_arg = 2.0f)
+{
+  auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) { return d_val; };
+  distance<distanceType, InType, AccType, OutType, decltype(default_fin_op), Index_>(
+    x, y, dist, m, n, k, workspace, worksize, default_fin_op, stream, isRowMajor, metric_arg);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -386,14 +654,16 @@ void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
  * @note If the specifed distanceType doesn't need the workspace at all, it
  * returns 0.
  */
-template <raft::distance::DistanceType distanceType, typename InType,
-          typename AccType, typename OutType, typename Index_ = int>
-size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n,
-                        Index_ k) {
-  size_t worksize = 0;
-  constexpr bool is_allocated =
-    (distanceType <= raft::distance::DistanceType::CosineExpanded) ||
-    (distanceType == raft::distance::DistanceType::CorrelationExpanded);
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int>
+size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k)
+{
+  size_t worksize             = 0;
+  constexpr bool is_allocated = (distanceType <= raft::distance::DistanceType::CosineExpanded) ||
+                                (distanceType == raft::distance::DistanceType::CorrelationExpanded);
   constexpr int numOfBuffers =
     (distanceType == raft::distance::DistanceType::CorrelationExpanded) ? 2 : 1;
 
@@ -425,17 +695,21 @@ size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n,
  * @param isRowMajor whether the matrices are row-major or col-major
  */
 template <typename Type, typename Index_, raft::distance::DistanceType DistType>
-void pairwise_distance_impl(const Type *x, const Type *y, Type *dist, Index_ m,
-                            Index_ n, Index_ k,
-                            rmm::device_uvector<char> &workspace,
-                            cudaStream_t stream, bool isRowMajor,
-                            Type metric_arg = 2.0f) {
-  auto worksize =
-    getWorkspaceSize<DistType, Type, Type, Type, Index_>(x, y, m, n, k);
+void pairwise_distance_impl(const Type* x,
+                            const Type* y,
+                            Type* dist,
+                            Index_ m,
+                            Index_ n,
+                            Index_ k,
+                            rmm::device_uvector<char>& workspace,
+                            cudaStream_t stream,
+                            bool isRowMajor,
+                            Type metric_arg = 2.0f)
+{
+  auto worksize = getWorkspaceSize<DistType, Type, Type, Type, Index_>(x, y, m, n, k);
   workspace.resize(worksize, stream);
-  distance<DistType, Type, Type, Type, Index_>(x, y, dist, m, n, k,
-                                               workspace.data(), worksize,
-                                               stream, isRowMajor, metric_arg);
+  distance<DistType, Type, Type, Type, Index_>(
+    x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, metric_arg);
 }
 /** @} */
 };  // namespace detail
diff --git a/cpp/include/raft/distance/detail/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh
index 8b8882c244..1166543f8c 100644
--- a/cpp/include/raft/distance/detail/euclidean.cuh
+++ b/cpp/include/raft/distance/detail/euclidean.cuh
@@ -49,30 +49,44 @@ namespace detail {
  * @param fin_op  the final gemm epilogue lambda
 *  @param stream  cuda stream to launch cuda operations.
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn,
-                      const DataT *yn, IdxT m, IdxT n, IdxT k, IdxT lda,
-                      IdxT ldb, IdxT ldd, bool sqrt, OutT *dOutput,
-                      FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+void euclideanExpImpl(const DataT* x,
+                      const DataT* y,
+                      const DataT* xn,
+                      const DataT* yn,
+                      IdxT m,
+                      IdxT n,
+                      IdxT k,
+                      IdxT lda,
+                      IdxT ldb,
+                      IdxT ldd,
+                      bool sqrt,
+                      OutT* dOutput,
+                      FinalLambda fin_op,
+                      cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    acc += x * y;
-  };
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [sqrt] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) {
+  auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                         DataT * regxn,
+                                         DataT * regyn,
+                                         IdxT gridStrideX,
+                                         IdxT gridStrideY) {
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
@@ -94,47 +108,68 @@ void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn,
   constexpr size_t shmemSize =
     KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
   if (isRowMajor) {
-    auto euclideanExpRowMajor =
-      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid =
-      launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpRowMajor);
+    auto euclideanExpRowMajor = pairwiseDistanceMatKernel<true,
+                                                          DataT,
+                                                          AccT,
+                                                          OutT,
+                                                          IdxT,
+                                                          KPolicy,
+                                                          decltype(core_lambda),
+                                                          decltype(epilog_lambda),
+                                                          FinalLambda,
+                                                          true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpRowMajor);
 
     euclideanExpRowMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
-      fin_op);
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    auto euclideanExpColMajor =
-      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid =
-      launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpColMajor);
+    auto euclideanExpColMajor = pairwiseDistanceMatKernel<true,
+                                                          DataT,
+                                                          AccT,
+                                                          OutT,
+                                                          IdxT,
+                                                          KPolicy,
+                                                          decltype(core_lambda),
+                                                          decltype(epilog_lambda),
+                                                          FinalLambda,
+                                                          false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpColMajor);
     euclideanExpColMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
-      fin_op);
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void euclideanExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                  const DataT *x, const DataT *y, const DataT *xn,
-                  const DataT *yn, bool sqrt, OutT *dOutput, FinalLambda fin_op,
-                  cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void euclideanExp(IdxT m,
+                  IdxT n,
+                  IdxT k,
+                  IdxT lda,
+                  IdxT ldb,
+                  IdxT ldd,
+                  const DataT* x,
+                  const DataT* y,
+                  const DataT* xn,
+                  const DataT* yn,
+                  bool sqrt,
+                  OutT* dOutput,
+                  FinalLambda fin_op,
+                  cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    euclideanExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                     isRowMajor>(x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt,
-                                 dOutput, fin_op, stream);
+    euclideanExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    euclideanExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                     isRowMajor>(x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt,
-                                 dOutput, fin_op, stream);
+    euclideanExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
   } else {
     euclideanExpImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
@@ -162,53 +197,59 @@ void euclideanExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void euclideanAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA,
-                    const InType *pB, OutType *pD, bool enable_sqrt,
-                    AccType *workspace, size_t &worksize, FinalLambda fin_op,
-                    cudaStream_t stream, bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void euclideanAlgo1(Index_ m,
+                    Index_ n,
+                    Index_ k,
+                    const InType* pA,
+                    const InType* pB,
+                    OutType* pD,
+                    bool enable_sqrt,
+                    AccType* workspace,
+                    size_t& worksize,
+                    FinalLambda fin_op,
+                    cudaStream_t stream,
+                    bool isRowMajor)
+{
   auto norm_op = [] __device__(InType in) { return in; };
 
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    ExpOutType;
-  ExpOutType *pDcast = reinterpret_cast<ExpOutType *>(pD);
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type ExpOutType;
+  ExpOutType* pDcast = reinterpret_cast<ExpOutType*>(pD);
 
-  ASSERT(!(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) ||
-           (worksize < m * sizeof(AccType))),
-         "workspace size error");
+  ASSERT(
+    !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))),
+    "workspace size error");
   ASSERT(workspace != nullptr, "workspace is null");
 
   Index_ lda, ldb, ldd;
-  InType *col_vec = workspace;
-  InType *row_vec = workspace;
+  InType* col_vec = workspace;
+  InType* row_vec = workspace;
   if (pA != pB) {
     row_vec += m;
-    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor,
-                          stream, norm_op);
-    raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor,
-                          stream, norm_op);
+    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
+    raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
   } else {
-    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor,
-                          stream, norm_op);
+    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
   }
 
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     euclideanExp<InType, AccType, ExpOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, enable_sqrt, pDcast,
-      fin_op, stream);
+      m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, enable_sqrt, pDcast, fin_op, stream);
   } else {
     lda = n, ldb = m, ldd = m;
     euclideanExp<InType, AccType, ExpOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, enable_sqrt, pDcast,
-      fin_op, stream);
+      n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, enable_sqrt, pDcast, fin_op, stream);
   }
 }
 
 /**
- * @brief the unexpanded euclidean distance matrix calculation 
+ * @brief the unexpanded euclidean distance matrix calculation
  *  It computes the following equation: cij = op((ai-bj)^2)
  * @tparam DataT          input data-type (for A and B matrices)
  * @tparam AccT           accumulation data-type
@@ -228,16 +269,30 @@ void euclideanAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA,
  * @param[output]   pD output matrix
  * @param fin_op    the final gemm epilogue lambda
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
-                        IdxT lda, IdxT ldb, IdxT ldd, bool sqrt, OutT *dOutput,
-                        FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+void euclideanUnExpImpl(const DataT* x,
+                        const DataT* y,
+                        IdxT m,
+                        IdxT n,
+                        IdxT k,
+                        IdxT lda,
+                        IdxT ldb,
+                        IdxT ldd,
+                        bool sqrt,
+                        OutT* dOutput,
+                        FinalLambda fin_op,
+                        cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
@@ -248,10 +303,11 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [sqrt] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) {
+  auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                         DataT * regxn,
+                                         DataT * regyn,
+                                         IdxT gridStrideX,
+                                         IdxT gridStrideY) {
     if (sqrt) {
 #pragma unroll
       for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
@@ -264,48 +320,68 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
   };
 
   if (isRowMajor) {
-    auto euclideanUnExpRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               euclideanUnExpRowMajor);
+    auto euclideanUnExpRowMajor = pairwiseDistanceMatKernel<false,
+                                                            DataT,
+                                                            AccT,
+                                                            OutT,
+                                                            IdxT,
+                                                            KPolicy,
+                                                            decltype(core_lambda),
+                                                            decltype(epilog_lambda),
+                                                            FinalLambda,
+                                                            true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, euclideanUnExpRowMajor);
 
     euclideanUnExpRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
 
   } else {
-    auto euclideanUnExpColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               euclideanUnExpColMajor);
+    auto euclideanUnExpColMajor = pairwiseDistanceMatKernel<false,
+                                                            DataT,
+                                                            AccT,
+                                                            OutT,
+                                                            IdxT,
+                                                            KPolicy,
+                                                            decltype(core_lambda),
+                                                            decltype(epilog_lambda),
+                                                            FinalLambda,
+                                                            false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, euclideanUnExpColMajor);
 
     euclideanUnExpColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void euclideanUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                    const DataT *x, const DataT *y, bool sqrt, OutT *dOutput,
-                    FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void euclideanUnExp(IdxT m,
+                    IdxT n,
+                    IdxT k,
+                    IdxT lda,
+                    IdxT ldb,
+                    IdxT ldd,
+                    const DataT* x,
+                    const DataT* y,
+                    bool sqrt,
+                    OutT* dOutput,
+                    FinalLambda fin_op,
+                    cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                       isRowMajor>(x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput,
-                                   fin_op, stream);
+    euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                       isRowMajor>(x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput,
-                                   fin_op, stream);
+    euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
   } else {
     euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
@@ -331,15 +407,25 @@ void euclideanUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void euclideanAlgo2(Index_ m, Index_ n, Index_ k, const InType *pA,
-                    const InType *pB, OutType *pD, bool enable_sqrt,
-                    FinalLambda fin_op, cudaStream_t stream, bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void euclideanAlgo2(Index_ m,
+                    Index_ n,
+                    Index_ k,
+                    const InType* pA,
+                    const InType* pB,
+                    OutType* pD,
+                    bool enable_sqrt,
+                    FinalLambda fin_op,
+                    cudaStream_t stream,
+                    bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    UnExpOutType;
-  UnExpOutType *pDcast = reinterpret_cast<UnExpOutType *>(pD);
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type UnExpOutType;
+  UnExpOutType* pDcast = reinterpret_cast<UnExpOutType*>(pD);
   Index_ lda, ldb, ldd;
 
   if (isRowMajor) {
diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
index ca8f729a68..9373992ada 100644
--- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
@@ -36,24 +36,24 @@ template <typename LabelT, typename DataT>
 struct KVPMinReduceImpl {
   typedef cub::KeyValuePair<LabelT, DataT> KVP;
 
-  DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) {
-    return b.value < a.value ? b : a;
-  }
+  DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
 
 };  // KVPMinReduce
 
 template <typename LabelT, typename DataT>
 struct MinAndDistanceReduceOpImpl {
   typedef typename cub::KeyValuePair<LabelT, DataT> KVP;
-  DI void operator()(LabelT rid, KVP* out, const KVP& other) {
+  DI void operator()(LabelT rid, KVP* out, const KVP& other)
+  {
     if (other.value < out->value) {
-      out->key = other.key;
+      out->key   = other.key;
       out->value = other.value;
     }
   }
 
-  DI void init(KVP* out, DataT maxVal) {
-    out->key = -1;
+  DI void init(KVP* out, DataT maxVal)
+  {
+    out->key   = -1;
     out->value = maxVal;
   }
 };
@@ -61,38 +61,35 @@ struct MinAndDistanceReduceOpImpl {
 template <typename LabelT, typename DataT>
 struct MinReduceOpImpl {
   typedef typename cub::KeyValuePair<LabelT, DataT> KVP;
-  DI void operator()(LabelT rid, DataT* out, const KVP& other) {
-    if (other.value < *out) {
-      *out = other.value;
-    }
+  DI void operator()(LabelT rid, DataT* out, const KVP& other)
+  {
+    if (other.value < *out) { *out = other.value; }
   }
 
   DI void init(DataT* out, DataT maxVal) { *out = maxVal; }
 };
 
 template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
-__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) {
+__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
+{
   auto tid = IdxT(blockIdx.x) * blockDim.x + threadIdx.x;
-  if (tid < m) {
-    redOp.init(min + tid, maxVal);
-  }
+  if (tid < m) { redOp.init(min + tid, maxVal); }
 }
 
 template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
-void initialize(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp,
-                cudaStream_t stream) {
+void initialize(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp, cudaStream_t stream)
+{
   auto blks = raft::ceildiv(m, 256);
-  initKernel<DataT, OutT, IdxT>
-    <<<blks, 256, 0, stream>>>(min, m, maxVal, redOp);
+  initKernel<DataT, OutT, IdxT><<<blks, 256, 0, stream>>>(min, m, maxVal, redOp);
 }
 
 // TODO: specialize this function for MinAndDistanceReduceOp<int, float>
 // with atomicCAS of 64 bit which will eliminate mutex and shfls
-template <typename P, typename OutT, typename IdxT, typename KVPair,
-          typename ReduceOpT>
-DI void updateReducedVal(int* mutex, OutT* min, KVPair* val, ReduceOpT red_op,
-                         IdxT m, IdxT gridStrideY) {
-  const auto lid = threadIdx.x % raft::WarpSize;
+template <typename P, typename OutT, typename IdxT, typename KVPair, typename ReduceOpT>
+DI void updateReducedVal(
+  int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, IdxT m, IdxT gridStrideY)
+{
+  const auto lid      = threadIdx.x % raft::WarpSize;
   const auto accrowid = threadIdx.x / P::AccThCols;
 
   // for now have first lane from each warp update a unique output row. This
@@ -117,21 +114,38 @@ DI void updateReducedVal(int* mutex, OutT* min, KVPair* val, ReduceOpT red_op,
     if (j < (raft::WarpSize / P::AccThCols) - 1) {
 #pragma unroll
       for (int i = 0; i < P::AccRowsPerTh; ++i) {
-        auto tmpkey = raft::shfl(val[i].key, (j + 1) * P::AccThCols);
+        auto tmpkey   = raft::shfl(val[i].key, (j + 1) * P::AccThCols);
         auto tmpvalue = raft::shfl(val[i].value, (j + 1) * P::AccThCols);
-        val[i] = {tmpkey, tmpvalue};
+        val[i]        = {tmpkey, tmpvalue};
       }
     }
   }
 }
 
-template <typename DataT, typename OutT, typename IdxT, bool Sqrt, typename P,
-          typename ReduceOpT, typename KVPReduceOpT, typename CoreLambda,
+template <typename DataT,
+          typename OutT,
+          typename IdxT,
+          bool Sqrt,
+          typename P,
+          typename ReduceOpT,
+          typename KVPReduceOpT,
+          typename CoreLambda,
           typename FinalLambda>
-__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(
-  OutT* min, const DataT* x, const DataT* y, const DataT* xn, const DataT* yn,
-  IdxT m, IdxT n, IdxT k, DataT maxVal, int* mutex, ReduceOpT redOp,
-  KVPReduceOpT pairRedOp, CoreLambda core_op, FinalLambda fin_op) {
+__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min,
+                                                                  const DataT* x,
+                                                                  const DataT* y,
+                                                                  const DataT* xn,
+                                                                  const DataT* yn,
+                                                                  IdxT m,
+                                                                  IdxT n,
+                                                                  IdxT k,
+                                                                  DataT maxVal,
+                                                                  int* mutex,
+                                                                  ReduceOpT redOp,
+                                                                  KVPReduceOpT pairRedOp,
+                                                                  CoreLambda core_op,
+                                                                  FinalLambda fin_op)
+{
   extern __shared__ char smem[];
 
   typedef cub::KeyValuePair<IdxT, DataT> KVPair;
@@ -144,7 +158,9 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(
   // epilogue operation lambda for final value calculation
   auto epilog_lambda = [n, pairRedOp, &val, maxVal] __device__(
                          DataT acc[P::AccRowsPerTh][P::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         DataT * regxn,
+                         DataT * regyn,
+                         IdxT gridStrideX,
                          IdxT gridStrideY) {
     KVPReduceOpT pairRed_op(pairRedOp);
 
@@ -173,72 +189,105 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(
 #pragma unroll
       for (int j = 0; j < P::AccColsPerTh; ++j) {
         auto tmpkey = acccolid + j * P::AccThCols + gridStrideX;
-        KVPair tmp = {tmpkey, acc[i][j]};
+        KVPair tmp  = {tmpkey, acc[i][j]};
         if (tmpkey < n) {
-          val[i] =
-            pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]);
+          val[i] = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]);
         }
       }
     }
   };
 
-  auto rowEpilog_lambda = [m, mutex, min, pairRedOp, redOp, &val,
-                           maxVal] __device__(IdxT gridStrideY) {
-    KVPReduceOpT pairRed_op(pairRedOp);
-    ReduceOpT red_op(redOp);
+  auto rowEpilog_lambda =
+    [m, mutex, min, pairRedOp, redOp, &val, maxVal] __device__(IdxT gridStrideY) {
+      KVPReduceOpT pairRed_op(pairRedOp);
+      ReduceOpT red_op(redOp);
 
-    const auto accrowid = threadIdx.x / P::AccThCols;
-    const auto lid = raft::laneId();
+      const auto accrowid = threadIdx.x / P::AccThCols;
+      const auto lid      = raft::laneId();
 
     // reduce
 #pragma unroll
-    for (int i = 0; i < P::AccRowsPerTh; ++i) {
+      for (int i = 0; i < P::AccRowsPerTh; ++i) {
 #pragma unroll
-      for (int j = P::AccThCols / 2; j > 0; j >>= 1) {
-        auto tmpkey = raft::shfl(val[i].key, lid + j);
-        auto tmpvalue = raft::shfl(val[i].value, lid + j);
-        KVPair tmp = {tmpkey, tmpvalue};
-        val[i] =
-          pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]);
+        for (int j = P::AccThCols / 2; j > 0; j >>= 1) {
+          auto tmpkey   = raft::shfl(val[i].key, lid + j);
+          auto tmpvalue = raft::shfl(val[i].value, lid + j);
+          KVPair tmp    = {tmpkey, tmpvalue};
+          val[i]        = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]);
+        }
       }
-    }
 
-    updateReducedVal<P, OutT, IdxT, KVPair, ReduceOpT>(mutex, min, val, red_op,
-                                                       m, gridStrideY);
+      updateReducedVal<P, OutT, IdxT, KVPair, ReduceOpT>(mutex, min, val, red_op, m, gridStrideY);
 
     // reset the val array.
 #pragma unroll
-    for (int i = 0; i < P::AccRowsPerTh; ++i) {
-      val[i] = {-1, maxVal};
-    }
-  };
+      for (int i = 0; i < P::AccRowsPerTh; ++i) {
+        val[i] = {-1, maxVal};
+      }
+    };
 
   IdxT lda = k, ldb = k, ldd = n;
-  PairwiseDistances<true, DataT, DataT, DataT, IdxT, P, CoreLambda,
-                    decltype(epilog_lambda), FinalLambda,
-                    decltype(rowEpilog_lambda), true, false>
-    obj(x, y, m, n, k, lda, ldb, ldd, xn, yn, nullptr, smem, core_op,
-        epilog_lambda, fin_op, rowEpilog_lambda);
+  PairwiseDistances<true,
+                    DataT,
+                    DataT,
+                    DataT,
+                    IdxT,
+                    P,
+                    CoreLambda,
+                    decltype(epilog_lambda),
+                    FinalLambda,
+                    decltype(rowEpilog_lambda),
+                    true,
+                    false>
+    obj(x,
+        y,
+        m,
+        n,
+        k,
+        lda,
+        ldb,
+        ldd,
+        xn,
+        yn,
+        nullptr,
+        smem,
+        core_op,
+        epilog_lambda,
+        fin_op,
+        rowEpilog_lambda);
   obj.run();
 }
 
-template <typename DataT, typename OutT, typename IdxT, int VecLen,
-          typename ReduceOpT, typename KVPReduceOpT>
-void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn,
-                   const DataT* yn, IdxT m, IdxT n, IdxT k, int* workspace,
-                   ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt,
-                   bool initOutBuffer, cudaStream_t stream) {
+template <typename DataT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename ReduceOpT,
+          typename KVPReduceOpT>
+void fusedL2NNImpl(OutT* min,
+                   const DataT* x,
+                   const DataT* y,
+                   const DataT* xn,
+                   const DataT* yn,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   int* workspace,
+                   ReduceOpT redOp,
+                   KVPReduceOpT pairRedOp,
+                   bool sqrt,
+                   bool initOutBuffer,
+                   cudaStream_t stream)
+{
   typedef typename linalg::Policy4x4<DataT, VecLen>::Policy P;
 
   dim3 blk(P::Nthreads);
-  auto nblks = raft::ceildiv<int>(m, P::Nthreads);
+  auto nblks            = raft::ceildiv<int>(m, P::Nthreads);
   constexpr auto maxVal = std::numeric_limits<DataT>::max();
   typedef cub::KeyValuePair<IdxT, DataT> KVPair;
 
   // Accumulation operation lambda
-  auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) {
-    acc += x * y;
-  };
+  auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) { acc += x * y; };
 
   CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream));
   if (initOutBuffer) {
@@ -249,25 +298,34 @@ void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn,
 
   auto fin_op = [] __device__(DataT d_val, int g_d_idx) { return d_val; };
 
-  constexpr size_t shmemSize =
-    P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT));
+  constexpr size_t shmemSize = P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT));
   if (sqrt) {
-    auto fusedL2NNSqrt =
-      fusedL2NNkernel<DataT, OutT, IdxT, true, P, ReduceOpT, KVPReduceOpT,
-                      decltype(core_lambda), decltype(fin_op)>;
-    dim3 grid = launchConfigGenerator<P>(m, n, shmemSize, fusedL2NNSqrt);
+    auto fusedL2NNSqrt = fusedL2NNkernel<DataT,
+                                         OutT,
+                                         IdxT,
+                                         true,
+                                         P,
+                                         ReduceOpT,
+                                         KVPReduceOpT,
+                                         decltype(core_lambda),
+                                         decltype(fin_op)>;
+    dim3 grid          = launchConfigGenerator<P>(m, n, shmemSize, fusedL2NNSqrt);
 
     fusedL2NNSqrt<<<grid, blk, shmemSize, stream>>>(
-      min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp,
-      core_lambda, fin_op);
+      min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, core_lambda, fin_op);
   } else {
-    auto fusedL2NN =
-      fusedL2NNkernel<DataT, OutT, IdxT, false, P, ReduceOpT, KVPReduceOpT,
-                      decltype(core_lambda), decltype(fin_op)>;
-    dim3 grid = launchConfigGenerator<P>(m, n, shmemSize, fusedL2NN);
-    fusedL2NN<<<grid, blk, shmemSize, stream>>>(min, x, y, xn, yn, m, n, k,
-                                                maxVal, workspace, redOp,
-                                                pairRedOp, core_lambda, fin_op);
+    auto fusedL2NN = fusedL2NNkernel<DataT,
+                                     OutT,
+                                     IdxT,
+                                     false,
+                                     P,
+                                     ReduceOpT,
+                                     KVPReduceOpT,
+                                     decltype(core_lambda),
+                                     decltype(fin_op)>;
+    dim3 grid      = launchConfigGenerator<P>(m, n, shmemSize, fusedL2NN);
+    fusedL2NN<<<grid, blk, shmemSize, stream>>>(
+      min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, core_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
diff --git a/cpp/include/raft/distance/detail/hamming.cuh b/cpp/include/raft/distance/detail/hamming.cuh
index 0169ba33a2..886b9d1426 100644
--- a/cpp/include/raft/distance/detail/hamming.cuh
+++ b/cpp/include/raft/distance/detail/hamming.cuh
@@ -23,7 +23,7 @@ namespace detail {
 
 /**
  * @brief the Hamming distance matrix using the unexpanded form:
- *  It computes the following equation: 
+ *  It computes the following equation:
     Cij = sum(x_i != y_i) / k
  *
  * @tparam DataT          input data-type (for A and B matrices)
@@ -47,30 +47,41 @@ namespace detail {
  * @param[in]       fin_op the final gemm epilogue lambda
  * @param[in]       stream cuda stream to launch work
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-static void hammingUnexpandedImpl(const DataT *x, const DataT *y, IdxT m,
-                                  IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                                  OutT *dOutput, FinalLambda fin_op,
-                                  cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+static void hammingUnexpandedImpl(const DataT* x,
+                                  const DataT* y,
+                                  IdxT m,
+                                  IdxT n,
+                                  IdxT k,
+                                  IdxT lda,
+                                  IdxT ldb,
+                                  IdxT ldd,
+                                  OutT* dOutput,
+                                  FinalLambda fin_op,
+                                  cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    acc += (x != y);
-  };
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += (x != y); };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [k] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) {
+  auto epilog_lambda = [k] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                      DataT * regxn,
+                                      DataT * regyn,
+                                      IdxT gridStrideX,
+                                      IdxT gridStrideY) {
     const DataT one_over_k = DataT(1.0) / k;
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
@@ -82,46 +93,65 @@ static void hammingUnexpandedImpl(const DataT *x, const DataT *y, IdxT m,
   };
 
   if (isRowMajor) {
-    auto hammingUnexpandedRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               hammingUnexpandedRowMajor);
+    auto hammingUnexpandedRowMajor = pairwiseDistanceMatKernel<false,
+                                                               DataT,
+                                                               AccT,
+                                                               OutT,
+                                                               IdxT,
+                                                               KPolicy,
+                                                               decltype(core_lambda),
+                                                               decltype(epilog_lambda),
+                                                               FinalLambda,
+                                                               true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, hammingUnexpandedRowMajor);
 
     hammingUnexpandedRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    auto hammingUnexpandedColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               hammingUnexpandedColMajor);
+    auto hammingUnexpandedColMajor = pairwiseDistanceMatKernel<false,
+                                                               DataT,
+                                                               AccT,
+                                                               OutT,
+                                                               IdxT,
+                                                               KPolicy,
+                                                               decltype(core_lambda),
+                                                               decltype(epilog_lambda),
+                                                               FinalLambda,
+                                                               false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, hammingUnexpandedColMajor);
     hammingUnexpandedColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void hammingUnexpanded(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                       const DataT *x, const DataT *y, OutT *dOutput,
-                       FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void hammingUnexpanded(IdxT m,
+                       IdxT n,
+                       IdxT k,
+                       IdxT lda,
+                       IdxT ldb,
+                       IdxT ldd,
+                       const DataT* x,
+                       const DataT* y,
+                       OutT* dOutput,
+                       FinalLambda fin_op,
+                       cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    hammingUnexpandedImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT),
-                          FinalLambda, isRowMajor>(x, y, m, n, k, lda, ldb, ldd,
-                                                   dOutput, fin_op, stream);
+    hammingUnexpandedImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    hammingUnexpandedImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT),
-                          FinalLambda, isRowMajor>(x, y, m, n, k, lda, ldb, ldd,
-                                                   dOutput, fin_op, stream);
+    hammingUnexpandedImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else {
     hammingUnexpandedImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -130,7 +160,7 @@ void hammingUnexpanded(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
 
 /**
  * @brief the Hamming Unexpanded distance matrix calculation
- *  It computes the following equation: 
+ *  It computes the following equation:
     Cij = sum(x_i != y_i) / k
  *
  * @tparam InType input data-type (for A and B matrices)
@@ -148,28 +178,35 @@ void hammingUnexpanded(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void hammingUnexpandedImpl(int m, int n, int k, const InType *pA,
-                           const InType *pB, OutType *pD, FinalLambda fin_op,
-                           cudaStream_t stream, bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void hammingUnexpandedImpl(int m,
+                           int n,
+                           int k,
+                           const InType* pA,
+                           const InType* pB,
+                           OutType* pD,
+                           FinalLambda fin_op,
+                           cudaStream_t stream,
+                           bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    hammingUnexpandedOutType;
+  typedef
+    typename std::conditional<is_bool::value, OutType, AccType>::type hammingUnexpandedOutType;
   Index_ lda, ldb, ldd;
-  hammingUnexpandedOutType *pDcast =
-    reinterpret_cast<hammingUnexpandedOutType *>(pD);
+  hammingUnexpandedOutType* pDcast = reinterpret_cast<hammingUnexpandedOutType*>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
-    hammingUnexpanded<InType, AccType, hammingUnexpandedOutType, Index_,
-                      FinalLambda, true>(m, n, k, lda, ldb, ldd, pA, pB, pDcast,
-                                         fin_op, stream);
+    hammingUnexpanded<InType, AccType, hammingUnexpandedOutType, Index_, FinalLambda, true>(
+      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
 
   } else {
     lda = n, ldb = m, ldd = m;
-    hammingUnexpanded<InType, AccType, hammingUnexpandedOutType, Index_,
-                      FinalLambda, false>(n, m, k, lda, ldb, ldd, pB, pA,
-                                          pDcast, fin_op, stream);
+    hammingUnexpanded<InType, AccType, hammingUnexpandedOutType, Index_, FinalLambda, false>(
+      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
   }
 }
 
diff --git a/cpp/include/raft/distance/detail/hellinger.cuh b/cpp/include/raft/distance/detail/hellinger.cuh
index 933d850dbf..189bbed491 100644
--- a/cpp/include/raft/distance/detail/hellinger.cuh
+++ b/cpp/include/raft/distance/detail/hellinger.cuh
@@ -24,7 +24,7 @@ namespace detail {
 
 /**
  * @brief the Hellinger distance matrix using the expanded form:
- *  It computes the following equation: 
+ *  It computes the following equation:
     cij = sqrt(1 - sum(sqrt(x_k * y_k)))
  * This distance computation modifies A and B by computing a sqrt
  * and then performing a `pow(x, 2)` to convert it back. Because of this,
@@ -52,29 +52,40 @@ namespace detail {
  * @param[in]       fin_op the final gemm epilogue lambda
  * @param[in]       stream cuda stream to launch work
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-static void hellingerImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
-                          IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
-                          FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+static void hellingerImpl(const DataT* x,
+                          const DataT* y,
+                          IdxT m,
+                          IdxT n,
+                          IdxT k,
+                          IdxT lda,
+                          IdxT ldb,
+                          IdxT ldd,
+                          OutT* dOutput,
+                          FinalLambda fin_op,
+                          cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
-  auto unaryOp_lambda = [] __device__(DataT input) {
-    return raft::mySqrt(input);
-  };
+  auto unaryOp_lambda = [] __device__(DataT input) { return raft::mySqrt(input); };
   // First sqrt x and y
   raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-    (DataT *)x, x, m * k, unaryOp_lambda, stream);
+    (DataT*)x, x, m * k, unaryOp_lambda, stream);
 
   if (x != y) {
     raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-      (DataT *)y, y, n * k, unaryOp_lambda, stream);
+      (DataT*)y, y, n * k, unaryOp_lambda, stream);
   }
 
   // Accumulation operation lambda
@@ -85,71 +96,91 @@ static void hellingerImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) {
+  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                     DataT * regxn,
+                                     DataT * regyn,
+                                     IdxT gridStrideX,
+                                     IdxT gridStrideY) {
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
       for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
         // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
-        const auto finalVal = (1 - acc[i][j]);
+        const auto finalVal  = (1 - acc[i][j]);
         const auto rectifier = (!signbit(finalVal));
-        acc[i][j] = raft::mySqrt(rectifier * finalVal);
+        acc[i][j]            = raft::mySqrt(rectifier * finalVal);
       }
     }
   };
 
   if (isRowMajor) {
-    auto hellingerRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               hellingerRowMajor);
+    auto hellingerRowMajor = pairwiseDistanceMatKernel<false,
+                                                       DataT,
+                                                       AccT,
+                                                       OutT,
+                                                       IdxT,
+                                                       KPolicy,
+                                                       decltype(core_lambda),
+                                                       decltype(epilog_lambda),
+                                                       FinalLambda,
+                                                       true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, hellingerRowMajor);
 
     hellingerRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    auto hellingerColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               hellingerColMajor);
+    auto hellingerColMajor = pairwiseDistanceMatKernel<false,
+                                                       DataT,
+                                                       AccT,
+                                                       OutT,
+                                                       IdxT,
+                                                       KPolicy,
+                                                       decltype(core_lambda),
+                                                       decltype(epilog_lambda),
+                                                       FinalLambda,
+                                                       false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, hellingerColMajor);
     hellingerColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   // Revert sqrt of x and y
   raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-    (DataT *)x, x, m * k, unaryOp_lambda, stream);
+    (DataT*)x, x, m * k, unaryOp_lambda, stream);
   if (x != y) {
     raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-      (DataT *)y, y, n * k, unaryOp_lambda, stream);
+      (DataT*)y, y, n * k, unaryOp_lambda, stream);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-               const DataT *x, const DataT *y, OutT *dOutput,
-               FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void hellinger(IdxT m,
+               IdxT n,
+               IdxT k,
+               IdxT lda,
+               IdxT ldb,
+               IdxT ldd,
+               const DataT* x,
+               const DataT* y,
+               OutT* dOutput,
+               FinalLambda fin_op,
+               cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    hellingerImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                  isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                              stream);
+    hellingerImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    hellingerImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                  isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                              stream);
+    hellingerImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else {
     hellingerImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -158,7 +189,7 @@ void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
 
 /**
  * @brief the Hellinger distance matrix calculation
- *  It computes the following equation: 
+ *  It computes the following equation:
     sqrt(1 - sum(sqrt(x_k * y_k))
  * This distance computation modifies A and B by computing a sqrt
  * and then performing a `pow(x, 2)` to convert it back. Because of this,
@@ -180,16 +211,25 @@ void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void hellingerImpl(int m, int n, int k, const InType *pA, const InType *pB,
-                   OutType *pD, FinalLambda fin_op, cudaStream_t stream,
-                   bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void hellingerImpl(int m,
+                   int n,
+                   int k,
+                   const InType* pA,
+                   const InType* pB,
+                   OutType* pD,
+                   FinalLambda fin_op,
+                   cudaStream_t stream,
+                   bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    hellingerOutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type hellingerOutType;
   Index_ lda, ldb, ldd;
-  hellingerOutType *pDcast = reinterpret_cast<hellingerOutType *>(pD);
+  hellingerOutType* pDcast = reinterpret_cast<hellingerOutType*>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     hellinger<InType, AccType, hellingerOutType, Index_, FinalLambda, true>(
diff --git a/cpp/include/raft/distance/detail/jensen_shannon.cuh b/cpp/include/raft/distance/detail/jensen_shannon.cuh
index 1e39f39682..b3240fe398 100644
--- a/cpp/include/raft/distance/detail/jensen_shannon.cuh
+++ b/cpp/include/raft/distance/detail/jensen_shannon.cuh
@@ -23,7 +23,7 @@ namespace detail {
 
 /**
  * @brief the Jensen Shannon distance matrix:
- *  It computes the following equation: 
+ *  It computes the following equation:
     Cij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i))
             + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i)))))
  *
@@ -48,37 +48,49 @@ namespace detail {
  * @param[in]       fin_op the final gemm epilogue lambda
  * @param[in]       stream cuda stream to launch work
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-static void jensenShannonImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
-                              IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                              OutT *dOutput, FinalLambda fin_op,
-                              cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+static void jensenShannonImpl(const DataT* x,
+                              const DataT* y,
+                              IdxT m,
+                              IdxT n,
+                              IdxT k,
+                              IdxT lda,
+                              IdxT ldb,
+                              IdxT ldd,
+                              OutT* dOutput,
+                              FinalLambda fin_op,
+                              cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    const DataT m = 0.5f * (x + y);
+    const DataT m     = 0.5f * (x + y);
     const bool m_zero = (m == 0);
-    const auto logM = (!m_zero) * raft::myLog(m + m_zero);
+    const auto logM   = (!m_zero) * raft::myLog(m + m_zero);
 
     const bool x_zero = (x == 0);
     const bool y_zero = (y == 0);
-    acc += (-x * (logM - raft::myLog(x + x_zero))) +
-           (-y * (logM - raft::myLog(y + y_zero)));
+    acc += (-x * (logM - raft::myLog(x + x_zero))) + (-y * (logM - raft::myLog(y + y_zero)));
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) {
+  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                     DataT * regxn,
+                                     DataT * regyn,
+                                     IdxT gridStrideX,
+                                     IdxT gridStrideY) {
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
@@ -89,46 +101,65 @@ static void jensenShannonImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
   };
 
   if (isRowMajor) {
-    auto jensenShannonRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               jensenShannonRowMajor);
+    auto jensenShannonRowMajor = pairwiseDistanceMatKernel<false,
+                                                           DataT,
+                                                           AccT,
+                                                           OutT,
+                                                           IdxT,
+                                                           KPolicy,
+                                                           decltype(core_lambda),
+                                                           decltype(epilog_lambda),
+                                                           FinalLambda,
+                                                           true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, jensenShannonRowMajor);
 
     jensenShannonRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    auto jensenShannonColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               jensenShannonColMajor);
+    auto jensenShannonColMajor = pairwiseDistanceMatKernel<false,
+                                                           DataT,
+                                                           AccT,
+                                                           OutT,
+                                                           IdxT,
+                                                           KPolicy,
+                                                           decltype(core_lambda),
+                                                           decltype(epilog_lambda),
+                                                           FinalLambda,
+                                                           false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, jensenShannonColMajor);
     jensenShannonColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void jensenShannon(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                   const DataT *x, const DataT *y, OutT *dOutput,
-                   FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void jensenShannon(IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   IdxT lda,
+                   IdxT ldb,
+                   IdxT ldd,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* dOutput,
+                   FinalLambda fin_op,
+                   cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    jensenShannonImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                      isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                                  stream);
+    jensenShannonImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    jensenShannonImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                      isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                                  stream);
+    jensenShannonImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else {
     jensenShannonImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -137,7 +168,7 @@ void jensenShannon(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
 
 /**
  * @brief the Jensen Shannon distance matrix calculation
- *  It computes the following equation: 
+ *  It computes the following equation:
     Cij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i))
             + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i)))))
  *
@@ -156,26 +187,34 @@ void jensenShannon(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void jensenShannonImpl(int m, int n, int k, const InType *pA, const InType *pB,
-                       OutType *pD, FinalLambda fin_op, cudaStream_t stream,
-                       bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void jensenShannonImpl(int m,
+                       int n,
+                       int k,
+                       const InType* pA,
+                       const InType* pB,
+                       OutType* pD,
+                       FinalLambda fin_op,
+                       cudaStream_t stream,
+                       bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    jensenShannonOutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type jensenShannonOutType;
   Index_ lda, ldb, ldd;
-  jensenShannonOutType *pDcast = reinterpret_cast<jensenShannonOutType *>(pD);
+  jensenShannonOutType* pDcast = reinterpret_cast<jensenShannonOutType*>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
-    jensenShannon<InType, AccType, jensenShannonOutType, Index_, FinalLambda,
-                  true>(m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
+    jensenShannon<InType, AccType, jensenShannonOutType, Index_, FinalLambda, true>(
+      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
 
   } else {
     lda = n, ldb = m, ldd = m;
-    jensenShannon<InType, AccType, jensenShannonOutType, Index_, FinalLambda,
-                  false>(n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op,
-                         stream);
+    jensenShannon<InType, AccType, jensenShannonOutType, Index_, FinalLambda, false>(
+      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
   }
 }
 }  // namespace detail
diff --git a/cpp/include/raft/distance/detail/kl_divergence.cuh b/cpp/include/raft/distance/detail/kl_divergence.cuh
index 5a18ba1670..31127a4d8d 100644
--- a/cpp/include/raft/distance/detail/kl_divergence.cuh
+++ b/cpp/include/raft/distance/detail/kl_divergence.cuh
@@ -23,7 +23,7 @@ namespace detail {
 
 /**
  * @brief the KL Divergence distance matrix:
- *  It computes the following equation: 
+ *  It computes the following equation:
     Cij = 0.5 * sum(x * log (x / y));
  * This distance computation modifies A or B by computing a log(x)
  * and then performing a `pow(e, log(x))` to convert it back. Because of this,
@@ -51,17 +51,29 @@ namespace detail {
  * @param[in]       fin_op the final gemm epilogue lambda
  * @param[in]       stream cuda stream to launch work
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-static void klDivergenceImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
-                             IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                             OutT *dOutput, FinalLambda fin_op,
-                             cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+static void klDivergenceImpl(const DataT* x,
+                             const DataT* y,
+                             IdxT m,
+                             IdxT n,
+                             IdxT k,
+                             IdxT lda,
+                             IdxT ldb,
+                             IdxT ldd,
+                             OutT* dOutput,
+                             FinalLambda fin_op,
+                             cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
@@ -80,13 +92,11 @@ static void klDivergenceImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
     if (isRowMajor) {
       const bool x_zero = (x == 0);
       const bool y_zero = (y == 0);
-      acc +=
-        x * (raft::myLog(x + x_zero) - (!y_zero) * raft::myLog(y + y_zero));
+      acc += x * (raft::myLog(x + x_zero) - (!y_zero) * raft::myLog(y + y_zero));
     } else {
       const bool y_zero = (y == 0);
       const bool x_zero = (x == 0);
-      acc +=
-        y * (raft::myLog(y + y_zero) - (!x_zero) * raft::myLog(x + x_zero));
+      acc += y * (raft::myLog(y + y_zero) - (!x_zero) * raft::myLog(x + x_zero));
     }
   };
 
@@ -102,10 +112,11 @@ static void klDivergenceImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) {
+  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                     DataT * regxn,
+                                     DataT * regyn,
+                                     IdxT gridStrideX,
+                                     IdxT gridStrideY) {
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
@@ -116,79 +127,158 @@ static void klDivergenceImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
   };
 
   if (isRowMajor) {
-    constexpr auto klDivergenceRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
+    constexpr auto klDivergenceRowMajor = pairwiseDistanceMatKernel<false,
+                                                                    DataT,
+                                                                    AccT,
+                                                                    OutT,
+                                                                    IdxT,
+                                                                    KPolicy,
+                                                                    decltype(core_lambda),
+                                                                    decltype(epilog_lambda),
+                                                                    FinalLambda,
+                                                                    true>;
     constexpr auto klDivergenceRowMajorXequalY =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+      pairwiseDistanceMatKernel<false,
+                                DataT,
+                                AccT,
+                                OutT,
+                                IdxT,
+                                KPolicy,
                                 decltype(core_lambda_x_equal_y),
-                                decltype(epilog_lambda), FinalLambda, true>;
+                                decltype(epilog_lambda),
+                                FinalLambda,
+                                true>;
     if (x != y) {
       raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-        (DataT *)y, y, n * k, unaryOp_lambda, stream);
-      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                                 klDivergenceRowMajor);
-      klDivergenceRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-        x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-        epilog_lambda, fin_op);
+        (DataT*)y, y, n * k, unaryOp_lambda, stream);
+      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, klDivergenceRowMajor);
+      klDivergenceRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(x,
+                                                                     y,
+                                                                     nullptr,
+                                                                     nullptr,
+                                                                     m,
+                                                                     n,
+                                                                     k,
+                                                                     lda,
+                                                                     ldb,
+                                                                     ldd,
+                                                                     dOutput,
+                                                                     core_lambda,
+                                                                     epilog_lambda,
+                                                                     fin_op);
       // Now reverse previous log (x) back to x using (e ^ log(x))
       raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda_reverse), IdxT>(
-        (DataT *)y, y, n * k, unaryOp_lambda_reverse, stream);
+        (DataT*)y, y, n * k, unaryOp_lambda_reverse, stream);
     } else {
-      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                                 klDivergenceRowMajorXequalY);
-      klDivergenceRowMajorXequalY<<<grid, blk, KPolicy::SmemSize, stream>>>(
-        x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput,
-        core_lambda_x_equal_y, epilog_lambda, fin_op);
+      dim3 grid =
+        launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, klDivergenceRowMajorXequalY);
+      klDivergenceRowMajorXequalY<<<grid, blk, KPolicy::SmemSize, stream>>>(x,
+                                                                            y,
+                                                                            nullptr,
+                                                                            nullptr,
+                                                                            m,
+                                                                            n,
+                                                                            k,
+                                                                            lda,
+                                                                            ldb,
+                                                                            ldd,
+                                                                            dOutput,
+                                                                            core_lambda_x_equal_y,
+                                                                            epilog_lambda,
+                                                                            fin_op);
     }
   } else {
-    constexpr auto klDivergenceColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
+    constexpr auto klDivergenceColMajor = pairwiseDistanceMatKernel<false,
+                                                                    DataT,
+                                                                    AccT,
+                                                                    OutT,
+                                                                    IdxT,
+                                                                    KPolicy,
+                                                                    decltype(core_lambda),
+                                                                    decltype(epilog_lambda),
+                                                                    FinalLambda,
+                                                                    false>;
     constexpr auto klDivergenceColMajorXequalY =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+      pairwiseDistanceMatKernel<false,
+                                DataT,
+                                AccT,
+                                OutT,
+                                IdxT,
+                                KPolicy,
                                 decltype(core_lambda_x_equal_y),
-                                decltype(epilog_lambda), FinalLambda, false>;
+                                decltype(epilog_lambda),
+                                FinalLambda,
+                                false>;
     if (x != y) {
       raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-        (DataT *)x, x, m * k, unaryOp_lambda, stream);
-      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                                 klDivergenceColMajor);
-      klDivergenceColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-        x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-        epilog_lambda, fin_op);
+        (DataT*)x, x, m * k, unaryOp_lambda, stream);
+      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, klDivergenceColMajor);
+      klDivergenceColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(x,
+                                                                     y,
+                                                                     nullptr,
+                                                                     nullptr,
+                                                                     m,
+                                                                     n,
+                                                                     k,
+                                                                     lda,
+                                                                     ldb,
+                                                                     ldd,
+                                                                     dOutput,
+                                                                     core_lambda,
+                                                                     epilog_lambda,
+                                                                     fin_op);
       // Now reverse previous log (x) back to x using (e ^ log(x))
       raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda_reverse), IdxT>(
-        (DataT *)x, x, m * k, unaryOp_lambda_reverse, stream);
+        (DataT*)x, x, m * k, unaryOp_lambda_reverse, stream);
     } else {
-      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                                 klDivergenceColMajorXequalY);
-      klDivergenceColMajorXequalY<<<grid, blk, KPolicy::SmemSize, stream>>>(
-        x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput,
-        core_lambda_x_equal_y, epilog_lambda, fin_op);
+      dim3 grid =
+        launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, klDivergenceColMajorXequalY);
+      klDivergenceColMajorXequalY<<<grid, blk, KPolicy::SmemSize, stream>>>(x,
+                                                                            y,
+                                                                            nullptr,
+                                                                            nullptr,
+                                                                            m,
+                                                                            n,
+                                                                            k,
+                                                                            lda,
+                                                                            ldb,
+                                                                            ldd,
+                                                                            dOutput,
+                                                                            core_lambda_x_equal_y,
+                                                                            epilog_lambda,
+                                                                            fin_op);
     }
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void klDivergence(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                  const DataT *x, const DataT *y, OutT *dOutput,
-                  FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void klDivergence(IdxT m,
+                  IdxT n,
+                  IdxT k,
+                  IdxT lda,
+                  IdxT ldb,
+                  IdxT ldd,
+                  const DataT* x,
+                  const DataT* y,
+                  OutT* dOutput,
+                  FinalLambda fin_op,
+                  cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    klDivergenceImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                     isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                                 stream);
+    klDivergenceImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    klDivergenceImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                     isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                                 stream);
+    klDivergenceImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else {
     klDivergenceImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -197,7 +287,7 @@ void klDivergence(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
 
 /**
  * @brief the KL Divergence distance matrix calculation
- *  It computes the following equation: 
+ *  It computes the following equation:
       Cij = 0.5 * sum(x * log (x / y));
  * This distance computation modifies A or B by computing a log(x)
  * and then performing a `pow(e, log(x))` to convert it back. Because of this,
@@ -218,25 +308,34 @@ void klDivergence(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void klDivergenceImpl(int m, int n, int k, const InType *pA, const InType *pB,
-                      OutType *pD, FinalLambda fin_op, cudaStream_t stream,
-                      bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void klDivergenceImpl(int m,
+                      int n,
+                      int k,
+                      const InType* pA,
+                      const InType* pB,
+                      OutType* pD,
+                      FinalLambda fin_op,
+                      cudaStream_t stream,
+                      bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    klDivergenceOutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type klDivergenceOutType;
   Index_ lda, ldb, ldd;
-  klDivergenceOutType *pDcast = reinterpret_cast<klDivergenceOutType *>(pD);
+  klDivergenceOutType* pDcast = reinterpret_cast<klDivergenceOutType*>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
-    klDivergence<InType, AccType, klDivergenceOutType, Index_, FinalLambda,
-                 true>(m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
+    klDivergence<InType, AccType, klDivergenceOutType, Index_, FinalLambda, true>(
+      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
 
   } else {
     lda = n, ldb = m, ldd = m;
-    klDivergence<InType, AccType, klDivergenceOutType, Index_, FinalLambda,
-                 false>(n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
+    klDivergence<InType, AccType, klDivergenceOutType, Index_, FinalLambda, false>(
+      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
   }
 }
 }  // namespace detail
diff --git a/cpp/include/raft/distance/detail/l1.cuh b/cpp/include/raft/distance/detail/l1.cuh
index 33e9bae206..e444e65d1f 100644
--- a/cpp/include/raft/distance/detail/l1.cuh
+++ b/cpp/include/raft/distance/detail/l1.cuh
@@ -43,16 +43,29 @@ namespace detail {
  * @param[output]   pD output matrix
  * @param fin_op    the final gemm epilogue lambda
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
-                   IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
-                   FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+static void l1Impl(const DataT* x,
+                   const DataT* y,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   IdxT lda,
+                   IdxT ldb,
+                   IdxT ldd,
+                   OutT* dOutput,
+                   FinalLambda fin_op,
+                   cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
@@ -63,47 +76,69 @@ static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) { return; };
+  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                     DataT * regxn,
+                                     DataT * regyn,
+                                     IdxT gridStrideX,
+                                     IdxT gridStrideY) { return; };
 
   if (isRowMajor) {
-    auto l1RowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid =
-      launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, l1RowMajor);
+    auto l1RowMajor = pairwiseDistanceMatKernel<false,
+                                                DataT,
+                                                AccT,
+                                                OutT,
+                                                IdxT,
+                                                KPolicy,
+                                                decltype(core_lambda),
+                                                decltype(epilog_lambda),
+                                                FinalLambda,
+                                                true>;
+    dim3 grid       = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, l1RowMajor);
 
     l1RowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    auto l1ColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid =
-      launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, l1ColMajor);
+    auto l1ColMajor = pairwiseDistanceMatKernel<false,
+                                                DataT,
+                                                AccT,
+                                                OutT,
+                                                IdxT,
+                                                KPolicy,
+                                                decltype(core_lambda),
+                                                decltype(epilog_lambda),
+                                                FinalLambda,
+                                                false>;
+    dim3 grid       = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, l1ColMajor);
     l1ColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void l1(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, const DataT *x,
-        const DataT *y, OutT *dOutput, FinalLambda fin_op,
-        cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void l1(IdxT m,
+        IdxT n,
+        IdxT k,
+        IdxT lda,
+        IdxT ldb,
+        IdxT ldd,
+        const DataT* x,
+        const DataT* y,
+        OutT* dOutput,
+        FinalLambda fin_op,
+        cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    l1Impl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-           isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+    l1Impl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
     l1Impl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -131,16 +166,25 @@ void l1(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, const DataT *x,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void l1Impl(int m, int n, int k, const InType *pA, const InType *pB,
-            OutType *pD, FinalLambda fin_op, cudaStream_t stream,
-            bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void l1Impl(int m,
+            int n,
+            int k,
+            const InType* pA,
+            const InType* pB,
+            OutType* pD,
+            FinalLambda fin_op,
+            cudaStream_t stream,
+            bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef
-    typename std::conditional<is_bool::value, OutType, AccType>::type L1OutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type L1OutType;
   Index_ lda, ldb, ldd;
-  L1OutType *pDcast = reinterpret_cast<L1OutType *>(pD);
+  L1OutType* pDcast = reinterpret_cast<L1OutType*>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     l1<InType, AccType, L1OutType, Index_, FinalLambda, true>(
diff --git a/cpp/include/raft/distance/detail/minkowski.cuh b/cpp/include/raft/distance/detail/minkowski.cuh
index 8bd3deb08f..22a183c22c 100644
--- a/cpp/include/raft/distance/detail/minkowski.cuh
+++ b/cpp/include/raft/distance/detail/minkowski.cuh
@@ -22,7 +22,7 @@ namespace distance {
 namespace detail {
 
 /**
- * @brief the unexpanded Minkowski distance matrix calculation 
+ * @brief the unexpanded Minkowski distance matrix calculation
  *  It computes the following equation: cij = sum(|x - y|^p)^(1/p)
  * @tparam DataT          input data-type (for A and B matrices)
  * @tparam AccT           accumulation data-type
@@ -45,16 +45,30 @@ namespace detail {
  * @param[in]       stream cuda stream to launch work
  * @param[in]       the value of `p` for Minkowski (l-p) distances.
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
-                        IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
-                        FinalLambda fin_op, cudaStream_t stream, DataT p) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+void minkowskiUnExpImpl(const DataT* x,
+                        const DataT* y,
+                        IdxT m,
+                        IdxT n,
+                        IdxT k,
+                        IdxT lda,
+                        IdxT ldb,
+                        IdxT ldd,
+                        OutT* dOutput,
+                        FinalLambda fin_op,
+                        cudaStream_t stream,
+                        DataT p)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
@@ -65,10 +79,11 @@ void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [p] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) {
+  auto epilog_lambda = [p] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                      DataT * regxn,
+                                      DataT * regyn,
+                                      IdxT gridStrideX,
+                                      IdxT gridStrideY) {
     const auto one_over_p = 1.0f / p;
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
@@ -80,48 +95,68 @@ void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
   };
 
   if (isRowMajor) {
-    auto minkowskiUnExpRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               minkowskiUnExpRowMajor);
+    auto minkowskiUnExpRowMajor = pairwiseDistanceMatKernel<false,
+                                                            DataT,
+                                                            AccT,
+                                                            OutT,
+                                                            IdxT,
+                                                            KPolicy,
+                                                            decltype(core_lambda),
+                                                            decltype(epilog_lambda),
+                                                            FinalLambda,
+                                                            true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, minkowskiUnExpRowMajor);
 
     minkowskiUnExpRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
 
   } else {
-    auto minkowskiUnExpColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               minkowskiUnExpColMajor);
+    auto minkowskiUnExpColMajor = pairwiseDistanceMatKernel<false,
+                                                            DataT,
+                                                            AccT,
+                                                            OutT,
+                                                            IdxT,
+                                                            KPolicy,
+                                                            decltype(core_lambda),
+                                                            decltype(epilog_lambda),
+                                                            FinalLambda,
+                                                            false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, minkowskiUnExpColMajor);
 
     minkowskiUnExpColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void minkowskiUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                    const DataT *x, const DataT *y, OutT *dOutput,
-                    FinalLambda fin_op, cudaStream_t stream, DataT metric_arg) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void minkowskiUnExp(IdxT m,
+                    IdxT n,
+                    IdxT k,
+                    IdxT lda,
+                    IdxT ldb,
+                    IdxT ldd,
+                    const DataT* x,
+                    const DataT* y,
+                    OutT* dOutput,
+                    FinalLambda fin_op,
+                    cudaStream_t stream,
+                    DataT metric_arg)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                       isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput,
-                                   fin_op, stream, metric_arg);
+    minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                       isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput,
-                                   fin_op, stream, metric_arg);
+    minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg);
   } else {
     minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg);
@@ -147,15 +182,25 @@ void minkowskiUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param[in] isRowMajor whether the input and output matrices are row major
  * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances.
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void minkowskiImpl(Index_ m, Index_ n, Index_ k, const InType *pA,
-                   const InType *pB, OutType *pD, FinalLambda fin_op,
-                   cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void minkowskiImpl(Index_ m,
+                   Index_ n,
+                   Index_ k,
+                   const InType* pA,
+                   const InType* pB,
+                   OutType* pD,
+                   FinalLambda fin_op,
+                   cudaStream_t stream,
+                   bool isRowMajor,
+                   InType metric_arg)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    LpUnexpOutType;
-  LpUnexpOutType *pDcast = reinterpret_cast<LpUnexpOutType *>(pD);
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type LpUnexpOutType;
+  LpUnexpOutType* pDcast = reinterpret_cast<LpUnexpOutType*>(pD);
   Index_ lda, ldb, ldd;
 
   if (isRowMajor) {
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index a98bda1541..8fa7801c70 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -34,11 +34,11 @@ namespace detail {
  * @tparam OutT           output data-type (for C and D matrices)
  * @tparam IdxT           index data-type
  * @tparam Policy         struct which tunes the Contraction kernel
- * @tparam CoreLambda     tells how to accumulate an x and y into 
+ * @tparam CoreLambda     tells how to accumulate an x and y into
                           acc. its signature:
     template <typename AccT, typename DataT> void core_lambda(AccT& acc,
       const DataT& x, const DataT& y)
- * @tparam EpilogueLambda applies an elementwise function to compute final 
+ * @tparam EpilogueLambda applies an elementwise function to compute final
     values. Its signature is:
     template <typename AccT, typename DataT> void epilogue_lambda
     (AccT acc[][], DataT* regxn, DataT* regyn);
@@ -60,21 +60,27 @@ namespace detail {
  * @param fin_op the final gemm epilogue lambda
  */
 
-template <bool useNorms, typename DataT, typename AccT, typename OutT,
-          typename IdxT, typename Policy, typename CoreLambda,
-          typename EpilogueLambda, typename FinalLambda,
-          typename rowEpilogueLambda, bool isRowMajor = true,
-          bool writeOut = true,
-          typename BaseClass =
-            raft::linalg::Contractions_NT<DataT, IdxT, Policy, isRowMajor>>
+template <bool useNorms,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename Policy,
+          typename CoreLambda,
+          typename EpilogueLambda,
+          typename FinalLambda,
+          typename rowEpilogueLambda,
+          bool isRowMajor    = true,
+          bool writeOut      = true,
+          typename BaseClass = raft::linalg::Contractions_NT<DataT, IdxT, Policy, isRowMajor>>
 struct PairwiseDistances : public BaseClass {
  private:
   typedef Policy P;
-  const DataT *xn;
-  const DataT *yn;
-  const DataT *const yBase;
-  OutT *dOutput;
-  char *smem;
+  const DataT* xn;
+  const DataT* yn;
+  const DataT* const yBase;
+  OutT* dOutput;
+  char* smem;
   CoreLambda core_op;
   EpilogueLambda epilog_op;
   FinalLambda fin_op;
@@ -84,11 +90,21 @@ struct PairwiseDistances : public BaseClass {
 
  public:
   // Constructor
-  DI PairwiseDistances(const DataT *_x, const DataT *_y, IdxT _m, IdxT _n,
-                       IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd,
-                       const DataT *_xn, const DataT *_yn, OutT *_dOutput,
-                       char *_smem, CoreLambda _core_op,
-                       EpilogueLambda _epilog_op, FinalLambda _fin_op,
+  DI PairwiseDistances(const DataT* _x,
+                       const DataT* _y,
+                       IdxT _m,
+                       IdxT _n,
+                       IdxT _k,
+                       IdxT _lda,
+                       IdxT _ldb,
+                       IdxT _ldd,
+                       const DataT* _xn,
+                       const DataT* _yn,
+                       OutT* _dOutput,
+                       char* _smem,
+                       CoreLambda _core_op,
+                       EpilogueLambda _epilog_op,
+                       FinalLambda _fin_op,
                        rowEpilogueLambda _rowEpilog_op)
     : BaseClass(_x, _y, _m, _n, _k, _lda, _ldb, _ldd, _smem),
       xn(_xn),
@@ -99,9 +115,12 @@ struct PairwiseDistances : public BaseClass {
       core_op(_core_op),
       epilog_op(_epilog_op),
       fin_op(_fin_op),
-      rowEpilog_op(_rowEpilog_op) {}
+      rowEpilog_op(_rowEpilog_op)
+  {
+  }
 
-  DI void run() {
+  DI void run()
+  {
     for (auto gridStrideY = blockIdx.y * P::Mblk; gridStrideY < this->m;
          gridStrideY += P::Mblk * gridDim.y) {
       for (auto gridStrideX = blockIdx.x * P::Nblk; gridStrideX < this->n;
@@ -115,7 +134,8 @@ struct PairwiseDistances : public BaseClass {
   }
 
  private:
-  DI void updateIndicesY() {
+  DI void updateIndicesY()
+  {
     const auto stride = P::Nblk * gridDim.x;
     if (isRowMajor) {
       this->y += stride * this->ldb;
@@ -125,21 +145,23 @@ struct PairwiseDistances : public BaseClass {
     this->yrowid += stride;
   }
 
-  DI void updateIndicesXY() {
+  DI void updateIndicesXY()
+  {
     const auto stride = P::Mblk * gridDim.y;
     if (isRowMajor) {
       this->x += stride * this->lda;
       this->yrowid = IdxT(blockIdx.x) * P::Nblk + this->srowid;
-      this->y = yBase + this->yrowid * this->ldb;
+      this->y      = yBase + this->yrowid * this->ldb;
     } else {
       this->x += stride;
       this->yrowid = IdxT(blockIdx.x) * P::Nblk;
-      this->y = yBase + this->yrowid + this->srowid * this->ldb;
+      this->y      = yBase + this->yrowid + this->srowid * this->ldb;
     }
     this->xrowid += stride;
   }
 
-  DI void ldgNextGridStride(IdxT gridStrideX, IdxT gridStrideY) {
+  DI void ldgNextGridStride(IdxT gridStrideX, IdxT gridStrideY)
+  {
     // Fetch next grid stride ldg if within range
     if ((gridStrideX + gridDim.x * P::Nblk) < this->n) {
       updateIndicesY();
@@ -150,10 +172,9 @@ struct PairwiseDistances : public BaseClass {
     }
   }
 
-  DI void prolog(IdxT gridStrideX, IdxT gridStrideY) {
-    if (gridStrideX == blockIdx.x * P::Nblk) {
-      this->ldgXY(0);
-    }
+  DI void prolog(IdxT gridStrideX, IdxT gridStrideY)
+  {
+    if (gridStrideX == blockIdx.x * P::Nblk) { this->ldgXY(0); }
 
 #pragma unroll
     for (int i = 0; i < P::AccRowsPerTh; ++i) {
@@ -168,7 +189,8 @@ struct PairwiseDistances : public BaseClass {
     this->pageWr ^= 1;
   }
 
-  DI void loop() {
+  DI void loop()
+  {
     for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) {
       this->ldgXY(kidx);
       accumulate();  // on the previous k-block
@@ -185,7 +207,8 @@ struct PairwiseDistances : public BaseClass {
     this->pageRd ^= 1;
   }
 
-  DI void accumulate() {
+  DI void accumulate()
+  {
 #pragma unroll
     for (int ki = 0; ki < P::Kblk; ki += P::Veclen) {
       this->ldsXY(ki);
@@ -202,21 +225,22 @@ struct PairwiseDistances : public BaseClass {
     }
   }
 
-  DI void epilog(IdxT gridStrideX, IdxT gridStrideY) {
+  DI void epilog(IdxT gridStrideX, IdxT gridStrideY)
+  {
     if (useNorms) {
-      DataT *sxNorm = (DataT *)(&smem[P::SmemSize]);
-      DataT *syNorm = (&sxNorm[P::Mblk]);
+      DataT* sxNorm = (DataT*)(&smem[P::SmemSize]);
+      DataT* syNorm = (&sxNorm[P::Mblk]);
 
       // Load x & y norms required by this threadblock in shmem buffer
       if (gridStrideX == blockIdx.x * P::Nblk) {
         for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) {
-          auto idx = gridStrideY + i;
+          auto idx  = gridStrideY + i;
           sxNorm[i] = idx < this->m ? xn[idx] : 0;
         }
       }
 
       for (int i = threadIdx.x; i < P::Nblk; i += P::Nthreads) {
-        auto idx = gridStrideX + i;
+        auto idx  = gridStrideX + i;
         syNorm[i] = idx < this->n ? yn[idx] : 0;
       }
 
@@ -291,41 +315,68 @@ struct PairwiseDistances : public BaseClass {
  * @param fin_op    the final gemm epilogue lambda
  */
 
-template <bool useNorms, typename DataT, typename AccT, typename OutT,
-          typename IdxT, typename Policy, typename CoreLambda,
-          typename EpilogueLambda, typename FinalLambda, bool isRowMajor = true,
-          bool writeOut = true>
+template <bool useNorms,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename Policy,
+          typename CoreLambda,
+          typename EpilogueLambda,
+          typename FinalLambda,
+          bool isRowMajor = true,
+          bool writeOut   = true>
 __global__ __launch_bounds__(Policy::Nthreads, 2)
 
-  void pairwiseDistanceMatKernel(const DataT *x, const DataT *y,
-                                 const DataT *_xn, const DataT *_yn, IdxT m,
-                                 IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                                 OutT *dOutput, CoreLambda core_op,
-                                 EpilogueLambda epilog_op, FinalLambda fin_op) {
+  void pairwiseDistanceMatKernel(const DataT* x,
+                                 const DataT* y,
+                                 const DataT* _xn,
+                                 const DataT* _yn,
+                                 IdxT m,
+                                 IdxT n,
+                                 IdxT k,
+                                 IdxT lda,
+                                 IdxT ldb,
+                                 IdxT ldd,
+                                 OutT* dOutput,
+                                 CoreLambda core_op,
+                                 EpilogueLambda epilog_op,
+                                 FinalLambda fin_op)
+{
   extern __shared__ char smem[];
   auto rowEpilog = [] __device__(IdxT starty) { return; };
 
-  PairwiseDistances<useNorms, DataT, AccT, OutT, IdxT, Policy, CoreLambda,
-                    EpilogueLambda, FinalLambda, decltype(rowEpilog),
-                    isRowMajor, writeOut>
-    obj(x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op,
-        epilog_op, fin_op, rowEpilog);
+  PairwiseDistances<useNorms,
+                    DataT,
+                    AccT,
+                    OutT,
+                    IdxT,
+                    Policy,
+                    CoreLambda,
+                    EpilogueLambda,
+                    FinalLambda,
+                    decltype(rowEpilog),
+                    isRowMajor,
+                    writeOut>
+    obj(
+      x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, epilog_op, fin_op, rowEpilog);
   obj.run();
 }
 
 template <typename P, typename IdxT, typename T>
-dim3 launchConfigGenerator(IdxT m, IdxT n, std::size_t sMemSize, T func) {
-  const auto numSMs = raft::getMultiProcessorCount();
+dim3 launchConfigGenerator(IdxT m, IdxT n, std::size_t sMemSize, T func)
+{
+  const auto numSMs  = raft::getMultiProcessorCount();
   int numBlocksPerSm = 0;
   dim3 grid;
 
-  CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &numBlocksPerSm, func, P::Nthreads, sMemSize));
+  CUDA_CHECK(
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, func, P::Nthreads, sMemSize));
   std::size_t minGridSize = numSMs * numBlocksPerSm;
-  std::size_t yChunks = raft::ceildiv<int>(m, P::Mblk);
-  std::size_t xChunks = raft::ceildiv<int>(n, P::Nblk);
-  grid.y = yChunks > minGridSize ? minGridSize : yChunks;
-  grid.x = (minGridSize - grid.y) <= 0 ? 1 : xChunks;
+  std::size_t yChunks     = raft::ceildiv<int>(m, P::Mblk);
+  std::size_t xChunks     = raft::ceildiv<int>(n, P::Nblk);
+  grid.y                  = yChunks > minGridSize ? minGridSize : yChunks;
+  grid.x                  = (minGridSize - grid.y) <= 0 ? 1 : xChunks;
   if (grid.x != 1) {
     std::size_t i = 1;
     while (grid.y * i < minGridSize) {
diff --git a/cpp/include/raft/distance/detail/russell_rao.cuh b/cpp/include/raft/distance/detail/russell_rao.cuh
index 8e4c4824c3..d4fbb039e7 100644
--- a/cpp/include/raft/distance/detail/russell_rao.cuh
+++ b/cpp/include/raft/distance/detail/russell_rao.cuh
@@ -23,7 +23,7 @@ namespace detail {
 
 /**
  * @brief the Russell Rao distance matrix:
- *  It computes the following equation: 
+ *  It computes the following equation:
     Cij = (k - sum(x_i * y_i)) / k
  *
  * @tparam DataT          input data-type (for A and B matrices)
@@ -47,29 +47,42 @@ namespace detail {
  * @param[in]       fin_op the final gemm epilogue lambda
  * @param[in]       stream cuda stream to launch work
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-static void russellRaoImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
-                           IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
-                           FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+static void russellRaoImpl(const DataT* x,
+                           const DataT* y,
+                           IdxT m,
+                           IdxT n,
+                           IdxT k,
+                           IdxT lda,
+                           IdxT ldb,
+                           IdxT ldd,
+                           OutT* dOutput,
+                           FinalLambda fin_op,
+                           cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    acc += x * y;
-  };
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
 
   const float one_over_k = 1.0 / k;
   // epilogue operation lambda for final value calculation
   auto epilog_lambda = [k, one_over_k] __device__(
                          AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         DataT * regxn,
+                         DataT * regyn,
+                         IdxT gridStrideX,
                          IdxT gridStrideY) {
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
@@ -81,46 +94,65 @@ static void russellRaoImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
   };
 
   if (isRowMajor) {
-    constexpr auto russellRaoRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               russellRaoRowMajor);
+    constexpr auto russellRaoRowMajor = pairwiseDistanceMatKernel<false,
+                                                                  DataT,
+                                                                  AccT,
+                                                                  OutT,
+                                                                  IdxT,
+                                                                  KPolicy,
+                                                                  decltype(core_lambda),
+                                                                  decltype(epilog_lambda),
+                                                                  FinalLambda,
+                                                                  true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, russellRaoRowMajor);
 
     russellRaoRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    constexpr auto russellRaoColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               russellRaoColMajor);
+    constexpr auto russellRaoColMajor = pairwiseDistanceMatKernel<false,
+                                                                  DataT,
+                                                                  AccT,
+                                                                  OutT,
+                                                                  IdxT,
+                                                                  KPolicy,
+                                                                  decltype(core_lambda),
+                                                                  decltype(epilog_lambda),
+                                                                  FinalLambda,
+                                                                  false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, russellRaoColMajor);
     russellRaoColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void russellRao(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                const DataT *x, const DataT *y, OutT *dOutput,
-                FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void russellRao(IdxT m,
+                IdxT n,
+                IdxT k,
+                IdxT lda,
+                IdxT ldb,
+                IdxT ldd,
+                const DataT* x,
+                const DataT* y,
+                OutT* dOutput,
+                FinalLambda fin_op,
+                cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    russellRaoImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                   isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                               stream);
+    russellRaoImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    russellRaoImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                   isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                               stream);
+    russellRaoImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else {
     russellRaoImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -129,7 +161,7 @@ void russellRao(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
 
 /**
  * @brief the Russell Rao distance matrix calculation
- *  It computes the following equation: 
+ *  It computes the following equation:
     Cij = (k - sum(x_i * y_i)) / k
  *
  * @tparam InType input data-type (for A and B matrices)
@@ -147,16 +179,25 @@ void russellRao(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void russellRaoImpl(int m, int n, int k, const InType *pA, const InType *pB,
-                    OutType *pD, FinalLambda fin_op, cudaStream_t stream,
-                    bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void russellRaoImpl(int m,
+                    int n,
+                    int k,
+                    const InType* pA,
+                    const InType* pB,
+                    OutType* pD,
+                    FinalLambda fin_op,
+                    cudaStream_t stream,
+                    bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    russellRaoOutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type russellRaoOutType;
   Index_ lda, ldb, ldd;
-  russellRaoOutType *pDcast = reinterpret_cast<russellRaoOutType *>(pD);
+  russellRaoOutType* pDcast = reinterpret_cast<russellRaoOutType*>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     russellRao<InType, AccType, russellRaoOutType, Index_, FinalLambda, true>(
diff --git a/cpp/include/raft/distance/distance.hpp b/cpp/include/raft/distance/distance.hpp
index 8b55543ff8..66832c12d2 100644
--- a/cpp/include/raft/distance/distance.hpp
+++ b/cpp/include/raft/distance/distance.hpp
@@ -25,132 +25,163 @@ namespace raft {
 namespace distance {
 
 /**
-* @brief Evaluate pairwise distances with the user epilogue lamba allowed
-* @tparam DistanceType which distance to evaluate
-* @tparam InType input argument type
-* @tparam AccType accumulation type
-* @tparam OutType output type
-* @tparam FinalLambda user-defined epilogue lamba
-* @tparam Index_ Index type
-* @param x first set of points
-* @param y second set of points
-* @param dist output distance matrix
-* @param m number of points in x
-* @param n number of points in y
-* @param k dimensionality
-* @param workspace temporary workspace needed for computations
-* @param worksize number of bytes of the workspace
-* @param fin_op the final gemm epilogue lambda
-* @param stream cuda stream
-* @param isRowMajor whether the matrices are row-major or col-major
-* @param metric_arg metric argument (used for Minkowski distance)
-*
-* @note fin_op: This is a device lambda which is supposed to operate upon the
-* input which is AccType and returns the output in OutType. It's signature is
-* as follows:  <pre>OutType fin_op(AccType in, int g_idx);</pre>. If one needs
-* any other parameters, feel free to pass them via closure.
-*/
-template <raft::distance::DistanceType distanceType, typename InType,
-          typename AccType, typename OutType, typename FinalLambda,
+ * @brief Evaluate pairwise distances with the user epilogue lamba allowed
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam FinalLambda user-defined epilogue lamba
+ * @tparam Index_ Index type
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param workspace temporary workspace needed for computations
+ * @param worksize number of bytes of the workspace
+ * @param fin_op the final gemm epilogue lambda
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
+ *
+ * @note fin_op: This is a device lambda which is supposed to operate upon the
+ * input which is AccType and returns the output in OutType. It's signature is
+ * as follows:  <pre>OutType fin_op(AccType in, int g_idx);</pre>. If one needs
+ * any other parameters, feel free to pass them via closure.
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
           typename Index_ = int>
-void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
-              Index_ n, Index_ k, void *workspace, size_t worksize,
-              FinalLambda fin_op, cudaStream_t stream, bool isRowMajor = true,
-              InType metric_arg = 2.0f) {
+void distance(const InType* x,
+              const InType* y,
+              OutType* dist,
+              Index_ m,
+              Index_ n,
+              Index_ k,
+              void* workspace,
+              size_t worksize,
+              FinalLambda fin_op,
+              cudaStream_t stream,
+              bool isRowMajor   = true,
+              InType metric_arg = 2.0f)
+{
   detail::distance<distanceType, InType, AccType, OutType, FinalLambda, Index_>(
-    x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor,
-    metric_arg);
+    x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg);
 }
 
 /**
-* @brief Evaluate pairwise distances for the simple use case
-* @tparam DistanceType which distance to evaluate
-* @tparam InType input argument type
-* @tparam AccType accumulation type
-* @tparam OutType output type
-* @tparam Index_ Index type
-* @param x first set of points
-* @param y second set of points
-* @param dist output distance matrix
-* @param m number of points in x
-* @param n number of points in y
-* @param k dimensionality
-* @param workspace temporary workspace needed for computations
-* @param worksize number of bytes of the workspace
-* @param stream cuda stream
-* @param isRowMajor whether the matrices are row-major or col-major
-* @param metric_arg metric argument (used for Minkowski distance)
-*
-* @note if workspace is passed as nullptr, this will return in
-*  worksize, the number of bytes of workspace required
-*/
-template <raft::distance::DistanceType distanceType, typename InType,
-          typename AccType, typename OutType, typename Index_ = int>
-void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
-              Index_ n, Index_ k, void *workspace, size_t worksize,
-              cudaStream_t stream, bool isRowMajor = true,
-              InType metric_arg = 2.0f) {
+ * @brief Evaluate pairwise distances for the simple use case
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam Index_ Index type
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param workspace temporary workspace needed for computations
+ * @param worksize number of bytes of the workspace
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
+ *
+ * @note if workspace is passed as nullptr, this will return in
+ *  worksize, the number of bytes of workspace required
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int>
+void distance(const InType* x,
+              const InType* y,
+              OutType* dist,
+              Index_ m,
+              Index_ n,
+              Index_ k,
+              void* workspace,
+              size_t worksize,
+              cudaStream_t stream,
+              bool isRowMajor   = true,
+              InType metric_arg = 2.0f)
+{
   detail::distance<distanceType, InType, AccType, OutType, Index_>(
     x, y, dist, m, n, k, workspace, worksize, stream, isRowMajor, metric_arg);
 }
 
 /**
-* @brief Return the exact workspace size to compute the distance
-* @tparam DistanceType which distance to evaluate
-* @tparam InType input argument type
-* @tparam AccType accumulation type
-* @tparam OutType output type
-* @tparam Index_ Index type
-* @param x first set of points
-* @param y second set of points
-* @param m number of points in x
-* @param n number of points in y
-* @param k dimensionality
-*
-* @note If the specified distanceType doesn't need the workspace at all, it
-* returns 0.
-*/
-template <raft::distance::DistanceType distanceType, typename InType,
-          typename AccType, typename OutType, typename Index_ = int>
-size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n,
-                        Index_ k) {
-  return detail::getWorkspaceSize<distanceType, InType, AccType, OutType,
-                                  Index_>(x, y, m, n, k);
+ * @brief Return the exact workspace size to compute the distance
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam Index_ Index type
+ * @param x first set of points
+ * @param y second set of points
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ *
+ * @note If the specified distanceType doesn't need the workspace at all, it
+ * returns 0.
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int>
+size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k)
+{
+  return detail::getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(x, y, m, n, k);
 }
 
 /**
-* @brief Evaluate pairwise distances for the simple use case
-* @tparam DistanceType which distance to evaluate
-* @tparam InType input argument type
-* @tparam AccType accumulation type
-* @tparam OutType output type
-* @tparam Index_ Index type
-* @param x first set of points
-* @param y second set of points
-* @param dist output distance matrix
-* @param m number of points in x
-* @param n number of points in y
-* @param k dimensionality
-* @param stream cuda stream
-* @param isRowMajor whether the matrices are row-major or col-major
-* @param metric_arg metric argument (used for Minkowski distance)
-*
-* @note if workspace is passed as nullptr, this will return in
-*  worksize, the number of bytes of workspace required
-*/
-template <raft::distance::DistanceType distanceType, typename InType,
-          typename AccType, typename OutType, typename Index_ = int>
-void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
-              Index_ n, Index_ k, cudaStream_t stream, bool isRowMajor = true,
-              InType metric_arg = 2.0f) {
+ * @brief Evaluate pairwise distances for the simple use case
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam Index_ Index type
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
+ *
+ * @note if workspace is passed as nullptr, this will return in
+ *  worksize, the number of bytes of workspace required
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int>
+void distance(const InType* x,
+              const InType* y,
+              OutType* dist,
+              Index_ m,
+              Index_ n,
+              Index_ k,
+              cudaStream_t stream,
+              bool isRowMajor   = true,
+              InType metric_arg = 2.0f)
+{
   rmm::device_uvector<char> workspace(0, stream);
-  auto worksize =
-    getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(x, y, m, n,
-                                                                     k);
+  auto worksize = getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(x, y, m, n, k);
   workspace.resize(worksize, stream);
   detail::distance<distanceType, InType, AccType, OutType, Index_>(
-    x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor,
-    metric_arg);
+    x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, metric_arg);
 }
 
 /**
@@ -173,119 +204,117 @@ void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
  * @param isRowMajor whether the matrices are row-major or col-major
  */
 template <typename Type, typename Index_ = int>
-void pairwise_distance(const raft::handle_t &handle, const Type *x,
-                       const Type *y, Type *dist, Index_ m, Index_ n, Index_ k,
-                       rmm::device_uvector<char> &workspace,
+void pairwise_distance(const raft::handle_t& handle,
+                       const Type* x,
+                       const Type* y,
+                       Type* dist,
+                       Index_ m,
+                       Index_ n,
+                       Index_ k,
+                       rmm::device_uvector<char>& workspace,
                        raft::distance::DistanceType metric,
-                       bool isRowMajor = true, Type metric_arg = 2.0f) {
+                       bool isRowMajor = true,
+                       Type metric_arg = 2.0f)
+{
   switch (metric) {
     case raft::distance::DistanceType::L2Expanded:
-      detail::pairwise_distance_impl<Type, Index_,
-                                     raft::distance::DistanceType::L2Expanded>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2Expanded>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::L2SqrtExpanded:
-      detail::pairwise_distance_impl<
-        Type, Index_, raft::distance::DistanceType::L2SqrtExpanded>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2SqrtExpanded>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::CosineExpanded:
-      detail::pairwise_distance_impl<
-        Type, Index_, raft::distance::DistanceType::CosineExpanded>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::CosineExpanded>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::L1:
-      detail::pairwise_distance_impl<Type, Index_,
-                                     raft::distance::DistanceType::L1>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L1>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::L2Unexpanded:
-      detail::pairwise_distance_impl<
-        Type, Index_, raft::distance::DistanceType::L2Unexpanded>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2Unexpanded>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::L2SqrtUnexpanded:
-      detail::pairwise_distance_impl<
-        Type, Index_, raft::distance::DistanceType::L2SqrtUnexpanded>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2SqrtUnexpanded>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::Linf:
-      detail::pairwise_distance_impl<Type, Index_,
-                                     raft::distance::DistanceType::Linf>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::Linf>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::HellingerExpanded:
-      detail::pairwise_distance_impl<
-        Type, Index_, raft::distance::DistanceType::HellingerExpanded>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::HellingerExpanded>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::LpUnexpanded:
-      detail::pairwise_distance_impl<
-        Type, Index_, raft::distance::DistanceType::LpUnexpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor,
-        metric_arg);
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::LpUnexpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor, metric_arg);
       break;
     case raft::distance::DistanceType::Canberra:
-      detail::pairwise_distance_impl<Type, Index_,
-                                     raft::distance::DistanceType::Canberra>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::Canberra>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::HammingUnexpanded:
-      detail::pairwise_distance_impl<
-        Type, Index_, raft::distance::DistanceType::HammingUnexpanded>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::HammingUnexpanded>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::JensenShannon:
-      detail::pairwise_distance_impl<
-        Type, Index_, raft::distance::DistanceType::JensenShannon>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::JensenShannon>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::RusselRaoExpanded:
-      detail::pairwise_distance_impl<
-        Type, Index_, raft::distance::DistanceType::RusselRaoExpanded>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::RusselRaoExpanded>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::KLDivergence:
-      detail::pairwise_distance_impl<
-        Type, Index_, raft::distance::DistanceType::KLDivergence>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::KLDivergence>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::CorrelationExpanded:
-      detail::pairwise_distance_impl<
-        Type, Index_, raft::distance::DistanceType::CorrelationExpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      detail::
+        pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::CorrelationExpanded>(
+          x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
-    default:
-      THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
+    default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
   };
 }
 /** @} */
 
 /**
-     * @defgroup pairwise_distance pairwise distance prims
-     * @{
-     * @brief Convenience wrapper around 'distance' prim to convert runtime metric
-     * into compile time for the purpose of dispatch
-     * @tparam Type input/accumulation/output data-type
-     * @tparam Index_ indexing type
-     * @param x first set of points
-     * @param y second set of points
-     * @param dist output distance matrix
-     * @param m number of points in x
-     * @param n number of points in y
-     * @param k dimensionality
-     * @param metric distance metric
-     * @param stream cuda stream
-     * @param isRowMajor whether the matrices are row-major or col-major
-     */
+ * @defgroup pairwise_distance pairwise distance prims
+ * @{
+ * @brief Convenience wrapper around 'distance' prim to convert runtime metric
+ * into compile time for the purpose of dispatch
+ * @tparam Type input/accumulation/output data-type
+ * @tparam Index_ indexing type
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param metric distance metric
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ */
 template <typename Type, typename Index_ = int>
-void pairwise_distance(const raft::handle_t &handle, const Type *x,
-                       const Type *y, Type *dist, Index_ m, Index_ n, Index_ k,
+void pairwise_distance(const raft::handle_t& handle,
+                       const Type* x,
+                       const Type* y,
+                       Type* dist,
+                       Index_ m,
+                       Index_ n,
+                       Index_ k,
                        raft::distance::DistanceType metric,
-                       bool isRowMajor = true, Type metric_arg = 2.0f) {
+                       bool isRowMajor = true,
+                       Type metric_arg = 2.0f)
+{
   rmm::device_uvector<char> workspace(0, handle.get_stream());
-  pairwise_distance<Type, Index_>(handle, x, y, dist, m, n, k, workspace,
-                                  metric, isRowMajor, metric_arg);
+  pairwise_distance<Type, Index_>(
+    handle, x, y, dist, m, n, k, workspace, metric, isRowMajor, metric_arg);
 }
 
 };  // namespace distance
diff --git a/cpp/include/raft/distance/fused_l2_nn.hpp b/cpp/include/raft/distance/fused_l2_nn.hpp
index 0a730506c8..d924ef217c 100644
--- a/cpp/include/raft/distance/fused_l2_nn.hpp
+++ b/cpp/include/raft/distance/fused_l2_nn.hpp
@@ -30,8 +30,7 @@ template <typename LabelT, typename DataT>
 using KVPMinReduce = detail::KVPMinReduceImpl<LabelT, DataT>;
 
 template <typename LabelT, typename DataT>
-using MinAndDistanceReduceOp =
-  detail::MinAndDistanceReduceOpImpl<LabelT, DataT>;
+using MinAndDistanceReduceOp = detail::MinAndDistanceReduceOpImpl<LabelT, DataT>;
 
 template <typename LabelT, typename DataT>
 using MinReduceOp = detail::MinReduceOpImpl<LabelT, DataT>;
@@ -40,10 +39,9 @@ using MinReduceOp = detail::MinReduceOpImpl<LabelT, DataT>;
  * Initialize array using init value from reduction op
  */
 template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
-void initialize(const raft::handle_t& handle, OutT* min, IdxT m, DataT maxVal,
-                ReduceOpT redOp) {
-  detail::initialize<DataT, OutT, IdxT, ReduceOpT>(min, m, maxVal, redOp,
-                                                   handle.get_stream());
+void initialize(const raft::handle_t& handle, OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
+{
+  detail::initialize<DataT, OutT, IdxT, ReduceOpT>(min, m, maxVal, redOp, handle.get_stream());
 }
 
 /**
@@ -82,25 +80,32 @@ void initialize(const raft::handle_t& handle, OutT* min, IdxT m, DataT maxVal,
  *                           main kernel launch
  * @param[in]  stream        cuda stream
  */
-template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT,
-          typename KVPReduceOpT>
-void fusedL2NN(OutT* min, const DataT* x, const DataT* y, const DataT* xn,
-               const DataT* yn, IdxT m, IdxT n, IdxT k, void* workspace,
-               ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt,
-               bool initOutBuffer, cudaStream_t stream) {
+template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT, typename KVPReduceOpT>
+void fusedL2NN(OutT* min,
+               const DataT* x,
+               const DataT* y,
+               const DataT* xn,
+               const DataT* yn,
+               IdxT m,
+               IdxT n,
+               IdxT k,
+               void* workspace,
+               ReduceOpT redOp,
+               KVPReduceOpT pairRedOp,
+               bool sqrt,
+               bool initOutBuffer,
+               cudaStream_t stream)
+{
   size_t bytes = sizeof(DataT) * k;
   if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) {
     detail::fusedL2NNImpl<DataT, OutT, IdxT, 16 / sizeof(DataT), ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt,
-      initOutBuffer, stream);
+      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
   } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) {
     detail::fusedL2NNImpl<DataT, OutT, IdxT, 8 / sizeof(DataT), ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt,
-      initOutBuffer, stream);
+      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
   } else {
     detail::fusedL2NNImpl<DataT, OutT, IdxT, 1, ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt,
-      initOutBuffer, stream);
+      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
   }
 }
 
diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp
index c62f2e5f79..773b83ab13 100644
--- a/cpp/include/raft/error.hpp
+++ b/cpp/include/raft/error.hpp
@@ -31,14 +31,14 @@ class exception : public std::exception {
   explicit exception() noexcept : std::exception(), msg_() {}
 
   /** copy ctor */
-  exception(exception const& src) noexcept
-    : std::exception(), msg_(src.what()) {
+  exception(exception const& src) noexcept : std::exception(), msg_(src.what())
+  {
     collect_call_stack();
   }
 
   /** ctor from an input message */
-  explicit exception(std::string const msg) noexcept
-    : std::exception(), msg_(std::move(msg)) {
+  explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg))
+  {
     collect_call_stack();
   }
 
@@ -51,7 +51,8 @@ class exception : public std::exception {
 
   /** append call stack info to this exception's message for ease of debug */
   // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
-  void collect_call_stack() noexcept {
+  void collect_call_stack() noexcept
+  {
 #ifdef __GNUC__
     constexpr int kMaxStackDepth = 64;
     void* stack[kMaxStackDepth];  // NOLINT
@@ -90,16 +91,16 @@ struct logic_error : public raft::exception {
 
 // FIXME: Need to be replaced with RAFT_FAIL
 /** macro to throw a runtime error */
-#define THROW(fmt, ...)                                                        \
-  do {                                                                         \
-    std::string msg;                                                           \
-    char errMsg[2048]; /* NOLINT */                                            \
-    std::snprintf(errMsg, sizeof(errMsg),                                      \
-                  "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \
-    msg += errMsg;                                                             \
-    std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__);                 \
-    msg += errMsg;                                                             \
-    throw raft::exception(msg);                                                \
+#define THROW(fmt, ...)                                                                    \
+  do {                                                                                     \
+    std::string msg;                                                                       \
+    char errMsg[2048]; /* NOLINT */                                                        \
+    std::snprintf(                                                                         \
+      errMsg, sizeof(errMsg), "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \
+    msg += errMsg;                                                                         \
+    std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__);                             \
+    msg += errMsg;                                                                         \
+    throw raft::exception(msg);                                                            \
   } while (0)
 
 // FIXME: Need to be replaced with RAFT_EXPECTS
@@ -109,16 +110,15 @@ struct logic_error : public raft::exception {
     if (!(check)) THROW(fmt, ##__VA_ARGS__); \
   } while (0)
 
-#define SET_ERROR_MSG(msg, location_prefix, fmt, ...)                      \
-  do {                                                                     \
-    char err_msg[2048]; /* NOLINT */                                       \
-    std::snprintf(err_msg, sizeof(err_msg), location_prefix);              \
-    msg += err_msg;                                                        \
-    std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, \
-                  __LINE__);                                               \
-    msg += err_msg;                                                        \
-    std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__);           \
-    msg += err_msg;                                                        \
+#define SET_ERROR_MSG(msg, location_prefix, fmt, ...)                                 \
+  do {                                                                                \
+    char err_msg[2048]; /* NOLINT */                                                  \
+    std::snprintf(err_msg, sizeof(err_msg), location_prefix);                         \
+    msg += err_msg;                                                                   \
+    std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, __LINE__); \
+    msg += err_msg;                                                                   \
+    std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__);                      \
+    msg += err_msg;                                                                   \
   } while (0)
 
 /**
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 794951ca9c..70fff1e210 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -61,34 +61,30 @@ class handle_t {
         int cur_dev = -1;
         CUDA_CHECK(cudaGetDevice(&cur_dev));
         return cur_dev;
-      }()) {
-    if (n_streams != 0) {
-      streams_ = std::make_unique<rmm::cuda_stream_pool>(n_streams);
-    }
+      }())
+  {
+    if (n_streams != 0) { streams_ = std::make_unique<rmm::cuda_stream_pool>(n_streams); }
     create_resources();
     thrust_policy_ = std::make_unique<rmm::exec_policy>(user_stream_);
   }
 
   /**
-   * @brief Construct a light handle copy from another 
+   * @brief Construct a light handle copy from another
    * user stream, cuda handles, comms and worker pool are not copied
-   * The user_stream of the returned handle is set to the specified stream 
+   * The user_stream of the returned handle is set to the specified stream
    * of the other handle worker pool
    * @param[in] other other handle for which to use streams
-   * @param[in] stream_id stream id in `other` worker streams 
+   * @param[in] stream_id stream id in `other` worker streams
    * to be set as user stream in the constructed handle
    * @param[in] n_streams number worker streams to be created
    */
-  handle_t(const handle_t& other, int stream_id,
-           int n_streams = kNumDefaultWorkerStreams)
-    : dev_id_(other.get_device()) {
-    RAFT_EXPECTS(
-      other.get_num_internal_streams() > 0,
-      "ERROR: the main handle must have at least one worker stream\n");
-    if (n_streams != 0) {
-      streams_ = std::make_unique<rmm::cuda_stream_pool>(n_streams);
-    }
-    prop_ = other.get_device_properties();
+  handle_t(const handle_t& other, int stream_id, int n_streams = kNumDefaultWorkerStreams)
+    : dev_id_(other.get_device())
+  {
+    RAFT_EXPECTS(other.get_num_internal_streams() > 0,
+                 "ERROR: the main handle must have at least one worker stream\n");
+    if (n_streams != 0) { streams_ = std::make_unique<rmm::cuda_stream_pool>(n_streams); }
+    prop_                    = other.get_device_properties();
     device_prop_initialized_ = true;
     create_resources();
     set_stream(other.get_internal_stream(stream_id));
@@ -102,11 +98,10 @@ class handle_t {
 
   void set_stream(cudaStream_t stream) { user_stream_ = stream; }
   cudaStream_t get_stream() const { return user_stream_; }
-  rmm::cuda_stream_view get_stream_view() const {
-    return rmm::cuda_stream_view(user_stream_);
-  }
+  rmm::cuda_stream_view get_stream_view() const { return rmm::cuda_stream_view(user_stream_); }
 
-  cublasHandle_t get_cublas_handle() const {
+  cublasHandle_t get_cublas_handle() const
+  {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cublas_initialized_) {
       CUBLAS_CHECK(cublasCreate(&cublas_handle_));
@@ -115,7 +110,8 @@ class handle_t {
     return cublas_handle_;
   }
 
-  cusolverDnHandle_t get_cusolver_dn_handle() const {
+  cusolverDnHandle_t get_cusolver_dn_handle() const
+  {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusolver_dn_initialized_) {
       CUSOLVER_CHECK(cusolverDnCreate(&cusolver_dn_handle_));
@@ -124,7 +120,8 @@ class handle_t {
     return cusolver_dn_handle_;
   }
 
-  cusolverSpHandle_t get_cusolver_sp_handle() const {
+  cusolverSpHandle_t get_cusolver_sp_handle() const
+  {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusolver_sp_initialized_) {
       CUSOLVER_CHECK(cusolverSpCreate(&cusolver_sp_handle_));
@@ -133,7 +130,8 @@ class handle_t {
     return cusolver_sp_handle_;
   }
 
-  cusparseHandle_t get_cusparse_handle() const {
+  cusparseHandle_t get_cusparse_handle() const
+  {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusparse_initialized_) {
       CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
@@ -145,25 +143,27 @@ class handle_t {
   rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; }
 
   // legacy compatibility for cuML
-  cudaStream_t get_internal_stream(int sid) const {
-    RAFT_EXPECTS(
-      streams_.get() != nullptr,
-      "ERROR: rmm::cuda_stream_pool was not initialized with a non-zero value");
+  cudaStream_t get_internal_stream(int sid) const
+  {
+    RAFT_EXPECTS(streams_.get() != nullptr,
+                 "ERROR: rmm::cuda_stream_pool was not initialized with a non-zero value");
     return streams_->get_stream(sid).value();
   }
   // new accessor return rmm::cuda_stream_view
-  rmm::cuda_stream_view get_internal_stream_view(int sid) const {
-    RAFT_EXPECTS(
-      streams_.get() != nullptr,
-      "ERROR: rmm::cuda_stream_pool was not initialized with a non-zero value");
+  rmm::cuda_stream_view get_internal_stream_view(int sid) const
+  {
+    RAFT_EXPECTS(streams_.get() != nullptr,
+                 "ERROR: rmm::cuda_stream_pool was not initialized with a non-zero value");
     return streams_->get_stream(sid);
   }
 
-  int get_num_internal_streams() const {
+  int get_num_internal_streams() const
+  {
     return streams_.get() != nullptr ? streams_->get_pool_size() : 0;
   }
 
-  std::vector<cudaStream_t> get_internal_streams() const {
+  std::vector<cudaStream_t> get_internal_streams() const
+  {
     std::vector<cudaStream_t> int_streams_vec;
     for (int i = 0; i < get_num_internal_streams(); i++) {
       int_streams_vec.push_back(get_internal_stream(i));
@@ -171,49 +171,51 @@ class handle_t {
     return int_streams_vec;
   }
 
-  void wait_on_user_stream() const {
+  void wait_on_user_stream() const
+  {
     CUDA_CHECK(cudaEventRecord(event_, user_stream_));
     for (int i = 0; i < get_num_internal_streams(); i++) {
       CUDA_CHECK(cudaStreamWaitEvent(get_internal_stream(i), event_, 0));
     }
   }
 
-  void wait_on_internal_streams() const {
+  void wait_on_internal_streams() const
+  {
     for (int i = 0; i < get_num_internal_streams(); i++) {
       CUDA_CHECK(cudaEventRecord(event_, get_internal_stream(i)));
       CUDA_CHECK(cudaStreamWaitEvent(user_stream_, event_, 0));
     }
   }
 
-  void set_comms(std::shared_ptr<comms::comms_t> communicator) {
-    communicator_ = communicator;
-  }
+  void set_comms(std::shared_ptr<comms::comms_t> communicator) { communicator_ = communicator; }
 
-  const comms::comms_t& get_comms() const {
-    RAFT_EXPECTS(this->comms_initialized(),
-                 "ERROR: Communicator was not initialized\n");
+  const comms::comms_t& get_comms() const
+  {
+    RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n");
     return *communicator_;
   }
 
-  void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm) {
+  void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm)
+  {
     subcomms_[key] = subcomm;
   }
 
-  const comms::comms_t& get_subcomm(std::string key) const {
-    RAFT_EXPECTS(subcomms_.find(key) != subcomms_.end(),
-                 "%s was not found in subcommunicators.", key.c_str());
+  const comms::comms_t& get_subcomm(std::string key) const
+  {
+    RAFT_EXPECTS(
+      subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str());
 
     auto subcomm = subcomms_.at(key);
 
-    RAFT_EXPECTS(nullptr != subcomm.get(),
-                 "ERROR: Subcommunicator was not initialized");
+    RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized");
 
     return *subcomm;
   }
 
   bool comms_initialized() const { return (nullptr != communicator_.get()); }
 
-  const cudaDeviceProp& get_device_properties() const {
+  const cudaDeviceProp& get_device_properties() const
+  {
     std::lock_guard<std::mutex> _(mutex_);
     if (!device_prop_initialized_) {
       CUDA_CHECK(cudaGetDeviceProperties(&prop_, dev_id_));
@@ -243,29 +245,28 @@ class handle_t {
   mutable bool device_prop_initialized_{false};
   mutable std::mutex mutex_;
 
-  void create_resources() {
-    CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-  }
+  void create_resources() { CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); }
 
-  void destroy_resources() {
+  void destroy_resources()
+  {
     ///@todo: enable *_NO_THROW variants once we have enabled logging
     if (cusparse_initialized_) {
-      //CUSPARSE_CHECK_NO_THROW(cusparseDestroy(cusparse_handle_));
+      // CUSPARSE_CHECK_NO_THROW(cusparseDestroy(cusparse_handle_));
       CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_));
     }
     if (cusolver_dn_initialized_) {
-      //CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
+      // CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
       CUSOLVER_CHECK(cusolverDnDestroy(cusolver_dn_handle_));
     }
     if (cusolver_sp_initialized_) {
-      //CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
+      // CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
       CUSOLVER_CHECK(cusolverSpDestroy(cusolver_sp_handle_));
     }
     if (cublas_initialized_) {
-      //CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_));
+      // CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_));
       CUBLAS_CHECK(cublasDestroy(cublas_handle_));
     }
-    //CUDA_CHECK_NO_THROW(cudaEventDestroy(event_));
+    // CUDA_CHECK_NO_THROW(cudaEventDestroy(event_));
     CUDA_CHECK(cudaEventDestroy(event_));
   }
 };  // class handle_t
@@ -275,7 +276,8 @@ class handle_t {
  */
 class stream_syncer {
  public:
-  explicit stream_syncer(const handle_t& handle) : handle_(handle) {
+  explicit stream_syncer(const handle_t& handle) : handle_(handle)
+  {
     handle_.wait_on_user_stream();
   }
   ~stream_syncer() { handle_.wait_on_internal_streams(); }
diff --git a/cpp/include/raft/integer_utils.h b/cpp/include/raft/integer_utils.h
index a7cfb9287b..5fc56de14b 100644
--- a/cpp/include/raft/integer_utils.h
+++ b/cpp/include/raft/integer_utils.h
@@ -34,15 +34,13 @@ namespace raft {
  * `modulus` is positive.
  */
 template <typename S>
-inline S round_up_safe(S number_to_round, S modulus) {
+inline S round_up_safe(S number_to_round, S modulus)
+{
   auto remainder = number_to_round % modulus;
-  if (remainder == 0) {
-    return number_to_round;
-  }
+  if (remainder == 0) { return number_to_round; }
   auto rounded_up = number_to_round - remainder + modulus;
   if (rounded_up < number_to_round) {
-    throw std::invalid_argument(
-      "Attempt to round up beyond the type's maximum value");
+    throw std::invalid_argument("Attempt to round up beyond the type's maximum value");
   }
   return rounded_up;
 }
@@ -53,8 +51,9 @@ inline S round_up_safe(S number_to_round, S modulus) {
  * `modulus` is positive.
  */
 template <typename S>
-inline S round_down_safe(S number_to_round, S modulus) {
-  auto remainder = number_to_round % modulus;
+inline S round_down_safe(S number_to_round, S modulus)
+{
+  auto remainder    = number_to_round % modulus;
   auto rounded_down = number_to_round - remainder;
   return rounded_down;
 }
@@ -72,25 +71,28 @@ inline S round_down_safe(S number_to_round, S modulus) {
  * the result will be incorrect
  */
 template <typename S, typename T>
-constexpr inline S div_rounding_up_unsafe(const S& dividend,
-                                          const T& divisor) noexcept {
+constexpr inline S div_rounding_up_unsafe(const S& dividend, const T& divisor) noexcept
+{
   return (dividend + divisor - 1) / divisor;
 }
 
 namespace detail {
 template <typename I>
 constexpr inline I div_rounding_up_safe(std::integral_constant<bool, false>,
-                                        I dividend, I divisor) noexcept {
+                                        I dividend,
+                                        I divisor) noexcept
+{
   // TODO: This could probably be implemented faster
-  return (dividend > divisor)
-           ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor)
-           : (dividend > 0);
+  return (dividend > divisor) ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor)
+                              : (dividend > 0);
 }
 
 template <typename I>
 constexpr inline I div_rounding_up_safe(std::integral_constant<bool, true>,
-                                        I dividend, I divisor) noexcept {
-  auto quotient = dividend / divisor;
+                                        I dividend,
+                                        I divisor) noexcept
+{
+  auto quotient  = dividend / divisor;
   auto remainder = dividend % divisor;
   return quotient + (remainder != 0);
 }
@@ -110,16 +112,17 @@ constexpr inline I div_rounding_up_safe(std::integral_constant<bool, true>,
  * approach of using (dividend + divisor - 1) / divisor
  */
 template <typename I>
-constexpr inline std::enable_if_t<std::is_integral<I>::value, I>
-div_rounding_up_safe(I dividend, I divisor) noexcept {
-  using i_is_a_signed_type =
-    std::integral_constant<bool, std::is_signed<I>::value>;
+constexpr inline std::enable_if_t<std::is_integral<I>::value, I> div_rounding_up_safe(
+  I dividend, I divisor) noexcept
+{
+  using i_is_a_signed_type = std::integral_constant<bool, std::is_signed<I>::value>;
   return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor);
 }
 
 template <typename I>
-constexpr inline std::enable_if_t<std::is_integral<I>::value, bool>
-is_a_power_of_two(I val) noexcept {
+constexpr inline std::enable_if_t<std::is_integral<I>::value, bool> is_a_power_of_two(
+  I val) noexcept
+{
   return ((val - 1) & val) == 0;
 }
 
@@ -147,14 +150,14 @@ is_a_power_of_two(I val) noexcept {
  * @return Absolute value if value type is signed.
  */
 template <typename T>
-std::enable_if_t<std::is_signed<T>::value, T> constexpr inline absolute_value(
-  T value) {
+std::enable_if_t<std::is_signed<T>::value, T> constexpr inline absolute_value(T value)
+{
   return std::abs(value);
 }
 // Unsigned type just returns itself.
 template <typename T>
-std::enable_if_t<!std::is_signed<T>::value, T> constexpr inline absolute_value(
-  T value) {
+std::enable_if_t<!std::is_signed<T>::value, T> constexpr inline absolute_value(T value)
+{
   return value;
 }
 
diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh
index b2302836bc..a2e29952d7 100644
--- a/cpp/include/raft/label/classlabels.cuh
+++ b/cpp/include/raft/label/classlabels.cuh
@@ -42,26 +42,25 @@ namespace label {
  * \param [in] stream cuda stream
  */
 template <typename value_t>
-int getUniquelabels(rmm::device_uvector<value_t> &unique, value_t *y, size_t n,
-                    cudaStream_t stream) {
+int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n, cudaStream_t stream)
+{
   rmm::device_scalar<int> d_num_selected(stream);
   rmm::device_uvector<value_t> workspace(n, stream);
-  size_t bytes = 0;
+  size_t bytes  = 0;
   size_t bytes2 = 0;
 
   // Query how much temporary storage we will need for cub operations
   // and allocate it
   cub::DeviceRadixSort::SortKeys(NULL, bytes, y, workspace.data(), n);
-  cub::DeviceSelect::Unique(NULL, bytes2, workspace.data(), workspace.data(),
-                            d_num_selected.data(), n);
+  cub::DeviceSelect::Unique(
+    NULL, bytes2, workspace.data(), workspace.data(), d_num_selected.data(), n);
   bytes = max(bytes, bytes2);
   rmm::device_uvector<char> cub_storage(bytes, stream);
 
   // Select Unique classes
-  cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, workspace.data(),
-                                 n);
-  cub::DeviceSelect::Unique(cub_storage.data(), bytes, workspace.data(),
-                            workspace.data(), d_num_selected.data(), n);
+  cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, workspace.data(), n);
+  cub::DeviceSelect::Unique(
+    cub_storage.data(), bytes, workspace.data(), workspace.data(), d_num_selected.data(), n);
 
   int n_unique = d_num_selected.value(stream);
   // Copy unique classes to output
@@ -90,16 +89,17 @@ int getUniquelabels(rmm::device_uvector<value_t> &unique, value_t *y, size_t n,
  * \param [in] stream cuda stream
  */
 template <typename value_t>
-void getOvrlabels(value_t *y, int n, value_t *y_unique, int n_classes,
-                  value_t *y_out, int idx, cudaStream_t stream) {
+void getOvrlabels(
+  value_t* y, int n, value_t* y_unique, int n_classes, value_t* y_out, int idx, cudaStream_t stream)
+{
   ASSERT(idx < n_classes,
          "Parameter idx should not be larger than the number "
          "of classes");
   raft::linalg::unaryOp(
-    y_out, y, n,
-    [idx, y_unique] __device__(value_t y) {
-      return y == y_unique[idx] ? +1 : -1;
-    },
+    y_out,
+    y,
+    n,
+    [idx, y_unique] __device__(value_t y) { return y == y_unique[idx] ? +1 : -1; },
     stream);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -108,9 +108,14 @@ void getOvrlabels(value_t *y, int n, value_t *y_unique, int n_classes,
 // +/-1, return array with the new class labels and corresponding indices.
 
 template <typename Type, int TPB_X, typename Lambda>
-__global__ void map_label_kernel(Type *map_ids, size_t N_labels, Type *in,
-                                 Type *out, size_t N, Lambda filter_op,
-                                 bool zero_based = false) {
+__global__ void map_label_kernel(Type* map_ids,
+                                 size_t N_labels,
+                                 Type* in,
+                                 Type* out,
+                                 size_t N,
+                                 Lambda filter_op,
+                                 bool zero_based = false)
+{
   int tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {
     if (!filter_op(in[tid])) {
@@ -125,27 +130,28 @@ __global__ void map_label_kernel(Type *map_ids, size_t N_labels, Type *in,
 }
 
 /**
-   * Maps an input array containing a series of numbers into a new array
-   * where numbers have been mapped to a monotonically increasing set
-   * of labels. This can be useful in machine learning algorithms, for instance,
-   * where a given set of labels is not taken from a monotonically increasing
-   * set. This can happen if they are filtered or if only a subset of the
-   * total labels are used in a dataset. This is also useful in graph algorithms
-   * where a set of vertices need to be labeled in a monotonically increasing
-   * order.
-   * @tparam Type the numeric type of the input and output arrays
-   * @tparam Lambda the type of an optional filter function, which determines
-   * which items in the array to map.
-   * @param out the output monotonic array
-   * @param in input label array
-   * @param N number of elements in the input array
-   * @param stream cuda stream to use
-   * @param filter_op an optional function for specifying which values
-   * should have monotonically increasing labels applied to them.
-   */
+ * Maps an input array containing a series of numbers into a new array
+ * where numbers have been mapped to a monotonically increasing set
+ * of labels. This can be useful in machine learning algorithms, for instance,
+ * where a given set of labels is not taken from a monotonically increasing
+ * set. This can happen if they are filtered or if only a subset of the
+ * total labels are used in a dataset. This is also useful in graph algorithms
+ * where a set of vertices need to be labeled in a monotonically increasing
+ * order.
+ * @tparam Type the numeric type of the input and output arrays
+ * @tparam Lambda the type of an optional filter function, which determines
+ * which items in the array to map.
+ * @param out the output monotonic array
+ * @param in input label array
+ * @param N number of elements in the input array
+ * @param stream cuda stream to use
+ * @param filter_op an optional function for specifying which values
+ * should have monotonically increasing labels applied to them.
+ */
 template <typename Type, typename Lambda>
-void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream,
-                    Lambda filter_op, bool zero_based = false) {
+void make_monotonic(
+  Type* out, Type* in, size_t N, cudaStream_t stream, Lambda filter_op, bool zero_based = false)
+{
   static const size_t TPB_X = 256;
 
   dim3 blocks(raft::ceildiv(N, TPB_X));
@@ -159,25 +165,25 @@ void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream,
 }
 
 /**
-   * Maps an input array containing a series of numbers into a new array
-   * where numbers have been mapped to a monotonically increasing set
-   * of labels. This can be useful in machine learning algorithms, for instance,
-   * where a given set of labels is not taken from a monotonically increasing
-   * set. This can happen if they are filtered or if only a subset of the
-   * total labels are used in a dataset. This is also useful in graph algorithms
-   * where a set of vertices need to be labeled in a monotonically increasing
-   * order.
-   * @tparam Type the numeric type of the input and output arrays
-   * @tparam Lambda the type of an optional filter function, which determines
-   * which items in the array to map.
-   * @param out output label array with labels assigned monotonically
-   * @param in input label array
-   * @param N number of elements in the input array
-   * @param stream cuda stream to use
-   */
+ * Maps an input array containing a series of numbers into a new array
+ * where numbers have been mapped to a monotonically increasing set
+ * of labels. This can be useful in machine learning algorithms, for instance,
+ * where a given set of labels is not taken from a monotonically increasing
+ * set. This can happen if they are filtered or if only a subset of the
+ * total labels are used in a dataset. This is also useful in graph algorithms
+ * where a set of vertices need to be labeled in a monotonically increasing
+ * order.
+ * @tparam Type the numeric type of the input and output arrays
+ * @tparam Lambda the type of an optional filter function, which determines
+ * which items in the array to map.
+ * @param out output label array with labels assigned monotonically
+ * @param in input label array
+ * @param N number of elements in the input array
+ * @param stream cuda stream to use
+ */
 template <typename Type>
-void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream,
-                    bool zero_based = false) {
+void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zero_based = false)
+{
   make_monotonic<Type>(
     out, in, N, stream, [] __device__(Type val) { return false; }, zero_based);
 }
diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/merge_labels.cuh
index bed74581a2..1ee0659b0d 100644
--- a/cpp/include/raft/label/merge_labels.cuh
+++ b/cpp/include/raft/label/merge_labels.cuh
@@ -35,8 +35,10 @@ __global__ void __launch_bounds__(TPB_X)
   propagate_label_kernel(const value_idx* __restrict__ labels_a,
                          const value_idx* __restrict__ labels_b,
                          value_idx* __restrict__ R,
-                         const bool* __restrict__ mask, bool* __restrict__ m,
-                         value_idx N) {
+                         const bool* __restrict__ mask,
+                         bool* __restrict__ m,
+                         value_idx N)
+{
   value_idx tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {
     if (__ldg((char*)mask + tid)) {
@@ -65,15 +67,17 @@ template <typename value_idx, int TPB_X = 256>
 __global__ void __launch_bounds__(TPB_X)
   reassign_label_kernel(value_idx* __restrict__ labels_a,
                         const value_idx* __restrict__ labels_b,
-                        const value_idx* __restrict__ R, value_idx N,
-                        value_idx MAX_LABEL) {
+                        const value_idx* __restrict__ R,
+                        value_idx N,
+                        value_idx MAX_LABEL)
+{
   value_idx tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {
     // Note: labels are from 1 to N
-    value_idx la = labels_a[tid];
-    value_idx lb = __ldg(labels_b + tid);
-    value_idx ra = (la == MAX_LABEL) ? MAX_LABEL : __ldg(R + (la - 1)) + 1;
-    value_idx rb = (lb == MAX_LABEL) ? MAX_LABEL : __ldg(R + (lb - 1)) + 1;
+    value_idx la  = labels_a[tid];
+    value_idx lb  = __ldg(labels_b + tid);
+    value_idx ra  = (la == MAX_LABEL) ? MAX_LABEL : __ldg(R + (la - 1)) + 1;
+    value_idx rb  = (lb == MAX_LABEL) ? MAX_LABEL : __ldg(R + (lb - 1)) + 1;
     labels_a[tid] = min(ra, rb);
   }
 }
@@ -108,9 +112,14 @@ __global__ void __launch_bounds__(TPB_X)
  * @param[in]    stream      CUDA stream
  */
 template <typename value_idx = int, int TPB_X = 256>
-void merge_labels(value_idx* labels_a, const value_idx* labels_b,
-                  const bool* mask, value_idx* R, bool* m, value_idx N,
-                  cudaStream_t stream) {
+void merge_labels(value_idx* labels_a,
+                  const value_idx* labels_b,
+                  const bool* mask,
+                  value_idx* R,
+                  bool* m,
+                  value_idx N,
+                  cudaStream_t stream)
+{
   dim3 blocks(raft::ceildiv(N, value_idx(TPB_X)));
   dim3 threads(TPB_X);
   value_idx MAX_LABEL = std::numeric_limits<value_idx>::max();
diff --git a/cpp/include/raft/lap/d_structs.h b/cpp/include/raft/lap/d_structs.h
index ed545b7198..e488dc528f 100644
--- a/cpp/include/raft/lap/d_structs.h
+++ b/cpp/include/raft/lap/d_structs.h
@@ -26,18 +26,18 @@
 
 template <typename vertex_t, typename weight_t>
 struct Vertices {
-  vertex_t *row_assignments;
-  vertex_t *col_assignments;
-  int *row_covers;
-  int *col_covers;
-  weight_t *row_duals;
-  weight_t *col_duals;
-  weight_t *col_slacks;
+  vertex_t* row_assignments;
+  vertex_t* col_assignments;
+  int* row_covers;
+  int* col_covers;
+  weight_t* row_duals;
+  weight_t* col_duals;
+  weight_t* col_slacks;
 };
 
 template <typename vertex_t>
 struct VertexData {
-  vertex_t *parents;
-  vertex_t *children;
-  int *is_visited;
+  vertex_t* parents;
+  vertex_t* children;
+  int* is_visited;
 };
diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.cuh
index f64afb3549..42b898ebff 100644
--- a/cpp/include/raft/lap/lap.cuh
+++ b/cpp/include/raft/lap/lap.cuh
@@ -39,12 +39,12 @@ class LinearAssignmentProblem {
   vertex_t batchsize_;
   weight_t epsilon_;
 
-  weight_t const *d_costs_;
+  weight_t const* d_costs_;
 
   Vertices<vertex_t, weight_t> d_vertices_dev;
   VertexData<vertex_t> d_row_data_dev, d_col_data_dev;
 
-  raft::handle_t const &handle_;
+  raft::handle_t const& handle_;
   rmm::device_uvector<int> row_covers_v;
   rmm::device_uvector<int> col_covers_v;
   rmm::device_uvector<weight_t> row_duals_v;
@@ -60,8 +60,10 @@ class LinearAssignmentProblem {
   rmm::device_uvector<weight_t> obj_val_dual_v;
 
  public:
-  LinearAssignmentProblem(raft::handle_t const &handle, vertex_t size,
-                          vertex_t batchsize, weight_t epsilon)
+  LinearAssignmentProblem(raft::handle_t const& handle,
+                          vertex_t size,
+                          vertex_t batchsize,
+                          weight_t epsilon)
     : handle_(handle),
       size_(size),
       batchsize_(batchsize),
@@ -79,11 +81,13 @@ class LinearAssignmentProblem {
       row_children_v(0, handle_.get_stream()),
       col_children_v(0, handle_.get_stream()),
       obj_val_primal_v(0, handle_.get_stream()),
-      obj_val_dual_v(0, handle_.get_stream()) {}
+      obj_val_dual_v(0, handle_.get_stream())
+  {
+  }
 
   // Executes Hungarian algorithm on the input cost matrix.
-  void solve(weight_t const *d_cost_matrix, vertex_t *d_row_assignment,
-             vertex_t *d_col_assignment) {
+  void solve(weight_t const* d_cost_matrix, vertex_t* d_row_assignment, vertex_t* d_col_assignment)
+  {
     initializeDevice();
 
     d_vertices_dev.row_assignments = d_row_assignment;
@@ -95,27 +99,13 @@ class LinearAssignmentProblem {
 
     while (step != 100) {
       switch (step) {
-        case 0:
-          step = hungarianStep0();
-          break;
-        case 1:
-          step = hungarianStep1();
-          break;
-        case 2:
-          step = hungarianStep2();
-          break;
-        case 3:
-          step = hungarianStep3();
-          break;
-        case 4:
-          step = hungarianStep4();
-          break;
-        case 5:
-          step = hungarianStep5();
-          break;
-        case 6:
-          step = hungarianStep6();
-          break;
+        case 0: step = hungarianStep0(); break;
+        case 1: step = hungarianStep1(); break;
+        case 2: step = hungarianStep2(); break;
+        case 3: step = hungarianStep3(); break;
+        case 4: step = hungarianStep4(); break;
+        case 5: step = hungarianStep5(); break;
+        case 6: step = hungarianStep6(); break;
       }
     }
 
@@ -123,36 +113,39 @@ class LinearAssignmentProblem {
   }
 
   // Function for getting optimal row dual vector for subproblem spId.
-  std::pair<const weight_t *, vertex_t> getRowDualVector(int spId) const {
+  std::pair<const weight_t*, vertex_t> getRowDualVector(int spId) const
+  {
     return std::make_pair(row_duals_v.data() + spId * size_, size_);
   }
 
   // Function for getting optimal col dual vector for subproblem spId.
-  std::pair<const weight_t *, vertex_t> getColDualVector(int spId) {
+  std::pair<const weight_t*, vertex_t> getColDualVector(int spId)
+  {
     return std::make_pair(col_duals_v.data() + spId * size_, size_);
   }
 
   // Function for getting optimal primal objective value for subproblem spId.
-  weight_t getPrimalObjectiveValue(int spId) {
+  weight_t getPrimalObjectiveValue(int spId)
+  {
     weight_t result;
-    raft::update_host(&result, obj_val_primal_v.data() + spId, 1,
-                      handle_.get_stream());
+    raft::update_host(&result, obj_val_primal_v.data() + spId, 1, handle_.get_stream());
     CHECK_CUDA(handle_.get_stream());
     return result;
   }
 
   // Function for getting optimal dual objective value for subproblem spId.
-  weight_t getDualObjectiveValue(int spId) {
+  weight_t getDualObjectiveValue(int spId)
+  {
     weight_t result;
-    raft::update_host(&result, obj_val_dual_v.data() + spId, 1,
-                      handle_.get_stream());
+    raft::update_host(&result, obj_val_dual_v.data() + spId, 1, handle_.get_stream());
     CHECK_CUDA(handle_.get_stream());
     return result;
   }
 
  private:
   // Helper function for initializing global variables and arrays on a single host.
-  void initializeDevice() {
+  void initializeDevice()
+  {
     cudaStream_t stream = handle_.get_stream();
     row_covers_v.resize(batchsize_ * size_, stream);
     col_covers_v.resize(batchsize_ * size_, stream);
@@ -171,39 +164,36 @@ class LinearAssignmentProblem {
     d_vertices_dev.row_covers = row_covers_v.data();
     d_vertices_dev.col_covers = col_covers_v.data();
 
-    d_vertices_dev.row_duals = row_duals_v.data();
-    d_vertices_dev.col_duals = col_duals_v.data();
+    d_vertices_dev.row_duals  = row_duals_v.data();
+    d_vertices_dev.col_duals  = col_duals_v.data();
     d_vertices_dev.col_slacks = col_slacks_v.data();
 
     d_row_data_dev.is_visited = row_is_visited_v.data();
     d_col_data_dev.is_visited = col_is_visited_v.data();
-    d_row_data_dev.parents = row_parents_v.data();
-    d_row_data_dev.children = row_children_v.data();
-    d_col_data_dev.parents = col_parents_v.data();
-    d_col_data_dev.children = col_children_v.data();
-
-    thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(),
-                 int{0});
-    thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(),
-                 int{0});
-    thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(),
-                 weight_t{0});
-    thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(),
-                 weight_t{0});
+    d_row_data_dev.parents    = row_parents_v.data();
+    d_row_data_dev.children   = row_children_v.data();
+    d_col_data_dev.parents    = col_parents_v.data();
+    d_col_data_dev.children   = col_children_v.data();
+
+    thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(), int{0});
+    thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(), int{0});
+    thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(), weight_t{0});
+    thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(), weight_t{0});
   }
 
   // Function for calculating initial zeros by subtracting row and column minima from each element.
-  int hungarianStep0() {
-    detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_,
-                             size_);
+  int hungarianStep0()
+  {
+    detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_, size_);
 
     return 1;
   }
 
   // Function for calculating initial zeros by subtracting row and column minima from each element.
-  int hungarianStep1() {
-    detail::computeInitialAssignments(handle_, d_costs_, d_vertices_dev,
-                                      batchsize_, size_, epsilon_);
+  int hungarianStep1()
+  {
+    detail::computeInitialAssignments(
+      handle_, d_costs_, d_vertices_dev, batchsize_, size_, epsilon_);
 
     int next = 2;
 
@@ -219,10 +209,10 @@ class LinearAssignmentProblem {
   }
 
   // Function for checking optimality and constructing predicates and covers.
-  int hungarianStep2() {
-    int cover_count =
-      detail::computeRowCovers(handle_, d_vertices_dev, d_row_data_dev,
-                               d_col_data_dev, batchsize_, size_);
+  int hungarianStep2()
+  {
+    int cover_count = detail::computeRowCovers(
+      handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_);
 
     int next = (cover_count == batchsize_ * size_) ? 6 : 3;
 
@@ -230,7 +220,8 @@ class LinearAssignmentProblem {
   }
 
   // Function for building alternating tree rooted at unassigned rows.
-  int hungarianStep3() {
+  int hungarianStep3()
+  {
     int next;
 
     rmm::device_scalar<bool> flag_v(handle_.get_stream());
@@ -238,8 +229,14 @@ class LinearAssignmentProblem {
     bool h_flag = false;
     flag_v.set_value_async(h_flag, handle_.get_stream());
 
-    detail::executeZeroCover(handle_, d_costs_, d_vertices_dev, d_row_data_dev,
-                             d_col_data_dev, flag_v.data(), batchsize_, size_,
+    detail::executeZeroCover(handle_,
+                             d_costs_,
+                             d_vertices_dev,
+                             d_row_data_dev,
+                             d_col_data_dev,
+                             flag_v.data(),
+                             batchsize_,
+                             size_,
                              epsilon_);
 
     h_flag = flag_v.value(handle_.get_stream());
@@ -250,31 +247,36 @@ class LinearAssignmentProblem {
   }
 
   // Function for augmenting the solution along multiple node-disjoint alternating trees.
-  int hungarianStep4() {
-    detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_,
-                        size_);
+  int hungarianStep4()
+  {
+    detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_, size_);
 
-    detail::augmentationPass(handle_, d_vertices_dev, d_row_data_dev,
-                             d_col_data_dev, batchsize_, size_);
+    detail::augmentationPass(
+      handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_);
 
     return 2;
   }
 
   // Function for updating dual solution to introduce new zero-cost arcs.
-  int hungarianStep5() {
-    detail::dualUpdate(handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev,
-                       batchsize_, size_, epsilon_);
+  int hungarianStep5()
+  {
+    detail::dualUpdate(
+      handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_, epsilon_);
 
     return 3;
   }
 
   // Function for calculating primal and dual objective values at optimality.
-  int hungarianStep6() {
-    detail::calcObjValPrimal(handle_, obj_val_primal_v.data(), d_costs_,
-                             d_vertices_dev.row_assignments, batchsize_, size_);
+  int hungarianStep6()
+  {
+    detail::calcObjValPrimal(handle_,
+                             obj_val_primal_v.data(),
+                             d_costs_,
+                             d_vertices_dev.row_assignments,
+                             batchsize_,
+                             size_);
 
-    detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev,
-                           batchsize_, size_);
+    detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev, batchsize_, size_);
 
     return 100;
   }
diff --git a/cpp/include/raft/lap/lap_functions.cuh b/cpp/include/raft/lap/lap_functions.cuh
index 830940f0ec..ab4aa2df59 100644
--- a/cpp/include/raft/lap/lap_functions.cuh
+++ b/cpp/include/raft/lap/lap_functions.cuh
@@ -45,20 +45,26 @@ const int BLOCKDIMX{64};
 const int BLOCKDIMY{1};
 
 // Function for calculating grid and block dimensions from the given input size.
-inline void calculateLinearDims(dim3 &blocks_per_grid, dim3 &threads_per_block,
-                                int &total_blocks, int size) {
+inline void calculateLinearDims(dim3& blocks_per_grid,
+                                dim3& threads_per_block,
+                                int& total_blocks,
+                                int size)
+{
   threads_per_block.x = BLOCKDIMX * BLOCKDIMY;
 
   int value = size / threads_per_block.x;
   if (size % threads_per_block.x > 0) value++;
 
-  total_blocks = value;
+  total_blocks      = value;
   blocks_per_grid.x = value;
 }
 
 // Function for calculating grid and block dimensions from the given input size for square grid.
-inline void calculateSquareDims(dim3 &blocks_per_grid, dim3 &threads_per_block,
-                                int &total_blocks, int size) {
+inline void calculateSquareDims(dim3& blocks_per_grid,
+                                dim3& threads_per_block,
+                                int& total_blocks,
+                                int size)
+{
   threads_per_block.x = BLOCKDIMX;
   threads_per_block.y = BLOCKDIMY;
 
@@ -67,15 +73,16 @@ inline void calculateSquareDims(dim3 &blocks_per_grid, dim3 &threads_per_block,
   int valuex = (int)ceil((float)(sq_size) / BLOCKDIMX);
   int valuey = (int)ceil((float)(sq_size) / BLOCKDIMY);
 
-  total_blocks = valuex * valuey;
+  total_blocks      = valuex * valuey;
   blocks_per_grid.x = valuex;
   blocks_per_grid.y = valuey;
 }
 
-// Function for calculating grid and block dimensions from the given input size for rectangular grid.
-inline void calculateRectangularDims(dim3 &blocks_per_grid,
-                                     dim3 &threads_per_block, int &total_blocks,
-                                     int xsize, int ysize) {
+// Function for calculating grid and block dimensions from the given input size for rectangular
+// grid.
+inline void calculateRectangularDims(
+  dim3& blocks_per_grid, dim3& threads_per_block, int& total_blocks, int xsize, int ysize)
+{
   threads_per_block.x = BLOCKDIMX;
   threads_per_block.y = BLOCKDIMY;
 
@@ -85,16 +92,18 @@ inline void calculateRectangularDims(dim3 &blocks_per_grid,
   int valuey = ysize / threads_per_block.y;
   if (ysize % threads_per_block.y > 0) valuey++;
 
-  total_blocks = valuex * valuey;
+  total_blocks      = valuex * valuey;
   blocks_per_grid.x = valuex;
   blocks_per_grid.y = valuey;
 }
 
 template <typename vertex_t, typename weight_t>
-inline void initialReduction(raft::handle_t const &handle,
-                             weight_t const *d_costs,
-                             Vertices<vertex_t, weight_t> &d_vertices_dev,
-                             int SP, vertex_t N) {
+inline void initialReduction(raft::handle_t const& handle,
+                             weight_t const* d_costs,
+                             Vertices<vertex_t, weight_t>& d_vertices_dev,
+                             int SP,
+                             vertex_t N)
+{
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
@@ -102,24 +111,28 @@ inline void initialReduction(raft::handle_t const &handle,
   raft::lap::detail::calculateRectangularDims(
     blocks_per_grid, threads_per_block, total_blocks, N, SP);
 
-  kernel_rowReduction<<<blocks_per_grid, threads_per_block, 0,
-                        handle.get_stream()>>>(
-    d_costs, d_vertices_dev.row_duals, SP, N,
-    std::numeric_limits<weight_t>::max());
+  kernel_rowReduction<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    d_costs, d_vertices_dev.row_duals, SP, N, std::numeric_limits<weight_t>::max());
 
   CHECK_CUDA(handle.get_stream());
-  kernel_columnReduction<<<blocks_per_grid, threads_per_block, 0,
-                           handle.get_stream()>>>(
-    d_costs, d_vertices_dev.row_duals, d_vertices_dev.col_duals, SP, N,
+  kernel_columnReduction<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    d_costs,
+    d_vertices_dev.row_duals,
+    d_vertices_dev.col_duals,
+    SP,
+    N,
     std::numeric_limits<weight_t>::max());
   CHECK_CUDA(handle.get_stream());
 }
 
 template <typename vertex_t, typename weight_t>
-inline void computeInitialAssignments(raft::handle_t const &handle,
-                                      weight_t const *d_costs,
-                                      Vertices<vertex_t, weight_t> &d_vertices,
-                                      int SP, vertex_t N, weight_t epsilon) {
+inline void computeInitialAssignments(raft::handle_t const& handle,
+                                      weight_t const* d_costs,
+                                      Vertices<vertex_t, weight_t>& d_vertices,
+                                      int SP,
+                                      vertex_t N,
+                                      weight_t epsilon)
+{
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
@@ -137,21 +150,29 @@ inline void computeInitialAssignments(raft::handle_t const &handle,
   raft::lap::detail::calculateRectangularDims(
     blocks_per_grid, threads_per_block, total_blocks, N, SP);
 
-  kernel_computeInitialAssignments<<<blocks_per_grid, threads_per_block, 0,
-                                     handle.get_stream()>>>(
-    d_costs, d_vertices.row_duals, d_vertices.col_duals,
-    d_vertices.row_assignments, d_vertices.col_assignments, row_lock_v.data(),
-    col_lock_v.data(), SP, N, epsilon);
+  kernel_computeInitialAssignments<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    d_costs,
+    d_vertices.row_duals,
+    d_vertices.col_duals,
+    d_vertices.row_assignments,
+    d_vertices.col_assignments,
+    row_lock_v.data(),
+    col_lock_v.data(),
+    SP,
+    N,
+    epsilon);
   CHECK_CUDA(handle.get_stream());
 }
 
 // Function for finding row cover on individual devices.
 template <typename vertex_t, typename weight_t>
-inline int computeRowCovers(raft::handle_t const &handle,
-                            Vertices<vertex_t, weight_t> &d_vertices,
-                            VertexData<vertex_t> &d_row_data,
-                            VertexData<vertex_t> &d_col_data, int SP,
-                            vertex_t N) {
+inline int computeRowCovers(raft::handle_t const& handle,
+                            Vertices<vertex_t, weight_t>& d_vertices,
+                            VertexData<vertex_t>& d_row_data,
+                            VertexData<vertex_t>& d_col_data,
+                            int SP,
+                            vertex_t N)
+{
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
@@ -160,8 +181,7 @@ inline int computeRowCovers(raft::handle_t const &handle,
 
   thrust::fill_n(thrust::device, d_vertices.row_covers, size, int{0});
   thrust::fill_n(thrust::device, d_vertices.col_covers, size, int{0});
-  thrust::fill_n(thrust::device, d_vertices.col_slacks, size,
-                 std::numeric_limits<weight_t>::max());
+  thrust::fill_n(thrust::device, d_vertices.col_slacks, size, std::numeric_limits<weight_t>::max());
   thrust::fill_n(thrust::device, d_row_data.is_visited, size, DORMANT);
   thrust::fill_n(thrust::device, d_col_data.is_visited, size, DORMANT);
   thrust::fill_n(thrust::device, d_row_data.parents, size, vertex_t{-1});
@@ -171,25 +191,28 @@ inline int computeRowCovers(raft::handle_t const &handle,
 
   raft::lap::detail::calculateRectangularDims(
     blocks_per_grid, threads_per_block, total_blocks, N, SP);
-  kernel_computeRowCovers<<<blocks_per_grid, threads_per_block, 0,
-                            handle.get_stream()>>>(
-    d_vertices.row_assignments, d_vertices.row_covers, d_row_data.is_visited,
-    SP, N);
+  kernel_computeRowCovers<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    d_vertices.row_assignments, d_vertices.row_covers, d_row_data.is_visited, SP, N);
 
   CHECK_CUDA(handle.get_stream());
 
-  return thrust::reduce(thrust::device, d_vertices.row_covers,
-                        d_vertices.row_covers + size);
+  return thrust::reduce(thrust::device, d_vertices.row_covers, d_vertices.row_covers + size);
 }
 
 // Function for covering the zeros in uncovered rows and expanding the frontier.
 template <typename vertex_t, typename weight_t>
-inline void coverZeroAndExpand(
-  raft::handle_t const &handle, weight_t const *d_costs_dev,
-  vertex_t const *d_rows_csr_neighbors, vertex_t const *d_rows_csr_ptrs,
-  Vertices<vertex_t, weight_t> &d_vertices_dev,
-  VertexData<vertex_t> &d_row_data_dev, VertexData<vertex_t> &d_col_data_dev,
-  bool *d_flag, int SP, vertex_t N, weight_t epsilon) {
+inline void coverZeroAndExpand(raft::handle_t const& handle,
+                               weight_t const* d_costs_dev,
+                               vertex_t const* d_rows_csr_neighbors,
+                               vertex_t const* d_rows_csr_ptrs,
+                               Vertices<vertex_t, weight_t>& d_vertices_dev,
+                               VertexData<vertex_t>& d_row_data_dev,
+                               VertexData<vertex_t>& d_col_data_dev,
+                               bool* d_flag,
+                               int SP,
+                               vertex_t N,
+                               weight_t epsilon)
+{
   int total_blocks = 0;
   dim3 blocks_per_grid;
   dim3 threads_per_block;
@@ -197,20 +220,30 @@ inline void coverZeroAndExpand(
   raft::lap::detail::calculateRectangularDims(
     blocks_per_grid, threads_per_block, total_blocks, N, SP);
 
-  kernel_coverAndExpand<<<blocks_per_grid, threads_per_block, 0,
-                          handle.get_stream()>>>(
-    d_flag, d_rows_csr_ptrs, d_rows_csr_neighbors, d_costs_dev, d_vertices_dev,
-    d_row_data_dev, d_col_data_dev, SP, N, epsilon);
+  kernel_coverAndExpand<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    d_flag,
+    d_rows_csr_ptrs,
+    d_rows_csr_neighbors,
+    d_costs_dev,
+    d_vertices_dev,
+    d_row_data_dev,
+    d_col_data_dev,
+    SP,
+    N,
+    epsilon);
 }
 
 template <typename vertex_t, typename weight_t>
-inline vertex_t zeroCoverIteration(raft::handle_t const &handle,
-                                   weight_t const *d_costs_dev,
-                                   Vertices<vertex_t, weight_t> &d_vertices_dev,
-                                   VertexData<vertex_t> &d_row_data_dev,
-                                   VertexData<vertex_t> &d_col_data_dev,
-                                   bool *d_flag, int SP, vertex_t N,
-                                   weight_t epsilon) {
+inline vertex_t zeroCoverIteration(raft::handle_t const& handle,
+                                   weight_t const* d_costs_dev,
+                                   Vertices<vertex_t, weight_t>& d_vertices_dev,
+                                   VertexData<vertex_t>& d_row_data_dev,
+                                   VertexData<vertex_t>& d_col_data_dev,
+                                   bool* d_flag,
+                                   int SP,
+                                   vertex_t N,
+                                   weight_t epsilon)
+{
   vertex_t M;
 
   rmm::device_uvector<vertex_t> csr_ptrs_v(0, handle.get_stream());
@@ -235,65 +268,85 @@ inline vertex_t zeroCoverIteration(raft::handle_t const &handle,
       blocks_per_grid, threads_per_block, total_blocks, N, SP);
 
     // construct predicate matrix for edges.
-    kernel_rowPredicateConstructionCSR<<<blocks_per_grid, threads_per_block, 0,
+    kernel_rowPredicateConstructionCSR<<<blocks_per_grid,
+                                         threads_per_block,
+                                         0,
                                          handle.get_stream()>>>(
-      predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP,
-      N);
+      predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP, N);
     CHECK_CUDA(handle.get_stream());
 
     M = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end());
-    thrust::exclusive_scan(thrust::device, addresses_v.begin(),
-                           addresses_v.end(), addresses_v.begin());
+    thrust::exclusive_scan(
+      thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin());
 
     if (M > 0) {
       csr_neighbors_v.resize(M, handle.get_stream());
 
-      kernel_rowScatterCSR<<<blocks_per_grid, threads_per_block, 0,
-                             handle.get_stream()>>>(
-        predicates_v.data(), addresses_v.data(), csr_neighbors_v.data(),
-        csr_ptrs_v.data(), M, SP, N);
+      kernel_rowScatterCSR<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+        predicates_v.data(),
+        addresses_v.data(),
+        csr_neighbors_v.data(),
+        csr_ptrs_v.data(),
+        M,
+        SP,
+        N);
 
       CHECK_CUDA(handle.get_stream());
     }
   }
 
   if (M > 0) {
-    coverZeroAndExpand(handle, d_costs_dev, csr_neighbors_v.data(),
-                       csr_ptrs_v.data(), d_vertices_dev, d_row_data_dev,
-                       d_col_data_dev, d_flag, SP, N, epsilon);
+    coverZeroAndExpand(handle,
+                       d_costs_dev,
+                       csr_neighbors_v.data(),
+                       csr_ptrs_v.data(),
+                       d_vertices_dev,
+                       d_row_data_dev,
+                       d_col_data_dev,
+                       d_flag,
+                       SP,
+                       N,
+                       epsilon);
   }
 
   return M;
 }
 
-// Function for executing recursive zero cover. Returns the next step (Step 4 or Step 5) depending on the presence of uncovered zeros.
+// Function for executing recursive zero cover. Returns the next step (Step 4 or Step 5) depending
+// on the presence of uncovered zeros.
 template <typename vertex_t, typename weight_t>
-inline void executeZeroCover(raft::handle_t const &handle,
-                             weight_t const *d_costs_dev,
-                             Vertices<vertex_t, weight_t> &d_vertices_dev,
-                             VertexData<vertex_t> &d_row_data_dev,
-                             VertexData<vertex_t> &d_col_data_dev, bool *d_flag,
-                             int SP, vertex_t N, weight_t epsilon) {
+inline void executeZeroCover(raft::handle_t const& handle,
+                             weight_t const* d_costs_dev,
+                             Vertices<vertex_t, weight_t>& d_vertices_dev,
+                             VertexData<vertex_t>& d_row_data_dev,
+                             VertexData<vertex_t>& d_col_data_dev,
+                             bool* d_flag,
+                             int SP,
+                             vertex_t N,
+                             weight_t epsilon)
+{
   vertex_t M = 1;
   while (M > 0) {
-    M = zeroCoverIteration(handle, d_costs_dev, d_vertices_dev, d_row_data_dev,
-                           d_col_data_dev, d_flag, SP, N, epsilon);
+    M = zeroCoverIteration(
+      handle, d_costs_dev, d_vertices_dev, d_row_data_dev, d_col_data_dev, d_flag, SP, N, epsilon);
   }
 }
 
 // Function for executing reverse pass of the maximum matching.
 template <typename vertex_t>
-inline void reversePass(raft::handle_t const &handle,
-                        VertexData<vertex_t> &d_row_data_dev,
-                        VertexData<vertex_t> &d_col_data_dev, int SP, int N) {
+inline void reversePass(raft::handle_t const& handle,
+                        VertexData<vertex_t>& d_row_data_dev,
+                        VertexData<vertex_t>& d_col_data_dev,
+                        int SP,
+                        int N)
+{
   int total_blocks = 0;
   dim3 blocks_per_grid;
   dim3 threads_per_block;
 
   std::size_t size = SP * N;
 
-  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
-                                         total_blocks, size);
+  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, size);
 
   rmm::device_uvector<bool> predicates_v(size, handle.get_stream());
   rmm::device_uvector<vertex_t> addresses_v(size, handle.get_stream());
@@ -302,18 +355,19 @@ inline void reversePass(raft::handle_t const &handle,
   thrust::fill_n(thrust::device, addresses_v.data(), size, vertex_t{0});
 
   // compact the reverse pass row vertices.
-  kernel_augmentPredicateConstruction<<<blocks_per_grid, threads_per_block, 0,
+  kernel_augmentPredicateConstruction<<<blocks_per_grid,
+                                        threads_per_block,
+                                        0,
                                         handle.get_stream()>>>(
     predicates_v.data(), addresses_v.data(), d_col_data_dev.is_visited, size);
 
   CHECK_CUDA(handle.get_stream());
 
   // calculate total number of vertices.
-  std::size_t csr_size =
-    thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end());
+  std::size_t csr_size = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end());
   // exclusive scan for calculating the scatter addresses.
-  thrust::exclusive_scan(thrust::device, addresses_v.begin(), addresses_v.end(),
-                         addresses_v.begin());
+  thrust::exclusive_scan(
+    thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin());
 
   if (csr_size > 0) {
     int total_blocks_1 = 0;
@@ -324,14 +378,12 @@ inline void reversePass(raft::handle_t const &handle,
 
     rmm::device_uvector<vertex_t> elements_v(csr_size, handle.get_stream());
 
-    kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0,
-                            handle.get_stream()>>>(
+    kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
       elements_v.data(), predicates_v.data(), addresses_v.data(), size);
 
     CHECK_CUDA(handle.get_stream());
 
-    kernel_reverseTraversal<<<blocks_per_grid_1, threads_per_block_1, 0,
-                              handle.get_stream()>>>(
+    kernel_reverseTraversal<<<blocks_per_grid_1, threads_per_block_1, 0, handle.get_stream()>>>(
       elements_v.data(), d_row_data_dev, d_col_data_dev, csr_size);
     CHECK_CUDA(handle.get_stream());
   }
@@ -339,16 +391,17 @@ inline void reversePass(raft::handle_t const &handle,
 
 // Function for executing augmentation pass of the maximum matching.
 template <typename vertex_t, typename weight_t>
-inline void augmentationPass(raft::handle_t const &handle,
-                             Vertices<vertex_t, weight_t> &d_vertices_dev,
-                             VertexData<vertex_t> &d_row_data_dev,
-                             VertexData<vertex_t> &d_col_data_dev, int SP,
-                             int N) {
+inline void augmentationPass(raft::handle_t const& handle,
+                             Vertices<vertex_t, weight_t>& d_vertices_dev,
+                             VertexData<vertex_t>& d_row_data_dev,
+                             VertexData<vertex_t>& d_col_data_dev,
+                             int SP,
+                             int N)
+{
   int total_blocks = 0;
   dim3 blocks_per_grid;
   dim3 threads_per_block;
-  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
-                                         total_blocks, SP * N);
+  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP * N);
 
   rmm::device_uvector<bool> predicates_v(SP * N, handle.get_stream());
   rmm::device_uvector<vertex_t> addresses_v(SP * N, handle.get_stream());
@@ -357,7 +410,9 @@ inline void augmentationPass(raft::handle_t const &handle,
   thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0});
 
   // compact the reverse pass row vertices.
-  kernel_augmentPredicateConstruction<<<blocks_per_grid, threads_per_block, 0,
+  kernel_augmentPredicateConstruction<<<blocks_per_grid,
+                                        threads_per_block,
+                                        0,
                                         handle.get_stream()>>>(
     predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP * N);
 
@@ -368,8 +423,8 @@ inline void augmentationPass(raft::handle_t const &handle,
   vertex_t row_ids_csr_size =
     thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end());
   // exclusive scan for calculating the scatter addresses.
-  thrust::exclusive_scan(thrust::device, addresses_v.begin(), addresses_v.end(),
-                         addresses_v.begin());
+  thrust::exclusive_scan(
+    thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin());
 
   if (row_ids_csr_size > 0) {
     int total_blocks_1 = 0;
@@ -378,20 +433,20 @@ inline void augmentationPass(raft::handle_t const &handle,
     raft::lap::detail::calculateLinearDims(
       blocks_per_grid_1, threads_per_block_1, total_blocks_1, row_ids_csr_size);
 
-    rmm::device_uvector<vertex_t> elements_v(row_ids_csr_size,
-                                             handle.get_stream());
+    rmm::device_uvector<vertex_t> elements_v(row_ids_csr_size, handle.get_stream());
 
-    kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0,
-                            handle.get_stream()>>>(
-      elements_v.data(), predicates_v.data(), addresses_v.data(),
-      vertex_t{SP * N});
+    kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+      elements_v.data(), predicates_v.data(), addresses_v.data(), vertex_t{SP * N});
 
     CHECK_CUDA(handle.get_stream());
 
-    kernel_augmentation<<<blocks_per_grid_1, threads_per_block_1, 0,
-                          handle.get_stream()>>>(
-      d_vertices_dev.row_assignments, d_vertices_dev.col_assignments,
-      elements_v.data(), d_row_data_dev, d_col_data_dev, vertex_t{N},
+    kernel_augmentation<<<blocks_per_grid_1, threads_per_block_1, 0, handle.get_stream()>>>(
+      d_vertices_dev.row_assignments,
+      d_vertices_dev.col_assignments,
+      elements_v.data(),
+      d_row_data_dev,
+      d_col_data_dev,
+      vertex_t{N},
       row_ids_csr_size);
 
     CHECK_CUDA(handle.get_stream());
@@ -399,34 +454,45 @@ inline void augmentationPass(raft::handle_t const &handle,
 }
 
 template <typename vertex_t, typename weight_t>
-inline void dualUpdate(raft::handle_t const &handle,
-                       Vertices<vertex_t, weight_t> &d_vertices_dev,
-                       VertexData<vertex_t> &d_row_data_dev,
-                       VertexData<vertex_t> &d_col_data_dev, int SP, vertex_t N,
-                       weight_t epsilon) {
+inline void dualUpdate(raft::handle_t const& handle,
+                       Vertices<vertex_t, weight_t>& d_vertices_dev,
+                       VertexData<vertex_t>& d_row_data_dev,
+                       VertexData<vertex_t>& d_col_data_dev,
+                       int SP,
+                       vertex_t N,
+                       weight_t epsilon)
+{
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks;
 
   rmm::device_scalar<weight_t> sp_min_v(handle.get_stream());
 
-  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
-                                         total_blocks, SP);
-  kernel_dualUpdate_1<<<blocks_per_grid, threads_per_block, 0,
-                        handle.get_stream()>>>(
-    sp_min_v.data(), d_vertices_dev.col_slacks, d_vertices_dev.col_covers, SP,
-    N, std::numeric_limits<weight_t>::max());
+  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP);
+  kernel_dualUpdate_1<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    sp_min_v.data(),
+    d_vertices_dev.col_slacks,
+    d_vertices_dev.col_covers,
+    SP,
+    N,
+    std::numeric_limits<weight_t>::max());
 
   CHECK_CUDA(handle.get_stream());
 
   raft::lap::detail::calculateRectangularDims(
     blocks_per_grid, threads_per_block, total_blocks, N, SP);
-  kernel_dualUpdate_2<<<blocks_per_grid, threads_per_block, 0,
-                        handle.get_stream()>>>(
-    sp_min_v.data(), d_vertices_dev.row_duals, d_vertices_dev.col_duals,
-    d_vertices_dev.col_slacks, d_vertices_dev.row_covers,
-    d_vertices_dev.col_covers, d_row_data_dev.is_visited,
-    d_col_data_dev.parents, SP, N, std::numeric_limits<weight_t>::max(),
+  kernel_dualUpdate_2<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    sp_min_v.data(),
+    d_vertices_dev.row_duals,
+    d_vertices_dev.col_duals,
+    d_vertices_dev.col_slacks,
+    d_vertices_dev.row_covers,
+    d_vertices_dev.col_covers,
+    d_row_data_dev.is_visited,
+    d_col_data_dev.parents,
+    SP,
+    N,
+    std::numeric_limits<weight_t>::max(),
     epsilon);
 
   CHECK_CUDA(handle.get_stream());
@@ -434,18 +500,19 @@ inline void dualUpdate(raft::handle_t const &handle,
 
 // Function for calculating optimal objective function value using dual variables.
 template <typename vertex_t, typename weight_t>
-inline void calcObjValDual(raft::handle_t const &handle, weight_t *d_obj_val,
-                           Vertices<vertex_t, weight_t> &d_vertices_dev, int SP,
-                           int N) {
+inline void calcObjValDual(raft::handle_t const& handle,
+                           weight_t* d_obj_val,
+                           Vertices<vertex_t, weight_t>& d_vertices_dev,
+                           int SP,
+                           int N)
+{
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
 
-  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
-                                         total_blocks, SP);
+  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP);
 
-  kernel_calcObjValDual<<<blocks_per_grid, threads_per_block, 0,
-                          handle.get_stream()>>>(
+  kernel_calcObjValDual<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
     d_obj_val, d_vertices_dev.row_duals, d_vertices_dev.col_duals, SP, N);
 
   CHECK_CUDA(handle.get_stream());
@@ -453,20 +520,21 @@ inline void calcObjValDual(raft::handle_t const &handle, weight_t *d_obj_val,
 
 // Function for calculating optimal objective function value using dual variables.
 template <typename vertex_t, typename weight_t>
-inline void calcObjValPrimal(raft::handle_t const &handle, weight_t *d_obj_val,
-                             weight_t const *d_costs,
-                             vertex_t const *d_row_assignments, int SP,
-                             vertex_t N) {
+inline void calcObjValPrimal(raft::handle_t const& handle,
+                             weight_t* d_obj_val,
+                             weight_t const* d_costs,
+                             vertex_t const* d_row_assignments,
+                             int SP,
+                             vertex_t N)
+{
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
 
-  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
-                                         total_blocks, SP);
+  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP);
 
-  kernel_calcObjValPrimal<<<blocks_per_grid, threads_per_block, 0,
-                            handle.get_stream()>>>(d_obj_val, d_costs,
-                                                   d_row_assignments, SP, N);
+  kernel_calcObjValPrimal<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    d_obj_val, d_costs, d_row_assignments, SP, N);
 
   CHECK_CUDA(handle.get_stream());
 }
diff --git a/cpp/include/raft/lap/lap_kernels.cuh b/cpp/include/raft/lap/lap_kernels.cuh
index 14ad877aa4..328cbf3e74 100644
--- a/cpp/include/raft/lap/lap_kernels.cuh
+++ b/cpp/include/raft/lap/lap_kernels.cuh
@@ -45,42 +45,57 @@ const int AUGMENT{4};
 const int MODIFIED{5};
 
 template <typename weight_t>
-bool __device__ near_zero(weight_t w, weight_t epsilon) {
+bool __device__ near_zero(weight_t w, weight_t epsilon)
+{
   return ((w > -epsilon) && (w < epsilon));
 }
 
 template <>
-bool __device__ near_zero<int32_t>(int32_t w, int32_t epsilon) {
+bool __device__ near_zero<int32_t>(int32_t w, int32_t epsilon)
+{
   return (w == 0);
 }
 
 template <>
-bool __device__ near_zero<int64_t>(int64_t w, int64_t epsilon) {
+bool __device__ near_zero<int64_t>(int64_t w, int64_t epsilon)
+{
   return (w == 0);
 }
 
-// Device function for traversing the neighbors from start pointer to end pointer and updating the covers.
-// The function sets d_next to 4 if there are uncovered zeros, indicating the requirement of Step 4 execution.
+// Device function for traversing the neighbors from start pointer to end pointer and updating the
+// covers. The function sets d_next to 4 if there are uncovered zeros, indicating the requirement of
+// Step 4 execution.
 template <typename vertex_t, typename weight_t>
-__device__ void cover_and_expand_row(
-  weight_t const *d_elements, weight_t const *d_row_duals,
-  weight_t const *d_col_duals, weight_t *d_col_slacks, int *d_row_covers,
-  int *d_col_covers, vertex_t const *d_col_assignments, bool *d_flag,
-  vertex_t *d_row_parents, vertex_t *d_col_parents, int *d_row_visited,
-  int *d_col_visited, vertex_t rowid, int spid, int colid, vertex_t N,
-  weight_t epsilon) {
+__device__ void cover_and_expand_row(weight_t const* d_elements,
+                                     weight_t const* d_row_duals,
+                                     weight_t const* d_col_duals,
+                                     weight_t* d_col_slacks,
+                                     int* d_row_covers,
+                                     int* d_col_covers,
+                                     vertex_t const* d_col_assignments,
+                                     bool* d_flag,
+                                     vertex_t* d_row_parents,
+                                     vertex_t* d_col_parents,
+                                     int* d_row_visited,
+                                     int* d_col_visited,
+                                     vertex_t rowid,
+                                     int spid,
+                                     int colid,
+                                     vertex_t N,
+                                     weight_t epsilon)
+{
   int ROWID = spid * N + rowid;
   int COLID = spid * N + colid;
 
-  weight_t slack = d_elements[spid * N * N + rowid * N + colid] -
-                   d_row_duals[ROWID] - d_col_duals[COLID];
+  weight_t slack =
+    d_elements[spid * N * N + rowid * N + colid] - d_row_duals[ROWID] - d_col_duals[COLID];
 
   int nxt_rowid = d_col_assignments[COLID];
   int NXT_ROWID = spid * N + nxt_rowid;
 
   if (rowid != nxt_rowid && d_col_covers[COLID] == 0) {
     if (slack < d_col_slacks[COLID]) {
-      d_col_slacks[COLID] = slack;
+      d_col_slacks[COLID]  = slack;
       d_col_parents[COLID] = ROWID;
     }
 
@@ -89,13 +104,12 @@ __device__ void cover_and_expand_row(
         d_row_parents[NXT_ROWID] = COLID;  // update parent info
 
         d_row_covers[NXT_ROWID] = 0;
-        d_col_covers[COLID] = 1;
+        d_col_covers[COLID]     = 1;
 
-        if (d_row_visited[NXT_ROWID] != VISITED)
-          d_row_visited[NXT_ROWID] = ACTIVE;
+        if (d_row_visited[NXT_ROWID] != VISITED) d_row_visited[NXT_ROWID] = ACTIVE;
       } else {
         d_col_visited[COLID] = REVERSE;
-        *d_flag = true;
+        *d_flag              = true;
       }
     }
   }
@@ -104,28 +118,34 @@ __device__ void cover_and_expand_row(
 
 // Device function for traversing an alternating path from unassigned row to unassigned column.
 template <typename vertex_t>
-__device__ void __reverse_traversal(
-  int *d_row_visited, vertex_t *d_row_children, vertex_t *d_col_children,
-  vertex_t const *d_row_parents, vertex_t const *d_col_parents, int cur_colid) {
+__device__ void __reverse_traversal(int* d_row_visited,
+                                    vertex_t* d_row_children,
+                                    vertex_t* d_col_children,
+                                    vertex_t const* d_row_parents,
+                                    vertex_t const* d_col_parents,
+                                    int cur_colid)
+{
   int cur_rowid = -1;
 
   while (cur_colid != -1) {
     d_col_children[cur_colid] = cur_rowid;
-    cur_rowid = d_col_parents[cur_colid];
+    cur_rowid                 = d_col_parents[cur_colid];
 
     d_row_children[cur_rowid] = cur_colid;
-    cur_colid = d_row_parents[cur_rowid];
+    cur_colid                 = d_row_parents[cur_rowid];
   }
   d_row_visited[cur_rowid] = AUGMENT;
 }
 
 // Device function for augmenting the alternating path from unassigned column to unassigned row.
 template <typename vertex_t>
-__device__ void __augment(vertex_t *d_row_assignments,
-                          vertex_t *d_col_assignments,
-                          vertex_t const *d_row_children,
-                          vertex_t const *d_col_children, vertex_t cur_rowid,
-                          vertex_t N) {
+__device__ void __augment(vertex_t* d_row_assignments,
+                          vertex_t* d_col_assignments,
+                          vertex_t const* d_row_children,
+                          vertex_t const* d_col_children,
+                          vertex_t cur_rowid,
+                          vertex_t N)
+{
   int cur_colid = -1;
 
   while (cur_rowid != -1) {
@@ -142,20 +162,18 @@ __device__ void __augment(vertex_t *d_row_assignments,
 //  FIXME:  Once cuda 10.2 is the standard should replace passing infinity
 //          here with using cuda::std::numeric_limits<weight_t>::max()
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_rowReduction(weight_t const *d_costs,
-                                    weight_t *d_row_duals, int SP, vertex_t N,
-                                    weight_t infinity) {
-  int spid = blockIdx.y * blockDim.y + threadIdx.y;
-  int rowid = blockIdx.x * blockDim.x + threadIdx.x;
+__global__ void kernel_rowReduction(
+  weight_t const* d_costs, weight_t* d_row_duals, int SP, vertex_t N, weight_t infinity)
+{
+  int spid     = blockIdx.y * blockDim.y + threadIdx.y;
+  int rowid    = blockIdx.x * blockDim.x + threadIdx.x;
   weight_t min = infinity;
 
   if (spid < SP && rowid < N) {
     for (int colid = 0; colid < N; colid++) {
       weight_t slack = d_costs[spid * N * N + rowid * N + colid];
 
-      if (slack < min) {
-        min = slack;
-      }
+      if (slack < min) { min = slack; }
     }
 
     d_row_duals[spid * N + rowid] = min;
@@ -166,25 +184,26 @@ __global__ void kernel_rowReduction(weight_t const *d_costs,
 //  FIXME:  Once cuda 10.2 is the standard should replace passing infinity
 //          here with using cuda::std::numeric_limits<weight_t>::max()
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_columnReduction(weight_t const *d_costs,
-                                       weight_t const *d_row_duals,
-                                       weight_t *d_col_duals, int SP,
-                                       vertex_t N, weight_t infinity) {
-  int spid = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void kernel_columnReduction(weight_t const* d_costs,
+                                       weight_t const* d_row_duals,
+                                       weight_t* d_col_duals,
+                                       int SP,
+                                       vertex_t N,
+                                       weight_t infinity)
+{
+  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
   int colid = blockIdx.x * blockDim.x + threadIdx.x;
 
   weight_t min = infinity;
 
   if (spid < SP && colid < N) {
     for (int rowid = 0; rowid < N; rowid++) {
-      weight_t cost = d_costs[spid * N * N + rowid * N + colid];
+      weight_t cost     = d_costs[spid * N * N + rowid * N + colid];
       weight_t row_dual = d_row_duals[spid * N + rowid];
 
       weight_t slack = cost - row_dual;
 
-      if (slack < min) {
-        min = slack;
-      }
+      if (slack < min) { min = slack; }
     }
 
     d_col_duals[spid * N + colid] = min;
@@ -193,12 +212,18 @@ __global__ void kernel_columnReduction(weight_t const *d_costs,
 
 // Kernel for calculating initial assignments.
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_computeInitialAssignments(
-  weight_t const *d_costs, weight_t const *d_row_duals,
-  weight_t const *d_col_duals, vertex_t *d_row_assignments,
-  vertex_t *d_col_assignments, int *d_row_lock, int *d_col_lock, int SP,
-  vertex_t N, weight_t epsilon) {
-  int spid = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void kernel_computeInitialAssignments(weight_t const* d_costs,
+                                                 weight_t const* d_row_duals,
+                                                 weight_t const* d_col_duals,
+                                                 vertex_t* d_row_assignments,
+                                                 vertex_t* d_col_assignments,
+                                                 int* d_row_lock,
+                                                 int* d_col_lock,
+                                                 int SP,
+                                                 vertex_t N,
+                                                 weight_t epsilon)
+{
+  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
   int colid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP && colid < N) {
@@ -210,15 +235,15 @@ __global__ void kernel_computeInitialAssignments(
 
       if (d_col_lock[overall_colid] == 1) break;
 
-      weight_t cost = d_costs[spid * N * N + rowid * N + colid];
+      weight_t cost     = d_costs[spid * N * N + rowid * N + colid];
       weight_t row_dual = d_row_duals[overall_rowid];
-      weight_t slack = cost - row_dual - col_dual;
+      weight_t slack    = cost - row_dual - col_dual;
 
       if (near_zero(slack, epsilon)) {
         if (atomicCAS(&d_row_lock[overall_rowid], 0, 1) == 0) {
           d_row_assignments[overall_rowid] = colid;
           d_col_assignments[overall_colid] = rowid;
-          d_col_lock[overall_colid] = 1;
+          d_col_lock[overall_colid]        = 1;
         }
       }
     }
@@ -227,10 +252,10 @@ __global__ void kernel_computeInitialAssignments(
 
 // Kernel for populating the cover arrays and initializing alternating tree.
 template <typename vertex_t>
-__global__ void kernel_computeRowCovers(vertex_t *d_row_assignments,
-                                        int *d_row_covers, int *d_row_visited,
-                                        int SP, vertex_t N) {
-  int spid = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void kernel_computeRowCovers(
+  vertex_t* d_row_assignments, int* d_row_covers, int* d_row_visited, int SP, vertex_t N)
+{
+  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
   int rowid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP && rowid < N) {
@@ -246,11 +271,10 @@ __global__ void kernel_computeRowCovers(vertex_t *d_row_assignments,
 
 // Kernel for populating the predicate matrix for edges in row major format.
 template <typename vertex_t>
-__global__ void kernel_rowPredicateConstructionCSR(bool *d_predicates,
-                                                   vertex_t *d_addresses,
-                                                   int *d_row_visited, int SP,
-                                                   vertex_t N) {
-  int spid = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void kernel_rowPredicateConstructionCSR(
+  bool* d_predicates, vertex_t* d_addresses, int* d_row_visited, int SP, vertex_t N)
+{
+  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
   int rowid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP && rowid < N) {
@@ -258,130 +282,160 @@ __global__ void kernel_rowPredicateConstructionCSR(bool *d_predicates,
 
     if (d_row_visited[index] == ACTIVE) {
       d_predicates[index] = true;
-      d_addresses[index] = 1;
+      d_addresses[index]  = 1;
     } else {
       d_predicates[index] = false;
-      d_addresses[index] = 0;
+      d_addresses[index]  = 0;
     }
   }
 }
 
 // Kernel for scattering the edges based on the scatter addresses.
 template <typename vertex_t>
-__global__ void kernel_rowScatterCSR(bool const *d_predicates,
-                                     vertex_t const *d_addresses,
-                                     vertex_t *d_neighbors, vertex_t *d_ptrs,
-                                     vertex_t M, int SP, vertex_t N) {
-  int spid = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void kernel_rowScatterCSR(bool const* d_predicates,
+                                     vertex_t const* d_addresses,
+                                     vertex_t* d_neighbors,
+                                     vertex_t* d_ptrs,
+                                     vertex_t M,
+                                     int SP,
+                                     vertex_t N)
+{
+  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
   int rowid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP && rowid < N) {
     int index = spid * N + rowid;
 
-    bool predicate = d_predicates[index];
+    bool predicate  = d_predicates[index];
     vertex_t compid = d_addresses[index];
 
-    if (predicate) {
-      d_neighbors[compid] = rowid;
-    }
+    if (predicate) { d_neighbors[compid] = rowid; }
     if (rowid == 0) {
       d_ptrs[spid] = compid;
-      d_ptrs[SP] = M;
+      d_ptrs[SP]   = M;
     }
   }
 }
 
 // Kernel for finding the minimum zero cover.
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_coverAndExpand(bool *d_flag, vertex_t const *d_ptrs,
-                                      vertex_t const *d_neighbors,
-                                      weight_t const *d_elements,
+__global__ void kernel_coverAndExpand(bool* d_flag,
+                                      vertex_t const* d_ptrs,
+                                      vertex_t const* d_neighbors,
+                                      weight_t const* d_elements,
                                       Vertices<vertex_t, weight_t> d_vertices,
                                       VertexData<vertex_t> d_row_data,
-                                      VertexData<vertex_t> d_col_data, int SP,
-                                      vertex_t N, weight_t epsilon) {
-  int spid = blockIdx.y * blockDim.y + threadIdx.y;
+                                      VertexData<vertex_t> d_col_data,
+                                      int SP,
+                                      vertex_t N,
+                                      weight_t epsilon)
+{
+  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
   int colid = blockIdx.x * blockDim.x + threadIdx.x;
 
   // Load values into local memory
 
   if (spid < SP && colid < N) {
     thrust::for_each(
-      thrust::seq, d_neighbors + d_ptrs[spid], d_neighbors + d_ptrs[spid + 1],
-      [d_elements, d_vertices, d_flag, d_row_data, d_col_data, spid, colid, N,
-       epsilon] __device__(vertex_t rowid) {
-        cover_and_expand_row(
-          d_elements, d_vertices.row_duals, d_vertices.col_duals,
-          d_vertices.col_slacks, d_vertices.row_covers, d_vertices.col_covers,
-          d_vertices.col_assignments, d_flag, d_row_data.parents,
-          d_col_data.parents, d_row_data.is_visited, d_col_data.is_visited,
-          rowid, spid, colid, N, epsilon);
+      thrust::seq,
+      d_neighbors + d_ptrs[spid],
+      d_neighbors + d_ptrs[spid + 1],
+      [d_elements, d_vertices, d_flag, d_row_data, d_col_data, spid, colid, N, epsilon] __device__(
+        vertex_t rowid) {
+        cover_and_expand_row(d_elements,
+                             d_vertices.row_duals,
+                             d_vertices.col_duals,
+                             d_vertices.col_slacks,
+                             d_vertices.row_covers,
+                             d_vertices.col_covers,
+                             d_vertices.col_assignments,
+                             d_flag,
+                             d_row_data.parents,
+                             d_col_data.parents,
+                             d_row_data.is_visited,
+                             d_col_data.is_visited,
+                             rowid,
+                             spid,
+                             colid,
+                             N,
+                             epsilon);
       });
   }
 }
 
 // Kernel for constructing the predicates for reverse pass or augmentation candidates.
 template <typename vertex_t>
-__global__ void kernel_augmentPredicateConstruction(bool *d_predicates,
-                                                    vertex_t *d_addresses,
-                                                    int *d_visited, int size) {
+__global__ void kernel_augmentPredicateConstruction(bool* d_predicates,
+                                                    vertex_t* d_addresses,
+                                                    int* d_visited,
+                                                    int size)
+{
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (id < size) {
     int visited = d_visited[id];
     if ((visited == REVERSE) || (visited == AUGMENT)) {
       d_predicates[id] = true;
-      d_addresses[id] = 1;
+      d_addresses[id]  = 1;
     } else {
       d_predicates[id] = false;
-      d_addresses[id] = 0;
+      d_addresses[id]  = 0;
     }
   }
 }
 
 // Kernel for scattering the vertices based on the scatter addresses.
 template <typename vertex_t>
-__global__ void kernel_augmentScatter(vertex_t *d_elements,
-                                      bool const *d_predicates,
-                                      vertex_t const *d_addresses,
-                                      std::size_t size) {
+__global__ void kernel_augmentScatter(vertex_t* d_elements,
+                                      bool const* d_predicates,
+                                      vertex_t const* d_addresses,
+                                      std::size_t size)
+{
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (id < size) {
-    if (d_predicates[id]) {
-      d_elements[d_addresses[id]] = id;
-    }
+    if (d_predicates[id]) { d_elements[d_addresses[id]] = id; }
   }
 }
 
 // Kernel for executing the reverse pass of the maximum matching algorithm.
 template <typename vertex_t>
-__global__ void kernel_reverseTraversal(vertex_t *d_elements,
+__global__ void kernel_reverseTraversal(vertex_t* d_elements,
                                         VertexData<vertex_t> d_row_data,
                                         VertexData<vertex_t> d_col_data,
-                                        int size) {
+                                        int size)
+{
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (id < size) {
-    __reverse_traversal(d_row_data.is_visited, d_row_data.children,
-                        d_col_data.children, d_row_data.parents,
-                        d_col_data.parents, d_elements[id]);
+    __reverse_traversal(d_row_data.is_visited,
+                        d_row_data.children,
+                        d_col_data.children,
+                        d_row_data.parents,
+                        d_col_data.parents,
+                        d_elements[id]);
   }
 }
 
 // Kernel for executing the augmentation pass of the maximum matching algorithm.
 template <typename vertex_t>
-__global__ void kernel_augmentation(vertex_t *d_row_assignments,
-                                    vertex_t *d_col_assignments,
-                                    vertex_t const *d_row_elements,
+__global__ void kernel_augmentation(vertex_t* d_row_assignments,
+                                    vertex_t* d_col_assignments,
+                                    vertex_t const* d_row_elements,
                                     VertexData<vertex_t> d_row_data,
-                                    VertexData<vertex_t> d_col_data, vertex_t N,
-                                    vertex_t size) {
+                                    VertexData<vertex_t> d_col_data,
+                                    vertex_t N,
+                                    vertex_t size)
+{
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (id < size) {
-    __augment(d_row_assignments, d_col_assignments, d_row_data.children,
-              d_col_data.children, d_row_elements[id], N);
+    __augment(d_row_assignments,
+              d_col_assignments,
+              d_row_data.children,
+              d_col_data.children,
+              d_row_elements[id],
+              N);
   }
 }
 
@@ -389,18 +443,21 @@ __global__ void kernel_augmentation(vertex_t *d_row_assignments,
 //  FIXME:  Once cuda 10.2 is the standard should replace passing infinity
 //          here with using cuda::std::numeric_limits<weight_t>::max()
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_dualUpdate_1(weight_t *d_sp_min,
-                                    weight_t const *d_col_slacks,
-                                    int const *d_col_covers, int SP, vertex_t N,
-                                    weight_t infinity) {
+__global__ void kernel_dualUpdate_1(weight_t* d_sp_min,
+                                    weight_t const* d_col_slacks,
+                                    int const* d_col_covers,
+                                    int SP,
+                                    vertex_t N,
+                                    weight_t infinity)
+{
   int spid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP) {
     weight_t min = infinity;
     for (int colid = 0; colid < N; colid++) {
-      int index = spid * N + colid;
+      int index      = spid * N + colid;
       weight_t slack = d_col_slacks[index];
-      int col_cover = d_col_covers[index];
+      int col_cover  = d_col_covers[index];
 
       if (col_cover == 0)
         if (slack < min) min = slack;
@@ -414,21 +471,29 @@ __global__ void kernel_dualUpdate_1(weight_t *d_sp_min,
 //  FIXME:  Once cuda 10.2 is the standard should replace passing infinity
 //          here with using cuda::std::numeric_limits<weight_t>::max()
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_dualUpdate_2(
-  weight_t const *d_sp_min, weight_t *d_row_duals, weight_t *d_col_duals,
-  weight_t *d_col_slacks, int const *d_row_covers, int const *d_col_covers,
-  int *d_row_visited, vertex_t *d_col_parents, int SP, vertex_t N,
-  weight_t infinity, weight_t epsilon) {
+__global__ void kernel_dualUpdate_2(weight_t const* d_sp_min,
+                                    weight_t* d_row_duals,
+                                    weight_t* d_col_duals,
+                                    weight_t* d_col_slacks,
+                                    int const* d_row_covers,
+                                    int const* d_col_covers,
+                                    int* d_row_visited,
+                                    vertex_t* d_col_parents,
+                                    int SP,
+                                    vertex_t N,
+                                    weight_t infinity,
+                                    weight_t epsilon)
+{
   int spid = blockIdx.y * blockDim.y + threadIdx.y;
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  int id   = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP && id < N) {
     int index = spid * N + id;
 
     if (d_sp_min[spid] < infinity) {
       weight_t theta = d_sp_min[spid];
-      int row_cover = d_row_covers[index];
-      int col_cover = d_col_covers[index];
+      int row_cover  = d_row_covers[index];
+      int col_cover  = d_col_covers[index];
 
       if (row_cover == 0)  // Row vertex is reachable from source.
         d_row_duals[index] += theta;
@@ -450,10 +515,12 @@ __global__ void kernel_dualUpdate_2(
 
 // Kernel for calculating optimal objective function value using dual variables.
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_calcObjValDual(weight_t *d_obj_val_dual,
-                                      weight_t const *d_row_duals,
-                                      weight_t const *d_col_duals, int SP,
-                                      vertex_t N) {
+__global__ void kernel_calcObjValDual(weight_t* d_obj_val_dual,
+                                      weight_t const* d_row_duals,
+                                      weight_t const* d_col_duals,
+                                      int SP,
+                                      vertex_t N)
+{
   int spid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP) {
@@ -468,10 +535,12 @@ __global__ void kernel_calcObjValDual(weight_t *d_obj_val_dual,
 
 // Kernel for calculating optimal objective function value using dual variables.
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_calcObjValPrimal(weight_t *d_obj_val_primal,
-                                        weight_t const *d_costs,
-                                        vertex_t const *d_row_assignments,
-                                        int SP, vertex_t N) {
+__global__ void kernel_calcObjValPrimal(weight_t* d_obj_val_primal,
+                                        weight_t const* d_costs,
+                                        vertex_t const* d_row_assignments,
+                                        int SP,
+                                        vertex_t N)
+{
   int spid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP) {
diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh
index 7a454f64e2..11d3174951 100644
--- a/cpp/include/raft/linalg/add.cuh
+++ b/cpp/include/raft/linalg/add.cuh
@@ -37,8 +37,8 @@ namespace linalg {
  * @param stream cuda stream where to launch work
  */
 template <typename InT, typename OutT = InT, typename IdxType = int>
-void addScalar(OutT *out, const InT *in, InT scalar, IdxType len,
-               cudaStream_t stream) {
+void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
+{
   auto op = [scalar] __device__(InT in) { return OutT(in + scalar); };
   unaryOp<InT, decltype(op), IdxType, OutT>(out, in, len, op, stream);
 }
@@ -57,23 +57,24 @@ void addScalar(OutT *out, const InT *in, InT scalar, IdxType len,
  * @param stream cuda stream where to launch work
  */
 template <typename InT, typename OutT = InT, typename IdxType = int>
-void add(OutT *out, const InT *in1, const InT *in2, IdxType len,
-         cudaStream_t stream) {
+void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
+{
   auto op = [] __device__(InT a, InT b) { return OutT(a + b); };
   binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
 }
 
 template <class math_t, typename IdxType>
-__global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
-                                      const math_t *singleScalarDev,
-                                      IdxType len) {
+__global__ void add_dev_scalar_kernel(math_t* outDev,
+                                      const math_t* inDev,
+                                      const math_t* singleScalarDev,
+                                      IdxType len)
+{
   IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
-  if (i < len) {
-    outDev[i] = inDev[i] + *singleScalarDev;
-  }
+  if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; }
 }
 
-/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i]
+/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
+ * write result to outDev[i]
  * @tparam math_t data-type upon which the math operation will be performed
  * @tparam IdxType Integer type used to for addressing
  * @param outDev the output buffer
@@ -83,14 +84,16 @@ __global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
-void addDevScalar(math_t *outDev, const math_t *inDev,
-                  const math_t *singleScalarDev, IdxType len,
-                  cudaStream_t stream) {
+void addDevScalar(math_t* outDev,
+                  const math_t* inDev,
+                  const math_t* singleScalarDev,
+                  IdxType len,
+                  cudaStream_t stream)
+{
   // TODO: block dimension has not been tuned
   dim3 block(256);
   dim3 grid(raft::ceildiv(len, (IdxType)block.x));
-  add_dev_scalar_kernel<math_t>
-    <<<grid, block, 0, stream>>>(outDev, inDev, singleScalarDev, len);
+  add_dev_scalar_kernel<math_t><<<grid, block, 0, stream>>>(outDev, inDev, singleScalarDev, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh
index 940d786e87..a49a433941 100644
--- a/cpp/include/raft/linalg/binary_op.cuh
+++ b/cpp/include/raft/linalg/binary_op.cuh
@@ -22,10 +22,10 @@
 namespace raft {
 namespace linalg {
 
-template <typename InType, int VecLen, typename Lambda, typename IdxType,
-          typename OutType>
-__global__ void binaryOpKernel(OutType *out, const InType *in1,
-                               const InType *in2, IdxType len, Lambda op) {
+template <typename InType, int VecLen, typename Lambda, typename IdxType, typename OutType>
+__global__ void binaryOpKernel(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op)
+{
   typedef TxN_t<InType, VecLen> InVecType;
   typedef TxN_t<OutType, VecLen> OutVecType;
   InVecType a, b;
@@ -42,12 +42,11 @@ __global__ void binaryOpKernel(OutType *out, const InType *in1,
   c.store(out, idx);
 }
 
-template <typename InType, int VecLen, typename Lambda, typename IdxType,
-          typename OutType, int TPB>
-void binaryOpImpl(OutType *out, const InType *in1, const InType *in2,
-                  IdxType len, Lambda op, cudaStream_t stream) {
-  const IdxType nblks =
-    raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
+template <typename InType, int VecLen, typename Lambda, typename IdxType, typename OutType, int TPB>
+void binaryOpImpl(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
+{
+  const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
   binaryOpKernel<InType, VecLen, Lambda, IdxType, OutType>
     <<<nblks, TPB, 0, stream>>>(out, in1, in2, len, op);
   CUDA_CHECK(cudaPeekAtLastError());
@@ -56,8 +55,8 @@ void binaryOpImpl(OutType *out, const InType *in1, const InType *in2,
 /**
  * @brief Checks if addresses are aligned on N bytes
  */
-inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3,
-                           uint64_t N) {
+inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint64_t N)
+{
   return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0;
 }
 
@@ -77,38 +76,36 @@ inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3,
  * @note Lambda must be a functor with the following signature:
  *       `OutType func(const InType& val1, const InType& val2);`
  */
-template <typename InType, typename Lambda, typename OutType = InType,
-          typename IdxType = int, int TPB = 256>
-void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len,
-              Lambda op, cudaStream_t stream) {
-  constexpr auto maxSize =
-    sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
-  size_t bytes = len * maxSize;
-  uint64_t in1Addr = uint64_t(in1);
-  uint64_t in2Addr = uint64_t(in2);
-  uint64_t outAddr = uint64_t(out);
-  if (16 / maxSize && bytes % 16 == 0 &&
-      addressAligned(in1Addr, in2Addr, outAddr, 16)) {
+template <typename InType,
+          typename Lambda,
+          typename OutType = InType,
+          typename IdxType = int,
+          int TPB          = 256>
+void binaryOp(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
+{
+  constexpr auto maxSize = sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
+  size_t bytes           = len * maxSize;
+  uint64_t in1Addr       = uint64_t(in1);
+  uint64_t in2Addr       = uint64_t(in2);
+  uint64_t outAddr       = uint64_t(out);
+  if (16 / maxSize && bytes % 16 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 16)) {
     binaryOpImpl<InType, 16 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
-  } else if (8 / maxSize && bytes % 8 == 0 &&
-             addressAligned(in1Addr, in2Addr, outAddr, 8)) {
+  } else if (8 / maxSize && bytes % 8 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 8)) {
     binaryOpImpl<InType, 8 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
-  } else if (4 / maxSize && bytes % 4 == 0 &&
-             addressAligned(in1Addr, in2Addr, outAddr, 4)) {
+  } else if (4 / maxSize && bytes % 4 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 4)) {
     binaryOpImpl<InType, 4 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
-  } else if (2 / maxSize && bytes % 2 == 0 &&
-             addressAligned(in1Addr, in2Addr, outAddr, 2)) {
+  } else if (2 / maxSize && bytes % 2 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 2)) {
     binaryOpImpl<InType, 2 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
   } else if (1 / maxSize) {
     binaryOpImpl<InType, 1 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
   } else {
-    binaryOpImpl<InType, 1, Lambda, IdxType, OutType, TPB>(out, in1, in2, len,
-                                                           op, stream);
+    binaryOpImpl<InType, 1, Lambda, IdxType, OutType, TPB>(out, in1, in2, len, op, stream);
   }
 }
 
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh
index d6d064c20e..4b58133ac5 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh
@@ -122,9 +122,16 @@ namespace linalg {
  *    conditioned systems. Negative values mean no regularizaton.
  */
 template <typename math_t>
-void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
-                         void *workspace, int *n_bytes, cublasFillMode_t uplo,
-                         cudaStream_t stream, math_t eps = -1) {
+void choleskyRank1Update(const raft::handle_t& handle,
+                         math_t* L,
+                         int n,
+                         int ld,
+                         void* workspace,
+                         int* n_bytes,
+                         cublasFillMode_t uplo,
+                         cudaStream_t stream,
+                         math_t eps = -1)
+{
   // The matrix A' is defined as:
   // A' = [[A_11, A_12]
   //       [A_21, A_22]]
@@ -144,18 +151,17 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
   // We need a workspace in device memory to store a scalar. Additionally, in
   // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats.
   const int align = 256;
-  int offset = (uplo == CUBLAS_FILL_MODE_LOWER)
-                 ? raft::alignTo<int>(sizeof(math_t) * (n - 1), align)
-                 : 0;
+  int offset =
+    (uplo == CUBLAS_FILL_MODE_LOWER) ? raft::alignTo<int>(sizeof(math_t) * (n - 1), align) : 0;
   if (workspace == nullptr) {
     *n_bytes = offset + 1 * sizeof(math_t);
     return;
   }
-  math_t *s = reinterpret_cast<math_t *>(((char *)workspace) + offset);
-  math_t *L_22 = L + (n - 1) * ld + n - 1;
+  math_t* s    = reinterpret_cast<math_t*>(((char*)workspace) + offset);
+  math_t* L_22 = L + (n - 1) * ld + n - 1;
 
-  math_t *A_new;
-  math_t *A_row;
+  math_t* A_new;
+  math_t* A_row;
   if (uplo == CUBLAS_FILL_MODE_UPPER) {
     // A_new is stored as the n-1 th column of L
     A_new = L + (n - 1) * ld;
@@ -164,27 +170,36 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
     // as the n-th row of L. Since the matrix is column major, this is non
     // contiguous. We copy elements from A_row to a contiguous workspace A_new.
     A_row = L + n - 1;
-    A_new = reinterpret_cast<math_t *>(workspace);
-    CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1,
-                                          A_row, ld, A_new, 1, stream));
+    A_new = reinterpret_cast<math_t*>(workspace);
+    CUBLAS_CHECK(
+      raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream));
   }
-  cublasOperation_t op =
-    (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t op = (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N;
   if (n > 1) {
     // Calculate L_12 = x by solving equation L_11 x = A_12
     math_t alpha = 1;
-    CUBLAS_CHECK(raft::linalg::cublastrsm(
-      handle.get_cublas_handle(), CUBLAS_SIDE_LEFT, uplo, op,
-      CUBLAS_DIAG_NON_UNIT, n - 1, 1, &alpha, L, ld, A_new, n - 1, stream));
+    CUBLAS_CHECK(raft::linalg::cublastrsm(handle.get_cublas_handle(),
+                                          CUBLAS_SIDE_LEFT,
+                                          uplo,
+                                          op,
+                                          CUBLAS_DIAG_NON_UNIT,
+                                          n - 1,
+                                          1,
+                                          &alpha,
+                                          L,
+                                          ld,
+                                          A_new,
+                                          n - 1,
+                                          stream));
 
     // A_new now stores L_12, we calculate s = L_12 * L_12
-    CUBLAS_CHECK(raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1,
-                                         A_new, 1, A_new, 1, s, stream));
+    CUBLAS_CHECK(
+      raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream));
 
     if (uplo == CUBLAS_FILL_MODE_LOWER) {
       // Copy back the L_12 elements as the n-th row of L
-      CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1,
-                                            A_new, 1, A_row, ld, stream));
+      CUBLAS_CHECK(
+        raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream));
     }
   } else {  // n == 1 case
     CUDA_CHECK(cudaMemsetAsync(s, 0, sizeof(math_t), stream));
@@ -202,9 +217,7 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
   // the system is very ill conditioned then the A_22 - L_12 * L_12 can be
   // negative, which would result L_22 = NaN. A small positive eps parameter
   // can be used to prevent this.
-  if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) {
-    L_22_host = eps;
-  }
+  if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { L_22_host = eps; }
   ASSERT(!std::isnan(L_22_host), "Error during Cholesky rank one update");
   raft::update_device(L_22, &L_22_host, 1, stream);
 }
diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh
index ef983ff3d0..7e0744f98a 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/coalesced_reduction.cuh
@@ -26,18 +26,27 @@ namespace linalg {
 // of the matrix, i.e. reduce along rows for row major or reduce along columns
 // for column major layout. Kernel does an inplace reduction adding to original
 // values of dots.
-template <typename InType, typename OutType, typename IdxType, int TPB,
-          typename MainLambda, typename ReduceLambda, typename FinalLambda>
-__global__ void coalescedReductionKernel(OutType *dots, const InType *data,
-                                         int D, int N, OutType init,
+template <typename InType,
+          typename OutType,
+          typename IdxType,
+          int TPB,
+          typename MainLambda,
+          typename ReduceLambda,
+          typename FinalLambda>
+__global__ void coalescedReductionKernel(OutType* dots,
+                                         const InType* data,
+                                         int D,
+                                         int N,
+                                         OutType init,
                                          MainLambda main_op,
                                          ReduceLambda reduce_op,
                                          FinalLambda final_op,
-                                         bool inplace = false) {
+                                         bool inplace = false)
+{
   typedef cub::BlockReduce<OutType, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   OutType thread_data = init;
-  IdxType rowStart = blockIdx.x * D;
+  IdxType rowStart    = blockIdx.x * D;
   for (IdxType i = threadIdx.x; i < D; i += TPB) {
     IdxType idx = rowStart + i;
     thread_data = reduce_op(thread_data, main_op(data[idx], i));
@@ -79,33 +88,37 @@ __global__ void coalescedReductionKernel(OutType *dots, const InType *data,
  * @param inplace reduction result added inplace or overwrites old values?
  * @param stream cuda stream where to launch work
  */
-template <typename InType, typename OutType = InType, typename IdxType = int,
-          typename MainLambda = raft::Nop<InType, IdxType>,
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
           typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda = raft::Nop<OutType>>
-void coalescedReduction(OutType *dots, const InType *data, int D, int N,
-                        OutType init, cudaStream_t stream, bool inplace = false,
-                        MainLambda main_op = raft::Nop<InType, IdxType>(),
+          typename FinalLambda  = raft::Nop<OutType>>
+void coalescedReduction(OutType* dots,
+                        const InType* data,
+                        int D,
+                        int N,
+                        OutType init,
+                        cudaStream_t stream,
+                        bool inplace           = false,
+                        MainLambda main_op     = raft::Nop<InType, IdxType>(),
                         ReduceLambda reduce_op = raft::Sum<OutType>(),
-                        FinalLambda final_op = raft::Nop<OutType>()) {
+                        FinalLambda final_op   = raft::Nop<OutType>())
+{
   // One block per reduction
   // Efficient only for large leading dimensions
   if (D <= 32) {
     coalescedReductionKernel<InType, OutType, IdxType, 32>
-      <<<N, 32, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
-                             final_op, inplace);
+      <<<N, 32, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
   } else if (D <= 64) {
     coalescedReductionKernel<InType, OutType, IdxType, 64>
-      <<<N, 64, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
-                             final_op, inplace);
+      <<<N, 64, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
   } else if (D <= 128) {
     coalescedReductionKernel<InType, OutType, IdxType, 128>
-      <<<N, 128, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
-                              final_op, inplace);
+      <<<N, 128, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
   } else {
     coalescedReductionKernel<InType, OutType, IdxType, 256>
-      <<<N, 256, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
-                              final_op, inplace);
+      <<<N, 256, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh
index e6ff8a49ce..817bfeab5c 100644
--- a/cpp/include/raft/linalg/contractions.cuh
+++ b/cpp/include/raft/linalg/contractions.cuh
@@ -55,8 +55,7 @@ namespace linalg {
  *                 thread block. This also determines the number of threads per
  *                 thread block
  */
-template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr,
-          int _tc>
+template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
 struct KernelPolicy {
   enum {
     /** number of elements along K worked upon per main loop iteration */
@@ -101,8 +100,7 @@ struct KernelPolicy {
 
 };  // struct KernelPolicy
 
-template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr,
-          int _tc>
+template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
 struct ColKernelPolicy {
   enum {
     /** number of elements along K worked upon per main loop iteration */
@@ -151,7 +149,8 @@ struct ColKernelPolicy {
  * @{
  */
 template <typename DataT, int _veclen>
-struct Policy4x4 {};
+struct Policy4x4 {
+};
 
 template <int _veclen>
 struct Policy4x4<float, _veclen> {
@@ -171,7 +170,8 @@ struct Policy4x4<double, _veclen> {
  * @{
  */
 template <typename DataT, int _veclen = 1>
-struct Policy2x8 {};
+struct Policy2x8 {
+};
 
 template <int _veclen>
 struct Policy2x8<float, _veclen> {
@@ -201,8 +201,7 @@ struct Policy2x8<double, _veclen> {
  * @tparam Policy policy used to customize memory access behavior.
  *                See documentation for `KernelPolicy` to know more.
  */
-template <typename DataT, typename IdxT, typename Policy,
-          bool isRowMajor = true>
+template <typename DataT, typename IdxT, typename Policy, bool isRowMajor = true>
 struct Contractions_NT {
  protected:
   typedef Policy P;
@@ -268,8 +267,7 @@ struct Contractions_NT {
    * @param[in] _k number of cols of X and Y
    * @param[in] _smem shared memory region used during computations
    */
-  DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n,
-                     IdxT _k, char* _smem)
+  DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, IdxT _k, char* _smem)
     : m(_m),
       n(_n),
       k(_k),
@@ -286,7 +284,9 @@ struct Contractions_NT {
       sx((DataT*)_smem),
       sy(&(sx[P::SmemPageX])),
       pageWr(0),
-      pageRd(0) {}
+      pageRd(0)
+  {
+  }
 
   /**
    * @brief Ctor
@@ -297,8 +297,15 @@ struct Contractions_NT {
    * @param[in] _k number of cols of X and Y
    * @param[in] _smem shared memory region used during computations
    */
-  DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n,
-                     IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, char* _smem)
+  DI Contractions_NT(const DataT* _x,
+                     const DataT* _y,
+                     IdxT _m,
+                     IdxT _n,
+                     IdxT _k,
+                     IdxT _lda,
+                     IdxT _ldb,
+                     IdxT _ldd,
+                     char* _smem)
     : m(_m),
       n(_n),
       k(_k),
@@ -312,17 +319,18 @@ struct Contractions_NT {
       sx((DataT*)_smem),
       sy(&(sx[P::SmemPageX])),
       pageWr(0),
-      pageRd(0) {
+      pageRd(0)
+  {
     if (isRowMajor) {
       xrowid = IdxT(blockIdx.y) * P::Mblk + srowid;
       yrowid = IdxT(blockIdx.x) * P::Nblk + srowid;
-      x = _x + xrowid * lda;
-      y = _y + yrowid * ldb;
+      x      = _x + xrowid * lda;
+      y      = _y + yrowid * ldb;
     } else {
       xrowid = IdxT(blockIdx.y) * P::Mblk;
       yrowid = IdxT(blockIdx.x) * P::Nblk;
-      x = _x + xrowid + srowid * lda;
-      y = _y + yrowid + srowid * ldb;
+      x      = _x + xrowid + srowid * lda;
+      y      = _y + yrowid + srowid * ldb;
     }
   }
 
@@ -331,7 +339,8 @@ struct Contractions_NT {
    * @brief Load current block of X/Y from global memory to registers
    * @param[in] kidx current start index of k to be loaded
    */
-  DI void ldgXY(IdxT kidx) {
+  DI void ldgXY(IdxT kidx)
+  {
     ldgX(kidx);
     ldgY(kidx);
   }
@@ -340,7 +349,8 @@ struct Contractions_NT {
    * @brief Store current block of X/Y from registers to smem
    * @param[in] kidx current start index of k to be loaded
    */
-  DI void stsXY() {
+  DI void stsXY()
+  {
     stsX(sx + pageWr * P::SmemPage);
     stsY(sy + pageWr * P::SmemPage);
   }
@@ -349,13 +359,15 @@ struct Contractions_NT {
    * @brief Load X and Y block from shared memory to registers
    * @param[in] kidx k value from the current k-block to be loaded from smem
    */
-  DI void ldsXY(int kidx) {
+  DI void ldsXY(int kidx)
+  {
     ldsX(kidx, sx + pageRd * P::SmemPage);
     ldsY(kidx, sy + pageRd * P::SmemPage);
   }
 
  private:
-  DI void ldgX(IdxT kidx) {
+  DI void ldgX(IdxT kidx)
+  {
     if (isRowMajor) {
       auto numRows = m;
       auto koffset = kidx + scolid;
@@ -372,11 +384,10 @@ struct Contractions_NT {
       }
     } else {
       const auto numRows = k;
-      auto koffset = scolid;
+      auto koffset       = scolid;
 #pragma unroll
       for (int i = 0; i < P::LdgPerThX; ++i) {
-        if ((koffset + xrowid) < lda &&
-            (srowid + kidx + i * P::LdgRowsX) < numRows) {
+        if ((koffset + xrowid) < lda && (srowid + kidx + i * P::LdgRowsX) < numRows) {
           ldg(ldgDataX[i], x + (kidx + i * P::LdgRowsX) * lda + koffset);
         } else {
 #pragma unroll
@@ -388,7 +399,8 @@ struct Contractions_NT {
     }
   }
 
-  DI void ldgY(IdxT kidx) {
+  DI void ldgY(IdxT kidx)
+  {
     if (isRowMajor) {
       auto numRows = n;
       auto koffset = kidx + scolid;
@@ -408,8 +420,7 @@ struct Contractions_NT {
       auto koffset = scolid;
 #pragma unroll
       for (int i = 0; i < P::LdgPerThY; ++i) {
-        if ((koffset + yrowid) < ldb &&
-            (srowid + kidx + i * P::LdgRowsY) < numRows) {
+        if ((koffset + yrowid) < ldb && (srowid + kidx + i * P::LdgRowsY) < numRows) {
           ldg(ldgDataY[i], y + (kidx + i * P::LdgRowsY) * ldb + koffset);
         } else {
 #pragma unroll
@@ -421,7 +432,8 @@ struct Contractions_NT {
     }
   }
 
-  DI void stsX(DataT* smem) {
+  DI void stsX(DataT* smem)
+  {
     auto* saddr = smem + srowid * P::SmemStride + scolid;
 #pragma unroll
     for (int i = 0; i < P::LdgPerThX; ++i) {
@@ -429,7 +441,8 @@ struct Contractions_NT {
     }
   }
 
-  DI void stsY(DataT* smem) {
+  DI void stsY(DataT* smem)
+  {
     auto* saddr = smem + srowid * P::SmemStride + scolid;
 #pragma unroll
     for (int i = 0; i < P::LdgPerThY; ++i) {
@@ -437,7 +450,8 @@ struct Contractions_NT {
     }
   }
 
-  DI void ldsX(int kidx, DataT* smem) {
+  DI void ldsX(int kidx, DataT* smem)
+  {
     if (isRowMajor) {
       auto* saddr = smem + accrowid * P::SmemStride + kidx;
 #pragma unroll
@@ -456,7 +470,8 @@ struct Contractions_NT {
     }
   }
 
-  DI void ldsY(int kidx, DataT* smem) {
+  DI void ldsY(int kidx, DataT* smem)
+  {
     if (isRowMajor) {
       auto* saddr = smem + acccolid * P::SmemStride + kidx;
 #pragma unroll
diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h
index 1be14a550d..3616d54506 100644
--- a/cpp/include/raft/linalg/cublas_wrappers.h
+++ b/cpp/include/raft/linalg/cublas_wrappers.h
@@ -25,8 +25,7 @@
 #include <cstdint>
 
 #define _CUBLAS_ERR_TO_STR(err) \
-  case err:                     \
-    return #err
+  case err: return #err
 
 namespace raft {
 
@@ -34,15 +33,15 @@ namespace raft {
  * @brief Exception thrown when a cuBLAS error is encountered.
  */
 struct cublas_error : public raft::exception {
-  explicit cublas_error(char const *const message) : raft::exception(message) {}
-  explicit cublas_error(std::string const &message)
-    : raft::exception(message) {}
+  explicit cublas_error(char const* const message) : raft::exception(message) {}
+  explicit cublas_error(std::string const& message) : raft::exception(message) {}
 };
 
 namespace linalg {
 namespace detail {
 
-inline const char *cublas_error_to_string(cublasStatus_t err) {
+inline const char* cublas_error_to_string(cublasStatus_t err)
+{
   switch (err) {
     _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
     _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
@@ -54,8 +53,7 @@ inline const char *cublas_error_to_string(cublasStatus_t err) {
     _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
     _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
     _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
-    default:
-      return "CUBLAS_STATUS_UNKNOWN";
+    default: return "CUBLAS_STATUS_UNKNOWN";
   };
 }
 
@@ -71,29 +69,34 @@ inline const char *cublas_error_to_string(cublasStatus_t err) {
  * Invokes a cuBLAS runtime API function call, if the call does not return
  * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred
  */
-#define CUBLAS_TRY(call)                                                      \
-  do {                                                                        \
-    cublasStatus_t const status = (call);                                     \
-    if (CUBLAS_STATUS_SUCCESS != status) {                                    \
-      std::string msg{};                                                      \
-      SET_ERROR_MSG(                                                          \
-        msg, "cuBLAS error encountered at: ", "call='%s', Reason=%d:%s",      \
-        #call, status, raft::linalg::detail::cublas_error_to_string(status)); \
-      throw raft::cublas_error(msg);                                          \
-    }                                                                         \
+#define CUBLAS_TRY(call)                                                   \
+  do {                                                                     \
+    cublasStatus_t const status = (call);                                  \
+    if (CUBLAS_STATUS_SUCCESS != status) {                                 \
+      std::string msg{};                                                   \
+      SET_ERROR_MSG(msg,                                                   \
+                    "cuBLAS error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                             \
+                    #call,                                                 \
+                    status,                                                \
+                    raft::linalg::detail::cublas_error_to_string(status)); \
+      throw raft::cublas_error(msg);                                       \
+    }                                                                      \
   } while (0)
 
 /** FIXME: temporary alias for cuML compatibility */
 #define CUBLAS_CHECK(call) CUBLAS_TRY(call)
 
 /** check for cublas runtime API errors but do not assert */
-#define CUBLAS_CHECK_NO_THROW(call)                                          \
-  do {                                                                       \
-    cublasStatus_t err = call;                                               \
-    if (err != CUBLAS_STATUS_SUCCESS) {                                      \
-      CUML_LOG_ERROR("CUBLAS call='%s' got errorcode=%d err=%s", #call, err, \
-                     raft::linalg::detail::cublas_error_to_string(err));     \
-    }                                                                        \
+#define CUBLAS_CHECK_NO_THROW(call)                                      \
+  do {                                                                   \
+    cublasStatus_t err = call;                                           \
+    if (err != CUBLAS_STATUS_SUCCESS) {                                  \
+      CUML_LOG_ERROR("CUBLAS call='%s' got errorcode=%d err=%s",         \
+                     #call,                                              \
+                     err,                                                \
+                     raft::linalg::detail::cublas_error_to_string(err)); \
+    }                                                                    \
   } while (0)
 
 namespace raft {
@@ -104,22 +107,39 @@ namespace linalg {
  * @{
  */
 template <typename T>
-cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, const T *alpha,
-                          const T *x, int incx, T *y, int incy,
+cublasStatus_t cublasaxpy(cublasHandle_t handle,
+                          int n,
+                          const T* alpha,
+                          const T* x,
+                          int incx,
+                          T* y,
+                          int incy,
                           cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n,
-                                 const float *alpha, const float *x, int incx,
-                                 float *y, int incy, cudaStream_t stream) {
+inline cublasStatus_t cublasaxpy(cublasHandle_t handle,
+                                 int n,
+                                 const float* alpha,
+                                 const float* x,
+                                 int incx,
+                                 float* y,
+                                 int incy,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSaxpy(handle, n, alpha, x, incx, y, incy);
 }
 
 template <>
-inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n,
-                                 const double *alpha, const double *x, int incx,
-                                 double *y, int incy, cudaStream_t stream) {
+inline cublasStatus_t cublasaxpy(cublasHandle_t handle,
+                                 int n,
+                                 const double* alpha,
+                                 const double* x,
+                                 int incx,
+                                 double* y,
+                                 int incy,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDaxpy(handle, n, alpha, x, incx, y, incy);
 }
@@ -130,21 +150,21 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasSwap(cublasHandle_t handle, int n, T *x, int incx, T *y,
-                          int incy, cudaStream_t stream);
+cublasStatus_t cublasSwap(
+  cublasHandle_t handle, int n, T* x, int incx, T* y, int incy, cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, float *x,
-                                 int incx, float *y, int incy,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublasSwap(
+  cublasHandle_t handle, int n, float* x, int incx, float* y, int incy, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSswap(handle, n, x, incx, y, incy);
 }
 
 template <>
-inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, double *x,
-                                 int incx, double *y, int incy,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublasSwap(
+  cublasHandle_t handle, int n, double* x, int incx, double* y, int incy, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDswap(handle, n, x, incx, y, incy);
 }
@@ -156,20 +176,20 @@ inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, double *x,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const T *x, int incx,
-                          T *y, int incy, cudaStream_t stream);
+cublasStatus_t cublasCopy(
+  cublasHandle_t handle, int n, const T* x, int incx, T* y, int incy, cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const float *x,
-                                 int incx, float *y, int incy,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublasCopy(
+  cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasScopy(handle, n, x, incx, y, incy);
 }
 template <>
-inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const double *x,
-                                 int incx, double *y, int incy,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublasCopy(
+  cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDcopy(handle, n, x, incx, y, incy);
 }
@@ -180,31 +200,56 @@ inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const double *x,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasgemv(cublasHandle_t handle, cublasOperation_t transA,
-                          int m, int n, const T *alfa, const T *A, int lda,
-                          const T *x, int incx, const T *beta, T *y, int incy,
+cublasStatus_t cublasgemv(cublasHandle_t handle,
+                          cublasOperation_t transA,
+                          int m,
+                          int n,
+                          const T* alfa,
+                          const T* A,
+                          int lda,
+                          const T* x,
+                          int incx,
+                          const T* beta,
+                          T* y,
+                          int incy,
                           cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgemv(cublasHandle_t handle,
-                                 cublasOperation_t transA, int m, int n,
-                                 const float *alfa, const float *A, int lda,
-                                 const float *x, int incx, const float *beta,
-                                 float *y, int incy, cudaStream_t stream) {
+                                 cublasOperation_t transA,
+                                 int m,
+                                 int n,
+                                 const float* alfa,
+                                 const float* A,
+                                 int lda,
+                                 const float* x,
+                                 int incx,
+                                 const float* beta,
+                                 float* y,
+                                 int incy,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y,
-                     incy);
+  return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy);
 }
 
 template <>
 inline cublasStatus_t cublasgemv(cublasHandle_t handle,
-                                 cublasOperation_t transA, int m, int n,
-                                 const double *alfa, const double *A, int lda,
-                                 const double *x, int incx, const double *beta,
-                                 double *y, int incy, cudaStream_t stream) {
+                                 cublasOperation_t transA,
+                                 int m,
+                                 int n,
+                                 const double* alfa,
+                                 const double* A,
+                                 int lda,
+                                 const double* x,
+                                 int incx,
+                                 const double* beta,
+                                 double* y,
+                                 int incy,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y,
-                     incy);
+  return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy);
 }
 /** @} */
 
@@ -213,23 +258,47 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, const T *alpha,
-                         const T *x, int incx, const T *y, int incy, T *A,
-                         int lda, cudaStream_t stream);
+cublasStatus_t cublasger(cublasHandle_t handle,
+                         int m,
+                         int n,
+                         const T* alpha,
+                         const T* x,
+                         int incx,
+                         const T* y,
+                         int incy,
+                         T* A,
+                         int lda,
+                         cudaStream_t stream);
 template <>
-inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n,
-                                const float *alpha, const float *x, int incx,
-                                const float *y, int incy, float *A, int lda,
-                                cudaStream_t stream) {
+inline cublasStatus_t cublasger(cublasHandle_t handle,
+                                int m,
+                                int n,
+                                const float* alpha,
+                                const float* x,
+                                int incx,
+                                const float* y,
+                                int incy,
+                                float* A,
+                                int lda,
+                                cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
 template <>
-inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n,
-                                const double *alpha, const double *x, int incx,
-                                const double *y, int incy, double *A, int lda,
-                                cudaStream_t stream) {
+inline cublasStatus_t cublasger(cublasHandle_t handle,
+                                int m,
+                                int n,
+                                const double* alpha,
+                                const double* x,
+                                int incx,
+                                const double* y,
+                                int incy,
+                                double* A,
+                                int lda,
+                                cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
@@ -240,34 +309,62 @@ inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasgemm(cublasHandle_t handle, cublasOperation_t transA,
-                          cublasOperation_t transB, int m, int n, int k,
-                          const T *alfa, const T *A, int lda, const T *B,
-                          int ldb, const T *beta, T *C, int ldc,
+cublasStatus_t cublasgemm(cublasHandle_t handle,
+                          cublasOperation_t transA,
+                          cublasOperation_t transB,
+                          int m,
+                          int n,
+                          int k,
+                          const T* alfa,
+                          const T* A,
+                          int lda,
+                          const T* B,
+                          int ldb,
+                          const T* beta,
+                          T* C,
+                          int ldc,
                           cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgemm(cublasHandle_t handle,
                                  cublasOperation_t transA,
-                                 cublasOperation_t transB, int m, int n, int k,
-                                 const float *alfa, const float *A, int lda,
-                                 const float *B, int ldb, const float *beta,
-                                 float *C, int ldc, cudaStream_t stream) {
+                                 cublasOperation_t transB,
+                                 int m,
+                                 int n,
+                                 int k,
+                                 const float* alfa,
+                                 const float* A,
+                                 int lda,
+                                 const float* B,
+                                 int ldb,
+                                 const float* beta,
+                                 float* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb,
-                     beta, C, ldc);
+  return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc);
 }
 
 template <>
 inline cublasStatus_t cublasgemm(cublasHandle_t handle,
                                  cublasOperation_t transA,
-                                 cublasOperation_t transB, int m, int n, int k,
-                                 const double *alfa, const double *A, int lda,
-                                 const double *B, int ldb, const double *beta,
-                                 double *C, int ldc, cudaStream_t stream) {
+                                 cublasOperation_t transB,
+                                 int m,
+                                 int n,
+                                 int k,
+                                 const double* alfa,
+                                 const double* A,
+                                 int lda,
+                                 const double* B,
+                                 int ldb,
+                                 const double* beta,
+                                 double* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb,
-                     beta, C, ldc);
+  return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc);
 }
 /** @} */
 
@@ -278,38 +375,93 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle,
 template <typename T>
 cublasStatus_t cublasgemmBatched(cublasHandle_t handle,  // NOLINT
                                  cublasOperation_t transa,
-                                 cublasOperation_t transb, int m, int n, int k,
-                                 const T *alpha,
-                                 const T *const Aarray[],           // NOLINT
-                                 int lda, const T *const Barray[],  // NOLINT
-                                 int ldb, const T *beta,
-                                 T *Carray[],  // NOLINT
-                                 int ldc, int batchCount, cudaStream_t stream);
+                                 cublasOperation_t transb,
+                                 int m,
+                                 int n,
+                                 int k,
+                                 const T* alpha,
+                                 const T* const Aarray[],  // NOLINT
+                                 int lda,
+                                 const T* const Barray[],  // NOLINT
+                                 int ldb,
+                                 const T* beta,
+                                 T* Carray[],  // NOLINT
+                                 int ldc,
+                                 int batchCount,
+                                 cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgemmBatched(  // NOLINT
-  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-  int m, int n, int k, const float *alpha,
-  const float *const Aarray[],                  // NOLINT
-  int lda, const float *const Barray[],         // NOLINT
-  int ldb, const float *beta, float *Carray[],  // NOLINT
-  int ldc, int batchCount, cudaStream_t stream) {
+  cublasHandle_t handle,
+  cublasOperation_t transa,
+  cublasOperation_t transb,
+  int m,
+  int n,
+  int k,
+  const float* alpha,
+  const float* const Aarray[],  // NOLINT
+  int lda,
+  const float* const Barray[],  // NOLINT
+  int ldb,
+  const float* beta,
+  float* Carray[],  // NOLINT
+  int ldc,
+  int batchCount,
+  cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda,
-                            Barray, ldb, beta, Carray, ldc, batchCount);
+  return cublasSgemmBatched(handle,
+                            transa,
+                            transb,
+                            m,
+                            n,
+                            k,
+                            alpha,
+                            Aarray,
+                            lda,
+                            Barray,
+                            ldb,
+                            beta,
+                            Carray,
+                            ldc,
+                            batchCount);
 }
 
 template <>
 inline cublasStatus_t cublasgemmBatched(  // NOLINT
-  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-  int m, int n, int k, const double *alpha,
-  const double *const Aarray[],                   // NOLINT
-  int lda, const double *const Barray[],          // NOLINT
-  int ldb, const double *beta, double *Carray[],  // NOLINT
-  int ldc, int batchCount, cudaStream_t stream) {
+  cublasHandle_t handle,
+  cublasOperation_t transa,
+  cublasOperation_t transb,
+  int m,
+  int n,
+  int k,
+  const double* alpha,
+  const double* const Aarray[],  // NOLINT
+  int lda,
+  const double* const Barray[],  // NOLINT
+  int ldb,
+  const double* beta,
+  double* Carray[],  // NOLINT
+  int ldc,
+  int batchCount,
+  cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda,
-                            Barray, ldb, beta, Carray, ldc, batchCount);
+  return cublasDgemmBatched(handle,
+                            transa,
+                            transb,
+                            m,
+                            n,
+                            k,
+                            alpha,
+                            Aarray,
+                            lda,
+                            Barray,
+                            ldb,
+                            beta,
+                            Carray,
+                            ldc,
+                            batchCount);
 }
 /** @} */
 
@@ -319,36 +471,110 @@ inline cublasStatus_t cublasgemmBatched(  // NOLINT
  */
 template <typename T>
 cublasStatus_t cublasgemmStridedBatched(  // NOLINT
-  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-  int m, int n, int k, const T *alpha, const T *const Aarray, int lda,
-  int64_t strideA, const T *const Barray, int ldb, int64_t strideB,
-  const T *beta, T *Carray, int ldc, int64_t strideC, int batchCount,
+  cublasHandle_t handle,
+  cublasOperation_t transa,
+  cublasOperation_t transb,
+  int m,
+  int n,
+  int k,
+  const T* alpha,
+  const T* const Aarray,
+  int lda,
+  int64_t strideA,
+  const T* const Barray,
+  int ldb,
+  int64_t strideB,
+  const T* beta,
+  T* Carray,
+  int ldc,
+  int64_t strideC,
+  int batchCount,
   cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgemmStridedBatched(  // NOLINT
-  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-  int m, int n, int k, const float *alpha, const float *const Aarray, int lda,
-  int64_t strideA, const float *const Barray, int ldb, int64_t strideB,
-  const float *beta, float *Carray, int ldc, int64_t strideC, int batchCount,
-  cudaStream_t stream) {
+  cublasHandle_t handle,
+  cublasOperation_t transa,
+  cublasOperation_t transb,
+  int m,
+  int n,
+  int k,
+  const float* alpha,
+  const float* const Aarray,
+  int lda,
+  int64_t strideA,
+  const float* const Barray,
+  int ldb,
+  int64_t strideB,
+  const float* beta,
+  float* Carray,
+  int ldc,
+  int64_t strideC,
+  int batchCount,
+  cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgemmStridedBatched(handle, transa, transb, m, n, k, alpha,
-                                   Aarray, lda, strideA, Barray, ldb, strideB,
-                                   beta, Carray, ldc, strideC, batchCount);
+  return cublasSgemmStridedBatched(handle,
+                                   transa,
+                                   transb,
+                                   m,
+                                   n,
+                                   k,
+                                   alpha,
+                                   Aarray,
+                                   lda,
+                                   strideA,
+                                   Barray,
+                                   ldb,
+                                   strideB,
+                                   beta,
+                                   Carray,
+                                   ldc,
+                                   strideC,
+                                   batchCount);
 }
 
 template <>
 inline cublasStatus_t cublasgemmStridedBatched(  // NOLINT
-  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-  int m, int n, int k, const double *alpha, const double *const Aarray, int lda,
-  int64_t strideA, const double *const Barray, int ldb, int64_t strideB,
-  const double *beta, double *Carray, int ldc, int64_t strideC, int batchCount,
-  cudaStream_t stream) {
+  cublasHandle_t handle,
+  cublasOperation_t transa,
+  cublasOperation_t transb,
+  int m,
+  int n,
+  int k,
+  const double* alpha,
+  const double* const Aarray,
+  int lda,
+  int64_t strideA,
+  const double* const Barray,
+  int ldb,
+  int64_t strideB,
+  const double* beta,
+  double* Carray,
+  int ldc,
+  int64_t strideC,
+  int batchCount,
+  cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgemmStridedBatched(handle, transa, transb, m, n, k, alpha,
-                                   Aarray, lda, strideA, Barray, ldb, strideB,
-                                   beta, Carray, ldc, strideC, batchCount);
+  return cublasDgemmStridedBatched(handle,
+                                   transa,
+                                   transb,
+                                   m,
+                                   n,
+                                   k,
+                                   alpha,
+                                   Aarray,
+                                   lda,
+                                   strideA,
+                                   Barray,
+                                   ldb,
+                                   strideB,
+                                   beta,
+                                   Carray,
+                                   ldc,
+                                   strideC,
+                                   batchCount);
 }
 /** @} */
 
@@ -358,51 +584,85 @@ inline cublasStatus_t cublasgemmStridedBatched(  // NOLINT
  */
 
 template <typename T>
-cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, int n,  // NOLINT
-                                  T *const A[],                  // NOLINT
-                                  int lda, int *P, int *info, int batchSize,
+cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,
+                                  int n,         // NOLINT
+                                  T* const A[],  // NOLINT
+                                  int lda,
+                                  int* P,
+                                  int* info,
+                                  int batchSize,
                                   cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,    // NOLINT
-                                         int n, float *const A[],  // NOLINT
-                                         int lda, int *P, int *info,
-                                         int batchSize, cudaStream_t stream) {
+inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,  // NOLINT
+                                         int n,
+                                         float* const A[],  // NOLINT
+                                         int lda,
+                                         int* P,
+                                         int* info,
+                                         int batchSize,
+                                         cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize);
 }
 
 template <>
-inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,     // NOLINT
-                                         int n, double *const A[],  // NOLINT
-                                         int lda, int *P, int *info,
-                                         int batchSize, cudaStream_t stream) {
+inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,  // NOLINT
+                                         int n,
+                                         double* const A[],  // NOLINT
+                                         int lda,
+                                         int* P,
+                                         int* info,
+                                         int batchSize,
+                                         cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize);
 }
 
 template <typename T>
-cublasStatus_t cublasgetriBatched(cublasHandle_t handle, int n,  // NOLINT
-                                  const T *const A[],            // NOLINT
-                                  int lda, const int *P,
-                                  T *const C[],  // NOLINT
-                                  int ldc, int *info, int batchSize,
+cublasStatus_t cublasgetriBatched(cublasHandle_t handle,
+                                  int n,               // NOLINT
+                                  const T* const A[],  // NOLINT
+                                  int lda,
+                                  const int* P,
+                                  T* const C[],  // NOLINT
+                                  int ldc,
+                                  int* info,
+                                  int batchSize,
                                   cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasgetriBatched(                // NOLINT
-  cublasHandle_t handle, int n, const float *const A[],  // NOLINT
-  int lda, const int *P, float *const C[],               // NOLINT
-  int ldc, int *info, int batchSize, cudaStream_t stream) {
+inline cublasStatus_t cublasgetriBatched(  // NOLINT
+  cublasHandle_t handle,
+  int n,
+  const float* const A[],  // NOLINT
+  int lda,
+  const int* P,
+  float* const C[],  // NOLINT
+  int ldc,
+  int* info,
+  int batchSize,
+  cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
 template <>
-inline cublasStatus_t cublasgetriBatched(                 // NOLINT
-  cublasHandle_t handle, int n, const double *const A[],  // NOLINT
-  int lda, const int *P, double *const C[],               // NOLINT
-  int ldc, int *info, int batchSize, cudaStream_t stream) {
+inline cublasStatus_t cublasgetriBatched(  // NOLINT
+  cublasHandle_t handle,
+  int n,
+  const double* const A[],  // NOLINT
+  int lda,
+  const int* P,
+  double* const C[],  // NOLINT
+  int ldc,
+  int* info,
+  int batchSize,
+  cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
@@ -416,34 +676,57 @@ inline cublasStatus_t cublasgetriBatched(                 // NOLINT
 
 template <typename T>
 inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
-                                        cublasOperation_t trans, int m, int n,
-                                        int nrhs, T *Aarray[],  // NOLINT
-                                        int lda, T *Carray[],   // NOLINT
-                                        int ldc, int *info, int *devInfoArray,
-                                        int batchSize, cudaStream_t stream);
+                                        cublasOperation_t trans,
+                                        int m,
+                                        int n,
+                                        int nrhs,
+                                        T* Aarray[],  // NOLINT
+                                        int lda,
+                                        T* Carray[],  // NOLINT
+                                        int ldc,
+                                        int* info,
+                                        int* devInfoArray,
+                                        int batchSize,
+                                        cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
-                                        cublasOperation_t trans, int m, int n,
-                                        int nrhs, float *Aarray[],  // NOLINT
-                                        int lda, float *Carray[],   // NOLINT
-                                        int ldc, int *info, int *devInfoArray,
-                                        int batchSize, cudaStream_t stream) {
+                                        cublasOperation_t trans,
+                                        int m,
+                                        int n,
+                                        int nrhs,
+                                        float* Aarray[],  // NOLINT
+                                        int lda,
+                                        float* Carray[],  // NOLINT
+                                        int ldc,
+                                        int* info,
+                                        int* devInfoArray,
+                                        int batchSize,
+                                        cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc,
-                            info, devInfoArray, batchSize);
+  return cublasSgelsBatched(
+    handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
 }
 
 template <>
 inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
-                                        cublasOperation_t trans, int m, int n,
-                                        int nrhs, double *Aarray[],  // NOLINT
-                                        int lda, double *Carray[],   // NOLINT
-                                        int ldc, int *info, int *devInfoArray,
-                                        int batchSize, cudaStream_t stream) {
+                                        cublasOperation_t trans,
+                                        int m,
+                                        int n,
+                                        int nrhs,
+                                        double* Aarray[],  // NOLINT
+                                        int lda,
+                                        double* Carray[],  // NOLINT
+                                        int ldc,
+                                        int* info,
+                                        int* devInfoArray,
+                                        int batchSize,
+                                        cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc,
-                            info, devInfoArray, batchSize);
+  return cublasDgelsBatched(
+    handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
 }
 
 /** @} */
@@ -453,33 +736,59 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
  * @{
  */
 template <typename T>
-cublasStatus_t cublasgeam(cublasHandle_t handle, cublasOperation_t transA,
-                          cublasOperation_t transB, int m, int n, const T *alfa,
-                          const T *A, int lda, const T *beta, const T *B,
-                          int ldb, T *C, int ldc, cudaStream_t stream);
+cublasStatus_t cublasgeam(cublasHandle_t handle,
+                          cublasOperation_t transA,
+                          cublasOperation_t transB,
+                          int m,
+                          int n,
+                          const T* alfa,
+                          const T* A,
+                          int lda,
+                          const T* beta,
+                          const T* B,
+                          int ldb,
+                          T* C,
+                          int ldc,
+                          cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgeam(cublasHandle_t handle,
                                  cublasOperation_t transA,
-                                 cublasOperation_t transB, int m, int n,
-                                 const float *alfa, const float *A, int lda,
-                                 const float *beta, const float *B, int ldb,
-                                 float *C, int ldc, cudaStream_t stream) {
+                                 cublasOperation_t transB,
+                                 int m,
+                                 int n,
+                                 const float* alfa,
+                                 const float* A,
+                                 int lda,
+                                 const float* beta,
+                                 const float* B,
+                                 int ldb,
+                                 float* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb,
-                     C, ldc);
+  return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc);
 }
 
 template <>
 inline cublasStatus_t cublasgeam(cublasHandle_t handle,
                                  cublasOperation_t transA,
-                                 cublasOperation_t transB, int m, int n,
-                                 const double *alfa, const double *A, int lda,
-                                 const double *beta, const double *B, int ldb,
-                                 double *C, int ldc, cudaStream_t stream) {
+                                 cublasOperation_t transB,
+                                 int m,
+                                 int n,
+                                 const double* alfa,
+                                 const double* A,
+                                 int lda,
+                                 const double* beta,
+                                 const double* B,
+                                 int ldb,
+                                 double* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb,
-                     C, ldc);
+  return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc);
 }
 /** @} */
 
@@ -488,31 +797,59 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle,
  * @{
  */
 template <typename T>
-cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side,
-                          cublasFillMode_t uplo, int m, int n, const T *alpha,
-                          const T *A, int lda, const T *B, int ldb,
-                          const T *beta, T *C, int ldc, cudaStream_t stream);
+cublasStatus_t cublassymm(cublasHandle_t handle,
+                          cublasSideMode_t side,
+                          cublasFillMode_t uplo,
+                          int m,
+                          int n,
+                          const T* alpha,
+                          const T* A,
+                          int lda,
+                          const T* B,
+                          int ldb,
+                          const T* beta,
+                          T* C,
+                          int ldc,
+                          cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side,
-                                 cublasFillMode_t uplo, int m, int n,
-                                 const float *alpha, const float *A, int lda,
-                                 const float *B, int ldb, const float *beta,
-                                 float *C, int ldc, cudaStream_t stream) {
+inline cublasStatus_t cublassymm(cublasHandle_t handle,
+                                 cublasSideMode_t side,
+                                 cublasFillMode_t uplo,
+                                 int m,
+                                 int n,
+                                 const float* alpha,
+                                 const float* A,
+                                 int lda,
+                                 const float* B,
+                                 int ldb,
+                                 const float* beta,
+                                 float* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                     ldc);
+  return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
 template <>
-inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side,
-                                 cublasFillMode_t uplo, int m, int n,
-                                 const double *alpha, const double *A, int lda,
-                                 const double *B, int ldb, const double *beta,
-                                 double *C, int ldc, cudaStream_t stream) {
+inline cublasStatus_t cublassymm(cublasHandle_t handle,
+                                 cublasSideMode_t side,
+                                 cublasFillMode_t uplo,
+                                 int m,
+                                 int n,
+                                 const double* alpha,
+                                 const double* A,
+                                 int lda,
+                                 const double* B,
+                                 int ldb,
+                                 const double* beta,
+                                 double* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                     ldc);
+  return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 /** @} */
 
@@ -521,27 +858,51 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side,
  * @{
  */
 template <typename T>
-cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo,
-                          cublasOperation_t trans, int n, int k, const T *alpha,
-                          const T *A, int lda, const T *beta, T *C, int ldc,
+cublasStatus_t cublassyrk(cublasHandle_t handle,
+                          cublasFillMode_t uplo,
+                          cublasOperation_t trans,
+                          int n,
+                          int k,
+                          const T* alpha,
+                          const T* A,
+                          int lda,
+                          const T* beta,
+                          T* C,
+                          int ldc,
                           cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo,
-                                 cublasOperation_t trans, int n, int k,
-                                 const float *alpha, const float *A, int lda,
-                                 const float *beta, float *C, int ldc,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublassyrk(cublasHandle_t handle,
+                                 cublasFillMode_t uplo,
+                                 cublasOperation_t trans,
+                                 int n,
+                                 int k,
+                                 const float* alpha,
+                                 const float* A,
+                                 int lda,
+                                 const float* beta,
+                                 float* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
 template <>
-inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo,
-                                 cublasOperation_t trans, int n, int k,
-                                 const double *alpha, const double *A, int lda,
-                                 const double *beta, double *C, int ldc,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublassyrk(cublasHandle_t handle,
+                                 cublasFillMode_t uplo,
+                                 cublasOperation_t trans,
+                                 int n,
+                                 int k,
+                                 const double* alpha,
+                                 const double* A,
+                                 int lda,
+                                 const double* beta,
+                                 double* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
@@ -552,52 +913,77 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const T *x, int incx,
-                          T *result, cudaStream_t stream);
+cublasStatus_t cublasnrm2(
+  cublasHandle_t handle, int n, const T* x, int incx, T* result, cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const float *x,
-                                 int incx, float *result, cudaStream_t stream) {
+inline cublasStatus_t cublasnrm2(
+  cublasHandle_t handle, int n, const float* x, int incx, float* result, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSnrm2(handle, n, x, incx, result);
 }
 
 template <>
-inline cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const double *x,
-                                 int incx, double *result,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublasnrm2(
+  cublasHandle_t handle, int n, const double* x, int incx, double* result, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDnrm2(handle, n, x, incx, result);
 }
 /** @} */
 
 template <typename T>
-cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side,
-                          cublasFillMode_t uplo, cublasOperation_t trans,
-                          cublasDiagType_t diag, int m, int n, const T *alpha,
-                          const T *A, int lda, T *B, int ldb,
+cublasStatus_t cublastrsm(cublasHandle_t handle,
+                          cublasSideMode_t side,
+                          cublasFillMode_t uplo,
+                          cublasOperation_t trans,
+                          cublasDiagType_t diag,
+                          int m,
+                          int n,
+                          const T* alpha,
+                          const T* A,
+                          int lda,
+                          T* B,
+                          int ldb,
                           cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side,
-                                 cublasFillMode_t uplo, cublasOperation_t trans,
-                                 cublasDiagType_t diag, int m, int n,
-                                 const float *alpha, const float *A, int lda,
-                                 float *B, int ldb, cudaStream_t stream) {
+inline cublasStatus_t cublastrsm(cublasHandle_t handle,
+                                 cublasSideMode_t side,
+                                 cublasFillMode_t uplo,
+                                 cublasOperation_t trans,
+                                 cublasDiagType_t diag,
+                                 int m,
+                                 int n,
+                                 const float* alpha,
+                                 const float* A,
+                                 int lda,
+                                 float* B,
+                                 int ldb,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B,
-                     ldb);
+  return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
 template <>
-inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side,
-                                 cublasFillMode_t uplo, cublasOperation_t trans,
-                                 cublasDiagType_t diag, int m, int n,
-                                 const double *alpha, const double *A, int lda,
-                                 double *B, int ldb, cudaStream_t stream) {
+inline cublasStatus_t cublastrsm(cublasHandle_t handle,
+                                 cublasSideMode_t side,
+                                 cublasFillMode_t uplo,
+                                 cublasOperation_t trans,
+                                 cublasDiagType_t diag,
+                                 int m,
+                                 int n,
+                                 const double* alpha,
+                                 const double* A,
+                                 int lda,
+                                 double* B,
+                                 int ldb,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B,
-                     ldb);
+  return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
 /**
@@ -605,21 +991,39 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasdot(cublasHandle_t handle, int n, const T *x, int incx,
-                         const T *y, int incy, T *result, cudaStream_t stream);
+cublasStatus_t cublasdot(cublasHandle_t handle,
+                         int n,
+                         const T* x,
+                         int incx,
+                         const T* y,
+                         int incy,
+                         T* result,
+                         cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const float *x,
-                                int incx, const float *y, int incy,
-                                float *result, cudaStream_t stream) {
+inline cublasStatus_t cublasdot(cublasHandle_t handle,
+                                int n,
+                                const float* x,
+                                int incx,
+                                const float* y,
+                                int incy,
+                                float* result,
+                                cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSdot(handle, n, x, incx, y, incy, result);
 }
 
 template <>
-inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x,
-                                int incx, const double *y, int incy,
-                                double *result, cudaStream_t stream) {
+inline cublasStatus_t cublasdot(cublasHandle_t handle,
+                                int n,
+                                const double* x,
+                                int incx,
+                                const double* y,
+                                int incy,
+                                double* result,
+                                cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDdot(handle, n, x, incx, y, incy, result);
 }
@@ -639,7 +1043,8 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x,
 // template<>
 inline cublasStatus_t cublassetpointermode(cublasHandle_t handle,
                                            cublasPointerMode_t mode,
-                                           cudaStream_t stream) {
+                                           cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSetPointerMode(handle, mode);
 }
@@ -650,21 +1055,21 @@ inline cublasStatus_t cublassetpointermode(cublasHandle_t handle,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasscal(cublasHandle_t handle, int n, const T *alpha, T *x,
-                          int incx, cudaStream_t stream);
+cublasStatus_t cublasscal(
+  cublasHandle_t handle, int n, const T* alpha, T* x, int incx, cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasscal(cublasHandle_t handle, int n,
-                                 const float *alpha, float *x, int incx,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublasscal(
+  cublasHandle_t handle, int n, const float* alpha, float* x, int incx, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSscal(handle, n, alpha, x, incx);
 }
 
 template <>
-inline cublasStatus_t cublasscal(cublasHandle_t handle, int n,
-                                 const double *alpha, double *x, int incx,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublasscal(
+  cublasHandle_t handle, int n, const double* alpha, double* x, int incx, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDscal(handle, n, alpha, x, incx);
 }
diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/cusolver_wrappers.h
index 6aa5e74455..85f2740647 100644
--- a/cpp/include/raft/linalg/cusolver_wrappers.h
+++ b/cpp/include/raft/linalg/cusolver_wrappers.h
@@ -24,8 +24,7 @@
 #include <type_traits>
 
 #define _CUSOLVER_ERR_TO_STR(err) \
-  case err:                       \
-    return #err;
+  case err: return #err;
 
 namespace raft {
 
@@ -33,16 +32,15 @@ namespace raft {
  * @brief Exception thrown when a cuSOLVER error is encountered.
  */
 struct cusolver_error : public raft::exception {
-  explicit cusolver_error(char const *const message)
-    : raft::exception(message) {}
-  explicit cusolver_error(std::string const &message)
-    : raft::exception(message) {}
+  explicit cusolver_error(char const* const message) : raft::exception(message) {}
+  explicit cusolver_error(std::string const& message) : raft::exception(message) {}
 };
 
 namespace linalg {
 namespace detail {
 
-inline const char *cusolver_error_to_string(cusolverStatus_t err) {
+inline const char* cusolver_error_to_string(cusolverStatus_t err)
+{
   switch (err) {
     _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
     _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
@@ -54,8 +52,7 @@ inline const char *cusolver_error_to_string(cusolverStatus_t err) {
     _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
     _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
     _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
-    default:
-      return "CUSOLVER_STATUS_UNKNOWN";
+    default: return "CUSOLVER_STATUS_UNKNOWN";
   };
 }
 
@@ -76,8 +73,11 @@ inline const char *cusolver_error_to_string(cusolverStatus_t err) {
     cusolverStatus_t const status = (call);                                  \
     if (CUSOLVER_STATUS_SUCCESS != status) {                                 \
       std::string msg{};                                                     \
-      SET_ERROR_MSG(msg, "cuSOLVER error encountered at: ",                  \
-                    "call='%s', Reason=%d:%s", #call, status,                \
+      SET_ERROR_MSG(msg,                                                     \
+                    "cuSOLVER error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                               \
+                    #call,                                                   \
+                    status,                                                  \
                     raft::linalg::detail::cusolver_error_to_string(status)); \
       throw raft::cusolver_error(msg);                                       \
     }                                                                        \
@@ -107,42 +107,76 @@ namespace linalg {
  * @{
  */
 template <typename T>
-cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, int m,  // NOLINT
-                                 int n, T *A, int lda, T *Workspace,
-                                 int *devIpiv, int *devInfo,
+cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle,
+                                 int m,  // NOLINT
+                                 int n,
+                                 T* A,
+                                 int lda,
+                                 T* Workspace,
+                                 int* devIpiv,
+                                 int* devInfo,
                                  cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle,  // NOLINT
-                                        int m, int n, float *A, int lda,
-                                        float *Workspace, int *devIpiv,
-                                        int *devInfo, cudaStream_t stream) {
+                                        int m,
+                                        int n,
+                                        float* A,
+                                        int lda,
+                                        float* Workspace,
+                                        int* devIpiv,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle,  // NOLINT
-                                        int m, int n, double *A, int lda,
-                                        double *Workspace, int *devIpiv,
-                                        int *devInfo, cudaStream_t stream) {
+                                        int m,
+                                        int n,
+                                        double* A,
+                                        int lda,
+                                        double* Workspace,
+                                        int* devIpiv,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork);
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  T* A,
+  int lda,
+  int* Lwork);
 
 template <>
 inline cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  float* A,
+  int lda,
+  int* Lwork)
+{
   return cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 
 template <>
 inline cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  double* A,
+  int lda,
+  int* Lwork)
+{
   return cusolverDnDgetrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 
@@ -152,30 +186,49 @@ inline cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
-                                 cublasOperation_t trans, int n, int nrhs,
-                                 const T *A, int lda, const int *devIpiv, T *B,
-                                 int ldb, int *devInfo, cudaStream_t stream);
+                                 cublasOperation_t trans,
+                                 int n,
+                                 int nrhs,
+                                 const T* A,
+                                 int lda,
+                                 const int* devIpiv,
+                                 T* B,
+                                 int ldb,
+                                 int* devInfo,
+                                 cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasOperation_t trans, int n,
-                                        int nrhs, const float *A, int lda,
-                                        const int *devIpiv, float *B, int ldb,
-                                        int *devInfo, cudaStream_t stream) {
+                                        cublasOperation_t trans,
+                                        int n,
+                                        int nrhs,
+                                        const float* A,
+                                        int lda,
+                                        const int* devIpiv,
+                                        float* B,
+                                        int ldb,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb,
-                          devInfo);
+  return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasOperation_t trans, int n,
-                                        int nrhs, const double *A, int lda,
-                                        const int *devIpiv, double *B, int ldb,
-                                        int *devInfo, cudaStream_t stream) {
+                                        cublasOperation_t trans,
+                                        int n,
+                                        int nrhs,
+                                        const double* A,
+                                        int lda,
+                                        const int* devIpiv,
+                                        double* B,
+                                        int ldb,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb,
-                          devInfo);
+  return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
 }
 /** @} */
 
@@ -185,20 +238,40 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, const T *A, int lda, const T *W, int *lwork);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  const T* A,
+  int lda,
+  const T* W,
+  int* lwork);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, const float *A, int lda, const float *W, int *lwork) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  const float* A,
+  int lda,
+  const float* W,
+  int* lwork)
+{
   return cusolverDnSsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, const double *A, int lda, const double *W, int *lwork) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  const double* A,
+  int lda,
+  const double* W,
+  int* lwork)
+{
   return cusolverDnDsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork);
 }
 /** @} */
@@ -209,52 +282,96 @@ inline cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnsyevj(cusolverDnHandle_t handle,  // NOLINT
-                                 cusolverEigMode_t jobz, cublasFillMode_t uplo,
-                                 int n, T *A, int lda, T *W, T *work, int lwork,
-                                 int *info, syevjInfo_t params,
+                                 cusolverEigMode_t jobz,
+                                 cublasFillMode_t uplo,
+                                 int n,
+                                 T* A,
+                                 int lda,
+                                 T* W,
+                                 T* work,
+                                 int lwork,
+                                 int* info,
+                                 syevjInfo_t params,
                                  cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevj(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, float *A, int lda, float *W, float *work, int lwork, int *info,
-  syevjInfo_t params, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  float* A,
+  int lda,
+  float* W,
+  float* work,
+  int lwork,
+  int* info,
+  syevjInfo_t params,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info,
-                          params);
+  return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevj(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, double *A, int lda, double *W, double *work, int lwork, int *info,
-  syevjInfo_t params, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  double* A,
+  int lda,
+  double* W,
+  double* work,
+  int lwork,
+  int* info,
+  syevjInfo_t params,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info,
-                          params);
+  return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, const T *A, int lda, const T *W, int *lwork, syevjInfo_t params);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  const T* A,
+  int lda,
+  const T* W,
+  int* lwork,
+  syevjInfo_t params);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, const float *A, int lda, const float *W, int *lwork,
-  syevjInfo_t params) {
-  return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork,
-                                     params);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  const float* A,
+  int lda,
+  const float* W,
+  int* lwork,
+  syevjInfo_t params)
+{
+  return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, const double *A, int lda, const double *W, int *lwork,
-  syevjInfo_t params) {
-  return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork,
-                                     params);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  const double* A,
+  int lda,
+  const double* W,
+  int* lwork,
+  syevjInfo_t params)
+{
+  return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params);
 }
 /** @} */
 
@@ -264,32 +381,49 @@ inline cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
-                                 cusolverEigMode_t jobz, cublasFillMode_t uplo,
-                                 int n, T *A, int lda, T *W, T *work, int lwork,
-                                 int *devInfo, cudaStream_t stream);
+                                 cusolverEigMode_t jobz,
+                                 cublasFillMode_t uplo,
+                                 int n,
+                                 T* A,
+                                 int lda,
+                                 T* W,
+                                 T* work,
+                                 int lwork,
+                                 int* devInfo,
+                                 cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
                                         cusolverEigMode_t jobz,
-                                        cublasFillMode_t uplo, int n, float *A,
-                                        int lda, float *W, float *work,
-                                        int lwork, int *devInfo,
-                                        cudaStream_t stream) {
+                                        cublasFillMode_t uplo,
+                                        int n,
+                                        float* A,
+                                        int lda,
+                                        float* W,
+                                        float* work,
+                                        int lwork,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork,
-                          devInfo);
+  return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
                                         cusolverEigMode_t jobz,
-                                        cublasFillMode_t uplo, int n, double *A,
-                                        int lda, double *W, double *work,
-                                        int lwork, int *devInfo,
-                                        cudaStream_t stream) {
+                                        cublasFillMode_t uplo,
+                                        int n,
+                                        double* A,
+                                        int lda,
+                                        double* W,
+                                        double* work,
+                                        int lwork,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork,
-                          devInfo);
+  return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo);
 }
 /** @} */
 
@@ -297,57 +431,134 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
 /**
  * @defgroup syevdx cusolver syevdx operations
  * @{
-*/
+ */
 template <typename T>
 cusolverStatus_t cusolverDnsyevdx_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-  cublasFillMode_t uplo, int n, const T *A, int lda, T vl, T vu, int il, int iu,
-  int *h_meig, const T *W, int *lwork);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cusolverEigRange_t range,
+  cublasFillMode_t uplo,
+  int n,
+  const T* A,
+  int lda,
+  T vl,
+  T vu,
+  int il,
+  int iu,
+  int* h_meig,
+  const T* W,
+  int* lwork);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevdx_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-  cublasFillMode_t uplo, int n, const float *A, int lda, float vl, float vu,
-  int il, int iu, int *h_meig, const float *W, int *lwork) {
-  return cusolverDnSsyevdx_bufferSize(handle, jobz, range, uplo, n, A, lda, vl,
-                                      vu, il, iu, h_meig, W, lwork);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cusolverEigRange_t range,
+  cublasFillMode_t uplo,
+  int n,
+  const float* A,
+  int lda,
+  float vl,
+  float vu,
+  int il,
+  int iu,
+  int* h_meig,
+  const float* W,
+  int* lwork)
+{
+  return cusolverDnSsyevdx_bufferSize(
+    handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevdx_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-  cublasFillMode_t uplo, int n, const double *A, int lda, double vl, double vu,
-  int il, int iu, int *h_meig, const double *W, int *lwork) {
-  return cusolverDnDsyevdx_bufferSize(handle, jobz, range, uplo, n, A, lda, vl,
-                                      vu, il, iu, h_meig, W, lwork);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cusolverEigRange_t range,
+  cublasFillMode_t uplo,
+  int n,
+  const double* A,
+  int lda,
+  double vl,
+  double vu,
+  int il,
+  int iu,
+  int* h_meig,
+  const double* W,
+  int* lwork)
+{
+  return cusolverDnDsyevdx_bufferSize(
+    handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDnsyevdx(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-  cublasFillMode_t uplo, int n, T *A, int lda, T vl, T vu, int il, int iu,
-  int *h_meig, T *W, T *work, int lwork, int *devInfo, cudaStream_t stream);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cusolverEigRange_t range,
+  cublasFillMode_t uplo,
+  int n,
+  T* A,
+  int lda,
+  T vl,
+  T vu,
+  int il,
+  int iu,
+  int* h_meig,
+  T* W,
+  T* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-  cublasFillMode_t uplo, int n, float *A, int lda, float vl, float vu, int il,
-  int iu, int *h_meig, float *W, float *work, int lwork, int *devInfo,
-  cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cusolverEigRange_t range,
+  cublasFillMode_t uplo,
+  int n,
+  float* A,
+  int lda,
+  float vl,
+  float vu,
+  int il,
+  int iu,
+  int* h_meig,
+  float* W,
+  float* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSsyevdx(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu,
-                           h_meig, W, work, lwork, devInfo);
+  return cusolverDnSsyevdx(
+    handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-  cublasFillMode_t uplo, int n, double *A, int lda, double vl, double vu,
-  int il, int iu, int *h_meig, double *W, double *work, int lwork, int *devInfo,
-  cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cusolverEigRange_t range,
+  cublasFillMode_t uplo,
+  int n,
+  double* A,
+  int lda,
+  double vl,
+  double vu,
+  int il,
+  int iu,
+  int* h_meig,
+  double* W,
+  double* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDsyevdx(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu,
-                           h_meig, W, work, lwork, devInfo);
+  return cusolverDnDsyevdx(
+    handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo);
 }
 /** @} */
 #endif
@@ -358,7 +569,11 @@ inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDngesvd_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, int *lwork) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  int* lwork)
+{
   if (std::is_same<std::decay_t<T>, float>::value) {
     return cusolverDnSgesvd_bufferSize(handle, m, n, lwork);
   } else {
@@ -367,72 +582,194 @@ cusolverStatus_t cusolverDngesvd_bufferSize(  // NOLINT
 }
 template <typename T>
 cusolverStatus_t cusolverDngesvd(  // NOLINT
-  cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n,
-  T *A, int lda, T *S, T *U, int ldu, T *VT, int ldvt, T *work, int lwork,
-  T *rwork, int *devInfo, cudaStream_t stream);
+  cusolverDnHandle_t handle,
+  signed char jobu,
+  signed char jobvt,
+  int m,
+  int n,
+  T* A,
+  int lda,
+  T* S,
+  T* U,
+  int ldu,
+  T* VT,
+  int ldvt,
+  T* work,
+  int lwork,
+  T* rwork,
+  int* devInfo,
+  cudaStream_t stream);
 template <>
 inline cusolverStatus_t cusolverDngesvd(  // NOLINT
-  cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n,
-  float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt,
-  float *work, int lwork, float *rwork, int *devInfo, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  signed char jobu,
+  signed char jobvt,
+  int m,
+  int n,
+  float* A,
+  int lda,
+  float* S,
+  float* U,
+  int ldu,
+  float* VT,
+  int ldvt,
+  float* work,
+  int lwork,
+  float* rwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT,
-                          ldvt, work, lwork, rwork, devInfo);
+  return cusolverDnSgesvd(
+    handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo);
 }
 template <>
 inline cusolverStatus_t cusolverDngesvd(  // NOLINT
-  cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n,
-  double *A, int lda, double *S, double *U, int ldu, double *VT, int ldvt,
-  double *work, int lwork, double *rwork, int *devInfo, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  signed char jobu,
+  signed char jobvt,
+  int m,
+  int n,
+  double* A,
+  int lda,
+  double* S,
+  double* U,
+  int ldu,
+  double* VT,
+  int ldvt,
+  double* work,
+  int lwork,
+  double* rwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT,
-                          ldvt, work, lwork, rwork, devInfo);
+  return cusolverDnDgesvd(
+    handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo);
 }
 
 template <typename T>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-  const T *A, int lda, const T *S, const T *U, int ldu, const T *V, int ldv,
-  int *lwork, gesvdjInfo_t params);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  int econ,
+  int m,
+  int n,
+  const T* A,
+  int lda,
+  const T* S,
+  const T* U,
+  int ldu,
+  const T* V,
+  int ldv,
+  int* lwork,
+  gesvdjInfo_t params);
 template <>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-  const float *A, int lda, const float *S, const float *U, int ldu,
-  const float *V, int ldv, int *lwork, gesvdjInfo_t params) {
-  return cusolverDnSgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U,
-                                      ldu, V, ldv, lwork, params);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  int econ,
+  int m,
+  int n,
+  const float* A,
+  int lda,
+  const float* S,
+  const float* U,
+  int ldu,
+  const float* V,
+  int ldv,
+  int* lwork,
+  gesvdjInfo_t params)
+{
+  return cusolverDnSgesvdj_bufferSize(
+    handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params);
 }
 template <>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-  const double *A, int lda, const double *S, const double *U, int ldu,
-  const double *V, int ldv, int *lwork, gesvdjInfo_t params) {
-  return cusolverDnDgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U,
-                                      ldu, V, ldv, lwork, params);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  int econ,
+  int m,
+  int n,
+  const double* A,
+  int lda,
+  const double* S,
+  const double* U,
+  int ldu,
+  const double* V,
+  int ldv,
+  int* lwork,
+  gesvdjInfo_t params)
+{
+  return cusolverDnDgesvdj_bufferSize(
+    handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params);
 }
 template <typename T>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-  T *A, int lda, T *S, T *U, int ldu, T *V, int ldv, T *work, int lwork,
-  int *info, gesvdjInfo_t params, cudaStream_t stream);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  int econ,
+  int m,
+  int n,
+  T* A,
+  int lda,
+  T* S,
+  T* U,
+  int ldu,
+  T* V,
+  int ldv,
+  T* work,
+  int lwork,
+  int* info,
+  gesvdjInfo_t params,
+  cudaStream_t stream);
 template <>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-  float *A, int lda, float *S, float *U, int ldu, float *V, int ldv,
-  float *work, int lwork, int *info, gesvdjInfo_t params, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  int econ,
+  int m,
+  int n,
+  float* A,
+  int lda,
+  float* S,
+  float* U,
+  int ldu,
+  float* V,
+  int ldv,
+  float* work,
+  int lwork,
+  int* info,
+  gesvdjInfo_t params,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSgesvdj(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv,
-                           work, lwork, info, params);
+  return cusolverDnSgesvdj(
+    handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params);
 }
 template <>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-  double *A, int lda, double *S, double *U, int ldu, double *V, int ldv,
-  double *work, int lwork, int *info, gesvdjInfo_t params,
-  cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  int econ,
+  int m,
+  int n,
+  double* A,
+  int lda,
+  double* S,
+  double* U,
+  int ldu,
+  double* V,
+  int ldv,
+  double* work,
+  int lwork,
+  int* info,
+  gesvdjInfo_t params,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDgesvdj(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv,
-                           work, lwork, info, params);
+  return cusolverDnDgesvdj(
+    handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params);
 }
 /** @} */
 
@@ -442,43 +779,74 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnpotrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, T *A, int lda,
-  int *Lwork);
+  cusolverDnHandle_t handle,
+  cublasFillMode_t uplo,
+  int n,
+  T* A,
+  int lda,
+  int* Lwork);
 
 template <>
 inline cusolverStatus_t cusolverDnpotrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, float *A, int lda,
-  int *Lwork) {
+  cusolverDnHandle_t handle,
+  cublasFillMode_t uplo,
+  int n,
+  float* A,
+  int lda,
+  int* Lwork)
+{
   return cusolverDnSpotrf_bufferSize(handle, uplo, n, A, lda, Lwork);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnpotrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda,
-  int *Lwork) {
+  cusolverDnHandle_t handle,
+  cublasFillMode_t uplo,
+  int n,
+  double* A,
+  int lda,
+  int* Lwork)
+{
   return cusolverDnDpotrf_bufferSize(handle, uplo, n, A, lda, Lwork);
 }
 
 template <typename T>
 inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasFillMode_t uplo, int n, T *A,
-                                        int lda, T *Workspace, int Lwork,
-                                        int *devInfo, cudaStream_t stream);
+                                        cublasFillMode_t uplo,
+                                        int n,
+                                        T* A,
+                                        int lda,
+                                        T* Workspace,
+                                        int Lwork,
+                                        int* devInfo,
+                                        cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasFillMode_t uplo, int n, float *A,
-                                        int lda, float *Workspace, int Lwork,
-                                        int *devInfo, cudaStream_t stream) {
+                                        cublasFillMode_t uplo,
+                                        int n,
+                                        float* A,
+                                        int lda,
+                                        float* Workspace,
+                                        int Lwork,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasFillMode_t uplo, int n, double *A,
-                                        int lda, double *Workspace, int Lwork,
-                                        int *devInfo, cudaStream_t stream) {
+                                        cublasFillMode_t uplo,
+                                        int n,
+                                        double* A,
+                                        int lda,
+                                        double* Workspace,
+                                        int Lwork,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
 }
@@ -490,26 +858,44 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
-                                 cublasFillMode_t uplo, int n, int nrhs,
-                                 const T *A, int lda, T *B, int ldb,
-                                 int *devInfo, cudaStream_t stream);
+                                 cublasFillMode_t uplo,
+                                 int n,
+                                 int nrhs,
+                                 const T* A,
+                                 int lda,
+                                 T* B,
+                                 int ldb,
+                                 int* devInfo,
+                                 cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasFillMode_t uplo, int n, int nrhs,
-                                        const float *A, int lda, float *B,
-                                        int ldb, int *devInfo,
-                                        cudaStream_t stream) {
+                                        cublasFillMode_t uplo,
+                                        int n,
+                                        int nrhs,
+                                        const float* A,
+                                        int lda,
+                                        float* B,
+                                        int ldb,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasFillMode_t uplo, int n, int nrhs,
-                                        const double *A, int lda, double *B,
-                                        int ldb, int *devInfo,
-                                        cudaStream_t stream) {
+                                        cublasFillMode_t uplo,
+                                        int n,
+                                        int nrhs,
+                                        const double* A,
+                                        int lda,
+                                        double* B,
+                                        int ldb,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
 }
@@ -520,38 +906,75 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
  * @{
  */
 template <typename T>
-cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, int m,  // NOLINT
-                                 int n, T *A, int lda, T *TAU, T *Workspace,
-                                 int Lwork, int *devInfo, cudaStream_t stream);
+cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle,
+                                 int m,  // NOLINT
+                                 int n,
+                                 T* A,
+                                 int lda,
+                                 T* TAU,
+                                 T* Workspace,
+                                 int Lwork,
+                                 int* devInfo,
+                                 cudaStream_t stream);
 template <>
 inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle,  // NOLINT
-                                        int m, int n, float *A, int lda,
-                                        float *TAU, float *Workspace, int Lwork,
-                                        int *devInfo, cudaStream_t stream) {
+                                        int m,
+                                        int n,
+                                        float* A,
+                                        int lda,
+                                        float* TAU,
+                                        float* Workspace,
+                                        int Lwork,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
 }
 template <>
 inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle,  // NOLINT
-                                        int m, int n, double *A, int lda,
-                                        double *TAU, double *Workspace,
-                                        int Lwork, int *devInfo,
-                                        cudaStream_t stream) {
+                                        int m,
+                                        int n,
+                                        double* A,
+                                        int lda,
+                                        double* TAU,
+                                        double* Workspace,
+                                        int Lwork,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork);
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  T* A,
+  int lda,
+  int* Lwork);
 template <>
 inline cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  float* A,
+  int lda,
+  int* Lwork)
+{
   return cusolverDnSgeqrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 template <>
 inline cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  double* A,
+  int lda,
+  int* Lwork)
+{
   return cusolverDnDgeqrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 /** @} */
@@ -562,38 +985,86 @@ inline cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnorgqr(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, int k, T *A, int lda, const T *tau,
-  T *work, int lwork, int *devInfo, cudaStream_t stream);
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  int k,
+  T* A,
+  int lda,
+  const T* tau,
+  T* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream);
 template <>
 inline cusolverStatus_t cusolverDnorgqr(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, int k, float *A, int lda,
-  const float *tau, float *work, int lwork, int *devInfo, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  int k,
+  float* A,
+  int lda,
+  const float* tau,
+  float* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo);
 }
 template <>
 inline cusolverStatus_t cusolverDnorgqr(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, int k, double *A, int lda,
-  const double *tau, double *work, int lwork, int *devInfo,
-  cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  int k,
+  double* A,
+  int lda,
+  const double* tau,
+  double* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, int k, const T *A, int lda,
-  const T *TAU, int *lwork);
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  int k,
+  const T* A,
+  int lda,
+  const T* TAU,
+  int* lwork);
 template <>
 inline cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, int k, const float *A, int lda,
-  const float *TAU, int *lwork) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  int k,
+  const float* A,
+  int lda,
+  const float* TAU,
+  int* lwork)
+{
   return cusolverDnSorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork);
 }
 template <>
 inline cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, int k, const double *A, int lda,
-  const double *TAU, int *lwork) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  int k,
+  const double* A,
+  int lda,
+  const double* TAU,
+  int* lwork)
+{
   return cusolverDnDorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork);
 }
 /** @} */
@@ -604,53 +1075,114 @@ inline cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnormqr(cusolverDnHandle_t handle,  // NOLINT
-                                 cublasSideMode_t side, cublasOperation_t trans,
-                                 int m, int n, int k, const T *A, int lda,
-                                 const T *tau, T *C, int ldc, T *work,
-                                 int lwork, int *devInfo, cudaStream_t stream);
+                                 cublasSideMode_t side,
+                                 cublasOperation_t trans,
+                                 int m,
+                                 int n,
+                                 int k,
+                                 const T* A,
+                                 int lda,
+                                 const T* tau,
+                                 T* C,
+                                 int ldc,
+                                 T* work,
+                                 int lwork,
+                                 int* devInfo,
+                                 cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnormqr(  // NOLINT
-  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-  int m, int n, int k, const float *A, int lda, const float *tau, float *C,
-  int ldc, float *work, int lwork, int *devInfo, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cublasSideMode_t side,
+  cublasOperation_t trans,
+  int m,
+  int n,
+  int k,
+  const float* A,
+  int lda,
+  const float* tau,
+  float* C,
+  int ldc,
+  float* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc,
-                          work, lwork, devInfo);
+  return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnormqr(  // NOLINT
-  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-  int m, int n, int k, const double *A, int lda, const double *tau, double *C,
-  int ldc, double *work, int lwork, int *devInfo, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cublasSideMode_t side,
+  cublasOperation_t trans,
+  int m,
+  int n,
+  int k,
+  const double* A,
+  int lda,
+  const double* tau,
+  double* C,
+  int ldc,
+  double* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc,
-                          work, lwork, devInfo);
+  return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-  int m, int n, int k, const T *A, int lda, const T *tau, const T *C, int ldc,
-  int *lwork);
+  cusolverDnHandle_t handle,
+  cublasSideMode_t side,
+  cublasOperation_t trans,
+  int m,
+  int n,
+  int k,
+  const T* A,
+  int lda,
+  const T* tau,
+  const T* C,
+  int ldc,
+  int* lwork);
 
 template <>
 inline cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-  int m, int n, int k, const float *A, int lda, const float *tau,
-  const float *C, int ldc, int *lwork) {
-  return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau,
-                                     C, ldc, lwork);
+  cusolverDnHandle_t handle,
+  cublasSideMode_t side,
+  cublasOperation_t trans,
+  int m,
+  int n,
+  int k,
+  const float* A,
+  int lda,
+  const float* tau,
+  const float* C,
+  int ldc,
+  int* lwork)
+{
+  return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-  int m, int n, int k, const double *A, int lda, const double *tau,
-  const double *C, int ldc, int *lwork) {
-  return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau,
-                                     C, ldc, lwork);
+  cusolverDnHandle_t handle,
+  cublasSideMode_t side,
+  cublasOperation_t trans,
+  int m,
+  int n,
+  int k,
+  const double* A,
+  int lda,
+  const double* tau,
+  const double* C,
+  int ldc,
+  int* lwork)
+{
+  return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
 }
 /** @} */
 
@@ -660,62 +1192,136 @@ inline cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverSpcsrqrBufferInfoBatched(  // NOLINT
-  cusolverSpHandle_t handle, int m, int n, int nnzA,
-  const cusparseMatDescr_t descrA, const T *csrValA, const int *csrRowPtrA,
-  const int *csrColIndA, int batchSize, csrqrInfo_t info,
-  size_t *internalDataInBytes, size_t *workspaceInBytes);
+  cusolverSpHandle_t handle,
+  int m,
+  int n,
+  int nnzA,
+  const cusparseMatDescr_t descrA,
+  const T* csrValA,
+  const int* csrRowPtrA,
+  const int* csrColIndA,
+  int batchSize,
+  csrqrInfo_t info,
+  size_t* internalDataInBytes,
+  size_t* workspaceInBytes);
 
 template <>
 inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched(  // NOLINT
-  cusolverSpHandle_t handle, int m, int n, int nnzA,
-  const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA,
-  const int *csrColIndA, int batchSize, csrqrInfo_t info,
-  size_t *internalDataInBytes, size_t *workspaceInBytes) {
-  return cusolverSpScsrqrBufferInfoBatched(
-    handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, batchSize,
-    info, internalDataInBytes, workspaceInBytes);
+  cusolverSpHandle_t handle,
+  int m,
+  int n,
+  int nnzA,
+  const cusparseMatDescr_t descrA,
+  const float* csrValA,
+  const int* csrRowPtrA,
+  const int* csrColIndA,
+  int batchSize,
+  csrqrInfo_t info,
+  size_t* internalDataInBytes,
+  size_t* workspaceInBytes)
+{
+  return cusolverSpScsrqrBufferInfoBatched(handle,
+                                           m,
+                                           n,
+                                           nnzA,
+                                           descrA,
+                                           csrValA,
+                                           csrRowPtrA,
+                                           csrColIndA,
+                                           batchSize,
+                                           info,
+                                           internalDataInBytes,
+                                           workspaceInBytes);
 }
 
 template <>
 inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched(  // NOLINT
-  cusolverSpHandle_t handle, int m, int n, int nnzA,
-  const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA,
-  const int *csrColIndA, int batchSize, csrqrInfo_t info,
-  size_t *internalDataInBytes, size_t *workspaceInBytes) {
-  return cusolverSpDcsrqrBufferInfoBatched(
-    handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, batchSize,
-    info, internalDataInBytes, workspaceInBytes);
+  cusolverSpHandle_t handle,
+  int m,
+  int n,
+  int nnzA,
+  const cusparseMatDescr_t descrA,
+  const double* csrValA,
+  const int* csrRowPtrA,
+  const int* csrColIndA,
+  int batchSize,
+  csrqrInfo_t info,
+  size_t* internalDataInBytes,
+  size_t* workspaceInBytes)
+{
+  return cusolverSpDcsrqrBufferInfoBatched(handle,
+                                           m,
+                                           n,
+                                           nnzA,
+                                           descrA,
+                                           csrValA,
+                                           csrRowPtrA,
+                                           csrColIndA,
+                                           batchSize,
+                                           info,
+                                           internalDataInBytes,
+                                           workspaceInBytes);
 }
 
 template <typename T>
 cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
-  cusolverSpHandle_t handle, int m, int n, int nnzA,
-  const cusparseMatDescr_t descrA, const T *csrValA, const int *csrRowPtrA,
-  const int *csrColIndA, const T *b, T *x, int batchSize, csrqrInfo_t info,
-  void *pBuffer, cudaStream_t stream);
+  cusolverSpHandle_t handle,
+  int m,
+  int n,
+  int nnzA,
+  const cusparseMatDescr_t descrA,
+  const T* csrValA,
+  const int* csrRowPtrA,
+  const int* csrColIndA,
+  const T* b,
+  T* x,
+  int batchSize,
+  csrqrInfo_t info,
+  void* pBuffer,
+  cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
-  cusolverSpHandle_t handle, int m, int n, int nnzA,
-  const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA,
-  const int *csrColIndA, const float *b, float *x, int batchSize,
-  csrqrInfo_t info, void *pBuffer, cudaStream_t stream) {
+  cusolverSpHandle_t handle,
+  int m,
+  int n,
+  int nnzA,
+  const cusparseMatDescr_t descrA,
+  const float* csrValA,
+  const int* csrRowPtrA,
+  const int* csrColIndA,
+  const float* b,
+  float* x,
+  int batchSize,
+  csrqrInfo_t info,
+  void* pBuffer,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverSpSetStream(handle, stream));
-  return cusolverSpScsrqrsvBatched(handle, m, n, nnzA, descrA, csrValA,
-                                   csrRowPtrA, csrColIndA, b, x, batchSize,
-                                   info, pBuffer);
+  return cusolverSpScsrqrsvBatched(
+    handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer);
 }
 
 template <>
 inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
-  cusolverSpHandle_t handle, int m, int n, int nnzA,
-  const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA,
-  const int *csrColIndA, const double *b, double *x, int batchSize,
-  csrqrInfo_t info, void *pBuffer, cudaStream_t stream) {
+  cusolverSpHandle_t handle,
+  int m,
+  int n,
+  int nnzA,
+  const cusparseMatDescr_t descrA,
+  const double* csrValA,
+  const int* csrRowPtrA,
+  const int* csrColIndA,
+  const double* b,
+  double* x,
+  int batchSize,
+  csrqrInfo_t info,
+  void* pBuffer,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverSpSetStream(handle, stream));
-  return cusolverSpDcsrqrsvBatched(handle, m, n, nnzA, descrA, csrValA,
-                                   csrRowPtrA, csrColIndA, b, x, batchSize,
-                                   info, pBuffer);
+  return cusolverSpDcsrqrsvBatched(
+    handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer);
 }
 /** @} */
 
@@ -726,66 +1332,165 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnxsyevd_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz,
-  cublasFillMode_t uplo, int64_t n, const T *A, int64_t lda, const T *W,
-  size_t *workspaceInBytesOnDevice, size_t *workspaceInBytesOnHost,
+  cusolverDnHandle_t handle,
+  cusolverDnParams_t params,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int64_t n,
+  const T* A,
+  int64_t lda,
+  const T* W,
+  size_t* workspaceInBytesOnDevice,
+  size_t* workspaceInBytesOnHost,
   cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnxsyevd_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz,
-  cublasFillMode_t uplo, int64_t n, const float *A, int64_t lda, const float *W,
-  size_t *workspaceInBytesOnDevice, size_t *workspaceInBytesOnHost,
-  cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverDnParams_t params,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int64_t n,
+  const float* A,
+  int64_t lda,
+  const float* W,
+  size_t* workspaceInBytesOnDevice,
+  size_t* workspaceInBytesOnHost,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnXsyevd_bufferSize(
-    handle, params, jobz, uplo, n, CUDA_R_32F, A, lda, CUDA_R_32F, W,
-    CUDA_R_32F, workspaceInBytesOnDevice, workspaceInBytesOnHost);
+  return cusolverDnXsyevd_bufferSize(handle,
+                                     params,
+                                     jobz,
+                                     uplo,
+                                     n,
+                                     CUDA_R_32F,
+                                     A,
+                                     lda,
+                                     CUDA_R_32F,
+                                     W,
+                                     CUDA_R_32F,
+                                     workspaceInBytesOnDevice,
+                                     workspaceInBytesOnHost);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnxsyevd_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz,
-  cublasFillMode_t uplo, int64_t n, const double *A, int64_t lda,
-  const double *W, size_t *workspaceInBytesOnDevice,
-  size_t *workspaceInBytesOnHost, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverDnParams_t params,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int64_t n,
+  const double* A,
+  int64_t lda,
+  const double* W,
+  size_t* workspaceInBytesOnDevice,
+  size_t* workspaceInBytesOnHost,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnXsyevd_bufferSize(
-    handle, params, jobz, uplo, n, CUDA_R_64F, A, lda, CUDA_R_64F, W,
-    CUDA_R_64F, workspaceInBytesOnDevice, workspaceInBytesOnHost);
+  return cusolverDnXsyevd_bufferSize(handle,
+                                     params,
+                                     jobz,
+                                     uplo,
+                                     n,
+                                     CUDA_R_64F,
+                                     A,
+                                     lda,
+                                     CUDA_R_64F,
+                                     W,
+                                     CUDA_R_64F,
+                                     workspaceInBytesOnDevice,
+                                     workspaceInBytesOnHost);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDnxsyevd(  // NOLINT
-  cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz,
-  cublasFillMode_t uplo, int64_t n, T *A, int64_t lda, T *W, T *bufferOnDevice,
-  size_t workspaceInBytesOnDevice, T *bufferOnHost,
-  size_t workspaceInBytesOnHost, int *info, cudaStream_t stream);
+  cusolverDnHandle_t handle,
+  cusolverDnParams_t params,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int64_t n,
+  T* A,
+  int64_t lda,
+  T* W,
+  T* bufferOnDevice,
+  size_t workspaceInBytesOnDevice,
+  T* bufferOnHost,
+  size_t workspaceInBytesOnHost,
+  int* info,
+  cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnxsyevd(  // NOLINT
-  cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz,
-  cublasFillMode_t uplo, int64_t n, float *A, int64_t lda, float *W,
-  float *bufferOnDevice, size_t workspaceInBytesOnDevice, float *bufferOnHost,
-  size_t workspaceInBytesOnHost, int *info, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverDnParams_t params,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int64_t n,
+  float* A,
+  int64_t lda,
+  float* W,
+  float* bufferOnDevice,
+  size_t workspaceInBytesOnDevice,
+  float* bufferOnHost,
+  size_t workspaceInBytesOnHost,
+  int* info,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnXsyevd(handle, params, jobz, uplo, n, CUDA_R_32F, A, lda,
-                          CUDA_R_32F, W, CUDA_R_32F, bufferOnDevice,
-                          workspaceInBytesOnDevice, bufferOnHost,
-                          workspaceInBytesOnHost, info);
+  return cusolverDnXsyevd(handle,
+                          params,
+                          jobz,
+                          uplo,
+                          n,
+                          CUDA_R_32F,
+                          A,
+                          lda,
+                          CUDA_R_32F,
+                          W,
+                          CUDA_R_32F,
+                          bufferOnDevice,
+                          workspaceInBytesOnDevice,
+                          bufferOnHost,
+                          workspaceInBytesOnHost,
+                          info);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnxsyevd(  // NOLINT
-  cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz,
-  cublasFillMode_t uplo, int64_t n, double *A, int64_t lda, double *W,
-  double *bufferOnDevice, size_t workspaceInBytesOnDevice, double *bufferOnHost,
-  size_t workspaceInBytesOnHost, int *info, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverDnParams_t params,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int64_t n,
+  double* A,
+  int64_t lda,
+  double* W,
+  double* bufferOnDevice,
+  size_t workspaceInBytesOnDevice,
+  double* bufferOnHost,
+  size_t workspaceInBytesOnHost,
+  int* info,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnXsyevd(handle, params, jobz, uplo, n, CUDA_R_64F, A, lda,
-                          CUDA_R_64F, W, CUDA_R_64F, bufferOnDevice,
-                          workspaceInBytesOnDevice, bufferOnHost,
-                          workspaceInBytesOnHost, info);
+  return cusolverDnXsyevd(handle,
+                          params,
+                          jobz,
+                          uplo,
+                          n,
+                          CUDA_R_64F,
+                          A,
+                          lda,
+                          CUDA_R_64F,
+                          W,
+                          CUDA_R_64F,
+                          bufferOnDevice,
+                          workspaceInBytesOnDevice,
+                          bufferOnHost,
+                          workspaceInBytesOnHost,
+                          info);
 }
 /** @} */
 #endif
diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh
index c848ac1f4b..562a3d8991 100644
--- a/cpp/include/raft/linalg/divide.cuh
+++ b/cpp/include/raft/linalg/divide.cuh
@@ -33,11 +33,10 @@ namespace linalg {
  * @{
  */
 template <typename math_t, typename IdxType = int>
-void divideScalar(math_t *out, const math_t *in, math_t scalar, IdxType len,
-                  cudaStream_t stream) {
+void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
+{
   unaryOp(
-    out, in, len, [scalar] __device__(math_t in) { return in / scalar; },
-    stream);
+    out, in, len, [scalar] __device__(math_t in) { return in / scalar; }, stream);
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
index e141883b6c..288d379dac 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -29,25 +29,42 @@ namespace raft {
 namespace linalg {
 
 template <typename math_t>
-void eigDC_legacy(const raft::handle_t &handle, const math_t *in,
-                  std::size_t n_rows, std::size_t n_cols, math_t *eig_vectors,
-                  math_t *eig_vals, cudaStream_t stream) {
+void eigDC_legacy(const raft::handle_t& handle,
+                  const math_t* in,
+                  std::size_t n_rows,
+                  std::size_t n_cols,
+                  math_t* eig_vectors,
+                  math_t* eig_vals,
+                  cudaStream_t stream)
+{
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int lwork;
-  CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
-                                            CUBLAS_FILL_MODE_UPPER, n_rows, in,
-                                            n_cols, eig_vals, &lwork));
+  CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH,
+                                            CUSOLVER_EIG_MODE_VECTOR,
+                                            CUBLAS_FILL_MODE_UPPER,
+                                            n_rows,
+                                            in,
+                                            n_cols,
+                                            eig_vals,
+                                            &lwork));
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
   rmm::device_scalar<int> d_dev_info(stream);
 
   raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
 
-  CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
-                                 CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors,
-                                 n_cols, eig_vals, d_work.data(), lwork,
-                                 d_dev_info.data(), stream));
+  CUSOLVER_CHECK(cusolverDnsyevd(cusolverH,
+                                 CUSOLVER_EIG_MODE_VECTOR,
+                                 CUBLAS_FILL_MODE_UPPER,
+                                 n_rows,
+                                 eig_vectors,
+                                 n_cols,
+                                 eig_vals,
+                                 d_work.data(),
+                                 lwork,
+                                 d_dev_info.data(),
+                                 stream));
   CUDA_CHECK(cudaGetLastError());
 
   auto dev_info = d_dev_info.value(stream);
@@ -70,9 +87,14 @@ void eigDC_legacy(const raft::handle_t &handle, const math_t *in,
  * @{
  */
 template <typename math_t>
-void eigDC(const raft::handle_t &handle, const math_t *in, std::size_t n_rows,
-           std::size_t n_cols, math_t *eig_vectors, math_t *eig_vals,
-           cudaStream_t stream) {
+void eigDC(const raft::handle_t& handle,
+           const math_t* in,
+           std::size_t n_rows,
+           std::size_t n_cols,
+           math_t* eig_vectors,
+           math_t* eig_vals,
+           cudaStream_t stream)
+{
 #if CUDART_VERSION < 11010
   eigDC_legacy(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream);
 #else
@@ -82,11 +104,18 @@ void eigDC(const raft::handle_t &handle, const math_t *in, std::size_t n_rows,
   CUSOLVER_CHECK(cusolverDnCreateParams(&dn_params));
 
   size_t workspaceDevice = 0;
-  size_t workspaceHost = 0;
-  CUSOLVER_CHECK(cusolverDnxsyevd_bufferSize(
-    cusolverH, dn_params, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER,
-    static_cast<int64_t>(n_rows), eig_vectors, static_cast<int64_t>(n_cols),
-    eig_vals, &workspaceDevice, &workspaceHost, stream));
+  size_t workspaceHost   = 0;
+  CUSOLVER_CHECK(cusolverDnxsyevd_bufferSize(cusolverH,
+                                             dn_params,
+                                             CUSOLVER_EIG_MODE_VECTOR,
+                                             CUBLAS_FILL_MODE_UPPER,
+                                             static_cast<int64_t>(n_rows),
+                                             eig_vectors,
+                                             static_cast<int64_t>(n_cols),
+                                             eig_vals,
+                                             &workspaceDevice,
+                                             &workspaceHost,
+                                             stream));
 
   rmm::device_uvector<math_t> d_work(workspaceDevice / sizeof(math_t), stream);
   rmm::device_scalar<int> d_dev_info(stream);
@@ -94,11 +123,20 @@ void eigDC(const raft::handle_t &handle, const math_t *in, std::size_t n_rows,
 
   raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
 
-  CUSOLVER_CHECK(cusolverDnxsyevd(
-    cusolverH, dn_params, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER,
-    static_cast<int64_t>(n_rows), eig_vectors, static_cast<int64_t>(n_cols),
-    eig_vals, d_work.data(), workspaceDevice, h_work.data(), workspaceHost,
-    d_dev_info.data(), stream));
+  CUSOLVER_CHECK(cusolverDnxsyevd(cusolverH,
+                                  dn_params,
+                                  CUSOLVER_EIG_MODE_VECTOR,
+                                  CUBLAS_FILL_MODE_UPPER,
+                                  static_cast<int64_t>(n_rows),
+                                  eig_vectors,
+                                  static_cast<int64_t>(n_cols),
+                                  eig_vals,
+                                  d_work.data(),
+                                  workspaceDevice,
+                                  h_work.data(),
+                                  workspaceHost,
+                                  d_dev_info.data(),
+                                  stream));
 
   CUDA_CHECK(cudaGetLastError());
   CUSOLVER_CHECK(cusolverDnDestroyParams(dn_params));
@@ -128,38 +166,79 @@ enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT };
  * @{
  */
 template <typename math_t>
-void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
-              int n_eig_vals, math_t *eig_vectors, math_t *eig_vals,
-              EigVecMemUsage memUsage, cudaStream_t stream) {
+void eigSelDC(const raft::handle_t& handle,
+              math_t* in,
+              int n_rows,
+              int n_cols,
+              int n_eig_vals,
+              math_t* eig_vectors,
+              math_t* eig_vals,
+              EigVecMemUsage memUsage,
+              cudaStream_t stream)
+{
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int lwork;
   int h_meig;
 
-  CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize(
-    cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I,
-    CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0),
-    n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, &lwork));
+  CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize(cusolverH,
+                                             CUSOLVER_EIG_MODE_VECTOR,
+                                             CUSOLVER_EIG_RANGE_I,
+                                             CUBLAS_FILL_MODE_UPPER,
+                                             n_rows,
+                                             in,
+                                             n_cols,
+                                             math_t(0.0),
+                                             math_t(0.0),
+                                             n_cols - n_eig_vals + 1,
+                                             n_cols,
+                                             &h_meig,
+                                             eig_vals,
+                                             &lwork));
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
   rmm::device_scalar<int> d_dev_info(stream);
   rmm::device_uvector<math_t> d_eig_vectors(0, stream);
 
   if (memUsage == OVERWRITE_INPUT) {
-    CUSOLVER_CHECK(cusolverDnsyevdx(
-      cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I,
-      CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0),
-      n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, d_work.data(), lwork,
-      d_dev_info.data(), stream));
+    CUSOLVER_CHECK(cusolverDnsyevdx(cusolverH,
+                                    CUSOLVER_EIG_MODE_VECTOR,
+                                    CUSOLVER_EIG_RANGE_I,
+                                    CUBLAS_FILL_MODE_UPPER,
+                                    n_rows,
+                                    in,
+                                    n_cols,
+                                    math_t(0.0),
+                                    math_t(0.0),
+                                    n_cols - n_eig_vals + 1,
+                                    n_cols,
+                                    &h_meig,
+                                    eig_vals,
+                                    d_work.data(),
+                                    lwork,
+                                    d_dev_info.data(),
+                                    stream));
   } else if (memUsage == COPY_INPUT) {
     d_eig_vectors.resize(n_rows * n_cols, stream);
     raft::matrix::copy(in, d_eig_vectors.data(), n_rows, n_cols, stream);
 
-    CUSOLVER_CHECK(cusolverDnsyevdx(
-      cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I,
-      CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, n_cols, math_t(0.0),
-      math_t(0.0), n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals,
-      d_work.data(), lwork, d_dev_info.data(), stream));
+    CUSOLVER_CHECK(cusolverDnsyevdx(cusolverH,
+                                    CUSOLVER_EIG_MODE_VECTOR,
+                                    CUSOLVER_EIG_RANGE_I,
+                                    CUBLAS_FILL_MODE_UPPER,
+                                    n_rows,
+                                    eig_vectors,
+                                    n_cols,
+                                    math_t(0.0),
+                                    math_t(0.0),
+                                    n_cols - n_eig_vals + 1,
+                                    n_cols,
+                                    &h_meig,
+                                    eig_vals,
+                                    d_work.data(),
+                                    lwork,
+                                    d_dev_info.data(),
+                                    stream));
   }
 
   CUDA_CHECK(cudaGetLastError());
@@ -170,11 +249,10 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
          "This usually occurs when some of the features do not vary enough.");
 
   if (memUsage == OVERWRITE_INPUT) {
-    raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals,
-                                  stream);
+    raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals, stream);
   } else if (memUsage == COPY_INPUT) {
-    raft::matrix::truncZeroOrigin(d_eig_vectors.data(), n_rows, eig_vectors,
-                                  n_rows, n_eig_vals, stream);
+    raft::matrix::truncZeroOrigin(
+      d_eig_vectors.data(), n_rows, eig_vectors, n_rows, n_eig_vals, stream);
   }
 }
 
@@ -195,36 +273,54 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
  * @{
  */
 template <typename math_t>
-void eigJacobi(const raft::handle_t &handle, const math_t *in,
-               std::size_t n_rows, std::size_t n_cols, math_t *eig_vectors,
-               math_t *eig_vals, cudaStream_t stream, math_t tol = 1.e-7,
-               std::uint32_t sweeps = 15) {
+void eigJacobi(const raft::handle_t& handle,
+               const math_t* in,
+               std::size_t n_rows,
+               std::size_t n_cols,
+               math_t* eig_vectors,
+               math_t* eig_vals,
+               cudaStream_t stream,
+               math_t tol           = 1.e-7,
+               std::uint32_t sweeps = 15)
+{
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   syevjInfo_t syevj_params = nullptr;
   CUSOLVER_CHECK(cusolverDnCreateSyevjInfo(&syevj_params));
   CUSOLVER_CHECK(cusolverDnXsyevjSetTolerance(syevj_params, tol));
-  CUSOLVER_CHECK(
-    cusolverDnXsyevjSetMaxSweeps(syevj_params, static_cast<int>(sweeps)));
+  CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(syevj_params, static_cast<int>(sweeps)));
 
   int lwork;
-  CUSOLVER_CHECK(cusolverDnsyevj_bufferSize(
-    cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, n_rows,
-    eig_vectors, n_cols, eig_vals, &lwork, syevj_params));
+  CUSOLVER_CHECK(cusolverDnsyevj_bufferSize(cusolverH,
+                                            CUSOLVER_EIG_MODE_VECTOR,
+                                            CUBLAS_FILL_MODE_UPPER,
+                                            n_rows,
+                                            eig_vectors,
+                                            n_cols,
+                                            eig_vals,
+                                            &lwork,
+                                            syevj_params));
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
   rmm::device_scalar<int> dev_info(stream);
 
   raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
 
-  CUSOLVER_CHECK(cusolverDnsyevj(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
-                                 CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors,
-                                 n_cols, eig_vals, d_work.data(), lwork,
-                                 dev_info.data(), syevj_params, stream));
+  CUSOLVER_CHECK(cusolverDnsyevj(cusolverH,
+                                 CUSOLVER_EIG_MODE_VECTOR,
+                                 CUBLAS_FILL_MODE_UPPER,
+                                 n_rows,
+                                 eig_vectors,
+                                 n_cols,
+                                 eig_vals,
+                                 d_work.data(),
+                                 lwork,
+                                 dev_info.data(),
+                                 syevj_params,
+                                 stream));
 
   int executed_sweeps;
-  CUSOLVER_CHECK(
-    cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps));
+  CUSOLVER_CHECK(cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps));
 
   CUDA_CHECK(cudaGetLastError());
   CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params));
diff --git a/cpp/include/raft/linalg/eltwise.cuh b/cpp/include/raft/linalg/eltwise.cuh
index 1c6dee562d..097c3ac218 100644
--- a/cpp/include/raft/linalg/eltwise.cuh
+++ b/cpp/include/raft/linalg/eltwise.cuh
@@ -34,19 +34,17 @@ namespace linalg {
  * @{
  */
 template <typename InType, typename IdxType, typename OutType = InType>
-void scalarAdd(OutType *out, const InType *in, InType scalar, IdxType len,
-               cudaStream_t stream) {
+void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
+{
   raft::linalg::unaryOp(
-    out, in, len, [scalar] __device__(InType in) { return in + scalar; },
-    stream);
+    out, in, len, [scalar] __device__(InType in) { return in + scalar; }, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len,
-                    cudaStream_t stream) {
+void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
+{
   raft::linalg::unaryOp(
-    out, in, len, [scalar] __device__(InType in) { return in * scalar; },
-    stream);
+    out, in, len, [scalar] __device__(InType in) { return in * scalar; }, stream);
 }
 /** @} */
 
@@ -62,42 +60,46 @@ void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len,
  * @{
  */
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseAdd(OutType *out, const InType *in1, const InType *in2, IdxType len,
-                cudaStream_t stream) {
+void eltwiseAdd(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; },
-    stream);
+    out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseSub(OutType *out, const InType *in1, const InType *in2, IdxType len,
-                cudaStream_t stream) {
+void eltwiseSub(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; },
-    stream);
+    out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; }, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseMultiply(OutType *out, const InType *in1, const InType *in2,
-                     IdxType len, cudaStream_t stream) {
+void eltwiseMultiply(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; },
-    stream);
+    out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; }, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseDivide(OutType *out, const InType *in1, const InType *in2,
-                   IdxType len, cudaStream_t stream) {
+void eltwiseDivide(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; },
-    stream);
+    out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; }, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseDivideCheckZero(OutType *out, const InType *in1, const InType *in2,
-                            IdxType len, cudaStream_t stream) {
+void eltwiseDivideCheckZero(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(
-    out, in1, in2, len,
+    out,
+    in1,
+    in2,
+    len,
     [] __device__(InType a, InType b) {
       if (b == InType(0.0))
         return InType(0.0);
diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh
index 0a4897cc0b..d5942b7446 100644
--- a/cpp/include/raft/linalg/gemm.cuh
+++ b/cpp/include/raft/linalg/gemm.cuh
@@ -43,35 +43,53 @@ namespace linalg {
  * @param stream cuda stream
  */
 template <typename math_t>
-void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
-          int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c,
-          cublasOperation_t trans_a, cublasOperation_t trans_b, math_t alpha,
-          math_t beta, cudaStream_t stream) {
+void gemm(const raft::handle_t& handle,
+          const math_t* a,
+          int n_rows_a,
+          int n_cols_a,
+          const math_t* b,
+          math_t* c,
+          int n_rows_c,
+          int n_cols_c,
+          cublasOperation_t trans_a,
+          cublasOperation_t trans_b,
+          math_t alpha,
+          math_t beta,
+          cudaStream_t stream)
+{
   cublasHandle_t cublas_h = handle.get_cublas_handle();
 
-  int m = n_rows_c;
-  int n = n_cols_c;
-  int k = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a;
+  int m   = n_rows_c;
+  int n   = n_cols_c;
+  int k   = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a;
   int lda = trans_a == CUBLAS_OP_T ? k : m;
   int ldb = trans_b == CUBLAS_OP_T ? n : k;
   int ldc = m;
-  CUBLAS_CHECK(cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda,
-                          b, ldb, &beta, c, ldc, stream));
+  CUBLAS_CHECK(
+    cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc, stream));
 }
 
 template <typename math_t>
-void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
-          int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c,
-          cublasOperation_t trans_a, cublasOperation_t trans_b,
-          cudaStream_t stream) {
+void gemm(const raft::handle_t& handle,
+          const math_t* a,
+          int n_rows_a,
+          int n_cols_a,
+          const math_t* b,
+          math_t* c,
+          int n_rows_c,
+          int n_cols_c,
+          cublasOperation_t trans_a,
+          cublasOperation_t trans_b,
+          cudaStream_t stream)
+{
   math_t alpha = math_t(1);
-  math_t beta = math_t(0);
-  gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a,
-       trans_b, alpha, beta, stream);
+  math_t beta  = math_t(0);
+  gemm(
+    handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream);
 }
 
 /**
- * @brief A wrapper for CUBLS GEMM function designed for handling all possible 
+ * @brief A wrapper for CUBLS GEMM function designed for handling all possible
  * combinations of operand layouts.
  * It computes the following equation: Z = alpha . X * Y + beta . Z
  * @tparam T Data type of input/output matrices (float/double)
@@ -90,9 +108,20 @@ void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
  * @param beta scalar
  */
 template <typename T>
-void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N,
-          int _K, bool isZColMajor, bool isXColMajor, bool isYColMajor,
-          cudaStream_t stream, T alpha = T(1.0), T beta = T(0.0)) {
+void gemm(const raft::handle_t& handle,
+          T* z,
+          T* x,
+          T* y,
+          int _M,
+          int _N,
+          int _K,
+          bool isZColMajor,
+          bool isXColMajor,
+          bool isYColMajor,
+          cudaStream_t stream,
+          T alpha = T(1.0),
+          T beta  = T(0.0))
+{
   cublasHandle_t cublas_h = handle.get_cublas_handle();
 
   cublasOperation_t trans_a, trans_b;
@@ -119,13 +148,13 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N,
     // therefore trans_x needs to be CUBLAS_OP_T. If x is in column major
     // layout, trans_b needs to be CUBLAS_OP_N.
     trans_b = isYColMajor == true ? CUBLAS_OP_N : CUBLAS_OP_T;
-    ldb = isYColMajor == true ? _K : _N;
+    ldb     = isYColMajor == true ? _K : _N;
 
-    c = z;
+    c   = z;
     ldc = _M;
-    M = _M;
-    N = _N;
-    K = _K;
+    M   = _M;
+    N   = _N;
+    K   = _K;
   } else {
     // Result c is required in row major layout Thus we pick
     // a = y, b = x and c = a * b = y * x
@@ -154,7 +183,7 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N,
     // Set leading dimension appropriately
     ldb = isXColMajor == true ? _M : _K;
 
-    c = z;
+    c   = z;
     ldc = _N;
 
     M = _N;
@@ -162,8 +191,8 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N,
     K = _K;
   }
   // Actual cuBLAS call
-  CUBLAS_CHECK(cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda,
-                          b, ldb, &beta, c, ldc, stream));
+  CUBLAS_CHECK(
+    cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda, b, ldb, &beta, c, ldc, stream));
 }
 
 }  // end namespace linalg
diff --git a/cpp/include/raft/linalg/gemv.h b/cpp/include/raft/linalg/gemv.h
index 0be11a0301..ac0547e30a 100644
--- a/cpp/include/raft/linalg/gemv.h
+++ b/cpp/include/raft/linalg/gemv.h
@@ -26,14 +26,23 @@ namespace raft {
 namespace linalg {
 
 template <typename math_t>
-void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows,
-          const int n_cols, const math_t *x, const int incx, math_t *y,
-          const int incy, const bool trans_a, const math_t alpha,
-          const math_t beta, cudaStream_t stream) {
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows,
+          const int n_cols,
+          const math_t* x,
+          const int incx,
+          math_t* y,
+          const int incy,
+          const bool trans_a,
+          const math_t alpha,
+          const math_t beta,
+          cudaStream_t stream)
+{
   cublasHandle_t cublas_h = handle.get_cublas_handle();
-  cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBLAS_CHECK(cublasgemv(cublas_h, op_a, n_rows, n_cols, &alpha, A, n_rows, x,
-                          incx, &beta, y, incy, stream));
+  cublasOperation_t op_a  = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  CUBLAS_CHECK(
+    cublasgemv(cublas_h, op_a, n_rows, n_cols, &alpha, A, n_rows, x, incx, &beta, y, incy, stream));
 }
 
 /**
@@ -53,9 +62,17 @@ void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows,
  * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
  */
 template <typename math_t>
-void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a,
-          const int n_cols_a, const math_t *x, math_t *y, const bool trans_a,
-          const math_t alpha, const math_t beta, cudaStream_t stream) {
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          const math_t alpha,
+          const math_t beta,
+          cudaStream_t stream)
+{
   gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
 }
 
@@ -72,11 +89,17 @@ void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a,
  * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
  */
 template <typename math_t>
-void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a,
-          const int n_cols_a, const math_t *x, math_t *y, const bool trans_a,
-          cudaStream_t stream) {
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          cudaStream_t stream)
+{
   math_t alpha = math_t(1);
-  math_t beta = math_t(0);
+  math_t beta  = math_t(0);
 
   gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
 }
@@ -102,14 +125,22 @@ void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a,
  * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
  */
 template <typename math_t>
-void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a,
-          const int n_cols_a, const int lda, const math_t *x, math_t *y,
-          const bool trans_a, const math_t alpha, const math_t beta,
-          cudaStream_t stream) {
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const int lda,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          const math_t alpha,
+          const math_t beta,
+          cudaStream_t stream)
+{
   cublasHandle_t cublas_h = handle.get_cublas_handle();
-  cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBLAS_CHECK(cublasgemv(cublas_h, op_a, n_rows_a, n_cols_a, &alpha, A, lda, x,
-                          1, &beta, y, 1, stream));
+  cublasOperation_t op_a  = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  CUBLAS_CHECK(
+    cublasgemv(cublas_h, op_a, n_rows_a, n_cols_a, &alpha, A, lda, x, 1, &beta, y, 1, stream));
 }
 
 /**
@@ -130,11 +161,18 @@ void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a,
  *
  */
 template <typename math_t>
-void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a,
-          const int n_cols_a, const int lda, const math_t *x, math_t *y,
-          const bool trans_a, cudaStream_t stream) {
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const int lda,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          cudaStream_t stream)
+{
   math_t alpha = math_t(1);
-  math_t beta = math_t(0);
+  math_t beta  = math_t(0);
   gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream);
 }
 
diff --git a/cpp/include/raft/linalg/init.h b/cpp/include/raft/linalg/init.h
index 9944685a1f..41ef4d4641 100644
--- a/cpp/include/raft/linalg/init.h
+++ b/cpp/include/raft/linalg/init.h
@@ -37,7 +37,8 @@ namespace {
  * \param [in] stream cuda stream
  */
 template <typename T>
-void range(T *out, int start, int end, cudaStream_t stream) {
+void range(T* out, int start, int end, cudaStream_t stream)
+{
   thrust::counting_iterator<int> first(start);
   thrust::counting_iterator<int> last = first + (end - start);
   thrust::device_ptr<T> ptr(out);
@@ -54,7 +55,8 @@ void range(T *out, int start, int end, cudaStream_t stream) {
  * \param [in] stream cuda stream
  */
 template <typename T, int TPB = 256>
-void range(T *out, int n, cudaStream_t stream) {
+void range(T* out, int n, cudaStream_t stream)
+{
   range(out, 0, n, stream);
 }
 }  // unnamed namespace
diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp
index b775a1f696..39089473e3 100644
--- a/cpp/include/raft/linalg/lanczos.hpp
+++ b/cpp/include/raft/linalg/lanczos.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-//for cmath:
+// for cmath:
 #define _USE_MATH_DEFINES
 
 #include <cmath>
@@ -40,14 +40,14 @@ using namespace linalg;
 namespace spectral {
 
 // curandGeneratorNormalX
-inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator,
-                                            float *outputPtr, size_t n,
-                                            float mean, float stddev) {
+inline curandStatus_t curandGenerateNormalX(
+  curandGenerator_t generator, float* outputPtr, size_t n, float mean, float stddev)
+{
   return curandGenerateNormal(generator, outputPtr, n, mean, stddev);
 }
-inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator,
-                                            double *outputPtr, size_t n,
-                                            double mean, double stddev) {
+inline curandStatus_t curandGenerateNormalX(
+  curandGenerator_t generator, double* outputPtr, size_t n, double mean, double stddev)
+{
   return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev);
 }
 
@@ -55,7 +55,7 @@ inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator,
 // Helper functions
 // =========================================================
 
-/**  
+/**
  *  @brief  Perform Lanczos iteration
  *    Lanczos iteration is performed on a shifted matrix A+shift*I.
  *  @tparam index_type_t the type of data used for indexing.
@@ -85,25 +85,30 @@ inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator,
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-int performLanczosIteration(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const *A,
-  index_type_t *iter, index_type_t maxIter, value_type_t shift,
-  value_type_t tol, bool reorthogonalize, value_type_t *__restrict__ alpha_host,
-  value_type_t *__restrict__ beta_host,
-  value_type_t *__restrict__ lanczosVecs_dev,
-  value_type_t *__restrict__ work_dev) {
+int performLanczosIteration(handle_t const& handle,
+                            sparse_matrix_t<index_type_t, value_type_t> const* A,
+                            index_type_t* iter,
+                            index_type_t maxIter,
+                            value_type_t shift,
+                            value_type_t tol,
+                            bool reorthogonalize,
+                            value_type_t* __restrict__ alpha_host,
+                            value_type_t* __restrict__ beta_host,
+                            value_type_t* __restrict__ lanczosVecs_dev,
+                            value_type_t* __restrict__ work_dev)
+{
   // -------------------------------------------------------
   // Variable declaration
   // -------------------------------------------------------
 
   // Useful variables
-  constexpr value_type_t one = 1;
+  constexpr value_type_t one    = 1;
   constexpr value_type_t negOne = -1;
-  constexpr value_type_t zero = 0;
+  constexpr value_type_t zero   = 0;
   value_type_t alpha;
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   RAFT_EXPECTS(A != nullptr, "Null matrix pointer.");
 
@@ -117,29 +122,28 @@ int performLanczosIteration(
 
     // Apply matrix
     if (shift != 0)
-      CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, lanczosVecs_dev,
+      CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n,
+                               lanczosVecs_dev,
                                n * sizeof(value_type_t),
-                               cudaMemcpyDeviceToDevice, stream));
+                               cudaMemcpyDeviceToDevice,
+                               stream));
     A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n);
 
     // Orthogonalize Lanczos vector
-    CUBLAS_CHECK(cublasdot(cublas_h, n, lanczosVecs_dev, 1,
-                           lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host,
-                           stream));
+    CUBLAS_CHECK(cublasdot(
+      cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream));
 
     alpha = -alpha_host[0];
-    CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, lanczosVecs_dev, 1,
-                            lanczosVecs_dev + IDX(0, 1, n), 1, stream));
-    CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1,
-                            beta_host, stream));
+    CUBLAS_CHECK(cublasaxpy(
+      cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
+    CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream));
 
     // Check if Lanczos has converged
     if (beta_host[0] <= tol) return 0;
 
     // Normalize Lanczos vector
     alpha = 1 / beta_host[0];
-    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n),
-                            1, stream));
+    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
   }
 
   // -------------------------------------------------------
@@ -151,65 +155,121 @@ int performLanczosIteration(
 
     // Apply matrix
     if (shift != 0)
-      CUDA_TRY(cudaMemcpyAsync(
-        lanczosVecs_dev + (*iter) * n, lanczosVecs_dev + (*iter - 1) * n,
-        n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream));
-    A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift,
-          lanczosVecs_dev + IDX(0, *iter, n));
+      CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n,
+                               lanczosVecs_dev + (*iter - 1) * n,
+                               n * sizeof(value_type_t),
+                               cudaMemcpyDeviceToDevice,
+                               stream));
+    A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n));
 
     // Full reorthogonalization
     //   "Twice is enough" algorithm per Kahan and Parlett
     if (reorthogonalize) {
-      CUBLAS_CHECK(cublasgemv(
-        cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n,
-        lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream));
-
-      CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne,
-                              lanczosVecs_dev, n, work_dev, 1, &one,
-                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
-
-      CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), work_dev + (*iter - 1),
-                               sizeof(value_type_t), cudaMemcpyDeviceToHost,
+      CUBLAS_CHECK(cublasgemv(cublas_h,
+                              CUBLAS_OP_T,
+                              n,
+                              *iter,
+                              &one,
+                              lanczosVecs_dev,
+                              n,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              &zero,
+                              work_dev,
+                              1,
+                              stream));
+
+      CUBLAS_CHECK(cublasgemv(cublas_h,
+                              CUBLAS_OP_N,
+                              n,
+                              *iter,
+                              &negOne,
+                              lanczosVecs_dev,
+                              n,
+                              work_dev,
+                              1,
+                              &one,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              stream));
+
+      CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1),
+                               work_dev + (*iter - 1),
+                               sizeof(value_type_t),
+                               cudaMemcpyDeviceToHost,
                                stream));
 
-      CUBLAS_CHECK(cublasgemv(
-        cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n,
-        lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream));
-
-      CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne,
-                              lanczosVecs_dev, n, work_dev, 1, &one,
-                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+      CUBLAS_CHECK(cublasgemv(cublas_h,
+                              CUBLAS_OP_T,
+                              n,
+                              *iter,
+                              &one,
+                              lanczosVecs_dev,
+                              n,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              &zero,
+                              work_dev,
+                              1,
+                              stream));
+
+      CUBLAS_CHECK(cublasgemv(cublas_h,
+                              CUBLAS_OP_N,
+                              n,
+                              *iter,
+                              &negOne,
+                              lanczosVecs_dev,
+                              n,
+                              work_dev,
+                              1,
+                              &one,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              stream));
     }
 
     // Orthogonalization with 3-term recurrence relation
     else {
-      CUBLAS_CHECK(cublasdot(cublas_h, n,
-                             lanczosVecs_dev + IDX(0, *iter - 1, n), 1,
-                             lanczosVecs_dev + IDX(0, *iter, n), 1,
-                             alpha_host + (*iter - 1), stream));
+      CUBLAS_CHECK(cublasdot(cublas_h,
+                             n,
+                             lanczosVecs_dev + IDX(0, *iter - 1, n),
+                             1,
+                             lanczosVecs_dev + IDX(0, *iter, n),
+                             1,
+                             alpha_host + (*iter - 1),
+                             stream));
 
       auto alpha = -alpha_host[*iter - 1];
-      CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha,
-                              lanczosVecs_dev + IDX(0, *iter - 1, n), 1,
-                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+      CUBLAS_CHECK(cublasaxpy(cublas_h,
+                              n,
+                              &alpha,
+                              lanczosVecs_dev + IDX(0, *iter - 1, n),
+                              1,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              stream));
 
       alpha = -beta_host[*iter - 2];
-      CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha,
-                              lanczosVecs_dev + IDX(0, *iter - 2, n), 1,
-                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+      CUBLAS_CHECK(cublasaxpy(cublas_h,
+                              n,
+                              &alpha,
+                              lanczosVecs_dev + IDX(0, *iter - 2, n),
+                              1,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              stream));
     }
 
     // Compute residual
-    CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1,
-                            beta_host + *iter - 1, stream));
+    CUBLAS_CHECK(cublasnrm2(
+      cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream));
 
     // Check if Lanczos has converged
     if (beta_host[*iter - 1] <= tol) break;
 
     // Normalize Lanczos vector
     alpha = 1 / beta_host[*iter - 1];
-    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha,
-                            lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
   }
 
   CUDA_TRY(cudaStreamSynchronize(stream));
@@ -217,7 +277,7 @@ int performLanczosIteration(
   return 0;
 }
 
-/** 
+/**
  *  @brief  Find Householder transform for 3-dimensional system
  *    Given an input vector v=[x,y,z]', this function finds a
  *    Householder transform P such that P*v is a multiple of
@@ -235,8 +295,8 @@ int performLanczosIteration(
  *    matrix. Matrix dimensions are 3 x 3.
  */
 template <typename index_type_t, typename value_type_t>
-static void findHouseholder3(value_type_t *v, value_type_t *Pv,
-                             value_type_t *P) {
+static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P)
+{
   // Compute norm of vector
   *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
 
@@ -246,8 +306,7 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv,
   v[0] -= *Pv;
 
   // Normalize Householder vector
-  value_type_t normHouseholder =
-    std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+  value_type_t normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
   if (normHouseholder != 0) {
     v[0] /= normHouseholder;
     v[1] /= normHouseholder;
@@ -261,11 +320,13 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv,
   // Construct Householder matrix
   index_type_t i, j;
   for (j = 0; j < 3; ++j)
-    for (i = 0; i < 3; ++i) P[IDX(i, j, 3)] = -2 * v[i] * v[j];
-  for (i = 0; i < 3; ++i) P[IDX(i, i, 3)] += 1;
+    for (i = 0; i < 3; ++i)
+      P[IDX(i, j, 3)] = -2 * v[i] * v[j];
+  for (i = 0; i < 3; ++i)
+    P[IDX(i, i, 3)] += 1;
 }
 
-/**  
+/**
  *  @brief  Apply 3-dimensional Householder transform to 4 x 4 matrix
  *    The Householder transform is pre-applied to the top three rows
  *  of the matrix and post-applied to the left three columns. The
@@ -277,7 +338,8 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv,
  *  @param A (Input/output, host memory, 16 entries) 4 x 4 matrix.
  */
 template <typename index_type_t, typename value_type_t>
-static void applyHouseholder3(const value_type_t *v, value_type_t *A) {
+static void applyHouseholder3(const value_type_t* v, value_type_t* A)
+{
   // Loop indices
   index_type_t i, j;
   // Dot product between Householder vector and matrix row/column
@@ -286,19 +348,23 @@ static void applyHouseholder3(const value_type_t *v, value_type_t *A) {
   // Pre-apply Householder transform
   for (j = 0; j < 4; ++j) {
     vDotA = 0;
-    for (i = 0; i < 3; ++i) vDotA += v[i] * A[IDX(i, j, 4)];
-    for (i = 0; i < 3; ++i) A[IDX(i, j, 4)] -= 2 * v[i] * vDotA;
+    for (i = 0; i < 3; ++i)
+      vDotA += v[i] * A[IDX(i, j, 4)];
+    for (i = 0; i < 3; ++i)
+      A[IDX(i, j, 4)] -= 2 * v[i] * vDotA;
   }
 
   // Post-apply Householder transform
   for (i = 0; i < 4; ++i) {
     vDotA = 0;
-    for (j = 0; j < 3; ++j) vDotA += A[IDX(i, j, 4)] * v[j];
-    for (j = 0; j < 3; ++j) A[IDX(i, j, 4)] -= 2 * vDotA * v[j];
+    for (j = 0; j < 3; ++j)
+      vDotA += A[IDX(i, j, 4)] * v[j];
+    for (j = 0; j < 3; ++j)
+      A[IDX(i, j, 4)] -= 2 * vDotA * v[j];
   }
 }
 
-/**  
+/**
  *  @brief  Perform one step of Francis QR algorithm
  *    Equivalent to two steps of the classical QR algorithm on a
  *    tridiagonal matrix.
@@ -319,10 +385,14 @@ static void applyHouseholder3(const value_type_t *v, value_type_t *A) {
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-static int francisQRIteration(index_type_t n, value_type_t shift1,
-                              value_type_t shift2, value_type_t *alpha,
-                              value_type_t *beta, value_type_t *V,
-                              value_type_t *work) {
+static int francisQRIteration(index_type_t n,
+                              value_type_t shift1,
+                              value_type_t shift2,
+                              value_type_t* alpha,
+                              value_type_t* beta,
+                              value_type_t* V,
+                              value_type_t* work)
+{
   // -------------------------------------------------------
   // Variable declaration
   // -------------------------------------------------------
@@ -352,30 +422,30 @@ static int francisQRIteration(index_type_t n, value_type_t shift1,
   householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c;
   householder[1] = beta[0] * (alpha[0] + alpha[1] + b);
   householder[2] = beta[0] * beta[1];
-  findHouseholder3<index_type_t, value_type_t>(householder, &temp,
-                                               householderMatrix);
+  findHouseholder3<index_type_t, value_type_t>(householder, &temp, householderMatrix);
 
   // Apply initial Householder transform to create bulge
   memset(bulge, 0, 16 * sizeof(value_type_t));
-  for (i = 0; i < 4; ++i) bulge[IDX(i, i, 4)] = alpha[i];
+  for (i = 0; i < 4; ++i)
+    bulge[IDX(i, i, 4)] = alpha[i];
   for (i = 0; i < 3; ++i) {
     bulge[IDX(i + 1, i, 4)] = beta[i];
     bulge[IDX(i, i + 1, 4)] = beta[i];
   }
   applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-  Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix,
-                             3, 0, work, n);
+  Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n);
   memcpy(V, work, 3 * n * sizeof(value_type_t));
 
   // Chase bulge to bottom-right of matrix with Householder transforms
   for (pos = 0; pos < n - 4; ++pos) {
     // Move to next position
-    alpha[pos] = bulge[IDX(0, 0, 4)];
+    alpha[pos]     = bulge[IDX(0, 0, 4)];
     householder[0] = bulge[IDX(1, 0, 4)];
     householder[1] = bulge[IDX(2, 0, 4)];
     householder[2] = bulge[IDX(3, 0, 4)];
     for (j = 0; j < 3; ++j)
-      for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+      for (i = 0; i < 3; ++i)
+        bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
     bulge[IDX(3, 0, 4)] = 0;
     bulge[IDX(3, 1, 4)] = 0;
     bulge[IDX(3, 2, 4)] = beta[pos + 3];
@@ -385,22 +455,22 @@ static int francisQRIteration(index_type_t n, value_type_t shift1,
     bulge[IDX(3, 3, 4)] = alpha[pos + 4];
 
     // Apply Householder transform
-    findHouseholder3<index_type_t, value_type_t>(householder, beta + pos,
-                                                 householderMatrix);
+    findHouseholder3<index_type_t, value_type_t>(householder, beta + pos, householderMatrix);
     applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-    Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n),
-                               n, householderMatrix, 3, 0, work, n);
+    Lapack<value_type_t>::gemm(
+      false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n);
     memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(value_type_t));
   }
 
   // Apply penultimate Householder transform
   //   Values in the last row and column are zero
-  alpha[n - 4] = bulge[IDX(0, 0, 4)];
+  alpha[n - 4]   = bulge[IDX(0, 0, 4)];
   householder[0] = bulge[IDX(1, 0, 4)];
   householder[1] = bulge[IDX(2, 0, 4)];
   householder[2] = bulge[IDX(3, 0, 4)];
   for (j = 0; j < 3; ++j)
-    for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+    for (i = 0; i < 3; ++i)
+      bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
   bulge[IDX(3, 0, 4)] = 0;
   bulge[IDX(3, 1, 4)] = 0;
   bulge[IDX(3, 2, 4)] = 0;
@@ -408,37 +478,36 @@ static int francisQRIteration(index_type_t n, value_type_t shift1,
   bulge[IDX(1, 3, 4)] = 0;
   bulge[IDX(2, 3, 4)] = 0;
   bulge[IDX(3, 3, 4)] = 0;
-  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 4,
-                                               householderMatrix);
+  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 4, householderMatrix);
   applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-  Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n,
-                             householderMatrix, 3, 0, work, n);
+  Lapack<value_type_t>::gemm(
+    false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n);
   memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(value_type_t));
 
   // Apply final Householder transform
   //   Values in the last two rows and columns are zero
-  alpha[n - 3] = bulge[IDX(0, 0, 4)];
+  alpha[n - 3]   = bulge[IDX(0, 0, 4)];
   householder[0] = bulge[IDX(1, 0, 4)];
   householder[1] = bulge[IDX(2, 0, 4)];
   householder[2] = 0;
   for (j = 0; j < 3; ++j)
-    for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
-  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 3,
-                                               householderMatrix);
+    for (i = 0; i < 3; ++i)
+      bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 3, householderMatrix);
   applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-  Lapack<value_type_t>::gemm(false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n,
-                             householderMatrix, 3, 0, work, n);
+  Lapack<value_type_t>::gemm(
+    false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n);
   memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(value_type_t));
 
   // Bulge has been eliminated
   alpha[n - 2] = bulge[IDX(0, 0, 4)];
   alpha[n - 1] = bulge[IDX(1, 1, 4)];
-  beta[n - 2] = bulge[IDX(1, 0, 4)];
+  beta[n - 2]  = bulge[IDX(1, 0, 4)];
 
   return 0;
 }
 
-/**  
+/**
  *  @brief  Perform implicit restart of Lanczos algorithm
  *    Shifts are Chebyshev nodes of unwanted region of matrix spectrum.
  *  @tparam index_type_t the type of data used for indexing.
@@ -474,23 +543,30 @@ static int francisQRIteration(index_type_t n, value_type_t shift1,
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-static int lanczosRestart(
-  handle_t const &handle, index_type_t n, index_type_t iter,
-  index_type_t iter_new, value_type_t *shiftUpper, value_type_t *shiftLower,
-  value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host,
-  value_type_t *__restrict__ V_host, value_type_t *__restrict__ work_host,
-  value_type_t *__restrict__ lanczosVecs_dev,
-  value_type_t *__restrict__ work_dev, bool smallest_eig) {
+static int lanczosRestart(handle_t const& handle,
+                          index_type_t n,
+                          index_type_t iter,
+                          index_type_t iter_new,
+                          value_type_t* shiftUpper,
+                          value_type_t* shiftLower,
+                          value_type_t* __restrict__ alpha_host,
+                          value_type_t* __restrict__ beta_host,
+                          value_type_t* __restrict__ V_host,
+                          value_type_t* __restrict__ work_host,
+                          value_type_t* __restrict__ lanczosVecs_dev,
+                          value_type_t* __restrict__ work_dev,
+                          bool smallest_eig)
+{
   // -------------------------------------------------------
   // Variable declaration
   // -------------------------------------------------------
 
   // Useful constants
   constexpr value_type_t zero = 0;
-  constexpr value_type_t one = 1;
+  constexpr value_type_t one  = 1;
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   // Loop index
   index_type_t i;
@@ -501,12 +577,12 @@ static int lanczosRestart(
   index_type_t restartSteps = iter - iter_new;
 
   // Ritz values from Lanczos method
-  value_type_t *ritzVals_host = work_host + 3 * iter;
+  value_type_t* ritzVals_host = work_host + 3 * iter;
   // Shifts for implicit restart
-  value_type_t *shifts_host;
+  value_type_t* shifts_host;
 
   // Orthonormal matrix for similarity transform
-  value_type_t *V_dev = work_dev + n * iter;
+  value_type_t* V_dev = work_dev + n * iter;
 
   // -------------------------------------------------------
   // Implementation
@@ -524,7 +600,8 @@ static int lanczosRestart(
 
   // Initialize similarity transform with identity matrix
   memset(V_host, 0, iter * iter * sizeof(value_type_t));
-  for (i = 0; i < iter; ++i) V_host[IDX(i, i, iter)] = 1;
+  for (i = 0; i < iter; ++i)
+    V_host[IDX(i, i, iter)] = 1;
 
   // Determine interval to suppress eigenvalues
   if (smallest_eig) {
@@ -548,49 +625,71 @@ static int lanczosRestart(
   // Calculate Chebyshev nodes as shifts
   shifts_host = ritzVals_host;
   for (i = 0; i < restartSteps; ++i) {
-    shifts_host[i] =
-      cos((i + 0.5) * static_cast<value_type_t>(M_PI) / restartSteps);
+    shifts_host[i] = cos((i + 0.5) * static_cast<value_type_t>(M_PI) / restartSteps);
     shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower));
     shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower));
   }
 
   // Apply Francis QR algorithm to implicitly restart Lanczos
   for (i = 0; i < restartSteps; i += 2)
-    if (francisQRIteration(iter, shifts_host[i], shifts_host[i + 1], alpha_host,
-                           beta_host, V_host, work_host))
+    if (francisQRIteration(
+          iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host))
       WARNING("error in implicitly shifted QR algorithm");
 
   // Obtain new residual
-  CUDA_TRY(cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice, stream));
-
-  beta_host[iter - 1] =
-    beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)];
-  CUBLAS_CHECK(cublasgemv(
-    cublas_h, CUBLAS_OP_N, n, iter, beta_host + iter_new - 1, lanczosVecs_dev,
-    n, V_dev + IDX(0, iter_new, iter), 1, beta_host + iter - 1,
-    lanczosVecs_dev + IDX(0, iter, n), 1, stream));
+  CUDA_TRY(cudaMemcpyAsync(
+    V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream));
+
+  beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)];
+  CUBLAS_CHECK(cublasgemv(cublas_h,
+                          CUBLAS_OP_N,
+                          n,
+                          iter,
+                          beta_host + iter_new - 1,
+                          lanczosVecs_dev,
+                          n,
+                          V_dev + IDX(0, iter_new, iter),
+                          1,
+                          beta_host + iter - 1,
+                          lanczosVecs_dev + IDX(0, iter, n),
+                          1,
+                          stream));
 
   // Obtain new Lanczos vectors
-  CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, iter_new, iter,
-                          &one, lanczosVecs_dev, n, V_dev, iter, &zero,
-                          work_dev, n, stream));
-
-  CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, work_dev,
+  CUBLAS_CHECK(cublasgemm(cublas_h,
+                          CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          n,
+                          iter_new,
+                          iter,
+                          &one,
+                          lanczosVecs_dev,
+                          n,
+                          V_dev,
+                          iter,
+                          &zero,
+                          work_dev,
+                          n,
+                          stream));
+
+  CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev,
+                           work_dev,
                            n * iter_new * sizeof(value_type_t),
-                           cudaMemcpyDeviceToDevice, stream));
+                           cudaMemcpyDeviceToDevice,
+                           stream));
 
   // Normalize residual to obtain new Lanczos vector
-  CUDA_TRY(cudaMemcpyAsync(
-    lanczosVecs_dev + IDX(0, iter_new, n), lanczosVecs_dev + IDX(0, iter, n),
-    n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream));
+  CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n),
+                           lanczosVecs_dev + IDX(0, iter, n),
+                           n * sizeof(value_type_t),
+                           cudaMemcpyDeviceToDevice,
+                           stream));
 
-  CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1,
-                          beta_host + iter_new - 1, stream));
+  CUBLAS_CHECK(cublasnrm2(
+    cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream));
 
   auto h_beta = 1 / beta_host[iter_new - 1];
-  CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta,
-                          lanczosVecs_dev + IDX(0, iter_new, n), 1, stream));
+  CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream));
 
   return 0;
 }
@@ -601,7 +700,7 @@ static int lanczosRestart(
 // Eigensolver
 // =========================================================
 
-/**  
+/**
  * @brief  Compute smallest eigenvectors of symmetric matrix
  *    Computes eigenvalues and eigenvectors that are least
  *    positive. If matrix is positive definite or positive
@@ -651,19 +750,28 @@ static int lanczosRestart(
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeSmallestEigenvectors(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const *A,
-  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
-  value_type_t tol, bool reorthogonalize, index_type_t *effIter,
-  index_type_t *totalIter, value_type_t *shift,
-  value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host,
-  value_type_t *__restrict__ lanczosVecs_dev,
-  value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev,
-  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) {
+int computeSmallestEigenvectors(handle_t const& handle,
+                                sparse_matrix_t<index_type_t, value_type_t> const* A,
+                                index_type_t nEigVecs,
+                                index_type_t maxIter,
+                                index_type_t restartIter,
+                                value_type_t tol,
+                                bool reorthogonalize,
+                                index_type_t* effIter,
+                                index_type_t* totalIter,
+                                value_type_t* shift,
+                                value_type_t* __restrict__ alpha_host,
+                                value_type_t* __restrict__ beta_host,
+                                value_type_t* __restrict__ lanczosVecs_dev,
+                                value_type_t* __restrict__ work_dev,
+                                value_type_t* __restrict__ eigVals_dev,
+                                value_type_t* __restrict__ eigVecs_dev,
+                                unsigned long long seed)
+{
   using namespace spectral;
 
   // Useful constants
-  constexpr value_type_t one = 1;
+  constexpr value_type_t one  = 1;
   constexpr value_type_t zero = 0;
 
   // Matrix dimension
@@ -683,21 +791,20 @@ int computeSmallestEigenvectors(
   index_type_t i;
 
   // Host memory
-  value_type_t *Z_host;     // Eigenvectors in Lanczos basis
-  value_type_t *work_host;  // Workspace
+  value_type_t* Z_host;     // Eigenvectors in Lanczos basis
+  value_type_t* work_host;  // Workspace
 
   // -------------------------------------------------------
   // Check that parameters are valid
   // -------------------------------------------------------
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
-               "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
   RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   // -------------------------------------------------------
   // Variable initialization
@@ -710,12 +817,11 @@ int computeSmallestEigenvectors(
   std::vector<value_type_t> Z_host_v(restartIter * restartIter);
   std::vector<value_type_t> work_host_v(4 * restartIter);
 
-  Z_host = Z_host_v.data();
+  Z_host    = Z_host_v.data();
   work_host = work_host_v.data();
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(
-    cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // -------------------------------------------------------
   // Compute largest eigenvalue to determine shift
@@ -738,10 +844,18 @@ int computeSmallestEigenvectors(
 
   // Obtain tridiagonal matrix with Lanczos
   *effIter = 0;
-  *shift = 0;
-  status = performLanczosIteration<index_type_t, value_type_t>(
-    handle, A, effIter, maxIter_curr, *shift, 0.0, reorthogonalize, alpha_host,
-    beta_host, lanczosVecs_dev, work_dev);
+  *shift   = 0;
+  status   = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                               A,
+                                                               effIter,
+                                                               maxIter_curr,
+                                                               *shift,
+                                                               0.0,
+                                                               reorthogonalize,
+                                                               alpha_host,
+                                                               beta_host,
+                                                               lanczosVecs_dev,
+                                                               work_dev);
   if (status) WARNING("error in Lanczos iteration");
 
   // Determine largest eigenvalue
@@ -756,9 +870,17 @@ int computeSmallestEigenvectors(
   // Obtain tridiagonal matrix with Lanczos
   *effIter = 0;
 
-  status = performLanczosIteration<index_type_t, value_type_t>(
-    handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host,
-    beta_host, lanczosVecs_dev, work_dev);
+  status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                               A,
+                                                               effIter,
+                                                               maxIter_curr,
+                                                               *shift,
+                                                               0,
+                                                               reorthogonalize,
+                                                               alpha_host,
+                                                               beta_host,
+                                                               lanczosVecs_dev,
+                                                               work_dev);
   if (status) WARNING("error in Lanczos iteration");
   *totalIter += *effIter;
 
@@ -775,9 +897,19 @@ int computeSmallestEigenvectors(
     if (iter_new == *effIter) break;
 
     // Implicit restart of Lanczos method
-    status = lanczosRestart<index_type_t, value_type_t>(
-      handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host,
-      beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, true);
+    status = lanczosRestart<index_type_t, value_type_t>(handle,
+                                                        n,
+                                                        *effIter,
+                                                        iter_new,
+                                                        &shiftUpper,
+                                                        &shiftLower,
+                                                        alpha_host,
+                                                        beta_host,
+                                                        Z_host,
+                                                        work_host,
+                                                        lanczosVecs_dev,
+                                                        work_dev,
+                                                        true);
     if (status) WARNING("error in Lanczos implicit restart");
     *effIter = iter_new;
 
@@ -786,9 +918,17 @@ int computeSmallestEigenvectors(
 
     // Proceed with Lanczos method
 
-    status = performLanczosIteration<index_type_t, value_type_t>(
-      handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower),
-      reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev);
+    status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                                 A,
+                                                                 effIter,
+                                                                 maxIter_curr,
+                                                                 *shift,
+                                                                 tol * fabs(shiftLower),
+                                                                 reorthogonalize,
+                                                                 alpha_host,
+                                                                 beta_host,
+                                                                 lanczosVecs_dev,
+                                                                 work_dev);
     if (status) WARNING("error in Lanczos iteration");
     *totalIter += *effIter - iter_new;
   }
@@ -799,39 +939,59 @@ int computeSmallestEigenvectors(
   }
 
   // Solve tridiagonal system
-  memcpy(work_host + 2 * (*effIter), alpha_host,
-         (*effIter) * sizeof(value_type_t));
-  memcpy(work_host + 3 * (*effIter), beta_host,
-         (*effIter - 1) * sizeof(value_type_t));
-  Lapack<value_type_t>::steqr('I', *effIter, work_host + 2 * (*effIter),
-                              work_host + 3 * (*effIter), Z_host, *effIter,
+  memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t));
+  memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t));
+  Lapack<value_type_t>::steqr('I',
+                              *effIter,
+                              work_host + 2 * (*effIter),
+                              work_host + 3 * (*effIter),
+                              Z_host,
+                              *effIter,
                               work_host);
 
   // Obtain desired eigenvalues by applying shift
-  for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift;
-  for (i = *effIter; i < nEigVecs; ++i) work_host[i + 2 * (*effIter)] = 0;
+  for (i = 0; i < *effIter; ++i)
+    work_host[i + 2 * (*effIter)] -= *shift;
+  for (i = *effIter; i < nEigVecs; ++i)
+    work_host[i + 2 * (*effIter)] = 0;
 
   // Copy results to device memory
-  CUDA_TRY(cudaMemcpyAsync(eigVals_dev, work_host + 2 * (*effIter),
+  CUDA_TRY(cudaMemcpyAsync(eigVals_dev,
+                           work_host + 2 * (*effIter),
                            nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice, stream));
+                           cudaMemcpyHostToDevice,
+                           stream));
 
-  CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host,
+  CUDA_TRY(cudaMemcpyAsync(work_dev,
+                           Z_host,
                            (*effIter) * nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice, stream));
+                           cudaMemcpyHostToDevice,
+                           stream));
   CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
-  CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs,
-                          *effIter, &one, lanczosVecs_dev, n, work_dev,
-                          *effIter, &zero, eigVecs_dev, n, stream));
+  CUBLAS_CHECK(cublasgemm(cublas_h,
+                          CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          n,
+                          nEigVecs,
+                          *effIter,
+                          &one,
+                          lanczosVecs_dev,
+                          n,
+                          work_dev,
+                          *effIter,
+                          &zero,
+                          eigVecs_dev,
+                          n,
+                          stream));
 
   // Clean up and exit
   curandDestroyGenerator(randGen);
   return 0;
 }
 
-/**  
+/**
  *  @brief  Compute smallest eigenvectors of symmetric matrix
  *    Computes eigenvalues and eigenvectors that are least
  *    positive. If matrix is positive definite or positive
@@ -869,20 +1029,25 @@ int computeSmallestEigenvectors(
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeSmallestEigenvectors(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const &A,
-  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
-  value_type_t tol, bool reorthogonalize, index_type_t &iter,
-  value_type_t *__restrict__ eigVals_dev,
-  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) {
+int computeSmallestEigenvectors(handle_t const& handle,
+                                sparse_matrix_t<index_type_t, value_type_t> const& A,
+                                index_type_t nEigVecs,
+                                index_type_t maxIter,
+                                index_type_t restartIter,
+                                value_type_t tol,
+                                bool reorthogonalize,
+                                index_type_t& iter,
+                                value_type_t* __restrict__ eigVals_dev,
+                                value_type_t* __restrict__ eigVecs_dev,
+                                unsigned long long seed = 1234567)
+{
   using namespace spectral;
 
   // Matrix dimension
   index_type_t n = A.nrows_;
 
   // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
-               "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
@@ -892,8 +1057,8 @@ int computeSmallestEigenvectors(
   std::vector<value_type_t> alpha_host_v(restartIter);
   std::vector<value_type_t> beta_host_v(restartIter);
 
-  value_type_t *alpha_host = alpha_host_v.data();
-  value_type_t *beta_host = beta_host_v.data();
+  value_type_t* alpha_host = alpha_host_v.data();
+  value_type_t* beta_host  = beta_host_v.data();
 
   vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
   vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
@@ -901,10 +1066,23 @@ int computeSmallestEigenvectors(
   // Perform Lanczos method
   index_type_t effIter;
   value_type_t shift;
-  int status = computeSmallestEigenvectors(
-    handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter,
-    &iter, &shift, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(),
-    eigVals_dev, eigVecs_dev, seed);
+  int status = computeSmallestEigenvectors(handle,
+                                           &A,
+                                           nEigVecs,
+                                           maxIter,
+                                           restartIter,
+                                           tol,
+                                           reorthogonalize,
+                                           &effIter,
+                                           &iter,
+                                           &shift,
+                                           alpha_host,
+                                           beta_host,
+                                           lanczosVecs_dev.raw(),
+                                           work_dev.raw(),
+                                           eigVals_dev,
+                                           eigVecs_dev,
+                                           seed);
 
   // Clean up and return
   return status;
@@ -914,7 +1092,7 @@ int computeSmallestEigenvectors(
 // Eigensolver
 // =========================================================
 
-/**  
+/**
  *  @brief Compute largest eigenvectors of symmetric matrix
  *    Computes eigenvalues and eigenvectors that are least
  *    positive. If matrix is positive definite or positive
@@ -959,19 +1137,27 @@ int computeSmallestEigenvectors(
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeLargestEigenvectors(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const *A,
-  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
-  value_type_t tol, bool reorthogonalize, index_type_t *effIter,
-  index_type_t *totalIter, value_type_t *__restrict__ alpha_host,
-  value_type_t *__restrict__ beta_host,
-  value_type_t *__restrict__ lanczosVecs_dev,
-  value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev,
-  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) {
+int computeLargestEigenvectors(handle_t const& handle,
+                               sparse_matrix_t<index_type_t, value_type_t> const* A,
+                               index_type_t nEigVecs,
+                               index_type_t maxIter,
+                               index_type_t restartIter,
+                               value_type_t tol,
+                               bool reorthogonalize,
+                               index_type_t* effIter,
+                               index_type_t* totalIter,
+                               value_type_t* __restrict__ alpha_host,
+                               value_type_t* __restrict__ beta_host,
+                               value_type_t* __restrict__ lanczosVecs_dev,
+                               value_type_t* __restrict__ work_dev,
+                               value_type_t* __restrict__ eigVals_dev,
+                               value_type_t* __restrict__ eigVecs_dev,
+                               unsigned long long seed)
+{
   using namespace spectral;
 
   // Useful constants
-  constexpr value_type_t one = 1;
+  constexpr value_type_t one  = 1;
   constexpr value_type_t zero = 0;
 
   // Matrix dimension
@@ -987,8 +1173,8 @@ int computeLargestEigenvectors(
   index_type_t i;
 
   // Host memory
-  value_type_t *Z_host;     // Eigenvectors in Lanczos basis
-  value_type_t *work_host;  // Workspace
+  value_type_t* Z_host;     // Eigenvectors in Lanczos basis
+  value_type_t* work_host;  // Workspace
 
   // -------------------------------------------------------
   // Check that LAPACK is enabled
@@ -998,15 +1184,14 @@ int computeLargestEigenvectors(
   // -------------------------------------------------------
   // Check that parameters are valid
   // -------------------------------------------------------
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
-               "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
   RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   // -------------------------------------------------------
   // Variable initialization
@@ -1019,12 +1204,11 @@ int computeLargestEigenvectors(
   std::vector<value_type_t> Z_host_v(restartIter * restartIter);
   std::vector<value_type_t> work_host_v(4 * restartIter);
 
-  Z_host = Z_host_v.data();
+  Z_host    = Z_host_v.data();
   work_host = work_host_v.data();
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(
-    cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // -------------------------------------------------------
   // Compute largest eigenvalue
@@ -1044,13 +1228,21 @@ int computeLargestEigenvectors(
   CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
 
   // Obtain tridiagonal matrix with Lanczos
-  *effIter = 0;
+  *effIter               = 0;
   value_type_t shift_val = 0.0;
-  value_type_t *shift = &shift_val;
-
-  status = performLanczosIteration<index_type_t, value_type_t>(
-    handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host,
-    beta_host, lanczosVecs_dev, work_dev);
+  value_type_t* shift    = &shift_val;
+
+  status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                               A,
+                                                               effIter,
+                                                               maxIter_curr,
+                                                               *shift,
+                                                               0,
+                                                               reorthogonalize,
+                                                               alpha_host,
+                                                               beta_host,
+                                                               lanczosVecs_dev,
+                                                               work_dev);
   if (status) WARNING("error in Lanczos iteration");
   *totalIter += *effIter;
 
@@ -1067,9 +1259,19 @@ int computeLargestEigenvectors(
     if (iter_new == *effIter) break;
 
     // Implicit restart of Lanczos method
-    status = lanczosRestart<index_type_t, value_type_t>(
-      handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host,
-      beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, false);
+    status = lanczosRestart<index_type_t, value_type_t>(handle,
+                                                        n,
+                                                        *effIter,
+                                                        iter_new,
+                                                        &shiftUpper,
+                                                        &shiftLower,
+                                                        alpha_host,
+                                                        beta_host,
+                                                        Z_host,
+                                                        work_host,
+                                                        lanczosVecs_dev,
+                                                        work_dev,
+                                                        false);
     if (status) WARNING("error in Lanczos implicit restart");
     *effIter = iter_new;
 
@@ -1078,9 +1280,17 @@ int computeLargestEigenvectors(
 
     // Proceed with Lanczos method
 
-    status = performLanczosIteration<index_type_t, value_type_t>(
-      handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower),
-      reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev);
+    status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                                 A,
+                                                                 effIter,
+                                                                 maxIter_curr,
+                                                                 *shift,
+                                                                 tol * fabs(shiftLower),
+                                                                 reorthogonalize,
+                                                                 alpha_host,
+                                                                 beta_host,
+                                                                 lanczosVecs_dev,
+                                                                 work_dev);
     if (status) WARNING("error in Lanczos iteration");
     *totalIter += *effIter - iter_new;
   }
@@ -1090,15 +1300,18 @@ int computeLargestEigenvectors(
     WARNING("implicitly restarted Lanczos failed to converge");
   }
   for (int i = 0; i < restartIter; ++i) {
-    for (int j = 0; j < restartIter; ++j) Z_host[i * restartIter + j] = 0;
+    for (int j = 0; j < restartIter; ++j)
+      Z_host[i * restartIter + j] = 0;
   }
   // Solve tridiagonal system
-  memcpy(work_host + 2 * (*effIter), alpha_host,
-         (*effIter) * sizeof(value_type_t));
-  memcpy(work_host + 3 * (*effIter), beta_host,
-         (*effIter - 1) * sizeof(value_type_t));
-  Lapack<value_type_t>::steqr('I', *effIter, work_host + 2 * (*effIter),
-                              work_host + 3 * (*effIter), Z_host, *effIter,
+  memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t));
+  memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t));
+  Lapack<value_type_t>::steqr('I',
+                              *effIter,
+                              work_host + 2 * (*effIter),
+                              work_host + 3 * (*effIter),
+                              Z_host,
+                              *effIter,
                               work_host);
 
   // note: We need to pick the top nEigVecs eigenvalues
@@ -1123,36 +1336,52 @@ int computeLargestEigenvectors(
   //}
 
   // Obtain desired eigenvalues by applying shift
-  for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift;
+  for (i = 0; i < *effIter; ++i)
+    work_host[i + 2 * (*effIter)] -= *shift;
 
   for (i = 0; i < top_eigenparis_idx_offset; ++i)
     work_host[i + 2 * (*effIter)] = 0;
 
   // Copy results to device memory
   // skip smallest eigenvalue if needed
-  CUDA_TRY(cudaMemcpyAsync(
-    eigVals_dev, work_host + 2 * (*effIter) + top_eigenparis_idx_offset,
-    nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice, stream));
+  CUDA_TRY(cudaMemcpyAsync(eigVals_dev,
+                           work_host + 2 * (*effIter) + top_eigenparis_idx_offset,
+                           nEigVecs * sizeof(value_type_t),
+                           cudaMemcpyHostToDevice,
+                           stream));
 
   // skip smallest eigenvector if needed
   CUDA_TRY(cudaMemcpyAsync(work_dev,
                            Z_host + (top_eigenparis_idx_offset * (*effIter)),
                            (*effIter) * nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice, stream));
+                           cudaMemcpyHostToDevice,
+                           stream));
 
   CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
-  CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs,
-                          *effIter, &one, lanczosVecs_dev, n, work_dev,
-                          *effIter, &zero, eigVecs_dev, n, stream));
+  CUBLAS_CHECK(cublasgemm(cublas_h,
+                          CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          n,
+                          nEigVecs,
+                          *effIter,
+                          &one,
+                          lanczosVecs_dev,
+                          n,
+                          work_dev,
+                          *effIter,
+                          &zero,
+                          eigVecs_dev,
+                          n,
+                          stream));
 
   // Clean up and exit
   curandDestroyGenerator(randGen);
   return 0;
 }
 
-/**  
+/**
  *  @brief  Compute largest eigenvectors of symmetric matrix
  *    Computes eigenvalues and eigenvectors that are least
  *    positive. If matrix is positive definite or positive
@@ -1190,18 +1419,23 @@ int computeLargestEigenvectors(
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeLargestEigenvectors(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const &A,
-  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
-  value_type_t tol, bool reorthogonalize, index_type_t &iter,
-  value_type_t *__restrict__ eigVals_dev,
-  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 123456) {
+int computeLargestEigenvectors(handle_t const& handle,
+                               sparse_matrix_t<index_type_t, value_type_t> const& A,
+                               index_type_t nEigVecs,
+                               index_type_t maxIter,
+                               index_type_t restartIter,
+                               value_type_t tol,
+                               bool reorthogonalize,
+                               index_type_t& iter,
+                               value_type_t* __restrict__ eigVals_dev,
+                               value_type_t* __restrict__ eigVecs_dev,
+                               unsigned long long seed = 123456)
+{
   // Matrix dimension
   index_type_t n = A.nrows_;
 
   // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
-               "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
@@ -1211,18 +1445,30 @@ int computeLargestEigenvectors(
   std::vector<value_type_t> alpha_host_v(restartIter);
   std::vector<value_type_t> beta_host_v(restartIter);
 
-  value_type_t *alpha_host = alpha_host_v.data();
-  value_type_t *beta_host = beta_host_v.data();
+  value_type_t* alpha_host = alpha_host_v.data();
+  value_type_t* beta_host  = beta_host_v.data();
 
   vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
   vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
 
   // Perform Lanczos method
   index_type_t effIter;
-  int status = computeLargestEigenvectors(
-    handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter,
-    &iter, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(),
-    eigVals_dev, eigVecs_dev, seed);
+  int status = computeLargestEigenvectors(handle,
+                                          &A,
+                                          nEigVecs,
+                                          maxIter,
+                                          restartIter,
+                                          tol,
+                                          reorthogonalize,
+                                          &effIter,
+                                          &iter,
+                                          alpha_host,
+                                          beta_host,
+                                          lanczosVecs_dev.raw(),
+                                          work_dev.raw(),
+                                          eigVals_dev,
+                                          eigVecs_dev,
+                                          seed);
 
   // Clean up and return
   return status;
diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.cuh
index aff08da2d3..200818fdc3 100644
--- a/cpp/include/raft/linalg/map.cuh
+++ b/cpp/include/raft/linalg/map.cuh
@@ -24,21 +24,18 @@
 namespace raft {
 namespace linalg {
 
-template <typename InType, typename OutType, typename MapOp, int TPB,
-          typename... Args>
-__global__ void mapKernel(OutType *out, size_t len, MapOp map, const InType *in,
-                          Args... args) {
+template <typename InType, typename OutType, typename MapOp, int TPB, typename... Args>
+__global__ void mapKernel(OutType* out, size_t len, MapOp map, const InType* in, Args... args)
+{
   auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
 
-  if (idx < len) {
-    out[idx] = map(in[idx], args[idx]...);
-  }
+  if (idx < len) { out[idx] = map(in[idx], args[idx]...); }
 }
 
-template <typename InType, typename OutType, typename MapOp, int TPB,
-          typename... Args>
-void mapImpl(OutType *out, size_t len, MapOp map, cudaStream_t stream,
-             const InType *in, Args... args) {
+template <typename InType, typename OutType, typename MapOp, int TPB, typename... Args>
+void mapImpl(
+  OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+{
   const int nblks = raft::ceildiv(len, (size_t)TPB);
   mapKernel<InType, OutType, MapOp, TPB, Args...>
     <<<nblks, TPB, 0, stream>>>(out, len, map, in, args...);
@@ -60,12 +57,14 @@ void mapImpl(OutType *out, size_t len, MapOp map, cudaStream_t stream,
  * @param args additional input arrays
  */
 
-template <typename InType, typename MapOp, int TPB = 256, typename... Args,
+template <typename InType,
+          typename MapOp,
+          int TPB = 256,
+          typename... Args,
           typename OutType = InType>
-void map(OutType *out, size_t len, MapOp map, cudaStream_t stream,
-         const InType *in, Args... args) {
-  mapImpl<InType, OutType, MapOp, TPB, Args...>(out, len, map, stream, in,
-                                                args...);
+void map(OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+{
+  mapImpl<InType, OutType, MapOp, TPB, Args...>(out, len, map, stream, in, args...);
 }
 
 }  // namespace linalg
diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/map_then_reduce.cuh
index f2f198670a..78a7017c5c 100644
--- a/cpp/include/raft/linalg/map_then_reduce.cuh
+++ b/cpp/include/raft/linalg/map_then_reduce.cuh
@@ -24,50 +24,66 @@
 namespace raft {
 namespace linalg {
 
-struct sum_tag {};
+struct sum_tag {
+};
 
 template <typename InType, typename OutType, int TPB>
-__device__ void reduce(OutType *out, const InType acc, sum_tag) {
+__device__ void reduce(OutType* out, const InType acc, sum_tag)
+{
   typedef cub::BlockReduce<InType, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   OutType tmp = BlockReduce(temp_storage).Sum(acc);
-  if (threadIdx.x == 0) {
-    raft::myAtomicAdd(out, tmp);
-  }
+  if (threadIdx.x == 0) { raft::myAtomicAdd(out, tmp); }
 }
 
 template <typename InType, typename OutType, int TPB, typename ReduceLambda>
-__device__ void reduce(OutType *out, const InType acc, ReduceLambda op) {
+__device__ void reduce(OutType* out, const InType acc, ReduceLambda op)
+{
   typedef cub::BlockReduce<InType, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   OutType tmp = BlockReduce(temp_storage).Reduce(acc, op);
-  if (threadIdx.x == 0) {
-    raft::myAtomicReduce(out, tmp, op);
-  }
+  if (threadIdx.x == 0) { raft::myAtomicReduce(out, tmp, op); }
 }
 
-template <typename InType, typename OutType, typename MapOp,
-          typename ReduceLambda, int TPB, typename... Args>
-__global__ void mapThenReduceKernel(OutType *out, size_t len, OutType neutral,
-                                    MapOp map, ReduceLambda op,
-                                    const InType *in, Args... args) {
+template <typename InType,
+          typename OutType,
+          typename MapOp,
+          typename ReduceLambda,
+          int TPB,
+          typename... Args>
+__global__ void mapThenReduceKernel(OutType* out,
+                                    size_t len,
+                                    OutType neutral,
+                                    MapOp map,
+                                    ReduceLambda op,
+                                    const InType* in,
+                                    Args... args)
+{
   OutType acc = neutral;
-  auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
+  auto idx    = (threadIdx.x + (blockIdx.x * blockDim.x));
 
-  if (idx < len) {
-    acc = map(in[idx], args[idx]...);
-  }
+  if (idx < len) { acc = map(in[idx], args[idx]...); }
 
   __syncthreads();
 
   reduce<InType, OutType, TPB>(out, acc, op);
 }
 
-template <typename InType, typename OutType, typename MapOp,
-          typename ReduceLambda, int TPB, typename... Args>
-void mapThenReduceImpl(OutType *out, size_t len, OutType neutral, MapOp map,
-                       ReduceLambda op, cudaStream_t stream, const InType *in,
-                       Args... args) {
+template <typename InType,
+          typename OutType,
+          typename MapOp,
+          typename ReduceLambda,
+          int TPB,
+          typename... Args>
+void mapThenReduceImpl(OutType* out,
+                       size_t len,
+                       OutType neutral,
+                       MapOp map,
+                       ReduceLambda op,
+                       cudaStream_t stream,
+                       const InType* in,
+                       Args... args)
+{
   raft::update_device(out, &neutral, 1, stream);
   const int nblks = raft::ceildiv(len, (size_t)TPB);
   mapThenReduceKernel<InType, OutType, MapOp, ReduceLambda, TPB, Args...>
@@ -89,10 +105,14 @@ void mapThenReduceImpl(OutType *out, size_t len, OutType neutral, MapOp map,
  * @param args additional input arrays
  */
 
-template <typename InType, typename MapOp, int TPB = 256, typename... Args,
+template <typename InType,
+          typename MapOp,
+          int TPB = 256,
+          typename... Args,
           typename OutType = InType>
-void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream,
-                      const InType *in, Args... args) {
+void mapThenSumReduce(
+  OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+{
   mapThenReduceImpl<InType, OutType, MapOp, sum_tag, TPB, Args...>(
     out, len, (OutType)0, map, sum_tag(), stream, in, args...);
 }
@@ -115,11 +135,21 @@ void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream,
  * @param args additional input arrays
  */
 
-template <typename InType, typename MapOp, typename ReduceLambda, int TPB = 256,
-          typename OutType = InType, typename... Args>
-void mapThenReduce(OutType *out, size_t len, OutType neutral, MapOp map,
-                   ReduceLambda op, cudaStream_t stream, const InType *in,
-                   Args... args) {
+template <typename InType,
+          typename MapOp,
+          typename ReduceLambda,
+          int TPB          = 256,
+          typename OutType = InType,
+          typename... Args>
+void mapThenReduce(OutType* out,
+                   size_t len,
+                   OutType neutral,
+                   MapOp map,
+                   ReduceLambda op,
+                   cudaStream_t stream,
+                   const InType* in,
+                   Args... args)
+{
   mapThenReduceImpl<InType, OutType, MapOp, ReduceLambda, TPB, Args...>(
     out, len, neutral, map, op, stream, in, args...);
 }
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index 93f2d746fa..81c1919b2e 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -27,19 +27,24 @@ namespace {
 template <size_t VecBytes>
 struct AlignedAccess {
   template <typename T>
-  static inline bool test(const T *matrix, size_t strideBytes) {
-    return Pow2<VecBytes>::isAligned(matrix) &&
-           Pow2<VecBytes>::isAligned(strideBytes) &&
+  static inline bool test(const T* matrix, size_t strideBytes)
+  {
+    return Pow2<VecBytes>::isAligned(matrix) && Pow2<VecBytes>::isAligned(strideBytes) &&
            Pow2<sizeof(T)>::isAligned(VecBytes);
   }
 };
 };  // namespace
 
 template <typename Type, int veclen_, typename Lambda, typename IdxType>
-__global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
-                                     const Type *vector, IdxType D, IdxType N,
-                                     bool rowMajor, bool bcastAlongRows,
-                                     Lambda op) {
+__global__ void matrixVectorOpKernel(Type* out,
+                                     const Type* matrix,
+                                     const Type* vector,
+                                     IdxType D,
+                                     IdxType N,
+                                     bool rowMajor,
+                                     bool bcastAlongRows,
+                                     Lambda op)
+{
   typedef TxN_t<Type, veclen_> VecType;
   IdxType len = N * D;
   IdxType idx = threadIdx.x;
@@ -70,17 +75,21 @@ __global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
   mat.store(out, idx);
 }
 
-template <typename Type, int veclen_, typename Lambda, typename IdxType,
-          int TPB>
-void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec,
-                        IdxType D, IdxType N, bool rowMajor,
-                        bool bcastAlongRows, Lambda op, cudaStream_t stream) {
-  IdxType len = N * D;
-  IdxType nblks =
-    raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
+template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
+void matrixVectorOpImpl(Type* out,
+                        const Type* matrix,
+                        const Type* vec,
+                        IdxType D,
+                        IdxType N,
+                        bool rowMajor,
+                        bool bcastAlongRows,
+                        Lambda op,
+                        cudaStream_t stream)
+{
+  IdxType len   = N * D;
+  IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
   matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor,
-                                bcastAlongRows, op);
+    <<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -108,10 +117,17 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec,
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
-void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D,
-                    IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op,
-                    cudaStream_t stream) {
-  IdxType stride = rowMajor ? D : N;
+void matrixVectorOp(Type* out,
+                    const Type* matrix,
+                    const Type* vec,
+                    IdxType D,
+                    IdxType N,
+                    bool rowMajor,
+                    bool bcastAlongRows,
+                    Lambda op,
+                    cudaStream_t stream)
+{
+  IdxType stride      = rowMajor ? D : N;
   size_t stride_bytes = stride * sizeof(Type);
 
   if (AlignedAccess<16>::test(matrix, stride_bytes)) {
@@ -138,10 +154,16 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D,
 ///@todo: come up with a cleaner interface to support these cases in future!
 
 template <typename Type, int veclen_, typename Lambda, typename IdxType>
-__global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
-                                     const Type *vector1, const Type *vector2,
-                                     IdxType D, IdxType N, bool rowMajor,
-                                     bool bcastAlongRows, Lambda op) {
+__global__ void matrixVectorOpKernel(Type* out,
+                                     const Type* matrix,
+                                     const Type* vector1,
+                                     const Type* vector2,
+                                     IdxType D,
+                                     IdxType N,
+                                     bool rowMajor,
+                                     bool bcastAlongRows,
+                                     Lambda op)
+{
   typedef TxN_t<Type, veclen_> VecType;
   IdxType len = N * D;
   IdxType idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio;
@@ -174,15 +196,21 @@ __global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
   mat.store(out, idx);
 }
 
-template <typename Type, int veclen_, typename Lambda, typename IdxType,
-          int TPB>
-void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1,
-                        const Type *vec2, IdxType D, IdxType N, bool rowMajor,
-                        bool bcastAlongRows, Lambda op, cudaStream_t stream) {
+template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
+void matrixVectorOpImpl(Type* out,
+                        const Type* matrix,
+                        const Type* vec1,
+                        const Type* vec2,
+                        IdxType D,
+                        IdxType N,
+                        bool rowMajor,
+                        bool bcastAlongRows,
+                        Lambda op,
+                        cudaStream_t stream)
+{
   IdxType nblks = raft::ceildiv(N * D, (IdxType)TPB);
   matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, matrix, vec1, vec2, D, N, rowMajor,
-                                bcastAlongRows, op);
+    <<<nblks, TPB, 0, stream>>>(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -211,10 +239,18 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1,
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
-void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1,
-                    const Type *vec2, IdxType D, IdxType N, bool rowMajor,
-                    bool bcastAlongRows, Lambda op, cudaStream_t stream) {
-  IdxType stride = rowMajor ? D : N;
+void matrixVectorOp(Type* out,
+                    const Type* matrix,
+                    const Type* vec1,
+                    const Type* vec2,
+                    IdxType D,
+                    IdxType N,
+                    bool rowMajor,
+                    bool bcastAlongRows,
+                    Lambda op,
+                    cudaStream_t stream)
+{
+  IdxType stride      = rowMajor ? D : N;
   size_t stride_bytes = stride * sizeof(Type);
 
   if (AlignedAccess<16>::test(matrix, stride_bytes)) {
diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.cuh
index 9d1538c172..a3fcc5bac6 100644
--- a/cpp/include/raft/linalg/mean_squared_error.cuh
+++ b/cpp/include/raft/linalg/mean_squared_error.cuh
@@ -24,7 +24,7 @@ namespace linalg {
 /**
  * @brief CUDA version mean squared error function mean((A-B)**2)
  * @tparam math_t data-type upon which the math operation will be performed
- * @tparam TPB threads-per-block 
+ * @tparam TPB threads-per-block
  * @param out the output mean squared error value (assumed to be a device pointer)
  * @param A input array (assumed to be a device pointer)
  * @param B input array (assumed to be a device pointer)
@@ -33,14 +33,14 @@ namespace linalg {
  * @param stream cuda-stream where to launch this kernel
  */
 template <typename math_t, int TPB = 256>
-void meanSquaredError(math_t* out, const math_t* A, const math_t* B, size_t len,
-                      math_t weight, cudaStream_t stream) {
+void meanSquaredError(
+  math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream)
+{
   auto sq_diff = [len, weight] __device__(const math_t a, const math_t b) {
     math_t diff = a - b;
     return diff * diff * weight / len;
   };
-  mapThenSumReduce<math_t, decltype(sq_diff), TPB>(out, len, sq_diff, stream, A,
-                                                   B);
+  mapThenSumReduce<math_t, decltype(sq_diff), TPB>(out, len, sq_diff, stream, A, B);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh
index ce948c927d..53d57ecd00 100644
--- a/cpp/include/raft/linalg/multiply.cuh
+++ b/cpp/include/raft/linalg/multiply.cuh
@@ -33,11 +33,10 @@ namespace linalg {
  * @{
  */
 template <typename math_t, typename IdxType = int>
-void multiplyScalar(math_t *out, const math_t *in, math_t scalar, IdxType len,
-                    cudaStream_t stream) {
+void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
+{
   unaryOp(
-    out, in, len, [scalar] __device__(math_t in) { return in * scalar; },
-    stream);
+    out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, stream);
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.cuh
index 64930a7123..82558c8023 100644
--- a/cpp/include/raft/linalg/norm.cuh
+++ b/cpp/include/raft/linalg/norm.cuh
@@ -44,22 +44,46 @@ enum NormType { L1Norm = 0, L2Norm };
  * @param stream cuda stream where to launch work
  * @param fin_op the final lambda op
  */
-template <typename Type, typename IdxType = int,
-          typename Lambda = raft::Nop<Type, IdxType>>
-void rowNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type,
-             bool rowMajor, cudaStream_t stream,
-             Lambda fin_op = raft::Nop<Type, IdxType>()) {
+template <typename Type, typename IdxType = int, typename Lambda = raft::Nop<Type, IdxType>>
+void rowNorm(Type* dots,
+             const Type* data,
+             IdxType D,
+             IdxType N,
+             NormType type,
+             bool rowMajor,
+             cudaStream_t stream,
+             Lambda fin_op = raft::Nop<Type, IdxType>())
+{
   switch (type) {
     case L1Norm:
-      reduce(dots, data, D, N, (Type)0, rowMajor, true, stream, false,
-             raft::L1Op<Type, IdxType>(), raft::Sum<Type>(), fin_op);
+      reduce(dots,
+             data,
+             D,
+             N,
+             (Type)0,
+             rowMajor,
+             true,
+             stream,
+             false,
+             raft::L1Op<Type, IdxType>(),
+             raft::Sum<Type>(),
+             fin_op);
       break;
     case L2Norm:
-      reduce(dots, data, D, N, (Type)0, rowMajor, true, stream, false,
-             raft::L2Op<Type>(), raft::Sum<Type>(), fin_op);
+      reduce(dots,
+             data,
+             D,
+             N,
+             (Type)0,
+             rowMajor,
+             true,
+             stream,
+             false,
+             raft::L2Op<Type>(),
+             raft::Sum<Type>(),
+             fin_op);
       break;
-    default:
-      ASSERT(false, "Invalid norm type passed! [%d]", type);
+    default: ASSERT(false, "Invalid norm type passed! [%d]", type);
   };
 }
 
@@ -77,22 +101,46 @@ void rowNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type,
  * @param stream cuda stream where to launch work
  * @param fin_op the final lambda op
  */
-template <typename Type, typename IdxType = int,
-          typename Lambda = raft::Nop<Type, IdxType>>
-void colNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type,
-             bool rowMajor, cudaStream_t stream,
-             Lambda fin_op = raft::Nop<Type, IdxType>()) {
+template <typename Type, typename IdxType = int, typename Lambda = raft::Nop<Type, IdxType>>
+void colNorm(Type* dots,
+             const Type* data,
+             IdxType D,
+             IdxType N,
+             NormType type,
+             bool rowMajor,
+             cudaStream_t stream,
+             Lambda fin_op = raft::Nop<Type, IdxType>())
+{
   switch (type) {
     case L1Norm:
-      reduce(dots, data, D, N, (Type)0, rowMajor, false, stream, false,
-             raft::L1Op<Type, IdxType>(), raft::Sum<Type>(), fin_op);
+      reduce(dots,
+             data,
+             D,
+             N,
+             (Type)0,
+             rowMajor,
+             false,
+             stream,
+             false,
+             raft::L1Op<Type, IdxType>(),
+             raft::Sum<Type>(),
+             fin_op);
       break;
     case L2Norm:
-      reduce(dots, data, D, N, (Type)0, rowMajor, false, stream, false,
-             raft::L2Op<Type, IdxType>(), raft::Sum<Type>(), fin_op);
+      reduce(dots,
+             data,
+             D,
+             N,
+             (Type)0,
+             rowMajor,
+             false,
+             stream,
+             false,
+             raft::L2Op<Type, IdxType>(),
+             raft::Sum<Type>(),
+             fin_op);
       break;
-    default:
-      ASSERT(false, "Invalid norm type passed! [%d]", type);
+    default: ASSERT(false, "Invalid norm type passed! [%d]", type);
   };
 }
 
diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh
index a50448acbe..c85cfda934 100644
--- a/cpp/include/raft/linalg/qr.cuh
+++ b/cpp/include/raft/linalg/qr.cuh
@@ -41,14 +41,18 @@ namespace linalg {
  * @{
  */
 template <typename math_t>
-void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q,
-            int n_rows, int n_cols, cudaStream_t stream) {
+void qrGetQ(const raft::handle_t& handle,
+            const math_t* M,
+            math_t* Q,
+            int n_rows,
+            int n_cols,
+            cudaStream_t stream)
+{
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int m = n_rows, n = n_cols;
   int k = min(m, n);
-  CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n,
-                             cudaMemcpyDeviceToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
 
   rmm::device_uvector<math_t> tau(k, stream);
   CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream));
@@ -58,19 +62,16 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q,
 
   CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork));
   rmm::device_uvector<math_t> workspace(Lwork, stream);
-  CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, m, n, Q, m, tau.data(),
-                                 workspace.data(), Lwork, devInfo.data(),
-                                 stream));
+  CUSOLVER_CHECK(cusolverDngeqrf(
+    cusolverH, m, n, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
   /// @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail.
 #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020
   CUDA_CHECK(cudaDeviceSynchronize());
 #endif
-  CUSOLVER_CHECK(
-    cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork));
+  CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork));
   workspace.resize(Lwork, stream);
-  CUSOLVER_CHECK(cusolverDnorgqr(cusolverH, m, n, k, Q, m, tau.data(),
-                                 workspace.data(), Lwork, devInfo.data(),
-                                 stream));
+  CUSOLVER_CHECK(cusolverDnorgqr(
+    cusolverH, m, n, k, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
 }
 
 /**
@@ -84,29 +85,40 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q,
  * @param stream cuda stream
  */
 template <typename math_t>
-void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R,
-             int n_rows, int n_cols, cudaStream_t stream) {
+void qrGetQR(const raft::handle_t& handle,
+             math_t* M,
+             math_t* Q,
+             math_t* R,
+             int n_rows,
+             int n_cols,
+             cudaStream_t stream)
+{
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int m = n_rows, n = n_cols;
   rmm::device_uvector<math_t> R_full(m * n, stream);
   rmm::device_uvector<math_t> tau(min(m, n), stream);
-  CUDA_CHECK(
-    cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream));
+  CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream));
   int R_full_nrows = m, R_full_ncols = n;
-  CUDA_CHECK(cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n,
-                             cudaMemcpyDeviceToDevice, stream));
+  CUDA_CHECK(
+    cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
 
   int Lwork;
   rmm::device_scalar<int> devInfo(stream);
 
-  CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, R_full_nrows,
-                                            R_full_ncols, R_full.data(),
-                                            R_full_nrows, &Lwork));
+  CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(
+    cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, &Lwork));
   rmm::device_uvector<math_t> workspace(Lwork, stream);
-  CUSOLVER_CHECK(cusolverDngeqrf(
-    cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows,
-    tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
+  CUSOLVER_CHECK(cusolverDngeqrf(cusolverH,
+                                 R_full_nrows,
+                                 R_full_ncols,
+                                 R_full.data(),
+                                 R_full_nrows,
+                                 tau.data(),
+                                 workspace.data(),
+                                 Lwork,
+                                 devInfo.data(),
+                                 stream));
   // @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail.
 #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020
   CUDA_CHECK(cudaDeviceSynchronize());
@@ -114,17 +126,24 @@ void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R,
 
   raft::matrix::copyUpperTriangular(R_full.data(), R, m, n, stream);
 
-  CUDA_CHECK(cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n,
-                             cudaMemcpyDeviceToDevice, stream));
+  CUDA_CHECK(
+    cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
   int Q_nrows = m, Q_ncols = n;
 
-  CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, Q_nrows, Q_ncols,
-                                            min(Q_ncols, Q_nrows), Q, Q_nrows,
-                                            tau.data(), &Lwork));
+  CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(
+    cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), &Lwork));
   workspace.resize(Lwork, stream);
-  CUSOLVER_CHECK(cusolverDnorgqr(
-    cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(),
-    workspace.data(), Lwork, devInfo.data(), stream));
+  CUSOLVER_CHECK(cusolverDnorgqr(cusolverH,
+                                 Q_nrows,
+                                 Q_ncols,
+                                 min(Q_ncols, Q_nrows),
+                                 Q,
+                                 Q_nrows,
+                                 tau.data(),
+                                 workspace.data(),
+                                 Lwork,
+                                 devInfo.data(),
+                                 stream));
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh
index d39577bbdd..693a797db9 100644
--- a/cpp/include/raft/linalg/reduce.cuh
+++ b/cpp/include/raft/linalg/reduce.cuh
@@ -52,28 +52,33 @@ namespace linalg {
  * @param reduce_op binary reduction operation
  * @param final_op elementwise operation to apply before storing results
  */
-template <typename InType, typename OutType = InType, typename IdxType = int,
-          typename MainLambda = raft::Nop<InType, IdxType>,
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
           typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda = raft::Nop<OutType>>
-void reduce(OutType *dots, const InType *data, int D, int N, OutType init,
-            bool rowMajor, bool alongRows, cudaStream_t stream,
-            bool inplace = false,
-            MainLambda main_op = raft::Nop<InType, IdxType>(),
+          typename FinalLambda  = raft::Nop<OutType>>
+void reduce(OutType* dots,
+            const InType* data,
+            int D,
+            int N,
+            OutType init,
+            bool rowMajor,
+            bool alongRows,
+            cudaStream_t stream,
+            bool inplace           = false,
+            MainLambda main_op     = raft::Nop<InType, IdxType>(),
             ReduceLambda reduce_op = raft::Sum<OutType>(),
-            FinalLambda final_op = raft::Nop<OutType>()) {
+            FinalLambda final_op   = raft::Nop<OutType>())
+{
   if (rowMajor && alongRows) {
-    coalescedReduction(dots, data, D, N, init, stream, inplace, main_op,
-                       reduce_op, final_op);
+    coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else if (rowMajor && !alongRows) {
-    stridedReduction(dots, data, D, N, init, stream, inplace, main_op,
-                     reduce_op, final_op);
+    stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else if (!rowMajor && alongRows) {
-    stridedReduction(dots, data, N, D, init, stream, inplace, main_op,
-                     reduce_op, final_op);
+    stridedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
   } else {
-    coalescedReduction(dots, data, N, D, init, stream, inplace, main_op,
-                       reduce_op, final_op);
+    coalescedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
   }
 }
 
diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh
index bba652e137..f931c976fd 100644
--- a/cpp/include/raft/linalg/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/strided_reduction.cuh
@@ -28,14 +28,15 @@ namespace linalg {
 // of the matrix, i.e. reduce along columns for row major or reduce along rows
 // for column major layout
 template <typename Type, typename MainLambda>
-__global__ void stridedSummationKernel(Type *dots, const Type *data, int D,
-                                       int N, Type init, MainLambda main_op) {
+__global__ void stridedSummationKernel(
+  Type* dots, const Type* data, int D, int N, Type init, MainLambda main_op)
+{
   // Thread reduction
   Type thread_data = Type(init);
-  int colStart = blockIdx.x * blockDim.x + threadIdx.x;
+  int colStart     = blockIdx.x * blockDim.x + threadIdx.x;
   if (colStart < D) {
     int rowStart = blockIdx.y * blockDim.y + threadIdx.y;
-    int stride = blockDim.y * gridDim.y;
+    int stride   = blockDim.y * gridDim.y;
     for (int j = rowStart; j < N; j += stride) {
       int idx = colStart + j * D;
       thread_data += main_op(data[idx], j);
@@ -44,8 +45,8 @@ __global__ void stridedSummationKernel(Type *dots, const Type *data, int D,
 
   // Block reduction
   extern __shared__ char tmp[];  // One element per thread in block
-  Type *temp = (Type *)tmp;      // Cast to desired type
-  int myidx = threadIdx.x + blockDim.x * threadIdx.y;
+  Type* temp  = (Type*)tmp;      // Cast to desired type
+  int myidx   = threadIdx.x + blockDim.x * threadIdx.y;
   temp[myidx] = thread_data;
   __syncthreads();
   for (int j = blockDim.y / 2; j > 0; j /= 2) {
@@ -54,24 +55,31 @@ __global__ void stridedSummationKernel(Type *dots, const Type *data, int D,
   }
 
   // Grid reduction
-  if ((colStart < D) && (threadIdx.y == 0))
-    raft::myAtomicAdd(dots + colStart, temp[myidx]);
+  if ((colStart < D) && (threadIdx.y == 0)) raft::myAtomicAdd(dots + colStart, temp[myidx]);
 }
 
 // Kernel to perform reductions along the strided dimension
 // of the matrix, i.e. reduce along columns for row major or reduce along rows
 // for column major layout
-template <typename InType, typename OutType, typename IdxType,
-          typename MainLambda, typename ReduceLambda>
-__global__ void stridedReductionKernel(OutType *dots, const InType *data, int D,
-                                       int N, OutType init, MainLambda main_op,
-                                       ReduceLambda reduce_op) {
+template <typename InType,
+          typename OutType,
+          typename IdxType,
+          typename MainLambda,
+          typename ReduceLambda>
+__global__ void stridedReductionKernel(OutType* dots,
+                                       const InType* data,
+                                       int D,
+                                       int N,
+                                       OutType init,
+                                       MainLambda main_op,
+                                       ReduceLambda reduce_op)
+{
   // Thread reduction
   OutType thread_data = init;
-  IdxType colStart = blockIdx.x * blockDim.x + threadIdx.x;
+  IdxType colStart    = blockIdx.x * blockDim.x + threadIdx.x;
   if (colStart < D) {
     IdxType rowStart = blockIdx.y * blockDim.y + threadIdx.y;
-    IdxType stride = blockDim.y * gridDim.y;
+    IdxType stride   = blockDim.y * gridDim.y;
     for (IdxType j = rowStart; j < N; j += stride) {
       IdxType idx = colStart + j * D;
       thread_data = reduce_op(thread_data, main_op(data[idx], j));
@@ -79,14 +87,13 @@ __global__ void stridedReductionKernel(OutType *dots, const InType *data, int D,
   }
 
   // Block reduction
-  extern __shared__ char tmp[];  // One element per thread in block
-  auto *temp = (OutType *)tmp;   // Cast to desired type
+  extern __shared__ char tmp[];   // One element per thread in block
+  auto* temp    = (OutType*)tmp;  // Cast to desired type
   IdxType myidx = threadIdx.x + ((IdxType)blockDim.x * (IdxType)threadIdx.y);
-  temp[myidx] = thread_data;
+  temp[myidx]   = thread_data;
   __syncthreads();
   for (int j = blockDim.y / 2; j > 0; j /= 2) {
-    if (threadIdx.y < j)
-      temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]);
+    if (threadIdx.y < j) temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]);
     __syncthreads();
   }
 
@@ -122,15 +129,23 @@ __global__ void stridedReductionKernel(OutType *dots, const InType *data, int D,
  * @param inplace reduction result added inplace or overwrites old values?
  * @param stream cuda stream where to launch work
  */
-template <typename InType, typename OutType = InType, typename IdxType = int,
-          typename MainLambda = raft::Nop<InType, IdxType>,
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
           typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda = raft::Nop<OutType>>
-void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N,
-                      OutType init, cudaStream_t stream, bool inplace = false,
-                      MainLambda main_op = raft::Nop<InType, IdxType>(),
+          typename FinalLambda  = raft::Nop<OutType>>
+void stridedReduction(OutType* dots,
+                      const InType* data,
+                      IdxType D,
+                      IdxType N,
+                      OutType init,
+                      cudaStream_t stream,
+                      bool inplace           = false,
+                      MainLambda main_op     = raft::Nop<InType, IdxType>(),
                       ReduceLambda reduce_op = raft::Sum<OutType>(),
-                      FinalLambda final_op = raft::Nop<OutType>()) {
+                      FinalLambda final_op   = raft::Nop<OutType>())
+{
   ///@todo: this extra should go away once we have eliminated the need
   /// for atomics in stridedKernel (redesign for this is already underway)
   if (!inplace)
@@ -140,7 +155,7 @@ void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N,
   // Arbitrary numbers for now, probably need to tune
   const dim3 thrds(32, 16);
   IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y);
-  elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread;
+  elemsPerThread         = (elemsPerThread > 8) ? 8 : elemsPerThread;
   const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x),
                    raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread));
   const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y;
@@ -153,8 +168,7 @@ void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N,
       <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op);
   else
     stridedReductionKernel<InType, OutType, IdxType>
-      <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op,
-                                            reduce_op);
+      <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op, reduce_op);
 
   ///@todo: this complication should go away once we have eliminated the need
   /// for atomics in stridedKernel (redesign for this is already underway)
diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh
index 882c105689..43060d0818 100644
--- a/cpp/include/raft/linalg/subtract.cuh
+++ b/cpp/include/raft/linalg/subtract.cuh
@@ -38,8 +38,8 @@ namespace linalg {
  * @param stream cuda stream where to launch work
  */
 template <typename InT, typename OutT = InT, typename IdxType = int>
-void subtractScalar(OutT *out, const InT *in, InT scalar, IdxType len,
-                    cudaStream_t stream) {
+void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
+{
   auto op = [scalar] __device__(InT in) { return OutT(in - scalar); };
   unaryOp<InT, decltype(op), IdxType, OutT>(out, in, len, op, stream);
 }
@@ -58,24 +58,25 @@ void subtractScalar(OutT *out, const InT *in, InT scalar, IdxType len,
  * @param stream cuda stream where to launch work
  */
 template <typename InT, typename OutT = InT, typename IdxType = int>
-void subtract(OutT *out, const InT *in1, const InT *in2, IdxType len,
-              cudaStream_t stream) {
+void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
+{
   auto op = [] __device__(InT a, InT b) { return OutT(a - b); };
   binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
 }
 
 template <class math_t, typename IdxType>
-__global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
-                                           const math_t *singleScalarDev,
-                                           IdxType len) {
-  //TODO: kernel do not use shared memory in current implementation
+__global__ void subtract_dev_scalar_kernel(math_t* outDev,
+                                           const math_t* inDev,
+                                           const math_t* singleScalarDev,
+                                           IdxType len)
+{
+  // TODO: kernel do not use shared memory in current implementation
   int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
-  if (i < len) {
-    outDev[i] = inDev[i] - *singleScalarDev;
-  }
+  if (i < len) { outDev[i] = inDev[i] - *singleScalarDev; }
 }
 
-/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i]
+/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
+ * write result to outDev[i]
  * @tparam math_t data-type upon which the math operation will be performed
  * @tparam IdxType Integer type used to for addressing
  * @param outDev the output buffer
@@ -86,9 +87,12 @@ __global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
  * @remark block size has not been tuned
  */
 template <typename math_t, typename IdxType = int, int TPB = 256>
-void subtractDevScalar(math_t *outDev, const math_t *inDev,
-                       const math_t *singleScalarDev, IdxType len,
-                       cudaStream_t stream) {
+void subtractDevScalar(math_t* outDev,
+                       const math_t* inDev,
+                       const math_t* singleScalarDev,
+                       IdxType len,
+                       cudaStream_t stream)
+{
   // Just for the note - there is no way to express such operation with cuBLAS in effective way
   // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda
   const IdxType nblks = raft::ceildiv(len, (IdxType)TPB);
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh
index 2315920689..e14a5b6a50 100644
--- a/cpp/include/raft/linalg/svd.cuh
+++ b/cpp/include/raft/linalg/svd.cuh
@@ -51,12 +51,20 @@ namespace linalg {
 // TODO: couldn't template this function due to cusolverDnSgesvd and
 // cusolverSnSgesvd. Check if there is any other way.
 template <typename T>
-void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
-           T *sing_vals, T *left_sing_vecs, T *right_sing_vecs,
-           bool trans_right, bool gen_left_vec, bool gen_right_vec,
-           cudaStream_t stream) {
+void svdQR(const raft::handle_t& handle,
+           T* in,
+           int n_rows,
+           int n_cols,
+           T* sing_vals,
+           T* left_sing_vecs,
+           T* right_sing_vecs,
+           bool trans_right,
+           bool gen_left_vec,
+           bool gen_right_vec,
+           cudaStream_t stream)
+{
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
-  cublasHandle_t cublasH = handle.get_cublas_handle();
+  cublasHandle_t cublasH       = handle.get_cublas_handle();
 
 #if CUDART_VERSION >= 10010 && CUDART_VERSION < 11000
   // 46340: sqrt of max int value
@@ -71,14 +79,13 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
   const int n = n_cols;
 
   rmm::device_scalar<int> devInfo(stream);
-  T *d_rwork = nullptr;
+  T* d_rwork = nullptr;
 
   int lwork = 0;
-  CUSOLVER_CHECK(
-    cusolverDngesvd_bufferSize<T>(cusolverH, n_rows, n_cols, &lwork));
+  CUSOLVER_CHECK(cusolverDngesvd_bufferSize<T>(cusolverH, n_rows, n_cols, &lwork));
   rmm::device_uvector<T> d_work(lwork, stream);
 
-  char jobu = 'S';
+  char jobu  = 'S';
   char jobvt = 'A';
 
   if (!gen_left_vec) {
@@ -91,9 +98,23 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
     strcpy(&jobvt, &new_vt);
   }
 
-  CUSOLVER_CHECK(cusolverDngesvd(
-    cusolverH, jobu, jobvt, m, n, in, m, sing_vals, left_sing_vecs, m,
-    right_sing_vecs, n, d_work.data(), lwork, d_rwork, devInfo.data(), stream));
+  CUSOLVER_CHECK(cusolverDngesvd(cusolverH,
+                                 jobu,
+                                 jobvt,
+                                 m,
+                                 n,
+                                 in,
+                                 m,
+                                 sing_vals,
+                                 left_sing_vecs,
+                                 m,
+                                 right_sing_vecs,
+                                 n,
+                                 d_work.data(),
+                                 lwork,
+                                 d_rwork,
+                                 devInfo.data(),
+                                 stream));
 
   // Transpose the right singular vector back
   if (trans_right) raft::linalg::transpose(right_sing_vecs, n_cols, stream);
@@ -109,18 +130,36 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
 }
 
 template <typename T>
-void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S,
-            T *U, T *V, bool gen_left_vec, cudaStream_t stream) {
+void svdEig(const raft::handle_t& handle,
+            T* in,
+            int n_rows,
+            int n_cols,
+            T* S,
+            T* U,
+            T* V,
+            bool gen_left_vec,
+            cudaStream_t stream)
+{
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
-  cublasHandle_t cublasH = handle.get_cublas_handle();
+  cublasHandle_t cublasH       = handle.get_cublas_handle();
 
   int len = n_cols * n_cols;
   rmm::device_uvector<T> in_cross_mult(len, stream);
 
   T alpha = T(1);
-  T beta = T(0);
-  raft::linalg::gemm(handle, in, n_rows, n_cols, in, in_cross_mult.data(),
-                     n_cols, n_cols, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta,
+  T beta  = T(0);
+  raft::linalg::gemm(handle,
+                     in,
+                     n_rows,
+                     n_cols,
+                     in,
+                     in_cross_mult.data(),
+                     n_cols,
+                     n_cols,
+                     CUBLAS_OP_T,
+                     CUBLAS_OP_N,
+                     alpha,
+                     beta,
                      stream);
 
   eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, stream);
@@ -131,10 +170,20 @@ void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S,
   raft::matrix::seqRoot(S, S, alpha, n_cols, stream, true);
 
   if (gen_left_vec) {
-    raft::linalg::gemm(handle, in, n_rows, n_cols, V, U, n_rows, n_cols,
-                       CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream);
-    raft::matrix::matrixVectorBinaryDivSkipZero(U, S, n_rows, n_cols, false,
-                                                true, stream);
+    raft::linalg::gemm(handle,
+                       in,
+                       n_rows,
+                       n_cols,
+                       V,
+                       U,
+                       n_rows,
+                       n_cols,
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_N,
+                       alpha,
+                       beta,
+                       stream);
+    raft::matrix::matrixVectorBinaryDivSkipZero(U, S, n_rows, n_cols, false, true, stream);
   }
 }
 
@@ -156,10 +205,19 @@ void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S,
  * @param stream cuda stream
  */
 template <typename math_t>
-void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
-               math_t *sing_vals, math_t *left_sing_vecs,
-               math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec,
-               math_t tol, int max_sweeps, cudaStream_t stream) {
+void svdJacobi(const raft::handle_t& handle,
+               math_t* in,
+               int n_rows,
+               int n_cols,
+               math_t* sing_vals,
+               math_t* left_sing_vecs,
+               math_t* right_sing_vecs,
+               bool gen_left_vec,
+               bool gen_right_vec,
+               math_t tol,
+               int max_sweeps,
+               cudaStream_t stream)
+{
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   gesvdjInfo_t gesvdj_params = NULL;
@@ -174,18 +232,42 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
   rmm::device_scalar<int> devInfo(stream);
 
   int lwork = 0;
-  int econ = 1;
-
-  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize(
-    cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals,
-    left_sing_vecs, m, right_sing_vecs, n, &lwork, gesvdj_params));
+  int econ  = 1;
+
+  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize(cusolverH,
+                                                           CUSOLVER_EIG_MODE_VECTOR,
+                                                           econ,
+                                                           m,
+                                                           n,
+                                                           in,
+                                                           m,
+                                                           sing_vals,
+                                                           left_sing_vecs,
+                                                           m,
+                                                           right_sing_vecs,
+                                                           n,
+                                                           &lwork,
+                                                           gesvdj_params));
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
 
-  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj(
-    cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals,
-    left_sing_vecs, m, right_sing_vecs, n, d_work.data(), lwork, devInfo.data(),
-    gesvdj_params, stream));
+  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj(cusolverH,
+                                                CUSOLVER_EIG_MODE_VECTOR,
+                                                econ,
+                                                m,
+                                                n,
+                                                in,
+                                                m,
+                                                sing_vals,
+                                                left_sing_vecs,
+                                                m,
+                                                right_sing_vecs,
+                                                n,
+                                                d_work.data(),
+                                                lwork,
+                                                devInfo.data(),
+                                                gesvdj_params,
+                                                stream));
 
   CUSOLVER_CHECK(cusolverDnDestroyGesvdjInfo(gesvdj_params));
 }
@@ -204,16 +286,34 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
  * @param stream cuda stream
  */
 template <typename math_t>
-void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S,
-                       math_t *V, math_t *out, int n_rows, int n_cols, int k,
-                       cudaStream_t stream) {
+void svdReconstruction(const raft::handle_t& handle,
+                       math_t* U,
+                       math_t* S,
+                       math_t* V,
+                       math_t* out,
+                       int n_rows,
+                       int n_cols,
+                       int k,
+                       cudaStream_t stream)
+{
   const math_t alpha = 1.0, beta = 0.0;
   rmm::device_uvector<math_t> SVT(k * n_cols, stream);
 
-  raft::linalg::gemm(handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N,
-                     CUBLAS_OP_T, alpha, beta, stream);
-  raft::linalg::gemm(handle, U, n_rows, k, SVT.data(), out, n_rows, n_cols,
-                     CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream);
+  raft::linalg::gemm(
+    handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, CUBLAS_OP_T, alpha, beta, stream);
+  raft::linalg::gemm(handle,
+                     U,
+                     n_rows,
+                     k,
+                     SVT.data(),
+                     out,
+                     n_rows,
+                     n_cols,
+                     CUBLAS_OP_N,
+                     CUBLAS_OP_N,
+                     alpha,
+                     beta,
+                     stream);
 }
 
 /**
@@ -231,9 +331,17 @@ void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S,
  * @param stream cuda stream
  */
 template <typename math_t>
-bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U,
-                         math_t *S_vec, math_t *V, int n_rows, int n_cols,
-                         int k, math_t tol, cudaStream_t stream) {
+bool evaluateSVDByL2Norm(const raft::handle_t& handle,
+                         math_t* A_d,
+                         math_t* U,
+                         math_t* S_vec,
+                         math_t* V,
+                         int n_rows,
+                         int n_cols,
+                         int k,
+                         math_t tol,
+                         cudaStream_t stream)
+{
   cublasHandle_t cublasH = handle.get_cublas_handle();
 
   int m = n_rows, n = n_cols;
@@ -257,16 +365,25 @@ bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U,
   // calculate percent error
   const math_t alpha = 1.0, beta = -1.0;
   rmm::device_uvector<math_t> A_minus_P(m * n, stream);
-  CUDA_CHECK(
-    cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream));
-
-  CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n,
-                                        &alpha, A_d, m, &beta, P_d.data(), m,
-                                        A_minus_P.data(), m, stream));
-
-  math_t norm_A_minus_P =
-    raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream);
-  math_t percent_error = 100.0 * norm_A_minus_P / normA;
+  CUDA_CHECK(cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream));
+
+  CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH,
+                                        CUBLAS_OP_N,
+                                        CUBLAS_OP_N,
+                                        m,
+                                        n,
+                                        &alpha,
+                                        A_d,
+                                        m,
+                                        &beta,
+                                        P_d.data(),
+                                        m,
+                                        A_minus_P.data(),
+                                        m,
+                                        stream));
+
+  math_t norm_A_minus_P = raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream);
+  math_t percent_error  = 100.0 * norm_A_minus_P / normA;
   return (percent_error / 100.0 < tol);
 }
 
diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/transpose.h
index db1cabd694..e84ddd1166 100644
--- a/cpp/include/raft/linalg/transpose.h
+++ b/cpp/include/raft/linalg/transpose.h
@@ -33,18 +33,34 @@ namespace linalg {
  * @param stream: cuda stream
  */
 template <typename math_t>
-void transpose(const raft::handle_t &handle, math_t *in, math_t *out,
-               int n_rows, int n_cols, cudaStream_t stream) {
+void transpose(const raft::handle_t& handle,
+               math_t* in,
+               math_t* out,
+               int n_rows,
+               int n_cols,
+               cudaStream_t stream)
+{
   cublasHandle_t cublas_h = handle.get_cublas_handle();
 
   int out_n_rows = n_cols;
   int out_n_cols = n_rows;
 
   const math_t alpha = 1.0;
-  const math_t beta = 0.0;
-  CUBLAS_CHECK(raft::linalg::cublasgeam(
-    cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, out_n_rows, out_n_cols, &alpha, in,
-    n_rows, &beta, out, out_n_rows, out, out_n_rows, stream));
+  const math_t beta  = 0.0;
+  CUBLAS_CHECK(raft::linalg::cublasgeam(cublas_h,
+                                        CUBLAS_OP_T,
+                                        CUBLAS_OP_N,
+                                        out_n_rows,
+                                        out_n_cols,
+                                        &alpha,
+                                        in,
+                                        n_rows,
+                                        &beta,
+                                        out,
+                                        out_n_rows,
+                                        out,
+                                        out_n_rows,
+                                        stream));
 }
 
 /**
@@ -54,24 +70,24 @@ void transpose(const raft::handle_t &handle, math_t *in, math_t *out,
  * @param stream: cuda stream
  */
 template <typename math_t>
-void transpose(math_t *inout, int n, cudaStream_t stream) {
-  auto m = n;
-  auto size = n * n;
-  auto d_inout = inout;
+void transpose(math_t* inout, int n, cudaStream_t stream)
+{
+  auto m        = n;
+  auto size     = n * n;
+  auto d_inout  = inout;
   auto counting = thrust::make_counting_iterator<int>(0);
 
-  thrust::for_each(rmm::exec_policy(stream), counting, counting + size,
-                   [=] __device__(int idx) {
-                     int s_row = idx % m;
-                     int s_col = idx / m;
-                     int d_row = s_col;
-                     int d_col = s_row;
-                     if (s_row < s_col) {
-                       auto temp = d_inout[d_col * m + d_row];
-                       d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row];
-                       d_inout[s_col * m + s_row] = temp;
-                     }
-                   });
+  thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(int idx) {
+    int s_row = idx % m;
+    int s_col = idx / m;
+    int d_row = s_col;
+    int d_col = s_row;
+    if (s_row < s_col) {
+      auto temp                  = d_inout[d_col * m + d_row];
+      d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row];
+      d_inout[s_col * m + s_row] = temp;
+    }
+  });
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh
index 46b4d296cb..198b9b2b10 100644
--- a/cpp/include/raft/linalg/unary_op.cuh
+++ b/cpp/include/raft/linalg/unary_op.cuh
@@ -23,10 +23,9 @@
 namespace raft {
 namespace linalg {
 
-template <typename InType, int VecLen, typename Lambda, typename OutType,
-          typename IdxType>
-__global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len,
-                              Lambda op) {
+template <typename InType, int VecLen, typename Lambda, typename OutType, typename IdxType>
+__global__ void unaryOpKernel(OutType* out, const InType* in, IdxType len, Lambda op)
+{
   typedef TxN_t<InType, VecLen> InVecType;
   typedef TxN_t<OutType, VecLen> OutVecType;
   InVecType a;
@@ -42,12 +41,10 @@ __global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len,
   b.store(out, idx);
 }
 
-template <typename InType, int VecLen, typename Lambda, typename OutType,
-          typename IdxType, int TPB>
-void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op,
-                 cudaStream_t stream) {
-  const IdxType nblks =
-    raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
+template <typename InType, int VecLen, typename Lambda, typename OutType, typename IdxType, int TPB>
+void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
+{
+  const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
   unaryOpKernel<InType, VecLen, Lambda, OutType, IdxType>
     <<<nblks, TPB, 0, stream>>>(out, in, len, op);
   CUDA_CHECK(cudaPeekAtLastError());
@@ -68,47 +65,38 @@ void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op,
  * @note Lambda must be a functor with the following signature:
  *       `OutType func(const InType& val);`
  */
-template <typename InType, typename Lambda, typename IdxType = int,
-          typename OutType = InType, int TPB = 256>
-void unaryOp(OutType *out, const InType *in, IdxType len, Lambda op,
-             cudaStream_t stream) {
-  if (len <= 0) return;  //silently skip in case of 0 length input
-  constexpr auto maxSize =
-    sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
-  size_t bytes = len * maxSize;
-  uint64_t inAddr = uint64_t(in);
-  uint64_t outAddr = uint64_t(out);
-  if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 &&
-      outAddr % 16 == 0) {
-    unaryOpImpl<InType, 16 / maxSize, Lambda, OutType, IdxType, TPB>(
-      out, in, len, op, stream);
-  } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 &&
-             outAddr % 8 == 0) {
-    unaryOpImpl<InType, 8 / maxSize, Lambda, OutType, IdxType, TPB>(
-      out, in, len, op, stream);
-  } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 &&
-             outAddr % 4 == 0) {
-    unaryOpImpl<InType, 4 / maxSize, Lambda, OutType, IdxType, TPB>(
-      out, in, len, op, stream);
-  } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 &&
-             outAddr % 2 == 0) {
-    unaryOpImpl<InType, 2 / maxSize, Lambda, OutType, IdxType, TPB>(
-      out, in, len, op, stream);
+template <typename InType,
+          typename Lambda,
+          typename IdxType = int,
+          typename OutType = InType,
+          int TPB          = 256>
+void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
+{
+  if (len <= 0) return;  // silently skip in case of 0 length input
+  constexpr auto maxSize = sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
+  size_t bytes           = len * maxSize;
+  uint64_t inAddr        = uint64_t(in);
+  uint64_t outAddr       = uint64_t(out);
+  if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && outAddr % 16 == 0) {
+    unaryOpImpl<InType, 16 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+  } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && outAddr % 8 == 0) {
+    unaryOpImpl<InType, 8 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+  } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && outAddr % 4 == 0) {
+    unaryOpImpl<InType, 4 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+  } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && outAddr % 2 == 0) {
+    unaryOpImpl<InType, 2 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
   } else if (1 / maxSize) {
-    unaryOpImpl<InType, 1 / maxSize, Lambda, OutType, IdxType, TPB>(
-      out, in, len, op, stream);
+    unaryOpImpl<InType, 1 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
   } else {
-    unaryOpImpl<InType, 1, Lambda, OutType, IdxType, TPB>(out, in, len, op,
-                                                          stream);
+    unaryOpImpl<InType, 1, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
   }
 }
 
 template <typename OutType, typename Lambda, typename IdxType>
-__global__ void writeOnlyUnaryOpKernel(OutType *out, IdxType len, Lambda op) {
+__global__ void writeOnlyUnaryOpKernel(OutType* out, IdxType len, Lambda op)
+{
   IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
-  if (idx < len) {
-    op(out + idx, idx);
-  }
+  if (idx < len) { op(out + idx, idx); }
 }
 
 /**
@@ -128,14 +116,12 @@ __global__ void writeOnlyUnaryOpKernel(OutType *out, IdxType len, Lambda op) {
  *                    where outLocationOffset will be out + idx.
  * @param[in]  stream cuda stream where to launch work
  */
-template <typename OutType, typename Lambda, typename IdxType = int,
-          int TPB = 256>
-void writeOnlyUnaryOp(OutType *out, IdxType len, Lambda op,
-                      cudaStream_t stream) {
+template <typename OutType, typename Lambda, typename IdxType = int, int TPB = 256>
+void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
+{
   if (len <= 0) return;  // silently skip in case of 0 length input
   auto nblks = raft::ceildiv<IdxType>(len, TPB);
-  writeOnlyUnaryOpKernel<OutType, Lambda, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, len, op);
+  writeOnlyUnaryOpKernel<OutType, Lambda, IdxType><<<nblks, TPB, 0, stream>>>(out, len, op);
   CUDA_CHECK(cudaGetLastError());
 }
 
diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh
index f79cb397b7..4b56f3986f 100644
--- a/cpp/include/raft/matrix/detail/math.cuh
+++ b/cpp/include/raft/matrix/detail/math.cuh
@@ -25,30 +25,29 @@ namespace detail {
 
 // Computes the argmax(d_in) column-wise in a DxN matrix
 template <typename T, int TPB>
-__global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) {
+__global__ void argmaxKernel(const T* d_in, int D, int N, T* argmax)
+{
   typedef cub::BlockReduce<cub::KeyValuePair<int, T>, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
   // compute maxIndex=argMax  index for column
-  using KVP = cub::KeyValuePair<int, T>;
+  using KVP    = cub::KeyValuePair<int, T>;
   int rowStart = blockIdx.x * D;
   KVP thread_data(-1, -raft::myInf<T>());
 
   for (int i = threadIdx.x; i < D; i += TPB) {
-    int idx = rowStart + i;
+    int idx     = rowStart + i;
     thread_data = cub::ArgMax()(thread_data, KVP(i, d_in[idx]));
   }
 
   auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax());
 
-  if (threadIdx.x == 0) {
-    argmax[blockIdx.x] = maxKV.key;
-  }
+  if (threadIdx.x == 0) { argmax[blockIdx.x] = maxKV.key; }
 }
 
 template <typename math_t>
-void argmax(const math_t *in, int n_rows, int n_cols, math_t *out,
-            cudaStream_t stream) {
+void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t stream)
+{
   int D = n_rows;
   int N = n_cols;
   if (D <= 32) {
@@ -67,39 +66,39 @@ void argmax(const math_t *in, int n_rows, int n_cols, math_t *out,
 // Computes the argmax(abs(d_in)) column-wise in a DxN matrix followed by
 // flipping the sign if the |max| value for each column is negative.
 template <typename T, int TPB>
-__global__ void signFlipKernel(T *d_in, int D, int N) {
+__global__ void signFlipKernel(T* d_in, int D, int N)
+{
   typedef cub::BlockReduce<cub::KeyValuePair<int, T>, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
   // compute maxIndex=argMax (with abs()) index for column
-  using KVP = cub::KeyValuePair<int, T>;
+  using KVP    = cub::KeyValuePair<int, T>;
   int rowStart = blockIdx.x * D;
   KVP thread_data(0, 0);
   for (int i = threadIdx.x; i < D; i += TPB) {
-    int idx = rowStart + i;
+    int idx     = rowStart + i;
     thread_data = cub::ArgMax()(thread_data, KVP(idx, abs(d_in[idx])));
   }
   auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax());
 
   // flip column sign if d_in[maxIndex] < 0
   __shared__ bool need_sign_flip;
-  if (threadIdx.x == 0) {
-    need_sign_flip = d_in[maxKV.key] < T(0);
-  }
+  if (threadIdx.x == 0) { need_sign_flip = d_in[maxKV.key] < T(0); }
   __syncthreads();
 
   if (need_sign_flip) {
     for (int i = threadIdx.x; i < D; i += TPB) {
-      int idx = rowStart + i;
+      int idx   = rowStart + i;
       d_in[idx] = -d_in[idx];
     }
   }
 }
 
 template <typename math_t>
-void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) {
-  int D = n_rows;
-  int N = n_cols;
+void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream)
+{
+  int D     = n_rows;
+  int N     = n_cols;
   auto data = inout;
   if (D <= 32) {
     signFlipKernel<math_t, 32><<<N, 32, 0, stream>>>(data, D, N);
diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index 8293d01bdb..709570ae56 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -28,29 +28,32 @@ namespace matrix {
 namespace detail {
 
 template <typename m_t, typename idx_array_t = int, typename idx_t = size_t>
-void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out,
-              const idx_array_t *indices, idx_t n_rows_indices,
-              cudaStream_t stream, bool rowMajor = false) {
+void copyRows(const m_t* in,
+              idx_t n_rows,
+              idx_t n_cols,
+              m_t* out,
+              const idx_array_t* indices,
+              idx_t n_rows_indices,
+              cudaStream_t stream,
+              bool rowMajor = false)
+{
   if (rowMajor) {
     const idx_t TPB = 256;
-    cache::
-      get_vecs<<<raft::ceildiv(n_rows_indices * n_cols, TPB), TPB, 0, stream>>>(
-        in, n_cols, indices, n_rows_indices, out);
+    cache::get_vecs<<<raft::ceildiv(n_rows_indices * n_cols, TPB), TPB, 0, stream>>>(
+      in, n_cols, indices, n_rows_indices, out);
     CUDA_CHECK(cudaPeekAtLastError());
     return;
   }
 
-  idx_t size = n_rows_indices * n_cols;
+  idx_t size    = n_rows_indices * n_cols;
   auto counting = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(rmm::exec_policy(stream), counting, counting + size,
-                   [=] __device__(idx_t idx) {
-                     idx_t row = idx % n_rows_indices;
-                     idx_t col = idx / n_rows_indices;
+  thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(idx_t idx) {
+    idx_t row = idx % n_rows_indices;
+    idx_t col = idx / n_rows_indices;
 
-                     out[col * n_rows_indices + row] =
-                       in[col * n_rows + indices[row]];
-                   });
+    out[col * n_rows_indices + row] = in[col * n_rows + indices[row]];
+  });
 }
 
 /**
@@ -65,8 +68,9 @@ void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out,
  * (1-based)
  */
 template <typename m_t, typename idx_t = int>
-__global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1,
-                      idx_t y1, idx_t x2, idx_t y2) {
+__global__ void slice(
+  m_t* src_d, idx_t m, idx_t n, m_t* dst_d, idx_t x1, idx_t y1, idx_t x2, idx_t y2)
+{
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
   idx_t dm = x2 - x1, dn = y2 - y1;
   if (idx < dm * dn) {
@@ -77,8 +81,16 @@ __global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1,
 }
 
 template <typename m_t, typename idx_t = int>
-void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1,
-                 idx_t y1, idx_t x2, idx_t y2, cudaStream_t stream) {
+void sliceMatrix(m_t* in,
+                 idx_t n_rows,
+                 idx_t n_cols,
+                 m_t* out,
+                 idx_t x1,
+                 idx_t y1,
+                 idx_t x2,
+                 idx_t y2,
+                 cudaStream_t stream)
+{
   // Slicing
   dim3 block(64);
   dim3 grid(((x2 - x1) * (y2 - y1) + block.x - 1) / block.x);
@@ -94,21 +106,19 @@ void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1,
  * @param k: min(n_rows, n_cols)
  */
 template <typename m_t, typename idx_t = int>
-__global__ void getUpperTriangular(m_t *src, m_t *dst, idx_t n_rows,
-                                   idx_t n_cols, idx_t k) {
+__global__ void getUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, idx_t k)
+{
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
   idx_t m = n_rows, n = n_cols;
   if (idx < m * n) {
     idx_t i = idx % m, j = idx / m;
-    if (i < k && j < k && j >= i) {
-      dst[i + j * k] = src[idx];
-    }
+    if (i < k && j < k && j >= i) { dst[i + j * k] = src[idx]; }
   }
 }
 
 template <typename m_t, typename idx_t = int>
-void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols,
-                         cudaStream_t stream) {
+void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
   idx_t m = n_rows, n = n_cols;
   idx_t k = min(m, n);
   dim3 block(64);
@@ -125,23 +135,21 @@ void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols,
  * @param k: dimensionality
  */
 template <typename m_t, typename idx_t = int>
-__global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, idx_t m,
-                                           idx_t n, idx_t k) {
+__global__ void copyVectorToMatrixDiagonal(m_t* vec, m_t* matrix, idx_t m, idx_t n, idx_t k)
+{
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
 
-  if (idx < k) {
-    matrix[idx + idx * m] = vec[idx];
-  }
+  if (idx < k) { matrix[idx + idx * m] = vec[idx]; }
 }
 
 template <typename m_t, typename idx_t = int>
-void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols,
-                              cudaStream_t stream) {
+void initializeDiagonalMatrix(
+  m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
   idx_t k = min(n_rows, n_cols);
   dim3 block(64);
   dim3 grid((k + block.x - 1) / block.x);
-  copyVectorToMatrixDiagonal<<<grid, block, 0, stream>>>(vec, matrix, n_rows,
-                                                         n_cols, k);
+  copyVectorToMatrixDiagonal<<<grid, block, 0, stream>>>(vec, matrix, n_rows, n_cols, k);
 }
 
 /**
@@ -151,15 +159,15 @@ void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols,
  * @param len: size of one side of the matrix
  */
 template <typename m_t, typename idx_t = int>
-__global__ void matrixDiagonalInverse(m_t *in, idx_t len) {
+__global__ void matrixDiagonalInverse(m_t* in, idx_t len)
+{
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-  if (idx < len) {
-    in[idx + idx * len] = 1.0 / in[idx + idx * len];
-  }
+  if (idx < len) { in[idx + idx * len] = 1.0 / in[idx + idx * len]; }
 }
 
 template <typename m_t, typename idx_t = int>
-void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) {
+void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
+{
   dim3 block(64);
   dim3 grid((len + block.x - 1) / block.x);
   matrixDiagonalInverse<m_t><<<grid, block, 0, stream>>>(in, len);
diff --git a/cpp/include/raft/matrix/math.hpp b/cpp/include/raft/matrix/math.hpp
index e67440019f..df6eb6f489 100644
--- a/cpp/include/raft/matrix/math.hpp
+++ b/cpp/include/raft/matrix/math.hpp
@@ -43,14 +43,18 @@ namespace matrix {
  * @param stream cuda stream
  */
 template <typename math_t>
-void power(math_t *in, math_t *out, math_t scalar, int len,
-           cudaStream_t stream) {
-  auto d_src = in;
+void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream)
+{
+  auto d_src  = in;
   auto d_dest = out;
 
   raft::linalg::binaryOp(
-    d_dest, d_src, d_src, len,
-    [=] __device__(math_t a, math_t b) { return scalar * a * b; }, stream);
+    d_dest,
+    d_src,
+    d_src,
+    len,
+    [=] __device__(math_t a, math_t b) { return scalar * a * b; },
+    stream);
 }
 
 /**
@@ -61,7 +65,8 @@ void power(math_t *in, math_t *out, math_t scalar, int len,
  * @param stream cuda stream
  */
 template <typename math_t>
-void power(math_t *inout, math_t scalar, int len, cudaStream_t stream) {
+void power(math_t* inout, math_t scalar, int len, cudaStream_t stream)
+{
   power(inout, inout, scalar, len, stream);
 }
 
@@ -72,7 +77,8 @@ void power(math_t *inout, math_t scalar, int len, cudaStream_t stream) {
  * @param stream cuda stream
  */
 template <typename math_t>
-void power(math_t *inout, int len, cudaStream_t stream) {
+void power(math_t* inout, int len, cudaStream_t stream)
+{
   math_t scalar = 1.0;
   power(inout, scalar, len, stream);
 }
@@ -86,7 +92,8 @@ void power(math_t *inout, int len, cudaStream_t stream) {
  * @{
  */
 template <typename math_t>
-void power(math_t *in, math_t *out, int len, cudaStream_t stream) {
+void power(math_t* in, math_t* out, int len, cudaStream_t stream)
+{
   math_t scalar = 1.0;
   power(in, out, scalar, len, stream);
 }
@@ -103,13 +110,20 @@ void power(math_t *in, math_t *out, int len, cudaStream_t stream) {
  * @param set_neg_zero whether to set negative numbers to zero
  */
 template <typename math_t, typename IdxType = int>
-void seqRoot(math_t *in, math_t *out, math_t scalar, IdxType len,
-             cudaStream_t stream, bool set_neg_zero = false) {
-  auto d_src = in;
+void seqRoot(math_t* in,
+             math_t* out,
+             math_t scalar,
+             IdxType len,
+             cudaStream_t stream,
+             bool set_neg_zero = false)
+{
+  auto d_src  = in;
   auto d_dest = out;
 
   raft::linalg::unaryOp(
-    d_dest, d_src, len,
+    d_dest,
+    d_src,
+    len,
     [=] __device__(math_t a) {
       if (set_neg_zero) {
         if (a < math_t(0)) {
@@ -135,8 +149,9 @@ void seqRoot(math_t *in, math_t *out, math_t scalar, IdxType len,
  * @param set_neg_zero whether to set negative numbers to zero
  */
 template <typename math_t, typename IdxType = int>
-void seqRoot(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream,
-             bool set_neg_zero = false) {
+void seqRoot(
+  math_t* inout, math_t scalar, IdxType len, cudaStream_t stream, bool set_neg_zero = false)
+{
   seqRoot(inout, inout, scalar, len, stream, set_neg_zero);
 }
 
@@ -150,22 +165,27 @@ void seqRoot(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream,
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
-void seqRoot(math_t *in, math_t *out, IdxType len, cudaStream_t stream) {
+void seqRoot(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
+{
   math_t scalar = 1.0;
   seqRoot(in, out, scalar, len, stream);
 }
 
 template <typename math_t, typename IdxType = int>
-void seqRoot(math_t *inout, IdxType len, cudaStream_t stream) {
+void seqRoot(math_t* inout, IdxType len, cudaStream_t stream)
+{
   math_t scalar = 1.0;
   seqRoot(inout, inout, scalar, len, stream);
 }
 
 template <typename math_t, typename IdxType = int>
-void setSmallValuesZero(math_t *out, const math_t *in, IdxType len,
-                        cudaStream_t stream, math_t thres = 1e-15) {
+void setSmallValuesZero(
+  math_t* out, const math_t* in, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
+{
   raft::linalg::unaryOp(
-    out, in, len,
+    out,
+    in,
+    len,
     [=] __device__(math_t a) {
       if (a <= thres && -a <= thres) {
         return math_t(0);
@@ -186,8 +206,8 @@ void setSmallValuesZero(math_t *out, const math_t *in, IdxType len,
  * @param thres: threshold
  */
 template <typename math_t, typename IdxType = int>
-void setSmallValuesZero(math_t *inout, IdxType len, cudaStream_t stream,
-                        math_t thres = 1e-15) {
+void setSmallValuesZero(math_t* inout, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
+{
   setSmallValuesZero(inout, inout, len, stream, thres);
 }
 
@@ -205,14 +225,21 @@ void setSmallValuesZero(math_t *inout, IdxType len, cudaStream_t stream,
  * @{
  */
 template <typename math_t, typename IdxType = int>
-void reciprocal(math_t *in, math_t *out, math_t scalar, int len,
-                cudaStream_t stream, bool setzero = false,
-                math_t thres = 1e-15) {
-  auto d_src = in;
+void reciprocal(math_t* in,
+                math_t* out,
+                math_t scalar,
+                int len,
+                cudaStream_t stream,
+                bool setzero = false,
+                math_t thres = 1e-15)
+{
+  auto d_src  = in;
   auto d_dest = out;
 
   raft::linalg::unaryOp(
-    d_dest, d_src, len,
+    d_dest,
+    d_src,
+    len,
     [=] __device__(math_t a) {
       if (setzero) {
         if (abs(a) <= thres) {
@@ -239,8 +266,13 @@ void reciprocal(math_t *in, math_t *out, math_t scalar, int len,
  * @param thres: Threshold to avoid dividing by zero (|value| < thres -> result = 0)
  */
 template <typename math_t, typename IdxType = int>
-void reciprocal(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream,
-                bool setzero = false, math_t thres = 1e-15) {
+void reciprocal(math_t* inout,
+                math_t scalar,
+                IdxType len,
+                cudaStream_t stream,
+                bool setzero = false,
+                math_t thres = 1e-15)
+{
   reciprocal(inout, inout, scalar, len, stream, setzero, thres);
 }
 
@@ -253,7 +285,8 @@ void reciprocal(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream,
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
-void reciprocal(math_t *inout, IdxType len, cudaStream_t stream) {
+void reciprocal(math_t* inout, IdxType len, cudaStream_t stream)
+{
   math_t scalar = 1.0;
   reciprocal(inout, scalar, len, stream);
 }
@@ -268,14 +301,15 @@ void reciprocal(math_t *inout, IdxType len, cudaStream_t stream) {
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
-void reciprocal(math_t *in, math_t *out, IdxType len, cudaStream_t stream) {
+void reciprocal(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
+{
   math_t scalar = 1.0;
   reciprocal(in, out, scalar, len, stream);
 }
 
 template <typename math_t>
-void setValue(math_t *out, const math_t *in, math_t scalar, int len,
-              cudaStream_t stream = 0) {
+void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_t stream = 0)
+{
   raft::linalg::unaryOp(
     out, in, len, [scalar] __device__(math_t in) { return scalar; }, stream);
 }
@@ -290,18 +324,18 @@ void setValue(math_t *out, const math_t *in, math_t scalar, int len,
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
-void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len,
-           cudaStream_t stream) {
-  auto d_src = src;
+void ratio(
+  const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream)
+{
+  auto d_src  = src;
   auto d_dest = dest;
 
   rmm::device_scalar<math_t> d_sum(stream);
-  auto *d_sum_ptr = d_sum.data();
-  auto no_op = [] __device__(math_t in) { return in; };
+  auto* d_sum_ptr = d_sum.data();
+  auto no_op      = [] __device__(math_t in) { return in; };
   raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src);
   raft::linalg::unaryOp(
-    d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); },
-    stream);
+    d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); }, stream);
 }
 
 /** @} */
@@ -315,8 +349,8 @@ void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len,
  * @param stream: cuda stream
  */
 template <typename math_t>
-void argmax(const math_t *in, int n_rows, int n_cols, math_t *out,
-            cudaStream_t stream) {
+void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t stream)
+{
   detail::argmax(in, n_rows, n_cols, out, stream);
 }
 
@@ -329,25 +363,49 @@ void argmax(const math_t *in, int n_rows, int n_cols, math_t *out,
  * @param stream cuda stream
  */
 template <typename math_t>
-void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) {
+void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream)
+{
   detail::signFlip(inout, n_rows, n_cols, stream);
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryMult(Type *data, const Type *vec, IdxType n_row,
-                            IdxType n_col, bool rowMajor, bool bcastAlongRows,
-                            cudaStream_t stream) {
+void matrixVectorBinaryMult(Type* data,
+                            const Type* vec,
+                            IdxType n_row,
+                            IdxType n_col,
+                            bool rowMajor,
+                            bool bcastAlongRows,
+                            cudaStream_t stream)
+{
   raft::linalg::matrixVectorOp(
-    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
-    [] __device__(Type a, Type b) { return a * b; }, stream);
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a * b; },
+    stream);
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryMultSkipZero(Type *data, const Type *vec, IdxType n_row,
-                                    IdxType n_col, bool rowMajor,
-                                    bool bcastAlongRows, cudaStream_t stream) {
+void matrixVectorBinaryMultSkipZero(Type* data,
+                                    const Type* vec,
+                                    IdxType n_row,
+                                    IdxType n_col,
+                                    bool rowMajor,
+                                    bool bcastAlongRows,
+                                    cudaStream_t stream)
+{
   raft::linalg::matrixVectorOp(
-    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
     [] __device__(Type a, Type b) {
       if (b == Type(0))
         return a;
@@ -358,22 +416,45 @@ void matrixVectorBinaryMultSkipZero(Type *data, const Type *vec, IdxType n_row,
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryDiv(Type *data, const Type *vec, IdxType n_row,
-                           IdxType n_col, bool rowMajor, bool bcastAlongRows,
-                           cudaStream_t stream) {
+void matrixVectorBinaryDiv(Type* data,
+                           const Type* vec,
+                           IdxType n_row,
+                           IdxType n_col,
+                           bool rowMajor,
+                           bool bcastAlongRows,
+                           cudaStream_t stream)
+{
   raft::linalg::matrixVectorOp(
-    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
-    [] __device__(Type a, Type b) { return a / b; }, stream);
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a / b; },
+    stream);
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row,
-                                   IdxType n_col, bool rowMajor,
-                                   bool bcastAlongRows, cudaStream_t stream,
-                                   bool return_zero = false) {
+void matrixVectorBinaryDivSkipZero(Type* data,
+                                   const Type* vec,
+                                   IdxType n_row,
+                                   IdxType n_col,
+                                   bool rowMajor,
+                                   bool bcastAlongRows,
+                                   cudaStream_t stream,
+                                   bool return_zero = false)
+{
   if (return_zero) {
     raft::linalg::matrixVectorOp(
-      data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
+      data,
+      data,
+      vec,
+      n_col,
+      n_row,
+      rowMajor,
+      bcastAlongRows,
       [] __device__(Type a, Type b) {
         if (raft::myAbs(b) < Type(1e-10))
           return Type(0);
@@ -383,7 +464,13 @@ void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row,
       stream);
   } else {
     raft::linalg::matrixVectorOp(
-      data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
+      data,
+      data,
+      vec,
+      n_col,
+      n_row,
+      rowMajor,
+      bcastAlongRows,
       [] __device__(Type a, Type b) {
         if (raft::myAbs(b) < Type(1e-10))
           return a;
@@ -395,21 +482,45 @@ void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row,
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryAdd(Type *data, const Type *vec, IdxType n_row,
-                           IdxType n_col, bool rowMajor, bool bcastAlongRows,
-                           cudaStream_t stream) {
+void matrixVectorBinaryAdd(Type* data,
+                           const Type* vec,
+                           IdxType n_row,
+                           IdxType n_col,
+                           bool rowMajor,
+                           bool bcastAlongRows,
+                           cudaStream_t stream)
+{
   raft::linalg::matrixVectorOp(
-    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
-    [] __device__(Type a, Type b) { return a + b; }, stream);
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a + b; },
+    stream);
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinarySub(Type *data, const Type *vec, IdxType n_row,
-                           IdxType n_col, bool rowMajor, bool bcastAlongRows,
-                           cudaStream_t stream) {
+void matrixVectorBinarySub(Type* data,
+                           const Type* vec,
+                           IdxType n_row,
+                           IdxType n_col,
+                           bool rowMajor,
+                           bool bcastAlongRows,
+                           cudaStream_t stream)
+{
   raft::linalg::matrixVectorOp(
-    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
-    [] __device__(Type a, Type b) { return a - b; }, stream);
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a - b; },
+    stream);
 }
 
 };  // end namespace matrix
diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp
index 8dd9fbf487..c4cd30b7bc 100644
--- a/cpp/include/raft/matrix/matrix.hpp
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -47,11 +47,16 @@ using namespace std;
  * @param rowMajor whether the matrix has row major layout
  */
 template <typename m_t, typename idx_array_t = int, typename idx_t = size_t>
-void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out,
-              const idx_array_t *indices, idx_t n_rows_indices,
-              cudaStream_t stream, bool rowMajor = false) {
-  detail::copyRows(in, n_rows, n_cols, out, indices, n_rows_indices, stream,
-                   rowMajor);
+void copyRows(const m_t* in,
+              idx_t n_rows,
+              idx_t n_cols,
+              m_t* out,
+              const idx_array_t* indices,
+              idx_t n_rows_indices,
+              cudaStream_t stream,
+              bool rowMajor = false)
+{
+  detail::copyRows(in, n_rows, n_cols, out, indices, n_rows_indices, stream, rowMajor);
 }
 
 /**
@@ -63,8 +68,8 @@ void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out,
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void copy(const m_t *in, m_t *out, idx_t n_rows, idx_t n_cols,
-          cudaStream_t stream) {
+void copy(const m_t* in, m_t* out, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
   raft::copy_async(out, in, n_rows * n_cols, stream);
 }
 
@@ -79,21 +84,21 @@ void copy(const m_t *in, m_t *out, idx_t n_rows, idx_t n_cols,
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void truncZeroOrigin(m_t *in, idx_t in_n_rows, m_t *out, idx_t out_n_rows,
-                     idx_t out_n_cols, cudaStream_t stream) {
-  auto m = out_n_rows;
-  auto k = in_n_rows;
-  idx_t size = out_n_rows * out_n_cols;
-  auto d_q = in;
+void truncZeroOrigin(
+  m_t* in, idx_t in_n_rows, m_t* out, idx_t out_n_rows, idx_t out_n_cols, cudaStream_t stream)
+{
+  auto m         = out_n_rows;
+  auto k         = in_n_rows;
+  idx_t size     = out_n_rows * out_n_cols;
+  auto d_q       = in;
   auto d_q_trunc = out;
-  auto counting = thrust::make_counting_iterator<idx_t>(0);
+  auto counting  = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(rmm::exec_policy(stream), counting, counting + size,
-                   [=] __device__(idx_t idx) {
-                     idx_t row = idx % m;
-                     idx_t col = idx / m;
-                     d_q_trunc[col * m + row] = d_q[col * k + row];
-                   });
+  thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(idx_t idx) {
+    idx_t row                = idx % m;
+    idx_t col                = idx / m;
+    d_q_trunc[col * m + row] = d_q[col * k + row];
+  });
 }
 
 /**
@@ -105,24 +110,25 @@ void truncZeroOrigin(m_t *in, idx_t in_n_rows, m_t *out, idx_t out_n_rows,
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void colReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) {
-  auto n = n_cols;
-  auto m = n_rows;
-  idx_t size = n_rows * n_cols;
-  auto d_q = inout;
+void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
+  auto n            = n_cols;
+  auto m            = n_rows;
+  idx_t size        = n_rows * n_cols;
+  auto d_q          = inout;
   auto d_q_reversed = inout;
-  auto counting = thrust::make_counting_iterator<idx_t>(0);
+  auto counting     = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(rmm::exec_policy(stream), counting, counting + (size / 2),
-                   [=] __device__(idx_t idx) {
-                     idx_t dest_row = idx % m;
-                     idx_t dest_col = idx / m;
-                     idx_t src_row = dest_row;
-                     idx_t src_col = (n - dest_col) - 1;
-                     m_t temp = (m_t)d_q_reversed[idx];
-                     d_q_reversed[idx] = d_q[src_col * m + src_row];
-                     d_q[src_col * m + src_row] = temp;
-                   });
+  thrust::for_each(
+    rmm::exec_policy(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) {
+      idx_t dest_row             = idx % m;
+      idx_t dest_col             = idx / m;
+      idx_t src_row              = dest_row;
+      idx_t src_col              = (n - dest_col) - 1;
+      m_t temp                   = (m_t)d_q_reversed[idx];
+      d_q_reversed[idx]          = d_q[src_col * m + src_row];
+      d_q[src_col * m + src_row] = temp;
+    });
 }
 
 /**
@@ -134,25 +140,26 @@ void colReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) {
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) {
-  auto m = n_rows;
-  idx_t size = n_rows * n_cols;
-  auto d_q = inout;
+void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
+  auto m            = n_rows;
+  idx_t size        = n_rows * n_cols;
+  auto d_q          = inout;
   auto d_q_reversed = inout;
-  auto counting = thrust::make_counting_iterator<idx_t>(0);
+  auto counting     = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(rmm::exec_policy(stream), counting, counting + (size / 2),
-                   [=] __device__(idx_t idx) {
-                     idx_t dest_row = idx % m;
-                     idx_t dest_col = idx / m;
-                     idx_t src_row = (m - dest_row) - 1;
-                     ;
-                     idx_t src_col = dest_col;
+  thrust::for_each(
+    rmm::exec_policy(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) {
+      idx_t dest_row = idx % m;
+      idx_t dest_col = idx / m;
+      idx_t src_row  = (m - dest_row) - 1;
+      ;
+      idx_t src_col = dest_col;
 
-                     m_t temp = (m_t)d_q_reversed[idx];
-                     d_q_reversed[idx] = d_q[src_col * m + src_row];
-                     d_q[src_col * m + src_row] = temp;
-                   });
+      m_t temp                   = (m_t)d_q_reversed[idx];
+      d_q_reversed[idx]          = d_q[src_col * m + src_row];
+      d_q[src_col * m + src_row] = temp;
+    });
 }
 
 /**
@@ -164,16 +171,19 @@ void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) {
  * @param v_separator: vertical separator character
  */
 template <typename m_t, typename idx_t = int>
-void print(const m_t *in, idx_t n_rows, idx_t n_cols, char h_separator = ' ',
-           char v_separator = '\n',
-           cudaStream_t stream = rmm::cuda_stream_default) {
+void print(const m_t* in,
+           idx_t n_rows,
+           idx_t n_cols,
+           char h_separator    = ' ',
+           char v_separator    = '\n',
+           cudaStream_t stream = rmm::cuda_stream_default)
+{
   std::vector<m_t> h_matrix = std::vector<m_t>(n_cols * n_rows);
   raft::update_host(h_matrix.data(), in, n_cols * n_rows, stream);
 
   for (idx_t i = 0; i < n_rows; i++) {
     for (idx_t j = 0; j < n_cols; j++) {
-      printf("%1.4f%c", h_matrix[j * n_rows + i],
-             j < n_cols - 1 ? h_separator : v_separator);
+      printf("%1.4f%c", h_matrix[j * n_rows + i], j < n_cols - 1 ? h_separator : v_separator);
     }
   }
 }
@@ -185,7 +195,8 @@ void print(const m_t *in, idx_t n_rows, idx_t n_cols, char h_separator = ' ',
  * @param n_cols: number of columns of input matrix
  */
 template <typename m_t, typename idx_t = int>
-void printHost(const m_t *in, idx_t n_rows, idx_t n_cols) {
+void printHost(const m_t* in, idx_t n_rows, idx_t n_cols)
+{
   for (idx_t i = 0; i < n_rows; i++) {
     for (idx_t j = 0; j < n_cols; j++) {
       printf("%1.4f ", in[j * n_rows + i]);
@@ -208,8 +219,16 @@ void printHost(const m_t *in, idx_t n_rows, idx_t n_cols) {
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1,
-                 idx_t y1, idx_t x2, idx_t y2, cudaStream_t stream) {
+void sliceMatrix(m_t* in,
+                 idx_t n_rows,
+                 idx_t n_cols,
+                 m_t* out,
+                 idx_t x1,
+                 idx_t y1,
+                 idx_t x2,
+                 idx_t y2,
+                 cudaStream_t stream)
+{
   detail::sliceMatrix(in, n_rows, n_cols, out, x1, y1, x2, y2, stream);
 }
 
@@ -222,8 +241,8 @@ void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1,
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols,
-                         cudaStream_t stream) {
+void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
   detail::copyUpperTriangular(src, dst, n_rows, n_cols, stream);
 }
 
@@ -236,8 +255,9 @@ void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols,
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols,
-                              cudaStream_t stream) {
+void initializeDiagonalMatrix(
+  m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
   detail::initializeDiagonalMatrix(vec, matrix, n_rows, n_cols, stream);
 }
 
@@ -248,7 +268,8 @@ void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols,
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) {
+void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
+{
   detail::getDiagonalInverseMatrix(in, len, stream);
 }
 
@@ -260,12 +281,11 @@ void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) {
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-m_t getL2Norm(const raft::handle_t &handle, m_t *in, idx_t size,
-              cudaStream_t stream) {
+m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream)
+{
   cublasHandle_t cublasH = handle.get_cublas_handle();
-  m_t normval = 0;
-  CUBLAS_CHECK(
-    raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream));
+  m_t normval            = 0;
+  CUBLAS_CHECK(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream));
   return normval;
 }
 
diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp
index 4a2362bf97..38ef59aadf 100644
--- a/cpp/include/raft/mr/buffer_base.hpp
+++ b/cpp/include/raft/mr/buffer_base.hpp
@@ -38,11 +38,11 @@ namespace mr {
 template <typename T, typename AllocatorT>
 class buffer_base {
  public:
-  using size_type = std::size_t;
-  using value_type = T;
-  using iterator = value_type*;
-  using const_iterator = const value_type*;
-  using reference = T&;
+  using size_type       = std::size_t;
+  using value_type      = T;
+  using iterator        = value_type*;
+  using const_iterator  = const value_type*;
+  using reference       = T&;
   using const_reference = const T&;
 
   buffer_base() = delete;
@@ -58,16 +58,12 @@ class buffer_base {
    * @param[in] stream    cuda stream where this allocation operations are async
    * @param[in] n         size of the buffer (in number of elements)
    */
-  buffer_base(std::shared_ptr<AllocatorT> allocator, cudaStream_t stream,
-              size_type n = 0)
-    : data_(nullptr),
-      size_(n),
-      capacity_(n),
-      stream_(stream),
-      allocator_(std::move(allocator)) {
+  buffer_base(std::shared_ptr<AllocatorT> allocator, cudaStream_t stream, size_type n = 0)
+    : data_(nullptr), size_(n), capacity_(n), stream_(stream), allocator_(std::move(allocator))
+  {
     if (capacity_ > 0) {
-      data_ = static_cast<value_type*>(
-        allocator_->allocate(capacity_ * sizeof(value_type), stream_));
+      data_ =
+        static_cast<value_type*>(allocator_->allocate(capacity_ * sizeof(value_type), stream_));
       CUDA_CHECK(cudaStreamSynchronize(stream_));
     }
   }
@@ -100,23 +96,23 @@ class buffer_base {
    * @param[in] new_capacity new capacity (in number of elements)
    * @{
    */
-  void reserve(size_type new_capacity) {
+  void reserve(size_type new_capacity)
+  {
     if (new_capacity > capacity_) {
-      auto* new_data = static_cast<value_type*>(
-        allocator_->allocate(new_capacity * sizeof(value_type), stream_));
-      if (size_ > 0) {
-        raft::copy(new_data, data_, size_, stream_);
-      }
+      auto* new_data =
+        static_cast<value_type*>(allocator_->allocate(new_capacity * sizeof(value_type), stream_));
+      if (size_ > 0) { raft::copy(new_data, data_, size_, stream_); }
       // Only deallocate if we have allocated a pointer
       if (nullptr != data_) {
         allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_);
       }
-      data_ = new_data;
+      data_     = new_data;
       capacity_ = new_capacity;
     }
   }
 
-  void reserve(size_type new_capacity, cudaStream_t stream) {
+  void reserve(size_type new_capacity, cudaStream_t stream)
+  {
     set_stream(stream);
     reserve(new_capacity);
   }
@@ -128,12 +124,14 @@ class buffer_base {
    * @param[in] new_size new buffer size
    * @{
    */
-  void resize(const size_type new_size) {
+  void resize(const size_type new_size)
+  {
     reserve(new_size);
     size_ = new_size;
   }
 
-  void resize(const size_type new_size, cudaStream_t stream) {
+  void resize(const size_type new_size, cudaStream_t stream)
+  {
     set_stream(stream);
     resize(new_size);
   }
@@ -145,16 +143,18 @@ class buffer_base {
    * If this method is not explicitly called, it will be during the destructor
    * @{
    */
-  void release() {
+  void release()
+  {
     if (nullptr != data_) {
       allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_);
     }
-    data_ = nullptr;
+    data_     = nullptr;
     capacity_ = 0;
-    size_ = 0;
+    size_     = 0;
   }
 
-  void release(cudaStream_t stream) {
+  void release(cudaStream_t stream)
+  {
     set_stream(stream);
     release();
   }
@@ -194,7 +194,8 @@ class buffer_base {
    * @param[in] stream new cuda stream to be set. If it is the same as the
    *                   current one, then this method will be a no-op.
    */
-  void set_stream(cudaStream_t stream) {
+  void set_stream(cudaStream_t stream)
+  {
     if (stream_ != stream) {
       cudaEvent_t event;
       CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
diff --git a/cpp/include/raft/mr/device/allocator.hpp b/cpp/include/raft/mr/device/allocator.hpp
index 3d1ce38c31..8d306a199f 100644
--- a/cpp/include/raft/mr/device/allocator.hpp
+++ b/cpp/include/raft/mr/device/allocator.hpp
@@ -34,17 +34,20 @@ namespace device {
  * further to the ones listed in `Allocator`:
  * - Allocations may be always on the device that was specified on construction.
  */
-class allocator : public base_allocator {};
+class allocator : public base_allocator {
+};
 
 /** Default device allocator based on the one provided by RMM */
 class default_allocator : public allocator {
  public:
-  void* allocate(std::size_t n, cudaStream_t stream) override {
+  void* allocate(std::size_t n, cudaStream_t stream) override
+  {
     void* ptr = rmm::mr::get_current_device_resource()->allocate(n, stream);
     return ptr;
   }
 
-  void deallocate(void* p, std::size_t n, cudaStream_t stream) override {
+  void deallocate(void* p, std::size_t n, cudaStream_t stream) override
+  {
     rmm::mr::get_current_device_resource()->deallocate(p, n, stream);
   }
 };  // class default_allocator
diff --git a/cpp/include/raft/mr/device/buffer.hpp b/cpp/include/raft/mr/device/buffer.hpp
index 39b5674ce4..2b9d84368f 100644
--- a/cpp/include/raft/mr/device/buffer.hpp
+++ b/cpp/include/raft/mr/device/buffer.hpp
@@ -46,11 +46,11 @@ namespace device {
 template <typename T>
 class buffer : public buffer_base<T, allocator> {
  public:
-  using size_type = typename buffer_base<T, allocator>::size_type;
-  using value_type = typename buffer_base<T, allocator>::value_type;
-  using iterator = typename buffer_base<T, allocator>::iterator;
-  using const_iterator = typename buffer_base<T, allocator>::const_iterator;
-  using reference = typename buffer_base<T, allocator>::reference;
+  using size_type       = typename buffer_base<T, allocator>::size_type;
+  using value_type      = typename buffer_base<T, allocator>::value_type;
+  using iterator        = typename buffer_base<T, allocator>::iterator;
+  using const_iterator  = typename buffer_base<T, allocator>::const_iterator;
+  using reference       = typename buffer_base<T, allocator>::reference;
   using const_reference = typename buffer_base<T, allocator>::const_reference;
 
   buffer() = delete;
@@ -60,7 +60,9 @@ class buffer : public buffer_base<T, allocator> {
   buffer& operator=(const buffer& other) = delete;
 
   buffer(std::shared_ptr<allocator> alloc, cudaStream_t stream, size_type n = 0)
-    : buffer_base<T, device::allocator>(alloc, stream, n) {}
+    : buffer_base<T, device::allocator>(alloc, stream, n)
+  {
+  }
 };  // class buffer
 
 };  // namespace device
diff --git a/cpp/include/raft/mr/host/allocator.hpp b/cpp/include/raft/mr/host/allocator.hpp
index e5b3da24eb..7d31248e7f 100644
--- a/cpp/include/raft/mr/host/allocator.hpp
+++ b/cpp/include/raft/mr/host/allocator.hpp
@@ -35,20 +35,23 @@ namespace host {
  * further to the ones listed in `Allocator`:
  * - Allocations don't need to be zero copy accessible form a device.
  */
-class allocator : public base_allocator {};
+class allocator : public base_allocator {
+};
 
 /** Default cudaMallocHost/cudaFreeHost based host allocator */
 class default_allocator : public allocator {
  public:
-  void* allocate(std::size_t n, cudaStream_t stream) override {
+  void* allocate(std::size_t n, cudaStream_t stream) override
+  {
     void* ptr = nullptr;
     CUDA_CHECK(cudaMallocHost(&ptr, n));
     return ptr;
   }
 
-  void deallocate(void* p, std::size_t n, cudaStream_t stream) override {
-    //Must call _NO_THROW here since this is called frequently from object
-    //destructors which are "nothrow" by default
+  void deallocate(void* p, std::size_t n, cudaStream_t stream) override
+  {
+    // Must call _NO_THROW here since this is called frequently from object
+    // destructors which are "nothrow" by default
     CUDA_CHECK_NO_THROW(cudaFreeHost(p));
   }
 };  // class default_allocator
diff --git a/cpp/include/raft/mr/host/buffer.hpp b/cpp/include/raft/mr/host/buffer.hpp
index 3c505bf2ed..52475ad6ec 100644
--- a/cpp/include/raft/mr/host/buffer.hpp
+++ b/cpp/include/raft/mr/host/buffer.hpp
@@ -48,11 +48,11 @@ namespace host {
 template <typename T>
 class buffer : public buffer_base<T, allocator> {
  public:
-  using size_type = typename buffer_base<T, allocator>::size_type;
-  using value_type = typename buffer_base<T, allocator>::value_type;
-  using iterator = typename buffer_base<T, allocator>::iterator;
-  using const_iterator = typename buffer_base<T, allocator>::const_iterator;
-  using reference = typename buffer_base<T, allocator>::reference;
+  using size_type       = typename buffer_base<T, allocator>::size_type;
+  using value_type      = typename buffer_base<T, allocator>::value_type;
+  using iterator        = typename buffer_base<T, allocator>::iterator;
+  using const_iterator  = typename buffer_base<T, allocator>::const_iterator;
+  using reference       = typename buffer_base<T, allocator>::reference;
   using const_reference = typename buffer_base<T, allocator>::const_reference;
 
   buffer() = delete;
@@ -62,14 +62,15 @@ class buffer : public buffer_base<T, allocator> {
   buffer& operator=(const buffer& other) = delete;
 
   buffer(std::shared_ptr<allocator> alloc, const device::buffer<T>& other)
-    : buffer_base<T, allocator>(alloc, other.get_stream(), other.size()) {
-    if (other.size() > 0) {
-      raft::copy(data_, other.data(), other.size(), other.get_stream());
-    }
+    : buffer_base<T, allocator>(alloc, other.get_stream(), other.size())
+  {
+    if (other.size() > 0) { raft::copy(data_, other.data(), other.size(), other.get_stream()); }
   }
 
   buffer(std::shared_ptr<allocator> alloc, cudaStream_t stream, size_type n = 0)
-    : buffer_base<T, allocator>(alloc, stream, n) {}
+    : buffer_base<T, allocator>(alloc, stream, n)
+  {
+  }
 
   reference operator[](size_type pos) { return data_[pos]; }
 
diff --git a/cpp/include/raft/pow2_utils.cuh b/cpp/include/raft/pow2_utils.cuh
index de5fc46452..56a3192f9f 100644
--- a/cpp/include/raft/pow2_utils.cuh
+++ b/cpp/include/raft/pow2_utils.cuh
@@ -29,14 +29,13 @@ template <auto Value_>
 struct Pow2 {
   typedef decltype(Value_) Type;
   static constexpr Type Value = Value_;
-  static constexpr Type Log2 = log2(Value);
-  static constexpr Type Mask = Value - 1;
+  static constexpr Type Log2  = log2(Value);
+  static constexpr Type Mask  = Value - 1;
 
   static_assert(std::is_integral<Type>::value, "Value must be integral.");
   static_assert(Value && !(Value & Mask), "Value must be power of two.");
 
-#define Pow2_IsRepresentableAs(I) \
-  (std::is_integral<I>::value && Type(I(Value)) == Value)
+#define Pow2_IsRepresentableAs(I) (std::is_integral<I>::value && Type(I(Value)) == Value)
 
   /**
    * Integer division by Value truncated toward zero
@@ -45,10 +44,9 @@ struct Pow2 {
    *  Invariant: `x = Value * quot(x) + rem(x)`
    */
   template <typename I>
-  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> quot(
-    I x) noexcept {
-    if constexpr (std::is_signed<I>::value)
-      return (x >> I(Log2)) + (x < 0 && (x & I(Mask)));
+  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> quot(I x) noexcept
+  {
+    if constexpr (std::is_signed<I>::value) return (x >> I(Log2)) + (x < 0 && (x & I(Mask)));
     if constexpr (std::is_unsigned<I>::value) return x >> I(Log2);
   }
 
@@ -59,10 +57,9 @@ struct Pow2 {
    *  Invariant: `x = Value * quot(x) + rem(x)`.
    */
   template <typename I>
-  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> rem(
-    I x) noexcept {
-    if constexpr (std::is_signed<I>::value)
-      return x < 0 ? -((-x) & I(Mask)) : (x & I(Mask));
+  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> rem(I x) noexcept
+  {
+    if constexpr (std::is_signed<I>::value) return x < 0 ? -((-x) & I(Mask)) : (x & I(Mask));
     if constexpr (std::is_unsigned<I>::value) return x & I(Mask);
   }
 
@@ -77,8 +74,8 @@ struct Pow2 {
    * compared to normal C++ operators `/` and `%`.
    */
   template <typename I>
-  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> div(
-    I x) noexcept {
+  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> div(I x) noexcept
+  {
     return x >> I(Log2);
   }
 
@@ -94,8 +91,8 @@ struct Pow2 {
    * compared to normal C++ operators `/` and `%`.
    */
   template <typename I>
-  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> mod(
-    I x) noexcept {
+  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> mod(I x) noexcept
+  {
     return x & I(Mask);
   }
 
@@ -108,16 +105,17 @@ struct Pow2 {
    * NB: for pointers, the alignment is checked in bytes, not in elements.
    */
   template <typename PtrT>
-  static constexpr HDI bool isAligned(PtrT p) noexcept {
+  static constexpr HDI bool isAligned(PtrT p) noexcept
+  {
     Pow2_CHECK_TYPE(PtrT);
     if constexpr (Pow2_IsRepresentableAs(PtrT)) return mod(p) == 0;
-    if constexpr (!Pow2_IsRepresentableAs(PtrT))
-      return mod(reinterpret_cast<Type>(p)) == 0;
+    if constexpr (!Pow2_IsRepresentableAs(PtrT)) return mod(reinterpret_cast<Type>(p)) == 0;
   }
 
   /** Tell whether two pointers have the same address modulo Value. */
   template <typename PtrT, typename PtrS>
-  static constexpr HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept {
+  static constexpr HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept
+  {
     Pow2_CHECK_TYPE(PtrT);
     Pow2_CHECK_TYPE(PtrS);
     Type x, y;
@@ -134,10 +132,10 @@ struct Pow2 {
 
   /** Get this or next Value-aligned address (in bytes) or integral. */
   template <typename PtrT>
-  static constexpr HDI PtrT roundUp(PtrT p) noexcept {
+  static constexpr HDI PtrT roundUp(PtrT p) noexcept
+  {
     Pow2_CHECK_TYPE(PtrT);
-    if constexpr (Pow2_IsRepresentableAs(PtrT))
-      return p + PtrT(Mask) - mod(p + PtrT(Mask));
+    if constexpr (Pow2_IsRepresentableAs(PtrT)) return p + PtrT(Mask) - mod(p + PtrT(Mask));
     if constexpr (!Pow2_IsRepresentableAs(PtrT)) {
       auto x = reinterpret_cast<Type>(p);
       return reinterpret_cast<PtrT>(x + Mask - mod(x + Mask));
@@ -146,7 +144,8 @@ struct Pow2 {
 
   /** Get this or previous Value-aligned address (in bytes) or integral. */
   template <typename PtrT>
-  static constexpr HDI PtrT roundDown(PtrT p) noexcept {
+  static constexpr HDI PtrT roundDown(PtrT p) noexcept
+  {
     Pow2_CHECK_TYPE(PtrT);
     if constexpr (Pow2_IsRepresentableAs(PtrT)) return p - mod(p);
     if constexpr (!Pow2_IsRepresentableAs(PtrT)) {
diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh
index 654c46bbf9..0f3b58975e 100644
--- a/cpp/include/raft/random/detail/rng_impl.cuh
+++ b/cpp/include/raft/random/detail/rng_impl.cuh
@@ -44,19 +44,20 @@ enum GeneratorType {
 };
 
 template <typename Type>
-DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1,
-                             Type sigma2, Type mu2) {
-  constexpr Type twoPi = Type(2.0) * Type(3.141592654);
+DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2)
+{
+  constexpr Type twoPi  = Type(2.0) * Type(3.141592654);
   constexpr Type minus2 = -Type(2.0);
-  Type R = raft::mySqrt(minus2 * raft::myLog(val1));
-  Type theta = twoPi * val2;
+  Type R                = raft::mySqrt(minus2 * raft::myLog(val1));
+  Type theta            = twoPi * val2;
   Type s, c;
   raft::mySinCos(theta, s, c);
   val1 = R * c * sigma1 + mu1;
   val2 = R * s * sigma2 + mu2;
 }
 template <typename Type>
-DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) {
+DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1)
+{
   box_muller_transform<Type>(val1, val2, sigma1, mu1, sigma1, mu1);
 }
 
@@ -67,10 +68,13 @@ DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) {
 template <typename GenType>
 struct Generator {
   DI Generator(uint64_t seed, uint64_t subsequence, uint64_t offset)
-    : gen(seed, subsequence, offset) {}
+    : gen(seed, subsequence, offset)
+  {
+  }
 
   template <typename Type>
-  DI void next(Type &ret) {
+  DI void next(Type& ret)
+  {
     gen.next(ret);
   }
 
@@ -79,10 +83,9 @@ struct Generator {
   GenType gen;
 };
 
-template <typename OutType, typename MathType, typename GenType,
-          typename LenType, typename Lambda>
-__global__ void randKernel(uint64_t seed, uint64_t offset, OutType *ptr,
-                           LenType len, Lambda randOp) {
+template <typename OutType, typename MathType, typename GenType, typename LenType, typename Lambda>
+__global__ void randKernel(uint64_t seed, uint64_t offset, OutType* ptr, LenType len, Lambda randOp)
+{
   LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   Generator<GenType> gen(seed, (uint64_t)tid, offset);
   const LenType stride = gridDim.x * blockDim.x;
@@ -94,10 +97,10 @@ __global__ void randKernel(uint64_t seed, uint64_t offset, OutType *ptr,
 }
 
 // used for Box-Muller type transformations
-template <typename OutType, typename MathType, typename GenType,
-          typename LenType, typename Lambda2>
-__global__ void rand2Kernel(uint64_t seed, uint64_t offset, OutType *ptr,
-                            LenType len, Lambda2 rand2Op) {
+template <typename OutType, typename MathType, typename GenType, typename LenType, typename Lambda2>
+__global__ void rand2Kernel(
+  uint64_t seed, uint64_t offset, OutType* ptr, LenType len, Lambda2 rand2Op)
+{
   LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   Generator<GenType> gen(seed, (uint64_t)tid, offset);
   const LenType stride = gridDim.x * blockDim.x;
@@ -113,8 +116,9 @@ __global__ void rand2Kernel(uint64_t seed, uint64_t offset, OutType *ptr,
 }
 
 template <typename Type>
-__global__ void constFillKernel(Type *ptr, int len, Type val) {
-  unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+__global__ void constFillKernel(Type* ptr, int len, Type val)
+{
+  unsigned tid          = (blockIdx.x * blockDim.x) + threadIdx.x;
   const unsigned stride = gridDim.x * blockDim.x;
   for (unsigned idx = tid; idx < len; idx += stride) {
     ptr[idx] = val;
@@ -130,7 +134,8 @@ struct PhiloxGenerator {
    * @param subsequence as found in curand docs
    * @param offset as found in curand docs
    */
-  DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
+  DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset)
+  {
     curand_init(seed, subsequence, offset, &state);
   }
 
@@ -138,21 +143,24 @@ struct PhiloxGenerator {
    * @defgroup NextRand Generate the next random number
    * @{
    */
-  DI void next(float &ret) { ret = curand_uniform(&(this->state)); }
-  DI void next(double &ret) { ret = curand_uniform_double(&(this->state)); }
-  DI void next(uint32_t &ret) { ret = curand(&(this->state)); }
-  DI void next(uint64_t &ret) {
+  DI void next(float& ret) { ret = curand_uniform(&(this->state)); }
+  DI void next(double& ret) { ret = curand_uniform_double(&(this->state)); }
+  DI void next(uint32_t& ret) { ret = curand(&(this->state)); }
+  DI void next(uint64_t& ret)
+  {
     uint32_t a, b;
     next(a);
     next(b);
     ret = (uint64_t)a | ((uint64_t)b << 32);
   }
-  DI void next(int32_t &ret) {
+  DI void next(int32_t& ret)
+  {
     uint32_t val;
     next(val);
     ret = int32_t(val & 0x7fffffff);
   }
-  DI void next(int64_t &ret) {
+  DI void next(int64_t& ret)
+  {
     uint64_t val;
     next(val);
     ret = int64_t(val & 0x7fffffffffffffff);
@@ -173,8 +181,9 @@ struct TapsGenerator {
    * @param subsequence unused
    * @param offset unused
    */
-  DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
-    uint64_t delta = (blockIdx.x * blockDim.x) + threadIdx.x;
+  DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset)
+  {
+    uint64_t delta  = (blockIdx.x * blockDim.x) + threadIdx.x;
     uint64_t stride = blockDim.x * gridDim.x;
     delta += ((blockIdx.y * blockDim.y) + threadIdx.y) * stride;
     stride *= blockDim.y * gridDim.y;
@@ -187,31 +196,36 @@ struct TapsGenerator {
    * @{
    */
   template <typename Type>
-  DI void next(Type &ret) {
+  DI void next(Type& ret)
+  {
     constexpr double ULL_LARGE = 1.8446744073709551614e19;
     uint64_t val;
     next(val);
     ret = static_cast<Type>(val);
     ret /= static_cast<Type>(ULL_LARGE);
   }
-  DI void next(uint64_t &ret) {
+  DI void next(uint64_t& ret)
+  {
     constexpr uint64_t TAPS = 0x8000100040002000ULL;
-    constexpr int ROUNDS = 128;
+    constexpr int ROUNDS    = 128;
     for (int i = 0; i < ROUNDS; i++)
       state = (state >> 1) ^ (-(state & 1ULL) & TAPS);
     ret = state;
   }
-  DI void next(uint32_t &ret) {
+  DI void next(uint32_t& ret)
+  {
     uint64_t val;
     next(val);
     ret = (uint32_t)val;
   }
-  DI void next(int32_t &ret) {
+  DI void next(int32_t& ret)
+  {
     uint32_t val;
     next(val);
     ret = int32_t(val & 0x7fffffff);
   }
-  DI void next(int64_t &ret) {
+  DI void next(int64_t& ret)
+  {
     uint64_t val;
     next(val);
     ret = int64_t(val & 0x7fffffffffffffff);
@@ -232,46 +246,49 @@ struct Kiss99Generator {
    * @param subsequence unused
    * @param offset unused
    */
-  DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
-    initKiss99(seed);
-  }
+  DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) { initKiss99(seed); }
 
   /**
    * @defgroup NextRand Generate the next random number
    * @{
    */
   template <typename Type>
-  DI void next(Type &ret) {
+  DI void next(Type& ret)
+  {
     constexpr double U_LARGE = 4.294967295e9;
     uint32_t val;
     next(val);
     ret = static_cast<Type>(val);
     ret /= static_cast<Type>(U_LARGE);
   }
-  DI void next(uint32_t &ret) {
+  DI void next(uint32_t& ret)
+  {
     uint32_t MWC;
-    z = 36969 * (z & 65535) + (z >> 16);
-    w = 18000 * (w & 65535) + (w >> 16);
+    z   = 36969 * (z & 65535) + (z >> 16);
+    w   = 18000 * (w & 65535) + (w >> 16);
     MWC = ((z << 16) + w);
     jsr ^= (jsr << 17);
     jsr ^= (jsr >> 13);
     jsr ^= (jsr << 5);
     jcong = 69069 * jcong + 1234567;
-    MWC = ((MWC ^ jcong) + jsr);
-    ret = MWC;
+    MWC   = ((MWC ^ jcong) + jsr);
+    ret   = MWC;
   }
-  DI void next(uint64_t &ret) {
+  DI void next(uint64_t& ret)
+  {
     uint32_t a, b;
     next(a);
     next(b);
     ret = (uint64_t)a | ((uint64_t)b << 32);
   }
-  DI void next(int32_t &ret) {
+  DI void next(int32_t& ret)
+  {
     uint32_t val;
     next(val);
     ret = int32_t(val & 0x7fffffff);
   }
-  DI void next(int64_t &ret) {
+  DI void next(int64_t& ret)
+  {
     uint64_t val;
     next(val);
     ret = int64_t(val & 0x7fffffffffffffff);
@@ -290,7 +307,8 @@ struct Kiss99Generator {
 
   // This function multiplies 128-bit hash by 128-bit FNV prime and returns lower
   // 128 bits. It uses 32-bit wide multiply only.
-  DI void mulByFnv1a128Prime(uint32_t *h) {
+  DI void mulByFnv1a128Prime(uint32_t* h)
+  {
     typedef union {
       uint32_t u32[2];
       uint64_t u64[1];
@@ -314,12 +332,12 @@ struct Kiss99Generator {
     // h_n[2] = HI(h[1]*p[0]) + LO(h[2]*p[0]) + LO(h[0]*p[2]);
     // h_n[3] = HI(h[2]*p[0]) + HI(h[0]*p[2]) + LO(h[3]*p[0]) + LO(h[1]*p[2]);
     uint32_t carry = 0;
-    h[0] = h0p0.u32[0];
+    h[0]           = h0p0.u32[0];
 
-    h[1] = h0p0.u32[1] + h1p0.u32[0];
+    h[1]  = h0p0.u32[1] + h1p0.u32[0];
     carry = h[1] < h0p0.u32[1] ? 1 : 0;
 
-    h[2] = h1p0.u32[1] + carry;
+    h[2]  = h1p0.u32[1] + carry;
     carry = h[2] < h1p0.u32[1] ? 1 : 0;
     h[2] += h2p0.u32[0];
     carry = h[2] < h2p0.u32[0] ? carry + 1 : carry;
@@ -330,7 +348,8 @@ struct Kiss99Generator {
     return;
   }
 
-  DI void fnv1a128(uint32_t *hash, uint32_t txt) {
+  DI void fnv1a128(uint32_t* hash, uint32_t txt)
+  {
     hash[0] ^= (txt >> 0) & 0xFF;
     mulByFnv1a128Prime(hash);
     hash[0] ^= (txt >> 8) & 0xFF;
@@ -341,7 +360,8 @@ struct Kiss99Generator {
     mulByFnv1a128Prime(hash);
   }
 
-  DI void initKiss99(uint64_t seed) {
+  DI void initKiss99(uint64_t seed)
+  {
     // Initialize hash to 128-bit FNV1a basis
     uint32_t hash[4] = {1653982605UL, 1656234357UL, 129696066UL, 1818371886UL};
 
@@ -356,9 +376,9 @@ struct Kiss99Generator {
     fnv1a128(hash, uint32_t(seed >> 32));
 
     // Initialize KISS99 state with hash
-    z = hash[0];
-    w = hash[1];
-    jsr = hash[2];
+    z     = hash[0];
+    w     = hash[1];
+    jsr   = hash[2];
     jcong = hash[3];
   }
 };
@@ -372,17 +392,20 @@ class RngImpl {
       // simple heuristic to make sure all SMs will be occupied properly
       // and also not too many initialization calls will be made by each thread
       nBlocks(4 * getMultiProcessorCount()),
-      gen() {
+      gen()
+  {
     seed(_s);
   }
 
-  void seed(uint64_t _s) {
+  void seed(uint64_t _s)
+  {
     gen.seed(_s);
     offset = 0;
   }
 
   template <typename IdxT>
-  void affine_transform_params(IdxT n, IdxT &a, IdxT &b) {
+  void affine_transform_params(IdxT n, IdxT& a, IdxT& b)
+  {
     // always keep 'a' to be coprime to 'n'
     a = gen() % n;
     while (gcd(a, n) != 1) {
@@ -394,128 +417,150 @@ class RngImpl {
   }
 
   template <typename Type, typename LenType = int>
-  void uniform(Type *ptr, LenType len, Type start, Type end,
-               cudaStream_t stream) {
+  void uniform(Type* ptr, LenType len, Type start, Type end, cudaStream_t stream)
+  {
     static_assert(std::is_floating_point<Type>::value,
                   "Type for 'uniform' can only be floating point!");
     custom_distribution(
-      ptr, len,
-      [=] __device__(Type val, LenType idx) {
-        return (val * (end - start)) + start;
-      },
+      ptr,
+      len,
+      [=] __device__(Type val, LenType idx) { return (val * (end - start)) + start; },
       stream);
   }
   template <typename IntType, typename LenType = int>
-  void uniformInt(IntType *ptr, LenType len, IntType start, IntType end,
-                  cudaStream_t stream) {
-    static_assert(std::is_integral<IntType>::value,
-                  "Type for 'uniformInt' can only be integer!");
+  void uniformInt(IntType* ptr, LenType len, IntType start, IntType end, cudaStream_t stream)
+  {
+    static_assert(std::is_integral<IntType>::value, "Type for 'uniformInt' can only be integer!");
     custom_distribution(
-      ptr, len,
-      [=] __device__(IntType val, LenType idx) {
-        return (val % (end - start)) + start;
-      },
+      ptr,
+      len,
+      [=] __device__(IntType val, LenType idx) { return (val % (end - start)) + start; },
       stream);
   }
 
   template <typename Type, typename LenType = int>
-  void normal(Type *ptr, LenType len, Type mu, Type sigma,
-              cudaStream_t stream) {
+  void normal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream)
+  {
     static_assert(std::is_floating_point<Type>::value,
                   "Type for 'normal' can only be floating point!");
     rand2Impl(
-      offset, ptr, len,
+      offset,
+      ptr,
+      len,
       [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) {
         box_muller_transform<Type>(val1, val2, sigma, mu);
       },
-      NumThreads, nBlocks, type, stream);
+      NumThreads,
+      nBlocks,
+      type,
+      stream);
   }
   template <typename IntType, typename LenType = int>
-  void normalInt(IntType *ptr, LenType len, IntType mu, IntType sigma,
-                 cudaStream_t stream) {
-    static_assert(std::is_integral<IntType>::value,
-                  "Type for 'normalInt' can only be integer!");
+  void normalInt(IntType* ptr, LenType len, IntType mu, IntType sigma, cudaStream_t stream)
+  {
+    static_assert(std::is_integral<IntType>::value, "Type for 'normalInt' can only be integer!");
     rand2Impl<IntType, double>(
-      offset, ptr, len,
-      [=] __device__(double &val1, double &val2, LenType idx1, LenType idx2) {
+      offset,
+      ptr,
+      len,
+      [=] __device__(double& val1, double& val2, LenType idx1, LenType idx2) {
         box_muller_transform<double>(val1, val2, sigma, mu);
       },
-      NumThreads, nBlocks, type, stream);
+      NumThreads,
+      nBlocks,
+      type,
+      stream);
   }
 
   template <typename Type, typename LenType = int>
-  void normalTable(Type *ptr, LenType n_rows, LenType n_cols, const Type *mu,
-                   const Type *sigma_vec, Type sigma, cudaStream_t stream) {
+  void normalTable(Type* ptr,
+                   LenType n_rows,
+                   LenType n_cols,
+                   const Type* mu,
+                   const Type* sigma_vec,
+                   Type sigma,
+                   cudaStream_t stream)
+  {
     rand2Impl(
-      offset, ptr, n_rows * n_cols,
+      offset,
+      ptr,
+      n_rows * n_cols,
       [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) {
         // yikes! use fast-int-div
-        auto col1 = idx1 % n_cols;
-        auto col2 = idx2 % n_cols;
+        auto col1  = idx1 % n_cols;
+        auto col2  = idx2 % n_cols;
         auto mean1 = mu[col1];
         auto mean2 = mu[col2];
-        auto sig1 = sigma_vec == nullptr ? sigma : sigma_vec[col1];
-        auto sig2 = sigma_vec == nullptr ? sigma : sigma_vec[col2];
+        auto sig1  = sigma_vec == nullptr ? sigma : sigma_vec[col1];
+        auto sig2  = sigma_vec == nullptr ? sigma : sigma_vec[col2];
         box_muller_transform<Type>(val1, val2, sig1, mean1, sig2, mean2);
       },
-      NumThreads, nBlocks, type, stream);
+      NumThreads,
+      nBlocks,
+      type,
+      stream);
   }
 
   template <typename Type, typename LenType = int>
-  void fill(Type *ptr, LenType len, Type val, cudaStream_t stream) {
-    detail::constFillKernel<Type>
-      <<<nBlocks, NumThreads, 0, stream>>>(ptr, len, val);
+  void fill(Type* ptr, LenType len, Type val, cudaStream_t stream)
+  {
+    detail::constFillKernel<Type><<<nBlocks, NumThreads, 0, stream>>>(ptr, len, val);
     CUDA_CHECK(cudaPeekAtLastError());
   }
 
   template <typename Type, typename OutType = bool, typename LenType = int>
-  void bernoulli(OutType *ptr, LenType len, Type prob, cudaStream_t stream) {
+  void bernoulli(OutType* ptr, LenType len, Type prob, cudaStream_t stream)
+  {
     custom_distribution<OutType, Type>(
-      ptr, len, [=] __device__(Type val, LenType idx) { return val > prob; },
-      stream);
+      ptr, len, [=] __device__(Type val, LenType idx) { return val > prob; }, stream);
   }
 
   template <typename Type, typename LenType = int>
-  void scaled_bernoulli(Type *ptr, LenType len, Type prob, Type scale,
-                        cudaStream_t stream) {
+  void scaled_bernoulli(Type* ptr, LenType len, Type prob, Type scale, cudaStream_t stream)
+  {
     static_assert(std::is_floating_point<Type>::value,
                   "Type for 'scaled_bernoulli' can only be floating point!");
     custom_distribution(
-      ptr, len,
-      [=] __device__(Type val, LenType idx) {
-        return val > prob ? -scale : scale;
-      },
+      ptr,
+      len,
+      [=] __device__(Type val, LenType idx) { return val > prob ? -scale : scale; },
       stream);
   }
 
   template <typename Type, typename LenType = int>
-  void gumbel(Type *ptr, LenType len, Type mu, Type beta, cudaStream_t stream) {
+  void gumbel(Type* ptr, LenType len, Type mu, Type beta, cudaStream_t stream)
+  {
     custom_distribution(
-      ptr, len,
-      [=] __device__(Type val, LenType idx) {
-        return mu - beta * raft::myLog(-raft::myLog(val));
-      },
+      ptr,
+      len,
+      [=] __device__(Type val, LenType idx) { return mu - beta * raft::myLog(-raft::myLog(val)); },
       stream);
   }
 
   template <typename Type, typename LenType = int>
-  void lognormal(Type *ptr, LenType len, Type mu, Type sigma,
-                 cudaStream_t stream) {
+  void lognormal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream)
+  {
     rand2Impl(
-      offset, ptr, len,
+      offset,
+      ptr,
+      len,
       [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) {
         box_muller_transform<Type>(val1, val2, sigma, mu);
         val1 = raft::myExp(val1);
         val2 = raft::myExp(val2);
       },
-      NumThreads, nBlocks, type, stream);
+      NumThreads,
+      nBlocks,
+      type,
+      stream);
   }
 
   template <typename Type, typename LenType = int>
-  void logistic(Type *ptr, LenType len, Type mu, Type scale,
-                cudaStream_t stream) {
+  void logistic(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream)
+  {
     custom_distribution(
-      ptr, len,
+      ptr,
+      len,
       [=] __device__(Type val, LenType idx) {
         constexpr Type one = (Type)1.0;
         return mu - scale * raft::myLog(one / val - one);
@@ -524,9 +569,11 @@ class RngImpl {
   }
 
   template <typename Type, typename LenType = int>
-  void exponential(Type *ptr, LenType len, Type lambda, cudaStream_t stream) {
+  void exponential(Type* ptr, LenType len, Type lambda, cudaStream_t stream)
+  {
     custom_distribution(
-      ptr, len,
+      ptr,
+      len,
       [=] __device__(Type val, LenType idx) {
         constexpr Type one = (Type)1.0;
         return -raft::myLog(one - val) / lambda;
@@ -535,9 +582,11 @@ class RngImpl {
   }
 
   template <typename Type, typename LenType = int>
-  void rayleigh(Type *ptr, LenType len, Type sigma, cudaStream_t stream) {
+  void rayleigh(Type* ptr, LenType len, Type sigma, cudaStream_t stream)
+  {
     custom_distribution(
-      ptr, len,
+      ptr,
+      len,
       [=] __device__(Type val, LenType idx) {
         constexpr Type one = (Type)1.0;
         constexpr Type two = (Type)2.0;
@@ -547,13 +596,14 @@ class RngImpl {
   }
 
   template <typename Type, typename LenType = int>
-  void laplace(Type *ptr, LenType len, Type mu, Type scale,
-               cudaStream_t stream) {
+  void laplace(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream)
+  {
     custom_distribution(
-      ptr, len,
+      ptr,
+      len,
       [=] __device__(Type val, LenType idx) {
-        constexpr Type one = (Type)1.0;
-        constexpr Type two = (Type)2.0;
+        constexpr Type one     = (Type)1.0;
+        constexpr Type two     = (Type)2.0;
         constexpr Type oneHalf = (Type)0.5;
         Type out;
         if (val <= oneHalf) {
@@ -567,55 +617,55 @@ class RngImpl {
   }
 
   template <typename DataT, typename WeightsT, typename IdxT = int>
-  void sampleWithoutReplacement(const raft::handle_t &handle, DataT *out,
-                                IdxT *outIdx, const DataT *in,
-                                const WeightsT *wts, IdxT sampledLen, IdxT len,
-                                cudaStream_t stream) {
-    ASSERT(sampledLen <= len,
-           "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'.");
+  void sampleWithoutReplacement(const raft::handle_t& handle,
+                                DataT* out,
+                                IdxT* outIdx,
+                                const DataT* in,
+                                const WeightsT* wts,
+                                IdxT sampledLen,
+                                IdxT len,
+                                cudaStream_t stream)
+  {
+    ASSERT(sampledLen <= len, "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'.");
 
     rmm::device_uvector<WeightsT> expWts(len, stream);
     rmm::device_uvector<WeightsT> sortedWts(len, stream);
     rmm::device_uvector<IdxT> inIdx(len, stream);
     rmm::device_uvector<IdxT> outIdxBuff(len, stream);
-    auto *inIdxPtr = inIdx.data();
+    auto* inIdxPtr = inIdx.data();
     // generate modified weights
     custom_distribution(
-      expWts.data(), len,
+      expWts.data(),
+      len,
       [wts, inIdxPtr] __device__(WeightsT val, IdxT idx) {
-        inIdxPtr[idx] = idx;
+        inIdxPtr[idx]          = idx;
         constexpr WeightsT one = (WeightsT)1.0;
-        auto exp = -raft::myLog(one - val);
-        if (wts != nullptr) {
-          return exp / wts[idx];
-        }
+        auto exp               = -raft::myLog(one - val);
+        if (wts != nullptr) { return exp / wts[idx]; }
         return exp;
       },
       stream);
     ///@todo: use a more efficient partitioning scheme instead of full sort
     // sort the array and pick the top sampledLen items
-    IdxT *outIdxPtr = outIdxBuff.data();
+    IdxT* outIdxPtr = outIdxBuff.data();
     rmm::device_uvector<char> workspace(0, stream);
-    sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr,
-              (int)len, stream);
+    sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, (int)len, stream);
     if (outIdx != nullptr) {
-      CUDA_CHECK(cudaMemcpyAsync(outIdx, outIdxPtr, sizeof(IdxT) * sampledLen,
-                                 cudaMemcpyDeviceToDevice, stream));
+      CUDA_CHECK(cudaMemcpyAsync(
+        outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, cudaMemcpyDeviceToDevice, stream));
     }
     raft::scatter<DataT, IdxT>(out, in, outIdxPtr, sampledLen, stream);
   }
 
-  template <typename OutType, typename MathType = OutType,
-            typename LenType = int, typename Lambda>
-  void custom_distribution(OutType *ptr, LenType len, Lambda randOp,
-                           cudaStream_t stream) {
+  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
+  void custom_distribution(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream)
+  {
     randImpl<OutType, MathType, LenType, Lambda>(
       offset, ptr, len, randOp, NumThreads, nBlocks, type, stream);
   }
-  template <typename OutType, typename MathType = OutType,
-            typename LenType = int, typename Lambda>
-  void custom_distribution2(OutType *ptr, LenType len, Lambda randOp,
-                            cudaStream_t stream) {
+  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
+  void custom_distribution2(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream)
+  {
     rand2Impl<OutType, MathType, LenType, Lambda>(
       offset, ptr, len, randOp, NumThreads, nBlocks, type, stream);
   }
@@ -625,10 +675,10 @@ class RngImpl {
   /** generator type */
   GeneratorType type;
   /**
-  * offset is also used to initialize curand state.
-  * Limits period of Philox RNG from (4 * 2^128) to (Blocks * Threads * 2^64),
-  * but is still a large period.
-  */
+   * offset is also used to initialize curand state.
+   * Limits period of Philox RNG from (4 * 2^128) to (Blocks * Threads * 2^64),
+   * but is still a large period.
+   */
   uint64_t offset;
   /** number of blocks to launch */
   int nBlocks;
@@ -638,12 +688,10 @@ class RngImpl {
   static const int NumThreads = 256;
 
   template <bool IsNormal, typename Type, typename LenType>
-  uint64_t _setupSeeds(uint64_t &seed, uint64_t &offset, LenType len,
-                       int nThreads, int nBlocks) {
+  uint64_t _setupSeeds(uint64_t& seed, uint64_t& offset, LenType len, int nThreads, int nBlocks)
+  {
     LenType itemsPerThread = raft::ceildiv(len, LenType(nBlocks * nThreads));
-    if (IsNormal && itemsPerThread % 2 == 1) {
-      ++itemsPerThread;
-    }
+    if (IsNormal && itemsPerThread % 2 == 1) { ++itemsPerThread; }
     // curand uses 2 32b uint's to generate one double
     uint64_t factor = sizeof(Type) / sizeof(float);
     if (factor == 0) ++factor;
@@ -651,72 +699,72 @@ class RngImpl {
     // If not, then generate new seed and start from zero offset
     uint64_t newOffset = offset + LenType(itemsPerThread) * factor;
     if (newOffset < offset) {
-      offset = 0;
-      seed = gen();
+      offset    = 0;
+      seed      = gen();
       newOffset = itemsPerThread * factor;
     }
     return newOffset;
   }
 
-  template <typename OutType, typename MathType = OutType,
-            typename LenType = int, typename Lambda>
-  void randImpl(uint64_t &offset, OutType *ptr, LenType len, Lambda randOp,
-                int nThreads, int nBlocks, GeneratorType type,
-                cudaStream_t stream) {
+  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
+  void randImpl(uint64_t& offset,
+                OutType* ptr,
+                LenType len,
+                Lambda randOp,
+                int nThreads,
+                int nBlocks,
+                GeneratorType type,
+                cudaStream_t stream)
+  {
     if (len <= 0) return;
-    uint64_t seed = gen();
-    auto newOffset = _setupSeeds<false, MathType, LenType>(seed, offset, len,
-                                                           nThreads, nBlocks);
+    uint64_t seed  = gen();
+    auto newOffset = _setupSeeds<false, MathType, LenType>(seed, offset, len, nThreads, nBlocks);
     switch (type) {
       case GenPhilox:
-        detail::randKernel<OutType, MathType, detail::PhiloxGenerator, LenType,
-                           Lambda>
+        detail::randKernel<OutType, MathType, detail::PhiloxGenerator, LenType, Lambda>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
         break;
       case GenTaps:
-        detail::randKernel<OutType, MathType, detail::TapsGenerator, LenType,
-                           Lambda>
+        detail::randKernel<OutType, MathType, detail::TapsGenerator, LenType, Lambda>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
         break;
       case GenKiss99:
-        detail::randKernel<OutType, MathType, detail::Kiss99Generator, LenType,
-                           Lambda>
+        detail::randKernel<OutType, MathType, detail::Kiss99Generator, LenType, Lambda>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
         break;
-      default:
-        ASSERT(false, "randImpl: Incorrect generator type! %d", type);
+      default: ASSERT(false, "randImpl: Incorrect generator type! %d", type);
     };
     CUDA_CHECK(cudaGetLastError());
     offset = newOffset;
   }
 
-  template <typename OutType, typename MathType = OutType,
-            typename LenType = int, typename Lambda2>
-  void rand2Impl(uint64_t &offset, OutType *ptr, LenType len, Lambda2 rand2Op,
-                 int nThreads, int nBlocks, GeneratorType type,
-                 cudaStream_t stream) {
+  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda2>
+  void rand2Impl(uint64_t& offset,
+                 OutType* ptr,
+                 LenType len,
+                 Lambda2 rand2Op,
+                 int nThreads,
+                 int nBlocks,
+                 GeneratorType type,
+                 cudaStream_t stream)
+  {
     if (len <= 0) return;
-    auto seed = gen();
-    auto newOffset = _setupSeeds<true, MathType, LenType>(seed, offset, len,
-                                                          nThreads, nBlocks);
+    auto seed      = gen();
+    auto newOffset = _setupSeeds<true, MathType, LenType>(seed, offset, len, nThreads, nBlocks);
     switch (type) {
       case GenPhilox:
-        detail::rand2Kernel<OutType, MathType, detail::PhiloxGenerator, LenType,
-                            Lambda2>
+        detail::rand2Kernel<OutType, MathType, detail::PhiloxGenerator, LenType, Lambda2>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
         break;
       case GenTaps:
-        detail::rand2Kernel<OutType, MathType, detail::TapsGenerator, LenType,
-                            Lambda2>
+        detail::rand2Kernel<OutType, MathType, detail::TapsGenerator, LenType, Lambda2>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
         break;
       case GenKiss99:
-        detail::rand2Kernel<OutType, MathType, detail::Kiss99Generator, LenType,
-                            Lambda2>
+        detail::rand2Kernel<OutType, MathType, detail::Kiss99Generator, LenType, Lambda2>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
         break;
-      default:
-        ASSERT(false, "rand2Impl: Incorrect generator type! %d", type);
+      default: ASSERT(false, "rand2Impl: Incorrect generator type! %d", type);
     };
     CUDA_CHECK(cudaGetLastError());
     offset = newOffset;
diff --git a/cpp/include/raft/random/rng.hpp b/cpp/include/raft/random/rng.hpp
index b6b0911ab0..0cced7c626 100644
--- a/cpp/include/raft/random/rng.hpp
+++ b/cpp/include/raft/random/rng.hpp
@@ -51,12 +51,13 @@ using detail::Kiss99Generator;
  * @{
  */
 template <typename Type>
-DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1,
-                             Type sigma2, Type mu2) {
+DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2)
+{
   detail::box_muller_transform(val1, val2, sigma1, mu1, sigma1, mu2);
 }
 template <typename Type>
-DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) {
+DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1)
+{
   detail::box_muller_transform(val1, val2, sigma1, mu1);
 }
 /** @} */
@@ -92,7 +93,8 @@ class Rng : public detail::RngImpl {
    * @param[out] b intercept parameter
    */
   template <typename IdxT>
-  void affine_transform_params(IdxT n, IdxT &a, IdxT &b) {
+  void affine_transform_params(IdxT n, IdxT& a, IdxT& b)
+  {
     detail::RngImpl::affine_transform_params(n, a, b);
   }
 
@@ -108,13 +110,13 @@ class Rng : public detail::RngImpl {
    * @{
    */
   template <typename Type, typename LenType = int>
-  void uniform(Type *ptr, LenType len, Type start, Type end,
-               cudaStream_t stream) {
+  void uniform(Type* ptr, LenType len, Type start, Type end, cudaStream_t stream)
+  {
     detail::RngImpl::uniform(ptr, len, start, end, stream);
   }
   template <typename IntType, typename LenType = int>
-  void uniformInt(IntType *ptr, LenType len, IntType start, IntType end,
-                  cudaStream_t stream) {
+  void uniformInt(IntType* ptr, LenType len, IntType start, IntType end, cudaStream_t stream)
+  {
     detail::RngImpl::uniformInt(ptr, len, start, end, stream);
   }
   /** @} */
@@ -131,13 +133,13 @@ class Rng : public detail::RngImpl {
    * @{
    */
   template <typename Type, typename LenType = int>
-  void normal(Type *ptr, LenType len, Type mu, Type sigma,
-              cudaStream_t stream) {
+  void normal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream)
+  {
     detail::RngImpl::normal(ptr, len, mu, sigma, stream);
   }
   template <typename IntType, typename LenType = int>
-  void normalInt(IntType *ptr, LenType len, IntType mu, IntType sigma,
-                 cudaStream_t stream) {
+  void normalInt(IntType* ptr, LenType len, IntType mu, IntType sigma, cudaStream_t stream)
+  {
     detail::RngImpl::normalInt(ptr, len, mu, sigma, stream);
   }
   /** @} */
@@ -163,10 +165,15 @@ class Rng : public detail::RngImpl {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void normalTable(Type *ptr, LenType n_rows, LenType n_cols, const Type *mu,
-                   const Type *sigma_vec, Type sigma, cudaStream_t stream) {
-    detail::RngImpl::normalTable(ptr, n_rows, n_cols, mu, sigma_vec, sigma,
-                                 stream);
+  void normalTable(Type* ptr,
+                   LenType n_rows,
+                   LenType n_cols,
+                   const Type* mu,
+                   const Type* sigma_vec,
+                   Type sigma,
+                   cudaStream_t stream)
+  {
+    detail::RngImpl::normalTable(ptr, n_rows, n_cols, mu, sigma_vec, sigma, stream);
   }
 
   /**
@@ -179,7 +186,8 @@ class Rng : public detail::RngImpl {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void fill(Type *ptr, LenType len, Type val, cudaStream_t stream) {
+  void fill(Type* ptr, LenType len, Type val, cudaStream_t stream)
+  {
     detail::RngImpl::fill(ptr, len, val, stream);
   }
 
@@ -196,7 +204,8 @@ class Rng : public detail::RngImpl {
    * @param[in]  stream stream where to launch the kernel
    */
   template <typename Type, typename OutType = bool, typename LenType = int>
-  void bernoulli(OutType *ptr, LenType len, Type prob, cudaStream_t stream) {
+  void bernoulli(OutType* ptr, LenType len, Type prob, cudaStream_t stream)
+  {
     detail::RngImpl::bernoulli(ptr, len, prob, stream);
   }
 
@@ -211,8 +220,8 @@ class Rng : public detail::RngImpl {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void scaled_bernoulli(Type *ptr, LenType len, Type prob, Type scale,
-                        cudaStream_t stream) {
+  void scaled_bernoulli(Type* ptr, LenType len, Type prob, Type scale, cudaStream_t stream)
+  {
     detail::RngImpl::scaled_bernoulli(ptr, len, prob, scale, stream);
   }
 
@@ -228,7 +237,8 @@ class Rng : public detail::RngImpl {
    * @note https://en.wikipedia.org/wiki/Gumbel_distribution
    */
   template <typename Type, typename LenType = int>
-  void gumbel(Type *ptr, LenType len, Type mu, Type beta, cudaStream_t stream) {
+  void gumbel(Type* ptr, LenType len, Type mu, Type beta, cudaStream_t stream)
+  {
     detail::RngImpl::gumbel(ptr, len, mu, beta, stream);
   }
 
@@ -243,8 +253,8 @@ class Rng : public detail::RngImpl {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void lognormal(Type *ptr, LenType len, Type mu, Type sigma,
-                 cudaStream_t stream) {
+  void lognormal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream)
+  {
     detail::RngImpl::lognormal(ptr, len, mu, sigma, stream);
   }
 
@@ -259,8 +269,8 @@ class Rng : public detail::RngImpl {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void logistic(Type *ptr, LenType len, Type mu, Type scale,
-                cudaStream_t stream) {
+  void logistic(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream)
+  {
     detail::RngImpl::logistic(ptr, len, mu, scale, stream);
   }
 
@@ -274,7 +284,8 @@ class Rng : public detail::RngImpl {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void exponential(Type *ptr, LenType len, Type lambda, cudaStream_t stream) {
+  void exponential(Type* ptr, LenType len, Type lambda, cudaStream_t stream)
+  {
     detail::RngImpl::exponential(ptr, len, lambda, stream);
   }
 
@@ -288,7 +299,8 @@ class Rng : public detail::RngImpl {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void rayleigh(Type *ptr, LenType len, Type sigma, cudaStream_t stream) {
+  void rayleigh(Type* ptr, LenType len, Type sigma, cudaStream_t stream)
+  {
     detail::RngImpl::rayleigh(ptr, len, sigma, stream);
   }
 
@@ -303,8 +315,8 @@ class Rng : public detail::RngImpl {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void laplace(Type *ptr, LenType len, Type mu, Type scale,
-               cudaStream_t stream) {
+  void laplace(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream)
+  {
     detail::RngImpl::laplace(ptr, len, mu, scale, stream);
   }
 
@@ -334,12 +346,17 @@ class Rng : public detail::RngImpl {
    * @param stream cuda stream
    */
   template <typename DataT, typename WeightsT, typename IdxT = int>
-  void sampleWithoutReplacement(const raft::handle_t &handle, DataT *out,
-                                IdxT *outIdx, const DataT *in,
-                                const WeightsT *wts, IdxT sampledLen, IdxT len,
-                                cudaStream_t stream) {
-    detail::RngImpl::sampleWithoutReplacement(handle, out, outIdx, in, wts,
-                                              sampledLen, len, stream);
+  void sampleWithoutReplacement(const raft::handle_t& handle,
+                                DataT* out,
+                                IdxT* outIdx,
+                                const DataT* in,
+                                const WeightsT* wts,
+                                IdxT sampledLen,
+                                IdxT len,
+                                cudaStream_t stream)
+  {
+    detail::RngImpl::sampleWithoutReplacement(
+      handle, out, outIdx, in, wts, sampledLen, len, stream);
   }
 
   /**
@@ -357,16 +374,14 @@ class Rng : public detail::RngImpl {
    * @param[in]  stream cuda stream
    * @{
    */
-  template <typename OutType, typename MathType = OutType,
-            typename LenType = int, typename Lambda>
-  void custom_distribution(OutType *ptr, LenType len, Lambda randOp,
-                           cudaStream_t stream) {
+  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
+  void custom_distribution(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream)
+  {
     detail::RngImpl::custom_distribution(ptr, len, randOp, stream);
   }
-  template <typename OutType, typename MathType = OutType,
-            typename LenType = int, typename Lambda>
-  void custom_distribution2(OutType *ptr, LenType len, Lambda randOp,
-                            cudaStream_t stream) {
+  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
+  void custom_distribution2(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream)
+  {
     detail::RngImpl::custom_distribution2(ptr, len, randOp, stream);
   }
   /** @} */
diff --git a/cpp/include/raft/sparse/convert/coo.cuh b/cpp/include/raft/sparse/convert/coo.cuh
index e367550060..5d38bdf4a8 100644
--- a/cpp/include/raft/sparse/convert/coo.cuh
+++ b/cpp/include/raft/sparse/convert/coo.cuh
@@ -37,14 +37,18 @@ namespace sparse {
 namespace convert {
 
 template <typename value_idx = int, int TPB_X = 32>
-__global__ void csr_to_coo_kernel(const value_idx *row_ind, value_idx m,
-                                  value_idx *coo_rows, value_idx nnz) {
+__global__ void csr_to_coo_kernel(const value_idx* row_ind,
+                                  value_idx m,
+                                  value_idx* coo_rows,
+                                  value_idx nnz)
+{
   // row-based matrix 1 thread per row
   value_idx row = (blockIdx.x * TPB_X) + threadIdx.x;
   if (row < m) {
     value_idx start_idx = row_ind[row];
-    value_idx stop_idx = get_stop_idx(row, m, nnz, row_ind);
-    for (value_idx i = start_idx; i < stop_idx; i++) coo_rows[i] = row;
+    value_idx stop_idx  = get_stop_idx(row, m, nnz, row_ind);
+    for (value_idx i = start_idx; i < stop_idx; i++)
+      coo_rows[i] = row;
   }
 }
 
@@ -57,14 +61,14 @@ __global__ void csr_to_coo_kernel(const value_idx *row_ind, value_idx m,
  * @param stream: cuda stream to use
  */
 template <typename value_idx = int, int TPB_X = 32>
-void csr_to_coo(const value_idx *row_ind, value_idx m, value_idx *coo_rows,
-                value_idx nnz, cudaStream_t stream) {
+void csr_to_coo(
+  const value_idx* row_ind, value_idx m, value_idx* coo_rows, value_idx nnz, cudaStream_t stream)
+{
   // @TODO: Use cusparse for this.
   dim3 grid(raft::ceildiv(m, (value_idx)TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
-  csr_to_coo_kernel<value_idx, TPB_X>
-    <<<grid, blk, 0, stream>>>(row_ind, m, coo_rows, nnz);
+  csr_to_coo_kernel<value_idx, TPB_X><<<grid, blk, 0, stream>>>(row_ind, m, coo_rows, nnz);
 
   CUDA_CHECK(cudaGetLastError());
 }
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
index 79b18ebd0a..2569b5d90f 100644
--- a/cpp/include/raft/sparse/convert/csr.cuh
+++ b/cpp/include/raft/sparse/convert/csr.cuh
@@ -43,28 +43,32 @@ namespace sparse {
 namespace convert {
 
 template <typename value_t>
-void coo_to_csr(const raft::handle_t &handle, const int *srcRows,
-                const int *srcCols, const value_t *srcVals, int nnz, int m,
-                int *dst_offsets, int *dstCols, value_t *dstVals) {
-  auto stream = handle.get_stream();
+void coo_to_csr(const raft::handle_t& handle,
+                const int* srcRows,
+                const int* srcCols,
+                const value_t* srcVals,
+                int nnz,
+                int m,
+                int* dst_offsets,
+                int* dstCols,
+                value_t* dstVals)
+{
+  auto stream         = handle.get_stream();
   auto cusparseHandle = handle.get_cusparse_handle();
   rmm::device_uvector<int> dstRows(nnz, stream);
-  CUDA_CHECK(cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz,
-                             cudaMemcpyDeviceToDevice, stream));
-  CUDA_CHECK(cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz,
-                             cudaMemcpyDeviceToDevice, stream));
+  CUDA_CHECK(
+    cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream));
+  CUDA_CHECK(
+    cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream));
   auto buffSize = raft::sparse::cusparsecoosort_bufferSizeExt(
     cusparseHandle, m, m, nnz, srcRows, srcCols, stream);
   rmm::device_uvector<char> pBuffer(buffSize, stream);
   rmm::device_uvector<int> P(nnz, stream);
-  CUSPARSE_CHECK(
-    cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data()));
-  raft::sparse::cusparsecoosortByRow(cusparseHandle, m, m, nnz, dstRows.data(),
-                                     dstCols, P.data(), pBuffer.data(), stream);
-  raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(),
-                             stream);
-  raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m,
-                                dst_offsets, stream);
+  CUSPARSE_CHECK(cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data()));
+  raft::sparse::cusparsecoosortByRow(
+    cusparseHandle, m, m, nnz, dstRows.data(), dstCols, P.data(), pBuffer.data(), stream);
+  raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(), stream);
+  raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m, dst_offsets, stream);
   CUDA_CHECK(cudaDeviceSynchronize());
 }
 
@@ -83,14 +87,20 @@ void coo_to_csr(const raft::handle_t &handle, const int *srcRows,
  * @param stream cuda stream to use
  * @param fused_op: the fused operation
  */
-template <typename Index_, int TPB_X = 32,
-          typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
-                           Index_ batchSize, const bool *adj,
-                           Index_ *row_ind_ptr, cudaStream_t stream,
-                           Lambda fused_op) {
+template <typename Index_, int TPB_X = 32, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph_batched(const Index_* row_ind,
+                           Index_ total_rows,
+                           Index_ nnz,
+                           Index_ batchSize,
+                           const bool* adj,
+                           Index_* row_ind_ptr,
+                           cudaStream_t stream,
+                           Lambda fused_op)
+{
   op::csr_row_op<Index_, TPB_X>(
-    row_ind, batchSize, nnz,
+    row_ind,
+    batchSize,
+    nnz,
     [fused_op, adj, total_rows, row_ind_ptr, batchSize, nnz] __device__(
       Index_ row, Index_ start_idx, Index_ stop_idx) {
       fused_op(row, start_idx, stop_idx);
@@ -106,14 +116,23 @@ void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
     stream);
 }
 
-template <typename Index_, int TPB_X = 32,
-          typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
-                           Index_ batchSize, const bool *adj,
-                           Index_ *row_ind_ptr, cudaStream_t stream) {
-  csr_adj_graph_batched(
-    row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream,
-    [] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {});
+template <typename Index_, int TPB_X = 32, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph_batched(const Index_* row_ind,
+                           Index_ total_rows,
+                           Index_ nnz,
+                           Index_ batchSize,
+                           const bool* adj,
+                           Index_* row_ind_ptr,
+                           cudaStream_t stream)
+{
+  csr_adj_graph_batched(row_ind,
+                        total_rows,
+                        nnz,
+                        batchSize,
+                        adj,
+                        row_ind_ptr,
+                        stream,
+                        [] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {});
 }
 
 /**
@@ -129,13 +148,17 @@ void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
  * @param stream cuda stream to use
  * @param fused_op the fused operation
  */
-template <typename Index_, int TPB_X = 32,
-          typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
-                   const bool *adj, Index_ *row_ind_ptr, cudaStream_t stream,
-                   Lambda fused_op) {
-  csr_adj_graph_batched<Index_, TPB_X>(row_ind, total_rows, nnz, total_rows,
-                                       adj, row_ind_ptr, stream, fused_op);
+template <typename Index_, int TPB_X = 32, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph(const Index_* row_ind,
+                   Index_ total_rows,
+                   Index_ nnz,
+                   const bool* adj,
+                   Index_* row_ind_ptr,
+                   cudaStream_t stream,
+                   Lambda fused_op)
+{
+  csr_adj_graph_batched<Index_, TPB_X>(
+    row_ind, total_rows, nnz, total_rows, adj, row_ind_ptr, stream, fused_op);
 }
 
 /**
@@ -148,8 +171,8 @@ void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
  * @param stream: cuda stream to use
  */
 template <typename T>
-void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m,
-                       cudaStream_t stream) {
+void sorted_coo_to_csr(const T* rows, int nnz, T* row_ind, int m, cudaStream_t stream)
+{
   rmm::device_uvector<T> row_counts(m, stream);
 
   CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream));
@@ -157,11 +180,9 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m,
   linalg::coo_degree<32>(rows, nnz, row_counts.data(), stream);
 
   // create csr compressed row index from row counts
-  thrust::device_ptr<T> row_counts_d =
-    thrust::device_pointer_cast(row_counts.data());
-  thrust::device_ptr<T> c_ind_d = thrust::device_pointer_cast(row_ind);
-  exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m,
-                 c_ind_d);
+  thrust::device_ptr<T> row_counts_d = thrust::device_pointer_cast(row_counts.data());
+  thrust::device_ptr<T> c_ind_d      = thrust::device_pointer_cast(row_ind);
+  exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m, c_ind_d);
 }
 
 /**
@@ -172,7 +193,8 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m,
  * @param stream: cuda stream to use
  */
 template <typename T>
-void sorted_coo_to_csr(COO<T> *coo, int *row_ind, cudaStream_t stream) {
+void sorted_coo_to_csr(COO<T>* coo, int* row_ind, cudaStream_t stream)
+{
   sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, stream);
 }
 
diff --git a/cpp/include/raft/sparse/convert/dense.cuh b/cpp/include/raft/sparse/convert/dense.cuh
index 299f9d36d4..e90882b501 100644
--- a/cpp/include/raft/sparse/convert/dense.cuh
+++ b/cpp/include/raft/sparse/convert/dense.cuh
@@ -37,22 +37,20 @@ namespace sparse {
 namespace convert {
 
 template <typename value_t>
-__global__ void csr_to_dense_warp_per_row_kernel(int n_cols,
-                                                 const value_t *csrVal,
-                                                 const int *csrRowPtr,
-                                                 const int *csrColInd,
-                                                 value_t *a) {
+__global__ void csr_to_dense_warp_per_row_kernel(
+  int n_cols, const value_t* csrVal, const int* csrRowPtr, const int* csrColInd, value_t* a)
+{
   int row = blockIdx.x;
   int tid = threadIdx.x;
 
   int colStart = csrRowPtr[row];
-  int colEnd = csrRowPtr[row + 1];
-  int rowNnz = colEnd - colStart;
+  int colEnd   = csrRowPtr[row + 1];
+  int rowNnz   = colEnd - colStart;
 
   for (int i = tid; i < rowNnz; i += blockDim.x) {
     int colIdx = colStart + i;
     if (colIdx < colEnd) {
-      int col = csrColInd[colIdx];
+      int col               = csrColInd[colIdx];
       a[row * n_cols + col] = csrVal[colIdx];
     }
   }
@@ -77,10 +75,17 @@ __global__ void csr_to_dense_warp_per_row_kernel(int n_cols,
  * @param[in] row_major : Is row-major output desired?
  */
 template <typename value_idx, typename value_t>
-void csr_to_dense(cusparseHandle_t handle, value_idx nrows, value_idx ncols,
-                  const value_idx *csr_indptr, const value_idx *csr_indices,
-                  const value_t *csr_data, value_idx lda, value_t *out,
-                  cudaStream_t stream, bool row_major = true) {
+void csr_to_dense(cusparseHandle_t handle,
+                  value_idx nrows,
+                  value_idx ncols,
+                  const value_idx* csr_indptr,
+                  const value_idx* csr_indices,
+                  const value_t* csr_data,
+                  value_idx lda,
+                  value_t* out,
+                  cudaStream_t stream,
+                  bool row_major = true)
+{
   if (!row_major) {
     /**
      * If we need col-major, use cusparse.
@@ -91,15 +96,13 @@ void csr_to_dense(cusparseHandle_t handle, value_idx nrows, value_idx ncols,
     CUSPARSE_CHECK(cusparseSetMatType(out_mat, CUSPARSE_MATRIX_TYPE_GENERAL));
 
     CUSPARSE_CHECK(raft::sparse::cusparsecsr2dense(
-      handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out,
-      lda, stream));
+      handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out, lda, stream));
 
     CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(out_mat));
 
   } else {
     int blockdim = block_dim(ncols);
-    CUDA_CHECK(
-      cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream));
+    CUDA_CHECK(cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream));
     csr_to_dense_warp_per_row_kernel<<<nrows, blockdim, 0, stream>>>(
       ncols, csr_data, csr_indptr, csr_indices, out);
   }
diff --git a/cpp/include/raft/sparse/coo.cuh b/cpp/include/raft/sparse/coo.cuh
index fa21614f8f..ad1bac1e75 100644
--- a/cpp/include/raft/sparse/coo.cuh
+++ b/cpp/include/raft/sparse/coo.cuh
@@ -66,79 +66,79 @@ class COO {
   Index_Type n_cols;
 
   /**
-    * @param stream: CUDA stream to use
-    */
+   * @param stream: CUDA stream to use
+   */
   COO(cudaStream_t stream)
-    : rows_arr(0, stream),
-      cols_arr(0, stream),
-      vals_arr(0, stream),
-      nnz(0),
-      n_rows(0),
-      n_cols(0) {}
+    : rows_arr(0, stream), cols_arr(0, stream), vals_arr(0, stream), nnz(0), n_rows(0), n_cols(0)
+  {
+  }
 
   /**
-    * @param rows: coo rows array
-    * @param cols: coo cols array
-    * @param vals: coo vals array
-    * @param nnz: size of the rows/cols/vals arrays
-    * @param n_rows: number of rows in the dense matrix
-    * @param n_cols: number of cols in the dense matrix
-    */
-  COO(rmm::device_uvector<Index_Type> &rows,
-      rmm::device_uvector<Index_Type> &cols, rmm::device_uvector<T> &vals,
-      Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0)
-    : rows_arr(rows),
-      cols_arr(cols),
-      vals_arr(vals),
-      nnz(nnz),
-      n_rows(n_rows),
-      n_cols(n_cols) {}
+   * @param rows: coo rows array
+   * @param cols: coo cols array
+   * @param vals: coo vals array
+   * @param nnz: size of the rows/cols/vals arrays
+   * @param n_rows: number of rows in the dense matrix
+   * @param n_cols: number of cols in the dense matrix
+   */
+  COO(rmm::device_uvector<Index_Type>& rows,
+      rmm::device_uvector<Index_Type>& cols,
+      rmm::device_uvector<T>& vals,
+      Index_Type nnz,
+      Index_Type n_rows = 0,
+      Index_Type n_cols = 0)
+    : rows_arr(rows), cols_arr(cols), vals_arr(vals), nnz(nnz), n_rows(n_rows), n_cols(n_cols)
+  {
+  }
 
   /**
-    * @param stream: CUDA stream to use
-    * @param nnz: size of the rows/cols/vals arrays
-    * @param n_rows: number of rows in the dense matrix
-    * @param n_cols: number of cols in the dense matrix
-    * @param init: initialize arrays with zeros
-    */
-  COO(cudaStream_t stream, Index_Type nnz, Index_Type n_rows = 0,
-      Index_Type n_cols = 0, bool init = true)
+   * @param stream: CUDA stream to use
+   * @param nnz: size of the rows/cols/vals arrays
+   * @param n_rows: number of rows in the dense matrix
+   * @param n_cols: number of cols in the dense matrix
+   * @param init: initialize arrays with zeros
+   */
+  COO(cudaStream_t stream,
+      Index_Type nnz,
+      Index_Type n_rows = 0,
+      Index_Type n_cols = 0,
+      bool init         = true)
     : rows_arr(nnz, stream),
       cols_arr(nnz, stream),
       vals_arr(nnz, stream),
       nnz(nnz),
       n_rows(n_rows),
-      n_cols(n_cols) {
+      n_cols(n_cols)
+  {
     if (init) init_arrays(stream);
   }
 
-  void init_arrays(cudaStream_t stream) {
-    CUDA_CHECK(cudaMemsetAsync(this->rows_arr.data(), 0,
-                               this->nnz * sizeof(Index_Type), stream));
-    CUDA_CHECK(cudaMemsetAsync(this->cols_arr.data(), 0,
-                               this->nnz * sizeof(Index_Type), stream));
-    CUDA_CHECK(
-      cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream));
+  void init_arrays(cudaStream_t stream)
+  {
+    CUDA_CHECK(cudaMemsetAsync(this->rows_arr.data(), 0, this->nnz * sizeof(Index_Type), stream));
+    CUDA_CHECK(cudaMemsetAsync(this->cols_arr.data(), 0, this->nnz * sizeof(Index_Type), stream));
+    CUDA_CHECK(cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream));
   }
 
   ~COO() {}
 
   /**
-    * @brief Size should be > 0, with the number of rows
-    * and cols in the dense matrix being > 0.
-    */
-  bool validate_size() const {
+   * @brief Size should be > 0, with the number of rows
+   * and cols in the dense matrix being > 0.
+   */
+  bool validate_size() const
+  {
     if (this->nnz < 0 || n_rows < 0 || n_cols < 0) return false;
     return true;
   }
 
   /**
-    * @brief If the underlying arrays have not been set,
-    * return false. Otherwise true.
-    */
-  bool validate_mem() const {
-    if (this->rows_arr.size() == 0 || this->cols_arr.size() == 0 ||
-        this->vals_arr.size() == 0) {
+   * @brief If the underlying arrays have not been set,
+   * return false. Otherwise true.
+   */
+  bool validate_mem() const
+  {
+    if (this->rows_arr.size() == 0 || this->cols_arr.size() == 0 || this->vals_arr.size() == 0) {
       return false;
     }
 
@@ -148,33 +148,30 @@ class COO {
   /*
    * @brief Returns the rows array
    */
-  Index_Type *rows() { return this->rows_arr.data(); }
+  Index_Type* rows() { return this->rows_arr.data(); }
 
   /**
    * @brief Returns the cols array
    */
-  Index_Type *cols() { return this->cols_arr.data(); }
+  Index_Type* cols() { return this->cols_arr.data(); }
 
   /**
    * @brief Returns the vals array
    */
-  T *vals() { return this->vals_arr.data(); }
+  T* vals() { return this->vals_arr.data(); }
 
   /**
-    * @brief Send human-readable state information to output stream
-    */
-  friend std::ostream &operator<<(std::ostream &out,
-                                  const COO<T, Index_Type> &c) {
+   * @brief Send human-readable state information to output stream
+   */
+  friend std::ostream& operator<<(std::ostream& out, const COO<T, Index_Type>& c)
+  {
     if (c.validate_size() && c.validate_mem()) {
       cudaStream_t stream;
       CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 
-      out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream)
-          << std::endl;
-      out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream)
-          << std::endl;
-      out << raft::arr2Str(c.vals_arr.data(), c.nnz, "vals", stream)
-          << std::endl;
+      out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream) << std::endl;
+      out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream) << std::endl;
+      out << raft::arr2Str(c.vals_arr.data(), c.nnz, "vals", stream) << std::endl;
       out << "nnz=" << c.nnz << std::endl;
       out << "n_rows=" << c.n_rows << std::endl;
       out << "n_cols=" << c.n_cols << std::endl;
@@ -188,58 +185,59 @@ class COO {
   }
 
   /**
-    * @brief Set the number of rows and cols
-    * @param n_rows: number of rows in the dense matrix
-    * @param n_cols: number of columns in the dense matrix
-    */
-  void setSize(int n_rows, int n_cols) {
+   * @brief Set the number of rows and cols
+   * @param n_rows: number of rows in the dense matrix
+   * @param n_cols: number of columns in the dense matrix
+   */
+  void setSize(int n_rows, int n_cols)
+  {
     this->n_rows = n_rows;
     this->n_cols = n_cols;
   }
 
   /**
-    * @brief Set the number of rows and cols for a square dense matrix
-    * @param n: number of rows and cols
-    */
-  void setSize(int n) {
+   * @brief Set the number of rows and cols for a square dense matrix
+   * @param n: number of rows and cols
+   */
+  void setSize(int n)
+  {
     this->n_rows = n;
     this->n_cols = n;
   }
 
   /**
-    * @brief Allocate the underlying arrays
-    * @param nnz: size of underlying row/col/val arrays
-    * @param init: should values be initialized to 0?
-    * @param stream: CUDA stream to use
-    */
-  void allocate(int nnz, bool init, cudaStream_t stream) {
-    this->allocate(nnz, 0, init, stream);
-  }
+   * @brief Allocate the underlying arrays
+   * @param nnz: size of underlying row/col/val arrays
+   * @param init: should values be initialized to 0?
+   * @param stream: CUDA stream to use
+   */
+  void allocate(int nnz, bool init, cudaStream_t stream) { this->allocate(nnz, 0, init, stream); }
 
   /**
-    * @brief Allocate the underlying arrays
-    * @param nnz: size of the underlying row/col/val arrays
-    * @param size: the number of rows/cols in a square dense matrix
-    * @param init: should values be initialized to 0?
-    * @param stream: CUDA stream to use
-    */
-  void allocate(int nnz, int size, bool init, cudaStream_t stream) {
+   * @brief Allocate the underlying arrays
+   * @param nnz: size of the underlying row/col/val arrays
+   * @param size: the number of rows/cols in a square dense matrix
+   * @param init: should values be initialized to 0?
+   * @param stream: CUDA stream to use
+   */
+  void allocate(int nnz, int size, bool init, cudaStream_t stream)
+  {
     this->allocate(nnz, size, size, init, stream);
   }
 
   /**
-    * @brief Allocate the underlying arrays
-    * @param nnz: size of the underlying row/col/val arrays
-    * @param n_rows: number of rows in the dense matrix
-    * @param n_cols: number of columns in the dense matrix
-    * @param init: should values be initialized to 0?
-    * @param stream: stream to use for init
-    */
-  void allocate(int nnz, int n_rows, int n_cols, bool init,
-                cudaStream_t stream) {
+   * @brief Allocate the underlying arrays
+   * @param nnz: size of the underlying row/col/val arrays
+   * @param n_rows: number of rows in the dense matrix
+   * @param n_cols: number of columns in the dense matrix
+   * @param init: should values be initialized to 0?
+   * @param stream: stream to use for init
+   */
+  void allocate(int nnz, int n_rows, int n_cols, bool init, cudaStream_t stream)
+  {
     this->n_rows = n_rows;
     this->n_cols = n_cols;
-    this->nnz = nnz;
+    this->nnz    = nnz;
 
     this->rows_arr.resize(this->nnz, stream);
     this->cols_arr.resize(this->nnz, stream);
diff --git a/cpp/include/raft/sparse/csr.cuh b/cpp/include/raft/sparse/csr.cuh
index 041aedf41c..f821ce2b98 100644
--- a/cpp/include/raft/sparse/csr.cuh
+++ b/cpp/include/raft/sparse/csr.cuh
@@ -41,57 +41,64 @@ namespace sparse {
 
 struct WeakCCState {
  public:
-  bool *m;
-  WeakCCState(bool *m) : m(m) {}
+  bool* m;
+  WeakCCState(bool* m) : m(m) {}
 };
 
 template <typename Index_, int TPB_X = 256, typename Lambda>
-__global__ void weak_cc_label_device(Index_ *__restrict__ labels,
-                                     const Index_ *__restrict__ row_ind,
-                                     const Index_ *__restrict__ row_ind_ptr,
-                                     Index_ nnz, bool *__restrict__ m,
-                                     Index_ start_vertex_id, Index_ batch_size,
-                                     Index_ N, Lambda filter_op) {
-  Index_ tid = threadIdx.x + blockIdx.x * TPB_X;
+__global__ void weak_cc_label_device(Index_* __restrict__ labels,
+                                     const Index_* __restrict__ row_ind,
+                                     const Index_* __restrict__ row_ind_ptr,
+                                     Index_ nnz,
+                                     bool* __restrict__ m,
+                                     Index_ start_vertex_id,
+                                     Index_ batch_size,
+                                     Index_ N,
+                                     Lambda filter_op)
+{
+  Index_ tid       = threadIdx.x + blockIdx.x * TPB_X;
   Index_ global_id = tid + start_vertex_id;
   if (tid < batch_size && global_id < N) {
     Index_ start = __ldg(row_ind + tid);
 
     Index_ ci, cj;
-    bool ci_mod = false;
-    ci = labels[global_id];
+    bool ci_mod        = false;
+    ci                 = labels[global_id];
     bool ci_allow_prop = filter_op(global_id);
 
     Index_ end = get_stop_idx(tid, batch_size, nnz, row_ind);
     /// TODO: add one element to row_ind and avoid get_stop_idx
     for (Index_ j = start; j < end; j++) {
-      Index_ j_ind = __ldg(row_ind_ptr + j);
-      cj = labels[j_ind];
+      Index_ j_ind       = __ldg(row_ind_ptr + j);
+      cj                 = labels[j_ind];
       bool cj_allow_prop = filter_op(j_ind);
       if (ci < cj && ci_allow_prop) {
         if (sizeof(Index_) == 4)
-          atomicMin((int *)(labels + j_ind), ci);
+          atomicMin((int*)(labels + j_ind), ci);
         else if (sizeof(Index_) == 8)
-          atomicMin((long long int *)(labels + j_ind), ci);
+          atomicMin((long long int*)(labels + j_ind), ci);
         if (cj_allow_prop) *m = true;
       } else if (ci > cj && cj_allow_prop) {
-        ci = cj;
+        ci     = cj;
         ci_mod = true;
       }
     }
     if (ci_mod) {
       if (sizeof(Index_) == 4)
-        atomicMin((int *)(labels + global_id), ci);
+        atomicMin((int*)(labels + global_id), ci);
       else if (sizeof(Index_) == 8)
-        atomicMin((long long int *)(labels + global_id), ci);
+        atomicMin((long long int*)(labels + global_id), ci);
       if (ci_allow_prop) *m = true;
     }
   }
 }
 
 template <typename Index_, int TPB_X = 256, typename Lambda>
-__global__ void weak_cc_init_all_kernel(Index_ *labels, Index_ N,
-                                        Index_ MAX_LABEL, Lambda filter_op) {
+__global__ void weak_cc_init_all_kernel(Index_* labels,
+                                        Index_ N,
+                                        Index_ MAX_LABEL,
+                                        Lambda filter_op)
+{
   Index_ tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {
     if (filter_op(tid))
@@ -123,22 +130,25 @@ __global__ void weak_cc_init_all_kernel(Index_ *labels, Index_ N,
  * @param filter_op an optional filtering function to determine which points
  * should get considered for labeling. It gets global indexes (not batch-wide!)
  */
-template <typename Index_, int TPB_X = 256,
-          typename Lambda = auto(Index_)->bool>
-void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
-                     const Index_ *row_ind_ptr, Index_ nnz, Index_ N,
-                     Index_ start_vertex_id, Index_ batch_size,
-                     WeakCCState *state, cudaStream_t stream,
-                     Lambda filter_op) {
-  ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8,
-         "Index_ should be 4 or 8 bytes");
+template <typename Index_, int TPB_X = 256, typename Lambda = auto(Index_)->bool>
+void weak_cc_batched(Index_* labels,
+                     const Index_* row_ind,
+                     const Index_* row_ind_ptr,
+                     Index_ nnz,
+                     Index_ N,
+                     Index_ start_vertex_id,
+                     Index_ batch_size,
+                     WeakCCState* state,
+                     cudaStream_t stream,
+                     Lambda filter_op)
+{
+  ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8, "Index_ should be 4 or 8 bytes");
 
   bool host_m;
 
   Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
   weak_cc_init_all_kernel<Index_, TPB_X>
-    <<<raft::ceildiv(N, Index_(TPB_X)), TPB_X, 0, stream>>>(
-      labels, N, MAX_LABEL, filter_op);
+    <<<raft::ceildiv(N, Index_(TPB_X)), TPB_X, 0, stream>>>(labels, N, MAX_LABEL, filter_op);
   CUDA_CHECK(cudaPeekAtLastError());
 
   int n_iters = 0;
@@ -147,8 +157,7 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
 
     weak_cc_label_device<Index_, TPB_X>
       <<<raft::ceildiv(batch_size, Index_(TPB_X)), TPB_X, 0, stream>>>(
-        labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id,
-        batch_size, N, filter_op);
+        labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id, batch_size, N, filter_op);
     CUDA_CHECK(cudaPeekAtLastError());
 
     //** Updating m *
@@ -180,12 +189,25 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
  * @param stream the cuda stream to use
  */
 template <typename Index_, int TPB_X = 256>
-void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
-                     const Index_ *row_ind_ptr, Index_ nnz, Index_ N,
-                     Index_ start_vertex_id, Index_ batch_size,
-                     WeakCCState *state, cudaStream_t stream) {
-  weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, start_vertex_id,
-                  batch_size, state, stream,
+void weak_cc_batched(Index_* labels,
+                     const Index_* row_ind,
+                     const Index_* row_ind_ptr,
+                     Index_ nnz,
+                     Index_ N,
+                     Index_ start_vertex_id,
+                     Index_ batch_size,
+                     WeakCCState* state,
+                     cudaStream_t stream)
+{
+  weak_cc_batched(labels,
+                  row_ind,
+                  row_ind_ptr,
+                  nnz,
+                  N,
+                  start_vertex_id,
+                  batch_size,
+                  state,
+                  stream,
                   [] __device__(Index_ tid) { return true; });
 }
 
@@ -212,14 +234,18 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
  * @param filter_op an optional filtering function to determine which points
  * should get considered for labeling. It gets global indexes (not batch-wide!)
  */
-template <typename Index_ = int, int TPB_X = 256,
-          typename Lambda = auto(Index_)->bool>
-void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
-             Index_ nnz, Index_ N, cudaStream_t stream, Lambda filter_op) {
+template <typename Index_ = int, int TPB_X = 256, typename Lambda = auto(Index_)->bool>
+void weak_cc(Index_* labels,
+             const Index_* row_ind,
+             const Index_* row_ind_ptr,
+             Index_ nnz,
+             Index_ N,
+             cudaStream_t stream,
+             Lambda filter_op)
+{
   rmm::device_scalar<bool> m(stream);
   WeakCCState state(m.data());
-  weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N,
-                                 stream, filter_op);
+  weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, filter_op);
 }
 
 /**
@@ -244,12 +270,17 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
  * @param stream the cuda stream to use
  */
 template <typename Index_, int TPB_X = 256>
-void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
-             Index_ nnz, Index_ N, cudaStream_t stream) {
+void weak_cc(Index_* labels,
+             const Index_* row_ind,
+             const Index_* row_ind_ptr,
+             Index_ nnz,
+             Index_ N,
+             cudaStream_t stream)
+{
   rmm::device_scalar<bool> m(stream);
   WeakCCState state(m.data());
-  weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N,
-                                 stream, [](Index_) { return true; });
+  weak_cc_batched<Index_, TPB_X>(
+    labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, [](Index_) { return true; });
 }
 
 };  // namespace sparse
diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h
index d072100672..29a244a962 100644
--- a/cpp/include/raft/sparse/cusparse_wrappers.h
+++ b/cpp/include/raft/sparse/cusparse_wrappers.h
@@ -23,10 +23,9 @@
 //#include <cuml/common/logger.hpp>
 
 #define _CUSPARSE_ERR_TO_STR(err) \
-  case err:                       \
-    return #err;
+  case err: return #err;
 
-//Notes:
+// Notes:
 //(1.) CUDA_VER_10_1_UP aggregates all the CUDA version selection logic;
 //(2.) to enforce a lower version,
 //
@@ -43,16 +42,15 @@ namespace raft {
  * @brief Exception thrown when a cuSparse error is encountered.
  */
 struct cusparse_error : public raft::exception {
-  explicit cusparse_error(char const* const message)
-    : raft::exception(message) {}
-  explicit cusparse_error(std::string const& message)
-    : raft::exception(message) {}
+  explicit cusparse_error(char const* const message) : raft::exception(message) {}
+  explicit cusparse_error(std::string const& message) : raft::exception(message) {}
 };
 
 namespace sparse {
 namespace detail {
 
-inline const char* cusparse_error_to_string(cusparseStatus_t err) {
+inline const char* cusparse_error_to_string(cusparseStatus_t err)
+{
 #if defined(CUDART_VERSION) && CUDART_VERSION >= 10100
   return cusparseGetErrorString(err);
 #else   // CUDART_VERSION
@@ -65,8 +63,7 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) {
     _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED);
     _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR);
     _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
-    default:
-      return "CUSPARSE_STATUS_UNKNOWN";
+    default: return "CUSPARSE_STATUS_UNKNOWN";
   };
 #endif  // CUDART_VERSION
 }
@@ -88,8 +85,11 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) {
     cusparseStatus_t const status = (call);                                  \
     if (CUSPARSE_STATUS_SUCCESS != status) {                                 \
       std::string msg{};                                                     \
-      SET_ERROR_MSG(msg, "cuSparse error encountered at: ",                  \
-                    "call='%s', Reason=%d:%s", #call, status,                \
+      SET_ERROR_MSG(msg,                                                     \
+                    "cuSparse error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                               \
+                    #call,                                                   \
+                    status,                                                  \
                     raft::sparse::detail::cusparse_error_to_string(status)); \
       throw raft::cusparse_error(msg);                                       \
     }                                                                        \
@@ -100,13 +100,15 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) {
 
 //@todo: use logger here once logging is enabled
 /** check for cusparse runtime API errors but do not assert */
-#define CUSPARSE_CHECK_NO_THROW(call)                                  \
-  do {                                                                 \
-    cusparseStatus_t err = call;                                       \
-    if (err != CUSPARSE_STATUS_SUCCESS) {                              \
-      printf("CUSPARSE call='%s' got errorcode=%d err=%s", #call, err, \
-             raft::sparse::detail::cusparse_error_to_string(err));     \
-    }                                                                  \
+#define CUSPARSE_CHECK_NO_THROW(call)                              \
+  do {                                                             \
+    cusparseStatus_t err = call;                                   \
+    if (err != CUSPARSE_STATUS_SUCCESS) {                          \
+      printf("CUSPARSE call='%s' got errorcode=%d err=%s",         \
+             #call,                                                \
+             err,                                                  \
+             raft::sparse::detail::cusparse_error_to_string(err)); \
+    }                                                              \
   } while (0)
 
 namespace raft {
@@ -117,28 +119,34 @@ namespace sparse {
  * @{
  */
 template <typename T>
-cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, const T* vals,
-                              T* vals_sorted, int* d_P, cudaStream_t stream);
+cusparseStatus_t cusparsegthr(
+  cusparseHandle_t handle, int nnz, const T* vals, T* vals_sorted, int* d_P, cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz,
-                                     const double* vals, double* vals_sorted,
-                                     int* d_P, cudaStream_t stream) {
+inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle,
+                                     int nnz,
+                                     const double* vals,
+                                     double* vals_sorted,
+                                     int* d_P,
+                                     cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseDgthr(handle, nnz, vals, vals_sorted, d_P,
-                       CUSPARSE_INDEX_BASE_ZERO);
+  return cusparseDgthr(handle, nnz, vals, vals_sorted, d_P, CUSPARSE_INDEX_BASE_ZERO);
 #pragma GCC diagnostic pop
 }
 template <>
-inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz,
-                                     const float* vals, float* vals_sorted,
-                                     int* d_P, cudaStream_t stream) {
+inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle,
+                                     int nnz,
+                                     const float* vals,
+                                     float* vals_sorted,
+                                     int* d_P,
+                                     cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseSgthr(handle, nnz, vals, vals_sorted, d_P,
-                       CUSPARSE_INDEX_BASE_ZERO);
+  return cusparseSgthr(handle, nnz, vals, vals_sorted, d_P, CUSPARSE_INDEX_BASE_ZERO);
 #pragma GCC diagnostic pop
 }
 /** @} */
@@ -148,15 +156,18 @@ inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz,
  * @{
  */
 template <typename T>
-void cusparsecoo2csr(cusparseHandle_t handle, const T* cooRowInd, int nnz,
-                     int m, T* csrRowPtr, cudaStream_t stream);
+void cusparsecoo2csr(
+  cusparseHandle_t handle, const T* cooRowInd, int nnz, int m, T* csrRowPtr, cudaStream_t stream);
 template <>
-inline void cusparsecoo2csr(cusparseHandle_t handle, const int* cooRowInd,
-                            int nnz, int m, int* csrRowPtr,
-                            cudaStream_t stream) {
+inline void cusparsecoo2csr(cusparseHandle_t handle,
+                            const int* cooRowInd,
+                            int nnz,
+                            int m,
+                            int* csrRowPtr,
+                            cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  CUSPARSE_CHECK(cusparseXcoo2csr(handle, cooRowInd, nnz, m, csrRowPtr,
-                                  CUSPARSE_INDEX_BASE_ZERO));
+  CUSPARSE_CHECK(cusparseXcoo2csr(handle, cooRowInd, nnz, m, csrRowPtr, CUSPARSE_INDEX_BASE_ZERO));
 }
 /** @} */
 
@@ -166,30 +177,54 @@ inline void cusparsecoo2csr(cusparseHandle_t handle, const int* cooRowInd,
  */
 template <typename T>
 size_t cusparsecoosort_bufferSizeExt(  // NOLINT
-  cusparseHandle_t handle, int m, int n, int nnz, const T* cooRows,
-  const T* cooCols, cudaStream_t stream);
+  cusparseHandle_t handle,
+  int m,
+  int n,
+  int nnz,
+  const T* cooRows,
+  const T* cooCols,
+  cudaStream_t stream);
 template <>
 inline size_t cusparsecoosort_bufferSizeExt(  // NOLINT
-  cusparseHandle_t handle, int m, int n, int nnz, const int* cooRows,
-  const int* cooCols, cudaStream_t stream) {
+  cusparseHandle_t handle,
+  int m,
+  int n,
+  int nnz,
+  const int* cooRows,
+  const int* cooCols,
+  cudaStream_t stream)
+{
   size_t val;
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  CUSPARSE_CHECK(
-    cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, cooRows, cooCols, &val));
+  CUSPARSE_CHECK(cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, cooRows, cooCols, &val));
   return val;
 }
 
 template <typename T>
 void cusparsecoosortByRow(  // NOLINT
-  cusparseHandle_t handle, int m, int n, int nnz, T* cooRows, T* cooCols, T* P,
-  void* pBuffer, cudaStream_t stream);
+  cusparseHandle_t handle,
+  int m,
+  int n,
+  int nnz,
+  T* cooRows,
+  T* cooCols,
+  T* P,
+  void* pBuffer,
+  cudaStream_t stream);
 template <>
 inline void cusparsecoosortByRow(  // NOLINT
-  cusparseHandle_t handle, int m, int n, int nnz, int* cooRows, int* cooCols,
-  int* P, void* pBuffer, cudaStream_t stream) {
+  cusparseHandle_t handle,
+  int m,
+  int n,
+  int nnz,
+  int* cooRows,
+  int* cooCols,
+  int* P,
+  void* pBuffer,
+  cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  CUSPARSE_CHECK(
-    cusparseXcoosortByRow(handle, m, n, nnz, cooRows, cooCols, P, pBuffer));
+  CUSPARSE_CHECK(cusparseXcoosortByRow(handle, m, n, nnz, cooRows, cooCols, P, pBuffer));
 }
 /** @} */
 
@@ -199,37 +234,67 @@ inline void cusparsecoosortByRow(  // NOLINT
  */
 template <typename T>
 cusparseStatus_t cusparsegemmi(  // NOLINT
-  cusparseHandle_t handle, int m, int n, int k, int nnz, const T* alpha,
-  const T* A, int lda, const T* cscValB, const int* cscColPtrB,
-  const int* cscRowIndB, const T* beta, T* C, int ldc, cudaStream_t stream);
+  cusparseHandle_t handle,
+  int m,
+  int n,
+  int k,
+  int nnz,
+  const T* alpha,
+  const T* A,
+  int lda,
+  const T* cscValB,
+  const int* cscColPtrB,
+  const int* cscRowIndB,
+  const T* beta,
+  T* C,
+  int ldc,
+  cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n,
-                                      int k, int nnz, const float* alpha,
-                                      const float* A, int lda,
+inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle,
+                                      int m,
+                                      int n,
+                                      int k,
+                                      int nnz,
+                                      const float* alpha,
+                                      const float* A,
+                                      int lda,
                                       const float* cscValB,
                                       const int* cscColPtrB,
-                                      const int* cscRowIndB, const float* beta,
-                                      float* C, int ldc, cudaStream_t stream) {
+                                      const int* cscRowIndB,
+                                      const float* beta,
+                                      float* C,
+                                      int ldc,
+                                      cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseSgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB,
-                        cscColPtrB, cscRowIndB, beta, C, ldc);
+  return cusparseSgemmi(
+    handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc);
 #pragma GCC diagnostic pop
 }
 template <>
-inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n,
-                                      int k, int nnz, const double* alpha,
-                                      const double* A, int lda,
+inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle,
+                                      int m,
+                                      int n,
+                                      int k,
+                                      int nnz,
+                                      const double* alpha,
+                                      const double* A,
+                                      int lda,
                                       const double* cscValB,
                                       const int* cscColPtrB,
-                                      const int* cscRowIndB, const double* beta,
-                                      double* C, int ldc, cudaStream_t stream) {
+                                      const int* cscRowIndB,
+                                      const double* beta,
+                                      double* C,
+                                      int ldc,
+                                      cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseDgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB,
-                        cscColPtrB, cscRowIndB, beta, C, ldc);
+  return cusparseDgemmi(
+    handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc);
 #pragma GCC diagnostic pop
 }
 /** @} */
@@ -241,49 +306,94 @@ inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n,
  */
 template <typename IndexT, typename ValueT>
 cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
-                                   int64_t rows, int64_t cols, int64_t nnz,
-                                   IndexT* csrRowOffsets, IndexT* csrColInd,
+                                   int64_t rows,
+                                   int64_t cols,
+                                   int64_t nnz,
+                                   IndexT* csrRowOffsets,
+                                   IndexT* csrColInd,
                                    ValueT* csrValues);
 template <>
 inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
-                                          int64_t rows, int64_t cols,
-                                          int64_t nnz, int* csrRowOffsets,
-                                          int* csrColInd, float* csrValues) {
-  return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets,
-                           csrColInd, csrValues, CUSPARSE_INDEX_32I,
-                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
+                                          int64_t rows,
+                                          int64_t cols,
+                                          int64_t nnz,
+                                          int* csrRowOffsets,
+                                          int* csrColInd,
+                                          float* csrValues)
+{
+  return cusparseCreateCsr(spMatDescr,
+                           rows,
+                           cols,
+                           nnz,
+                           csrRowOffsets,
+                           csrColInd,
+                           csrValues,
+                           CUSPARSE_INDEX_32I,
+                           CUSPARSE_INDEX_32I,
+                           CUSPARSE_INDEX_BASE_ZERO,
                            CUDA_R_32F);
 }
 template <>
 inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
-                                          int64_t rows, int64_t cols,
-                                          int64_t nnz, int* csrRowOffsets,
-                                          int* csrColInd, double* csrValues) {
-  return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets,
-                           csrColInd, csrValues, CUSPARSE_INDEX_32I,
-                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
+                                          int64_t rows,
+                                          int64_t cols,
+                                          int64_t nnz,
+                                          int* csrRowOffsets,
+                                          int* csrColInd,
+                                          double* csrValues)
+{
+  return cusparseCreateCsr(spMatDescr,
+                           rows,
+                           cols,
+                           nnz,
+                           csrRowOffsets,
+                           csrColInd,
+                           csrValues,
+                           CUSPARSE_INDEX_32I,
+                           CUSPARSE_INDEX_32I,
+                           CUSPARSE_INDEX_BASE_ZERO,
                            CUDA_R_64F);
 }
 template <>
 inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
-                                          int64_t rows, int64_t cols,
-                                          int64_t nnz, int64_t* csrRowOffsets,
+                                          int64_t rows,
+                                          int64_t cols,
+                                          int64_t nnz,
+                                          int64_t* csrRowOffsets,
                                           int64_t* csrColInd,
-                                          float* csrValues) {
-  return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets,
-                           csrColInd, csrValues, CUSPARSE_INDEX_64I,
-                           CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO,
+                                          float* csrValues)
+{
+  return cusparseCreateCsr(spMatDescr,
+                           rows,
+                           cols,
+                           nnz,
+                           csrRowOffsets,
+                           csrColInd,
+                           csrValues,
+                           CUSPARSE_INDEX_64I,
+                           CUSPARSE_INDEX_64I,
+                           CUSPARSE_INDEX_BASE_ZERO,
                            CUDA_R_32F);
 }
 template <>
 inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
-                                          int64_t rows, int64_t cols,
-                                          int64_t nnz, int64_t* csrRowOffsets,
+                                          int64_t rows,
+                                          int64_t cols,
+                                          int64_t nnz,
+                                          int64_t* csrRowOffsets,
                                           int64_t* csrColInd,
-                                          double* csrValues) {
-  return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets,
-                           csrColInd, csrValues, CUSPARSE_INDEX_64I,
-                           CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO,
+                                          double* csrValues)
+{
+  return cusparseCreateCsr(spMatDescr,
+                           rows,
+                           cols,
+                           nnz,
+                           csrRowOffsets,
+                           csrColInd,
+                           csrValues,
+                           CUSPARSE_INDEX_64I,
+                           CUSPARSE_INDEX_64I,
+                           CUSPARSE_INDEX_BASE_ZERO,
                            CUDA_R_64F);
 }
 /** @} */
@@ -292,16 +402,19 @@ inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
  * @{
  */
 template <typename T>
-cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr,
-                                     int64_t size, T* values);
+cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, int64_t size, T* values);
 template <>
 inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr,
-                                            int64_t size, float* values) {
+                                            int64_t size,
+                                            float* values)
+{
   return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_32F);
 }
 template <>
 inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr,
-                                            int64_t size, double* values) {
+                                            int64_t size,
+                                            double* values)
+{
   return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_64F);
 }
 /** @} */
@@ -312,23 +425,30 @@ inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr,
  */
 template <typename T>
 cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr,
-                                     int64_t rows, int64_t cols, int64_t ld,
-                                     T* values, cusparseOrder_t order);
+                                     int64_t rows,
+                                     int64_t cols,
+                                     int64_t ld,
+                                     T* values,
+                                     cusparseOrder_t order);
 template <>
 inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr,
-                                            int64_t rows, int64_t cols,
-                                            int64_t ld, float* values,
-                                            cusparseOrder_t order) {
-  return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_32F,
-                             order);
+                                            int64_t rows,
+                                            int64_t cols,
+                                            int64_t ld,
+                                            float* values,
+                                            cusparseOrder_t order)
+{
+  return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_32F, order);
 }
 template <>
 inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr,
-                                            int64_t rows, int64_t cols,
-                                            int64_t ld, double* values,
-                                            cusparseOrder_t order) {
-  return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_64F,
-                             order);
+                                            int64_t rows,
+                                            int64_t cols,
+                                            int64_t ld,
+                                            double* values,
+                                            cusparseOrder_t order)
+{
+  return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_64F, order);
 }
 /** @} */
 
@@ -337,58 +457,89 @@ inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr,
  * @{
  */
 template <typename T>
-cusparseStatus_t cusparsespmv_buffersize(
-  cusparseHandle_t handle, cusparseOperation_t opA, const T* alpha,
-  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
-  const T* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
-  size_t* bufferSize, cudaStream_t stream);
+cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle,
+                                         cusparseOperation_t opA,
+                                         const T* alpha,
+                                         const cusparseSpMatDescr_t matA,
+                                         const cusparseDnVecDescr_t vecX,
+                                         const T* beta,
+                                         const cusparseDnVecDescr_t vecY,
+                                         cusparseSpMVAlg_t alg,
+                                         size_t* bufferSize,
+                                         cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsespmv_buffersize(
-  cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha,
-  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
-  const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
-  size_t* bufferSize, cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle,
+                                                cusparseOperation_t opA,
+                                                const float* alpha,
+                                                const cusparseSpMatDescr_t matA,
+                                                const cusparseDnVecDescr_t vecX,
+                                                const float* beta,
+                                                const cusparseDnVecDescr_t vecY,
+                                                cusparseSpMVAlg_t alg,
+                                                size_t* bufferSize,
+                                                cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY,
-                                 CUDA_R_32F, alg, bufferSize);
+  return cusparseSpMV_bufferSize(
+    handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, alg, bufferSize);
 }
 template <>
-inline cusparseStatus_t cusparsespmv_buffersize(
-  cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha,
-  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
-  const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
-  size_t* bufferSize, cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle,
+                                                cusparseOperation_t opA,
+                                                const double* alpha,
+                                                const cusparseSpMatDescr_t matA,
+                                                const cusparseDnVecDescr_t vecX,
+                                                const double* beta,
+                                                const cusparseDnVecDescr_t vecY,
+                                                cusparseSpMVAlg_t alg,
+                                                size_t* bufferSize,
+                                                cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY,
-                                 CUDA_R_64F, alg, bufferSize);
+  return cusparseSpMV_bufferSize(
+    handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, alg, bufferSize);
 }
 
 template <typename T>
-cusparseStatus_t cusparsespmv(cusparseHandle_t handle, cusparseOperation_t opA,
-                              const T* alpha, const cusparseSpMatDescr_t matA,
-                              const cusparseDnVecDescr_t vecX, const T* beta,
+cusparseStatus_t cusparsespmv(cusparseHandle_t handle,
+                              cusparseOperation_t opA,
+                              const T* alpha,
+                              const cusparseSpMatDescr_t matA,
+                              const cusparseDnVecDescr_t vecX,
+                              const T* beta,
                               const cusparseDnVecDescr_t vecY,
-                              cusparseSpMVAlg_t alg, T* externalBuffer,
+                              cusparseSpMVAlg_t alg,
+                              T* externalBuffer,
                               cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsespmv(
-  cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha,
-  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
-  const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
-  float* externalBuffer, cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmv(cusparseHandle_t handle,
+                                     cusparseOperation_t opA,
+                                     const float* alpha,
+                                     const cusparseSpMatDescr_t matA,
+                                     const cusparseDnVecDescr_t vecX,
+                                     const float* beta,
+                                     const cusparseDnVecDescr_t vecY,
+                                     cusparseSpMVAlg_t alg,
+                                     float* externalBuffer,
+                                     cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F,
-                      alg, externalBuffer);
+  return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, alg, externalBuffer);
 }
 template <>
-inline cusparseStatus_t cusparsespmv(
-  cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha,
-  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
-  const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
-  double* externalBuffer, cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmv(cusparseHandle_t handle,
+                                     cusparseOperation_t opA,
+                                     const double* alpha,
+                                     const cusparseSpMatDescr_t matA,
+                                     const cusparseDnVecDescr_t vecX,
+                                     const double* beta,
+                                     const cusparseDnVecDescr_t vecY,
+                                     cusparseSpMVAlg_t alg,
+                                     double* externalBuffer,
+                                     cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F,
-                      alg, externalBuffer);
+  return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, alg, externalBuffer);
 }
 /** @} */
 #else
@@ -398,29 +549,59 @@ inline cusparseStatus_t cusparsespmv(
  */
 template <typename T>
 cusparseStatus_t cusparsecsrmv(  // NOLINT
-  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz,
-  const T* alpha, const cusparseMatDescr_t descr, const T* csrVal,
-  const int* csrRowPtr, const int* csrColInd, const T* x, const T* beta, T* y,
+  cusparseHandle_t handle,
+  cusparseOperation_t trans,
+  int m,
+  int n,
+  int nnz,
+  const T* alpha,
+  const cusparseMatDescr_t descr,
+  const T* csrVal,
+  const int* csrRowPtr,
+  const int* csrColInd,
+  const T* x,
+  const T* beta,
+  T* y,
   cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsecsrmv(
-  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz,
-  const float* alpha, const cusparseMatDescr_t descr, const float* csrVal,
-  const int* csrRowPtr, const int* csrColInd, const float* x, const float* beta,
-  float* y, cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle,
+                                      cusparseOperation_t trans,
+                                      int m,
+                                      int n,
+                                      int nnz,
+                                      const float* alpha,
+                                      const cusparseMatDescr_t descr,
+                                      const float* csrVal,
+                                      const int* csrRowPtr,
+                                      const int* csrColInd,
+                                      const float* x,
+                                      const float* beta,
+                                      float* y,
+                                      cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseScsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal,
-                        csrRowPtr, csrColInd, x, beta, y);
+  return cusparseScsrmv(
+    handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y);
 }
 template <>
-inline cusparseStatus_t cusparsecsrmv(
-  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz,
-  const double* alpha, const cusparseMatDescr_t descr, const double* csrVal,
-  const int* csrRowPtr, const int* csrColInd, const double* x,
-  const double* beta, double* y, cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle,
+                                      cusparseOperation_t trans,
+                                      int m,
+                                      int n,
+                                      int nnz,
+                                      const double* alpha,
+                                      const cusparseMatDescr_t descr,
+                                      const double* csrVal,
+                                      const int* csrRowPtr,
+                                      const int* csrColInd,
+                                      const double* x,
+                                      const double* beta,
+                                      double* y,
+                                      cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseDcsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal,
-                        csrRowPtr, csrColInd, x, beta, y);
+  return cusparseDcsrmv(
+    handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y);
 }
 /** @} */
 #endif
@@ -431,58 +612,96 @@ inline cusparseStatus_t cusparsecsrmv(
  * @{
  */
 template <typename T>
-cusparseStatus_t cusparsespmm_bufferSize(
-  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-  const T* alpha, const cusparseSpMatDescr_t matA,
-  const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC,
-  cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream);
+cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle,
+                                         cusparseOperation_t opA,
+                                         cusparseOperation_t opB,
+                                         const T* alpha,
+                                         const cusparseSpMatDescr_t matA,
+                                         const cusparseDnMatDescr_t matB,
+                                         const T* beta,
+                                         cusparseDnMatDescr_t matC,
+                                         cusparseSpMMAlg_t alg,
+                                         size_t* bufferSize,
+                                         cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsespmm_bufferSize(
-  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-  const float* alpha, const cusparseSpMatDescr_t matA,
-  const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC,
-  cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle,
+                                                cusparseOperation_t opA,
+                                                cusparseOperation_t opB,
+                                                const float* alpha,
+                                                const cusparseSpMatDescr_t matA,
+                                                const cusparseDnMatDescr_t matB,
+                                                const float* beta,
+                                                cusparseDnMatDescr_t matC,
+                                                cusparseSpMMAlg_t alg,
+                                                size_t* bufferSize,
+                                                cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta,
-                                 matC, CUDA_R_32F, alg, bufferSize);
+  return cusparseSpMM_bufferSize(
+    handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_32F, alg, bufferSize);
 }
 template <>
-inline cusparseStatus_t cusparsespmm_bufferSize(
-  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-  const double* alpha, const cusparseSpMatDescr_t matA,
-  const cusparseDnMatDescr_t matB, const double* beta,
-  cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, size_t* bufferSize,
-  cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle,
+                                                cusparseOperation_t opA,
+                                                cusparseOperation_t opB,
+                                                const double* alpha,
+                                                const cusparseSpMatDescr_t matA,
+                                                const cusparseDnMatDescr_t matB,
+                                                const double* beta,
+                                                cusparseDnMatDescr_t matC,
+                                                cusparseSpMMAlg_t alg,
+                                                size_t* bufferSize,
+                                                cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta,
-                                 matC, CUDA_R_64F, alg, bufferSize);
+  return cusparseSpMM_bufferSize(
+    handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_64F, alg, bufferSize);
 }
 template <typename T>
-inline cusparseStatus_t cusparsespmm(
-  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-  const T* alpha, const cusparseSpMatDescr_t matA,
-  const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC,
-  cusparseSpMMAlg_t alg, T* externalBuffer, cudaStream_t stream);
+inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle,
+                                     cusparseOperation_t opA,
+                                     cusparseOperation_t opB,
+                                     const T* alpha,
+                                     const cusparseSpMatDescr_t matA,
+                                     const cusparseDnMatDescr_t matB,
+                                     const T* beta,
+                                     cusparseDnMatDescr_t matC,
+                                     cusparseSpMMAlg_t alg,
+                                     T* externalBuffer,
+                                     cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsespmm(
-  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-  const float* alpha, const cusparseSpMatDescr_t matA,
-  const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC,
-  cusparseSpMMAlg_t alg, float* externalBuffer, cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle,
+                                     cusparseOperation_t opA,
+                                     cusparseOperation_t opB,
+                                     const float* alpha,
+                                     const cusparseSpMatDescr_t matA,
+                                     const cusparseDnMatDescr_t matB,
+                                     const float* beta,
+                                     cusparseDnMatDescr_t matC,
+                                     cusparseSpMMAlg_t alg,
+                                     float* externalBuffer,
+                                     cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC,
-                      CUDA_R_32F, alg, externalBuffer);
+  return cusparseSpMM(
+    handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_32F, alg, externalBuffer);
 }
 template <>
-inline cusparseStatus_t cusparsespmm(
-  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-  const double* alpha, const cusparseSpMatDescr_t matA,
-  const cusparseDnMatDescr_t matB, const double* beta,
-  cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, double* externalBuffer,
-  cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle,
+                                     cusparseOperation_t opA,
+                                     cusparseOperation_t opB,
+                                     const double* alpha,
+                                     const cusparseSpMatDescr_t matA,
+                                     const cusparseDnMatDescr_t matB,
+                                     const double* beta,
+                                     cusparseDnMatDescr_t matC,
+                                     cusparseSpMMAlg_t alg,
+                                     double* externalBuffer,
+                                     cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC,
-                      CUDA_R_64F, alg, externalBuffer);
+  return cusparseSpMM(
+    handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_64F, alg, externalBuffer);
 }
 /** @} */
 #else
@@ -492,31 +711,68 @@ inline cusparseStatus_t cusparsespmm(
  */
 template <typename T>
 cusparseStatus_t cusparsecsrmm(  // NOLINT
-  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k,
-  int nnz, const T* alpha, const cusparseMatDescr_t descr, const T* csrVal,
-  const int* csrRowPtr, const int* csrColInd, const T* x, const int ldx,
-  const T* beta, T* y, const int ldy, cudaStream_t stream);
+  cusparseHandle_t handle,
+  cusparseOperation_t trans,
+  int m,
+  int n,
+  int k,
+  int nnz,
+  const T* alpha,
+  const cusparseMatDescr_t descr,
+  const T* csrVal,
+  const int* csrRowPtr,
+  const int* csrColInd,
+  const T* x,
+  const int ldx,
+  const T* beta,
+  T* y,
+  const int ldy,
+  cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsecsrmm(
-  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k,
-  int nnz, const float* alpha, const cusparseMatDescr_t descr,
-  const float* csrVal, const int* csrRowPtr, const int* csrColInd,
-  const float* x, const int ldx, const float* beta, float* y, const int ldy,
-  cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle,
+                                      cusparseOperation_t trans,
+                                      int m,
+                                      int n,
+                                      int k,
+                                      int nnz,
+                                      const float* alpha,
+                                      const cusparseMatDescr_t descr,
+                                      const float* csrVal,
+                                      const int* csrRowPtr,
+                                      const int* csrColInd,
+                                      const float* x,
+                                      const int ldx,
+                                      const float* beta,
+                                      float* y,
+                                      const int ldy,
+                                      cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseScsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal,
-                        csrRowPtr, csrColInd, x, ldx, beta, y, ldy);
+  return cusparseScsrmm(
+    handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy);
 }
 template <>
-inline cusparseStatus_t cusparsecsrmm(
-  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k,
-  int nnz, const double* alpha, const cusparseMatDescr_t descr,
-  const double* csrVal, const int* csrRowPtr, const int* csrColInd,
-  const double* x, const int ldx, const double* beta, double* y, const int ldy,
-  cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle,
+                                      cusparseOperation_t trans,
+                                      int m,
+                                      int n,
+                                      int k,
+                                      int nnz,
+                                      const double* alpha,
+                                      const cusparseMatDescr_t descr,
+                                      const double* csrVal,
+                                      const int* csrRowPtr,
+                                      const int* csrColInd,
+                                      const double* x,
+                                      const int ldx,
+                                      const double* beta,
+                                      double* y,
+                                      const int ldy,
+                                      cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseDcsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal,
-                        csrRowPtr, csrColInd, x, ldx, beta, y, ldy);
+  return cusparseDcsrmm(
+    handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy);
 }
 /** @} */
 #endif
@@ -527,15 +783,22 @@ inline cusparseStatus_t cusparsecsrmm(
  */
 template <typename T>
 void cusparsecsr2coo(  // NOLINT
-  cusparseHandle_t handle, const int n, const int nnz, const T* csrRowPtr,
-  T* cooRowInd, cudaStream_t stream);
+  cusparseHandle_t handle,
+  const int n,
+  const int nnz,
+  const T* csrRowPtr,
+  T* cooRowInd,
+  cudaStream_t stream);
 template <>
-inline void cusparsecsr2coo(cusparseHandle_t handle, const int n, const int nnz,
-                            const int* csrRowPtr, int* cooRowInd,
-                            cudaStream_t stream) {
+inline void cusparsecsr2coo(cusparseHandle_t handle,
+                            const int n,
+                            const int nnz,
+                            const int* csrRowPtr,
+                            int* cooRowInd,
+                            cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd,
-                                  CUSPARSE_INDEX_BASE_ZERO));
+  CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, CUSPARSE_INDEX_BASE_ZERO));
 }
 /** @} */
 
@@ -553,7 +816,8 @@ inline void cusparsecsr2coo(cusparseHandle_t handle, const int n, const int nnz,
 // template<>
 inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle,
                                                cusparsePointerMode_t mode,
-                                               cudaStream_t stream) {
+                                               cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
   return cusparseSetPointerMode(handle, mode);
 }
@@ -564,69 +828,203 @@ inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle,
  * @{
  */
 template <typename T>
-cusparseStatus_t cusparsecsrmvex_bufferSize(
-  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-  int m, int n, int nnz, const T* alpha, const cusparseMatDescr_t descrA,
-  const T* csrValA, const int* csrRowPtrA, const int* csrColIndA, const T* x,
-  const T* beta, T* y, size_t* bufferSizeInBytes, cudaStream_t stream);
-template <>
-inline cusparseStatus_t cusparsecsrmvex_bufferSize(
-  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-  int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA,
-  const float* csrValA, const int* csrRowPtrA, const int* csrColIndA,
-  const float* x, const float* beta, float* y, size_t* bufferSizeInBytes,
-  cudaStream_t stream) {
-  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseCsrmvEx_bufferSize(
-    handle, alg, transA, m, n, nnz, alpha, CUDA_R_32F, descrA, csrValA,
-    CUDA_R_32F, csrRowPtrA, csrColIndA, x, CUDA_R_32F, beta, CUDA_R_32F, y,
-    CUDA_R_32F, CUDA_R_32F, bufferSizeInBytes);
-}
-template <>
-inline cusparseStatus_t cusparsecsrmvex_bufferSize(
-  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-  int m, int n, int nnz, const double* alpha, const cusparseMatDescr_t descrA,
-  const double* csrValA, const int* csrRowPtrA, const int* csrColIndA,
-  const double* x, const double* beta, double* y, size_t* bufferSizeInBytes,
-  cudaStream_t stream) {
-  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseCsrmvEx_bufferSize(
-    handle, alg, transA, m, n, nnz, alpha, CUDA_R_64F, descrA, csrValA,
-    CUDA_R_64F, csrRowPtrA, csrColIndA, x, CUDA_R_64F, beta, CUDA_R_64F, y,
-    CUDA_R_64F, CUDA_R_64F, bufferSizeInBytes);
+cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle,
+                                            cusparseAlgMode_t alg,
+                                            cusparseOperation_t transA,
+                                            int m,
+                                            int n,
+                                            int nnz,
+                                            const T* alpha,
+                                            const cusparseMatDescr_t descrA,
+                                            const T* csrValA,
+                                            const int* csrRowPtrA,
+                                            const int* csrColIndA,
+                                            const T* x,
+                                            const T* beta,
+                                            T* y,
+                                            size_t* bufferSizeInBytes,
+                                            cudaStream_t stream);
+template <>
+inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle,
+                                                   cusparseAlgMode_t alg,
+                                                   cusparseOperation_t transA,
+                                                   int m,
+                                                   int n,
+                                                   int nnz,
+                                                   const float* alpha,
+                                                   const cusparseMatDescr_t descrA,
+                                                   const float* csrValA,
+                                                   const int* csrRowPtrA,
+                                                   const int* csrColIndA,
+                                                   const float* x,
+                                                   const float* beta,
+                                                   float* y,
+                                                   size_t* bufferSizeInBytes,
+                                                   cudaStream_t stream)
+{
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseCsrmvEx_bufferSize(handle,
+                                    alg,
+                                    transA,
+                                    m,
+                                    n,
+                                    nnz,
+                                    alpha,
+                                    CUDA_R_32F,
+                                    descrA,
+                                    csrValA,
+                                    CUDA_R_32F,
+                                    csrRowPtrA,
+                                    csrColIndA,
+                                    x,
+                                    CUDA_R_32F,
+                                    beta,
+                                    CUDA_R_32F,
+                                    y,
+                                    CUDA_R_32F,
+                                    CUDA_R_32F,
+                                    bufferSizeInBytes);
+}
+template <>
+inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle,
+                                                   cusparseAlgMode_t alg,
+                                                   cusparseOperation_t transA,
+                                                   int m,
+                                                   int n,
+                                                   int nnz,
+                                                   const double* alpha,
+                                                   const cusparseMatDescr_t descrA,
+                                                   const double* csrValA,
+                                                   const int* csrRowPtrA,
+                                                   const int* csrColIndA,
+                                                   const double* x,
+                                                   const double* beta,
+                                                   double* y,
+                                                   size_t* bufferSizeInBytes,
+                                                   cudaStream_t stream)
+{
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseCsrmvEx_bufferSize(handle,
+                                    alg,
+                                    transA,
+                                    m,
+                                    n,
+                                    nnz,
+                                    alpha,
+                                    CUDA_R_64F,
+                                    descrA,
+                                    csrValA,
+                                    CUDA_R_64F,
+                                    csrRowPtrA,
+                                    csrColIndA,
+                                    x,
+                                    CUDA_R_64F,
+                                    beta,
+                                    CUDA_R_64F,
+                                    y,
+                                    CUDA_R_64F,
+                                    CUDA_R_64F,
+                                    bufferSizeInBytes);
 }
 
 template <typename T>
-cusparseStatus_t cusparsecsrmvex(
-  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-  int m, int n, int nnz, const T* alpha, const cusparseMatDescr_t descrA,
-  const T* csrValA, const int* csrRowPtrA, const int* csrColIndA, const T* x,
-  const T* beta, T* y, T* buffer, cudaStream_t stream);
-template <>
-inline cusparseStatus_t cusparsecsrmvex(
-  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-  int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA,
-  const float* csrValA, const int* csrRowPtrA, const int* csrColIndA,
-  const float* x, const float* beta, float* y, float* buffer,
-  cudaStream_t stream) {
-  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseCsrmvEx(handle, alg, transA, m, n, nnz, alpha, CUDA_R_32F,
-                         descrA, csrValA, CUDA_R_32F, csrRowPtrA, csrColIndA, x,
-                         CUDA_R_32F, beta, CUDA_R_32F, y, CUDA_R_32F,
-                         CUDA_R_32F, buffer);
-}
-template <>
-inline cusparseStatus_t cusparsecsrmvex(
-  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-  int m, int n, int nnz, const double* alpha, const cusparseMatDescr_t descrA,
-  const double* csrValA, const int* csrRowPtrA, const int* csrColIndA,
-  const double* x, const double* beta, double* y, double* buffer,
-  cudaStream_t stream) {
-  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseCsrmvEx(handle, alg, transA, m, n, nnz, alpha, CUDA_R_64F,
-                         descrA, csrValA, CUDA_R_64F, csrRowPtrA, csrColIndA, x,
-                         CUDA_R_64F, beta, CUDA_R_64F, y, CUDA_R_64F,
-                         CUDA_R_64F, buffer);
+cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle,
+                                 cusparseAlgMode_t alg,
+                                 cusparseOperation_t transA,
+                                 int m,
+                                 int n,
+                                 int nnz,
+                                 const T* alpha,
+                                 const cusparseMatDescr_t descrA,
+                                 const T* csrValA,
+                                 const int* csrRowPtrA,
+                                 const int* csrColIndA,
+                                 const T* x,
+                                 const T* beta,
+                                 T* y,
+                                 T* buffer,
+                                 cudaStream_t stream);
+template <>
+inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle,
+                                        cusparseAlgMode_t alg,
+                                        cusparseOperation_t transA,
+                                        int m,
+                                        int n,
+                                        int nnz,
+                                        const float* alpha,
+                                        const cusparseMatDescr_t descrA,
+                                        const float* csrValA,
+                                        const int* csrRowPtrA,
+                                        const int* csrColIndA,
+                                        const float* x,
+                                        const float* beta,
+                                        float* y,
+                                        float* buffer,
+                                        cudaStream_t stream)
+{
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseCsrmvEx(handle,
+                         alg,
+                         transA,
+                         m,
+                         n,
+                         nnz,
+                         alpha,
+                         CUDA_R_32F,
+                         descrA,
+                         csrValA,
+                         CUDA_R_32F,
+                         csrRowPtrA,
+                         csrColIndA,
+                         x,
+                         CUDA_R_32F,
+                         beta,
+                         CUDA_R_32F,
+                         y,
+                         CUDA_R_32F,
+                         CUDA_R_32F,
+                         buffer);
+}
+template <>
+inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle,
+                                        cusparseAlgMode_t alg,
+                                        cusparseOperation_t transA,
+                                        int m,
+                                        int n,
+                                        int nnz,
+                                        const double* alpha,
+                                        const cusparseMatDescr_t descrA,
+                                        const double* csrValA,
+                                        const int* csrRowPtrA,
+                                        const int* csrColIndA,
+                                        const double* x,
+                                        const double* beta,
+                                        double* y,
+                                        double* buffer,
+                                        cudaStream_t stream)
+{
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseCsrmvEx(handle,
+                         alg,
+                         transA,
+                         m,
+                         n,
+                         nnz,
+                         alpha,
+                         CUDA_R_64F,
+                         descrA,
+                         csrValA,
+                         CUDA_R_64F,
+                         csrRowPtrA,
+                         csrColIndA,
+                         x,
+                         CUDA_R_64F,
+                         beta,
+                         CUDA_R_64F,
+                         y,
+                         CUDA_R_64F,
+                         CUDA_R_64F,
+                         buffer);
 }
 
 /** @} */
@@ -637,68 +1035,180 @@ inline cusparseStatus_t cusparsecsrmvex(
  */
 
 template <typename T>
-cusparseStatus_t cusparsecsr2csc_bufferSize(
-  cusparseHandle_t handle, int m, int n, int nnz, const T* csrVal,
-  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
-  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
-  cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream);
+cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle,
+                                            int m,
+                                            int n,
+                                            int nnz,
+                                            const T* csrVal,
+                                            const int* csrRowPtr,
+                                            const int* csrColInd,
+                                            void* cscVal,
+                                            int* cscColPtr,
+                                            int* cscRowInd,
+                                            cusparseAction_t copyValues,
+                                            cusparseIndexBase_t idxBase,
+                                            cusparseCsr2CscAlg_t alg,
+                                            size_t* bufferSize,
+                                            cudaStream_t stream);
 
 template <>
-inline cusparseStatus_t cusparsecsr2csc_bufferSize(
-  cusparseHandle_t handle, int m, int n, int nnz, const float* csrVal,
-  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
-  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
-  cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle,
+                                                   int m,
+                                                   int n,
+                                                   int nnz,
+                                                   const float* csrVal,
+                                                   const int* csrRowPtr,
+                                                   const int* csrColInd,
+                                                   void* cscVal,
+                                                   int* cscColPtr,
+                                                   int* cscRowInd,
+                                                   cusparseAction_t copyValues,
+                                                   cusparseIndexBase_t idxBase,
+                                                   cusparseCsr2CscAlg_t alg,
+                                                   size_t* bufferSize,
+                                                   cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 
-  return cusparseCsr2cscEx2_bufferSize(
-    handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr,
-    cscRowInd, CUDA_R_32F, copyValues, idxBase, alg, bufferSize);
+  return cusparseCsr2cscEx2_bufferSize(handle,
+                                       m,
+                                       n,
+                                       nnz,
+                                       csrVal,
+                                       csrRowPtr,
+                                       csrColInd,
+                                       cscVal,
+                                       cscColPtr,
+                                       cscRowInd,
+                                       CUDA_R_32F,
+                                       copyValues,
+                                       idxBase,
+                                       alg,
+                                       bufferSize);
 }
 template <>
-inline cusparseStatus_t cusparsecsr2csc_bufferSize(
-  cusparseHandle_t handle, int m, int n, int nnz, const double* csrVal,
-  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
-  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
-  cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle,
+                                                   int m,
+                                                   int n,
+                                                   int nnz,
+                                                   const double* csrVal,
+                                                   const int* csrRowPtr,
+                                                   const int* csrColInd,
+                                                   void* cscVal,
+                                                   int* cscColPtr,
+                                                   int* cscRowInd,
+                                                   cusparseAction_t copyValues,
+                                                   cusparseIndexBase_t idxBase,
+                                                   cusparseCsr2CscAlg_t alg,
+                                                   size_t* bufferSize,
+                                                   cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 
-  return cusparseCsr2cscEx2_bufferSize(
-    handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr,
-    cscRowInd, CUDA_R_64F, copyValues, idxBase, alg, bufferSize);
+  return cusparseCsr2cscEx2_bufferSize(handle,
+                                       m,
+                                       n,
+                                       nnz,
+                                       csrVal,
+                                       csrRowPtr,
+                                       csrColInd,
+                                       cscVal,
+                                       cscColPtr,
+                                       cscRowInd,
+                                       CUDA_R_64F,
+                                       copyValues,
+                                       idxBase,
+                                       alg,
+                                       bufferSize);
 }
 
 template <typename T>
-cusparseStatus_t cusparsecsr2csc(
-  cusparseHandle_t handle, int m, int n, int nnz, const T* csrVal,
-  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
-  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
-  cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream);
+cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle,
+                                 int m,
+                                 int n,
+                                 int nnz,
+                                 const T* csrVal,
+                                 const int* csrRowPtr,
+                                 const int* csrColInd,
+                                 void* cscVal,
+                                 int* cscColPtr,
+                                 int* cscRowInd,
+                                 cusparseAction_t copyValues,
+                                 cusparseIndexBase_t idxBase,
+                                 cusparseCsr2CscAlg_t alg,
+                                 void* buffer,
+                                 cudaStream_t stream);
 
 template <>
-inline cusparseStatus_t cusparsecsr2csc(
-  cusparseHandle_t handle, int m, int n, int nnz, const float* csrVal,
-  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
-  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
-  cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle,
+                                        int m,
+                                        int n,
+                                        int nnz,
+                                        const float* csrVal,
+                                        const int* csrRowPtr,
+                                        const int* csrColInd,
+                                        void* cscVal,
+                                        int* cscColPtr,
+                                        int* cscRowInd,
+                                        cusparseAction_t copyValues,
+                                        cusparseIndexBase_t idxBase,
+                                        cusparseCsr2CscAlg_t alg,
+                                        void* buffer,
+                                        cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 
-  return cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd,
-                            cscVal, cscColPtr, cscRowInd, CUDA_R_32F,
-                            copyValues, idxBase, alg, buffer);
+  return cusparseCsr2cscEx2(handle,
+                            m,
+                            n,
+                            nnz,
+                            csrVal,
+                            csrRowPtr,
+                            csrColInd,
+                            cscVal,
+                            cscColPtr,
+                            cscRowInd,
+                            CUDA_R_32F,
+                            copyValues,
+                            idxBase,
+                            alg,
+                            buffer);
 }
 
 template <>
-inline cusparseStatus_t cusparsecsr2csc(
-  cusparseHandle_t handle, int m, int n, int nnz, const double* csrVal,
-  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
-  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
-  cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle,
+                                        int m,
+                                        int n,
+                                        int nnz,
+                                        const double* csrVal,
+                                        const int* csrRowPtr,
+                                        const int* csrColInd,
+                                        void* cscVal,
+                                        int* cscColPtr,
+                                        int* cscRowInd,
+                                        cusparseAction_t copyValues,
+                                        cusparseIndexBase_t idxBase,
+                                        cusparseCsr2CscAlg_t alg,
+                                        void* buffer,
+                                        cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 
-  return cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd,
-                            cscVal, cscColPtr, cscRowInd, CUDA_R_64F,
-                            copyValues, idxBase, alg, buffer);
+  return cusparseCsr2cscEx2(handle,
+                            m,
+                            n,
+                            nnz,
+                            csrVal,
+                            csrRowPtr,
+                            csrColInd,
+                            cscVal,
+                            cscColPtr,
+                            cscRowInd,
+                            CUDA_R_64F,
+                            copyValues,
+                            idxBase,
+                            alg,
+                            buffer);
 }
 
 /** @} */
@@ -709,120 +1219,329 @@ inline cusparseStatus_t cusparsecsr2csc(
  */
 
 template <typename T>
-cusparseStatus_t cusparsecsrgemm2_buffersizeext(
-  cusparseHandle_t handle, int m, int n, int k, const T* alpha, const T* beta,
-  const cusparseMatDescr_t matA, int nnzA, const int* rowindA,
-  const int* indicesA, const cusparseMatDescr_t matB, int nnzB,
-  const int* rowindB, const int* indicesB, const cusparseMatDescr_t matD,
-  int nnzD, const int* rowindD, const int* indicesD, csrgemm2Info_t info,
-  size_t* pBufferSizeInBytes, cudaStream_t stream);
-
-template <>
-inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(
-  cusparseHandle_t handle, int m, int n, int k, const float* alpha,
-  const float* beta, const cusparseMatDescr_t matA, int nnzA,
-  const int* rowindA, const int* indicesA, const cusparseMatDescr_t matB,
-  int nnzB, const int* rowindB, const int* indicesB,
-  const cusparseMatDescr_t matD, int nnzD, const int* rowindD,
-  const int* indicesD, csrgemm2Info_t info, size_t* pBufferSizeInBytes,
-  cudaStream_t stream) {
+cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle,
+                                                int m,
+                                                int n,
+                                                int k,
+                                                const T* alpha,
+                                                const T* beta,
+                                                const cusparseMatDescr_t matA,
+                                                int nnzA,
+                                                const int* rowindA,
+                                                const int* indicesA,
+                                                const cusparseMatDescr_t matB,
+                                                int nnzB,
+                                                const int* rowindB,
+                                                const int* indicesB,
+                                                const cusparseMatDescr_t matD,
+                                                int nnzD,
+                                                const int* rowindD,
+                                                const int* indicesD,
+                                                csrgemm2Info_t info,
+                                                size_t* pBufferSizeInBytes,
+                                                cudaStream_t stream);
+
+template <>
+inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle,
+                                                       int m,
+                                                       int n,
+                                                       int k,
+                                                       const float* alpha,
+                                                       const float* beta,
+                                                       const cusparseMatDescr_t matA,
+                                                       int nnzA,
+                                                       const int* rowindA,
+                                                       const int* indicesA,
+                                                       const cusparseMatDescr_t matB,
+                                                       int nnzB,
+                                                       const int* rowindB,
+                                                       const int* indicesB,
+                                                       const cusparseMatDescr_t matD,
+                                                       int nnzD,
+                                                       const int* rowindD,
+                                                       const int* indicesD,
+                                                       csrgemm2Info_t info,
+                                                       size_t* pBufferSizeInBytes,
+                                                       cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseScsrgemm2_bufferSizeExt(
-    handle, m, n, k, alpha, matA, nnzA, rowindA, indicesA, matB, nnzB, rowindB,
-    indicesB, beta, matD, nnzD, rowindD, indicesD, info, pBufferSizeInBytes);
+  return cusparseScsrgemm2_bufferSizeExt(handle,
+                                         m,
+                                         n,
+                                         k,
+                                         alpha,
+                                         matA,
+                                         nnzA,
+                                         rowindA,
+                                         indicesA,
+                                         matB,
+                                         nnzB,
+                                         rowindB,
+                                         indicesB,
+                                         beta,
+                                         matD,
+                                         nnzD,
+                                         rowindD,
+                                         indicesD,
+                                         info,
+                                         pBufferSizeInBytes);
 #pragma GCC diagnostic pop
 }
 
 template <>
-inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(
-  cusparseHandle_t handle, int m, int n, int k, const double* alpha,
-  const double* beta, const cusparseMatDescr_t matA, int nnzA,
-  const int* rowindA, const int* indicesA, const cusparseMatDescr_t matB,
-  int nnzB, const int* rowindB, const int* indicesB,
-  const cusparseMatDescr_t matD, int nnzD, const int* rowindD,
-  const int* indicesD, csrgemm2Info_t info, size_t* pBufferSizeInBytes,
-  cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle,
+                                                       int m,
+                                                       int n,
+                                                       int k,
+                                                       const double* alpha,
+                                                       const double* beta,
+                                                       const cusparseMatDescr_t matA,
+                                                       int nnzA,
+                                                       const int* rowindA,
+                                                       const int* indicesA,
+                                                       const cusparseMatDescr_t matB,
+                                                       int nnzB,
+                                                       const int* rowindB,
+                                                       const int* indicesB,
+                                                       const cusparseMatDescr_t matD,
+                                                       int nnzD,
+                                                       const int* rowindD,
+                                                       const int* indicesD,
+                                                       csrgemm2Info_t info,
+                                                       size_t* pBufferSizeInBytes,
+                                                       cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseDcsrgemm2_bufferSizeExt(
-    handle, m, n, k, alpha, matA, nnzA, rowindA, indicesA, matB, nnzB, rowindB,
-    indicesB, beta, matD, nnzD, rowindD, indicesD, info, pBufferSizeInBytes);
+  return cusparseDcsrgemm2_bufferSizeExt(handle,
+                                         m,
+                                         n,
+                                         k,
+                                         alpha,
+                                         matA,
+                                         nnzA,
+                                         rowindA,
+                                         indicesA,
+                                         matB,
+                                         nnzB,
+                                         rowindB,
+                                         indicesB,
+                                         beta,
+                                         matD,
+                                         nnzD,
+                                         rowindD,
+                                         indicesD,
+                                         info,
+                                         pBufferSizeInBytes);
 #pragma GCC diagnostic pop
 }
 
-inline cusparseStatus_t cusparsecsrgemm2nnz(
-  cusparseHandle_t handle, int m, int n, int k, const cusparseMatDescr_t matA,
-  int nnzA, const int* rowindA, const int* indicesA,
-  const cusparseMatDescr_t matB, int nnzB, const int* rowindB,
-  const int* indicesB, const cusparseMatDescr_t matD, int nnzD,
-  const int* rowindD, const int* indicesD, const cusparseMatDescr_t matC,
-  int* rowindC, int* nnzC, const csrgemm2Info_t info, void* pBuffer,
-  cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsrgemm2nnz(cusparseHandle_t handle,
+                                            int m,
+                                            int n,
+                                            int k,
+                                            const cusparseMatDescr_t matA,
+                                            int nnzA,
+                                            const int* rowindA,
+                                            const int* indicesA,
+                                            const cusparseMatDescr_t matB,
+                                            int nnzB,
+                                            const int* rowindB,
+                                            const int* indicesB,
+                                            const cusparseMatDescr_t matD,
+                                            int nnzD,
+                                            const int* rowindD,
+                                            const int* indicesD,
+                                            const cusparseMatDescr_t matC,
+                                            int* rowindC,
+                                            int* nnzC,
+                                            const csrgemm2Info_t info,
+                                            void* pBuffer,
+                                            cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseXcsrgemm2Nnz(handle, m, n, k, matA, nnzA, rowindA, indicesA,
-                              matB, nnzB, rowindB, indicesB, matD, nnzD,
-                              rowindD, indicesD, matC, rowindC, nnzC, info,
+  return cusparseXcsrgemm2Nnz(handle,
+                              m,
+                              n,
+                              k,
+                              matA,
+                              nnzA,
+                              rowindA,
+                              indicesA,
+                              matB,
+                              nnzB,
+                              rowindB,
+                              indicesB,
+                              matD,
+                              nnzD,
+                              rowindD,
+                              indicesD,
+                              matC,
+                              rowindC,
+                              nnzC,
+                              info,
                               pBuffer);
 #pragma GCC diagnostic pop
 }
 
 template <typename T>
-cusparseStatus_t cusparsecsrgemm2(
-  cusparseHandle_t handle, int m, int n, int k, const T* alpha,
-  const cusparseMatDescr_t descrA, int nnzA, const T* csrValA,
-  const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB,
-  int nnzB, const T* csrValB, const int* csrRowPtrB, const int* csrColIndB,
-  const T* beta, const cusparseMatDescr_t descrD, int nnzD, const T* csrValD,
-  const int* csrRowPtrD, const int* csrColIndD, const cusparseMatDescr_t descrC,
-  T* csrValC, const int* csrRowPtrC, int* csrColIndC, const csrgemm2Info_t info,
-  void* pBuffer, cudaStream_t stream);
-
-template <>
-inline cusparseStatus_t cusparsecsrgemm2(
-  cusparseHandle_t handle, int m, int n, int k, const float* alpha,
-  const cusparseMatDescr_t descrA, int nnzA, const float* csrValA,
-  const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB,
-  int nnzB, const float* csrValB, const int* csrRowPtrB, const int* csrColIndB,
-  const float* beta, const cusparseMatDescr_t descrD, int nnzD,
-  const float* csrValD, const int* csrRowPtrD, const int* csrColIndD,
-  const cusparseMatDescr_t descrC, float* csrValC, const int* csrRowPtrC,
-  int* csrColIndC, const csrgemm2Info_t info, void* pBuffer,
-  cudaStream_t stream) {
+cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle,
+                                  int m,
+                                  int n,
+                                  int k,
+                                  const T* alpha,
+                                  const cusparseMatDescr_t descrA,
+                                  int nnzA,
+                                  const T* csrValA,
+                                  const int* csrRowPtrA,
+                                  const int* csrColIndA,
+                                  const cusparseMatDescr_t descrB,
+                                  int nnzB,
+                                  const T* csrValB,
+                                  const int* csrRowPtrB,
+                                  const int* csrColIndB,
+                                  const T* beta,
+                                  const cusparseMatDescr_t descrD,
+                                  int nnzD,
+                                  const T* csrValD,
+                                  const int* csrRowPtrD,
+                                  const int* csrColIndD,
+                                  const cusparseMatDescr_t descrC,
+                                  T* csrValC,
+                                  const int* csrRowPtrC,
+                                  int* csrColIndC,
+                                  const csrgemm2Info_t info,
+                                  void* pBuffer,
+                                  cudaStream_t stream);
+
+template <>
+inline cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle,
+                                         int m,
+                                         int n,
+                                         int k,
+                                         const float* alpha,
+                                         const cusparseMatDescr_t descrA,
+                                         int nnzA,
+                                         const float* csrValA,
+                                         const int* csrRowPtrA,
+                                         const int* csrColIndA,
+                                         const cusparseMatDescr_t descrB,
+                                         int nnzB,
+                                         const float* csrValB,
+                                         const int* csrRowPtrB,
+                                         const int* csrColIndB,
+                                         const float* beta,
+                                         const cusparseMatDescr_t descrD,
+                                         int nnzD,
+                                         const float* csrValD,
+                                         const int* csrRowPtrD,
+                                         const int* csrColIndD,
+                                         const cusparseMatDescr_t descrC,
+                                         float* csrValC,
+                                         const int* csrRowPtrC,
+                                         int* csrColIndC,
+                                         const csrgemm2Info_t info,
+                                         void* pBuffer,
+                                         cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseScsrgemm2(handle, m, n, k, alpha, descrA, nnzA, csrValA,
-                           csrRowPtrA, csrColIndA, descrB, nnzB, csrValB,
-                           csrRowPtrB, csrColIndB, beta, descrD, nnzD, csrValD,
-                           csrRowPtrD, csrColIndD, descrC, csrValC, csrRowPtrC,
-                           csrColIndC, info, pBuffer);
+  return cusparseScsrgemm2(handle,
+                           m,
+                           n,
+                           k,
+                           alpha,
+                           descrA,
+                           nnzA,
+                           csrValA,
+                           csrRowPtrA,
+                           csrColIndA,
+                           descrB,
+                           nnzB,
+                           csrValB,
+                           csrRowPtrB,
+                           csrColIndB,
+                           beta,
+                           descrD,
+                           nnzD,
+                           csrValD,
+                           csrRowPtrD,
+                           csrColIndD,
+                           descrC,
+                           csrValC,
+                           csrRowPtrC,
+                           csrColIndC,
+                           info,
+                           pBuffer);
 #pragma GCC diagnostic pop
 }
 
 template <>
-inline cusparseStatus_t cusparsecsrgemm2(
-  cusparseHandle_t handle, int m, int n, int k, const double* alpha,
-  const cusparseMatDescr_t descrA, int nnzA, const double* csrValA,
-  const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB,
-  int nnzB, const double* csrValB, const int* csrRowPtrB, const int* csrColIndB,
-  const double* beta, const cusparseMatDescr_t descrD, int nnzD,
-  const double* csrValD, const int* csrRowPtrD, const int* csrColIndD,
-  const cusparseMatDescr_t descrC, double* csrValC, const int* csrRowPtrC,
-  int* csrColIndC, const csrgemm2Info_t info, void* pBuffer,
-  cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle,
+                                         int m,
+                                         int n,
+                                         int k,
+                                         const double* alpha,
+                                         const cusparseMatDescr_t descrA,
+                                         int nnzA,
+                                         const double* csrValA,
+                                         const int* csrRowPtrA,
+                                         const int* csrColIndA,
+                                         const cusparseMatDescr_t descrB,
+                                         int nnzB,
+                                         const double* csrValB,
+                                         const int* csrRowPtrB,
+                                         const int* csrColIndB,
+                                         const double* beta,
+                                         const cusparseMatDescr_t descrD,
+                                         int nnzD,
+                                         const double* csrValD,
+                                         const int* csrRowPtrD,
+                                         const int* csrColIndD,
+                                         const cusparseMatDescr_t descrC,
+                                         double* csrValC,
+                                         const int* csrRowPtrC,
+                                         int* csrColIndC,
+                                         const csrgemm2Info_t info,
+                                         void* pBuffer,
+                                         cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseDcsrgemm2(handle, m, n, k, alpha, descrA, nnzA, csrValA,
-                           csrRowPtrA, csrColIndA, descrB, nnzB, csrValB,
-                           csrRowPtrB, csrColIndB, beta, descrD, nnzD, csrValD,
-                           csrRowPtrD, csrColIndD, descrC, csrValC, csrRowPtrC,
-                           csrColIndC, info, pBuffer);
+  return cusparseDcsrgemm2(handle,
+                           m,
+                           n,
+                           k,
+                           alpha,
+                           descrA,
+                           nnzA,
+                           csrValA,
+                           csrRowPtrA,
+                           csrColIndA,
+                           descrB,
+                           nnzB,
+                           csrValB,
+                           csrRowPtrB,
+                           csrColIndB,
+                           beta,
+                           descrD,
+                           nnzD,
+                           csrValD,
+                           csrRowPtrD,
+                           csrColIndD,
+                           descrC,
+                           csrValC,
+                           csrRowPtrC,
+                           csrColIndC,
+                           info,
+                           pBuffer);
 #pragma GCC diagnostic pop
 }
 
@@ -834,33 +1553,46 @@ inline cusparseStatus_t cusparsecsrgemm2(
  */
 
 template <typename T>
-cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n,
+cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle,
+                                   int m,
+                                   int n,
                                    const cusparseMatDescr_t descrA,
-                                   const T* csrValA, const int* csrRowPtrA,
-                                   const int* csrColIndA, T* A, int lda,
+                                   const T* csrValA,
+                                   const int* csrRowPtrA,
+                                   const int* csrColIndA,
+                                   T* A,
+                                   int lda,
                                    cudaStream_t stream);
 
 template <>
-inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n,
+inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle,
+                                          int m,
+                                          int n,
                                           const cusparseMatDescr_t descrA,
                                           const float* csrValA,
                                           const int* csrRowPtrA,
-                                          const int* csrColIndA, float* A,
-                                          int lda, cudaStream_t stream) {
+                                          const int* csrColIndA,
+                                          float* A,
+                                          int lda,
+                                          cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA,
-                            csrColIndA, A, lda);
+  return cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda);
 }
 template <>
-inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n,
+inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle,
+                                          int m,
+                                          int n,
                                           const cusparseMatDescr_t descrA,
                                           const double* csrValA,
                                           const int* csrRowPtrA,
-                                          const int* csrColIndA, double* A,
-                                          int lda, cudaStream_t stream) {
+                                          const int* csrColIndA,
+                                          double* A,
+                                          int lda,
+                                          cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA,
-                            csrColIndA, A, lda);
+  return cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda);
 }
 
 /** @} */
diff --git a/cpp/include/raft/sparse/distance/common.h b/cpp/include/raft/sparse/distance/common.h
index 1c55412eec..29c823bcdb 100644
--- a/cpp/include/raft/sparse/distance/common.h
+++ b/cpp/include/raft/sparse/distance/common.h
@@ -24,31 +24,31 @@ namespace distance {
 
 template <typename value_idx, typename value_t>
 struct distances_config_t {
-  distances_config_t(const raft::handle_t &handle_) : handle(handle_) {}
+  distances_config_t(const raft::handle_t& handle_) : handle(handle_) {}
 
   // left side
   value_idx a_nrows;
   value_idx a_ncols;
   value_idx a_nnz;
-  value_idx *a_indptr;
-  value_idx *a_indices;
-  value_t *a_data;
+  value_idx* a_indptr;
+  value_idx* a_indices;
+  value_t* a_data;
 
   // right side
   value_idx b_nrows;
   value_idx b_ncols;
   value_idx b_nnz;
-  value_idx *b_indptr;
-  value_idx *b_indices;
-  value_t *b_data;
+  value_idx* b_indptr;
+  value_idx* b_indices;
+  value_t* b_data;
 
-  const raft::handle_t &handle;
+  const raft::handle_t& handle;
 };
 
 template <typename value_t>
 class distances_t {
  public:
-  virtual void compute(value_t *out) {}
+  virtual void compute(value_t* out) {}
   virtual ~distances_t() = default;
 };
 
diff --git a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
index 3f8c32a20b..4d3b31df9a 100644
--- a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
@@ -35,9 +35,11 @@ namespace distance {
 namespace detail {
 // @TODO: Move this into sparse prims (coo_norm)
 template <typename value_idx, typename value_t>
-__global__ void compute_binary_row_norm_kernel(
-  value_t *out, const value_idx *__restrict__ coo_rows,
-  const value_t *__restrict__ data, value_idx nnz) {
+__global__ void compute_binary_row_norm_kernel(value_t* out,
+                                               const value_idx* __restrict__ coo_rows,
+                                               const value_t* __restrict__ data,
+                                               value_idx nnz)
+{
   value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
   if (i < nnz) {
     // We do conditional here only because it's
@@ -49,54 +51,63 @@ __global__ void compute_binary_row_norm_kernel(
 }
 
 template <typename value_idx, typename value_t, typename expansion_f>
-__global__ void compute_binary_warp_kernel(value_t *__restrict__ C,
-                                           const value_t *__restrict__ Q_norms,
-                                           const value_t *__restrict__ R_norms,
-                                           value_idx n_rows, value_idx n_cols,
-                                           expansion_f expansion_func) {
+__global__ void compute_binary_warp_kernel(value_t* __restrict__ C,
+                                           const value_t* __restrict__ Q_norms,
+                                           const value_t* __restrict__ R_norms,
+                                           value_idx n_rows,
+                                           value_idx n_cols,
+                                           expansion_f expansion_func)
+{
   std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
-  value_idx i = tid / n_cols;
-  value_idx j = tid % n_cols;
+  value_idx i     = tid / n_cols;
+  value_idx j     = tid % n_cols;
 
   if (i >= n_rows || j >= n_cols) return;
 
-  value_t q_norm = Q_norms[i];
-  value_t r_norm = R_norms[j];
-  value_t dot = C[(size_t)i * n_cols + j];
+  value_t q_norm            = Q_norms[i];
+  value_t r_norm            = R_norms[j];
+  value_t dot               = C[(size_t)i * n_cols + j];
   C[(size_t)i * n_cols + j] = expansion_func(dot, q_norm, r_norm);
 }
 
-template <typename value_idx, typename value_t, typename expansion_f,
-          int tpb = 1024>
-void compute_binary(value_t *C, const value_t *Q_norms, const value_t *R_norms,
-                    value_idx n_rows, value_idx n_cols,
-                    expansion_f expansion_func, cudaStream_t stream) {
+template <typename value_idx, typename value_t, typename expansion_f, int tpb = 1024>
+void compute_binary(value_t* C,
+                    const value_t* Q_norms,
+                    const value_t* R_norms,
+                    value_idx n_rows,
+                    value_idx n_cols,
+                    expansion_f expansion_func,
+                    cudaStream_t stream)
+{
   int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
   compute_binary_warp_kernel<<<blocks, tpb, 0, stream>>>(
     C, Q_norms, R_norms, n_rows, n_cols, expansion_func);
 }
 
-template <typename value_idx, typename value_t, typename expansion_f,
-          int tpb = 1024>
-void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows,
-                          const value_t *Q_data, value_idx Q_nnz,
-                          const value_idx *R_coo_rows, const value_t *R_data,
-                          value_idx R_nnz, value_idx m, value_idx n,
-                          cudaStream_t stream, expansion_f expansion_func) {
+template <typename value_idx, typename value_t, typename expansion_f, int tpb = 1024>
+void compute_bin_distance(value_t* out,
+                          const value_idx* Q_coo_rows,
+                          const value_t* Q_data,
+                          value_idx Q_nnz,
+                          const value_idx* R_coo_rows,
+                          const value_t* R_data,
+                          value_idx R_nnz,
+                          value_idx m,
+                          value_idx n,
+                          cudaStream_t stream,
+                          expansion_f expansion_func)
+{
   rmm::device_uvector<value_t> Q_norms(m, stream);
   rmm::device_uvector<value_t> R_norms(n, stream);
-  CUDA_CHECK(
-    cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
-  CUDA_CHECK(
-    cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
 
   compute_binary_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
     Q_norms.data(), Q_coo_rows, Q_data, Q_nnz);
   compute_binary_row_norm_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
     R_norms.data(), R_coo_rows, R_data, R_nnz);
 
-  compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func,
-                 stream);
+  compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func, stream);
 }
 
 /**
@@ -106,44 +117,51 @@ void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows,
 template <typename value_idx = int, typename value_t = float>
 class jaccard_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit jaccard_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config),
-      workspace(0, config.handle.get_stream()),
-      ip_dists(config) {}
+  explicit jaccard_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     ip_dists.compute(out_dists);
 
-    value_idx *b_indices = ip_dists.b_rows_coo();
-    value_t *b_data = ip_dists.b_data_coo();
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
 
-    rmm::device_uvector<value_idx> search_coo_rows(
-      config_->a_nnz, config_->handle.get_stream());
-    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
-                                      search_coo_rows.data(), config_->a_nnz,
+    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz, config_->handle.get_stream());
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
                                       config_->handle.get_stream());
 
-    compute_bin_distance(
-      out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
-      b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
-      config_->handle.get_stream(),
-      [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
-        value_t q_r_union = q_norm + r_norm;
-        value_t denom = q_r_union - dot;
-
-        value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom);
-
-        // flip the similarity when both rows are 0
-        bool both_empty = q_r_union == 0;
-        return 1 - ((!both_empty * jacc) + both_empty);
-      });
+    compute_bin_distance(out_dists,
+                         search_coo_rows.data(),
+                         config_->a_data,
+                         config_->a_nnz,
+                         b_indices,
+                         b_data,
+                         config_->b_nnz,
+                         config_->a_nrows,
+                         config_->b_nrows,
+                         config_->handle.get_stream(),
+                         [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                           value_t q_r_union = q_norm + r_norm;
+                           value_t denom     = q_r_union - dot;
+
+                           value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom);
+
+                           // flip the similarity when both rows are 0
+                           bool both_empty = q_r_union == 0;
+                           return 1 - ((!both_empty * jacc) + both_empty);
+                         });
   }
 
   ~jaccard_expanded_distances_t() = default;
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   rmm::device_uvector<char> workspace;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
@@ -155,40 +173,47 @@ class jaccard_expanded_distances_t : public distances_t<value_t> {
 template <typename value_idx = int, typename value_t = float>
 class dice_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit dice_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config),
-      workspace(0, config.handle.get_stream()),
-      ip_dists(config) {}
+  explicit dice_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     ip_dists.compute(out_dists);
 
-    value_idx *b_indices = ip_dists.b_rows_coo();
-    value_t *b_data = ip_dists.b_data_coo();
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
 
-    rmm::device_uvector<value_idx> search_coo_rows(
-      config_->a_nnz, config_->handle.get_stream());
-    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
-                                      search_coo_rows.data(), config_->a_nnz,
+    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz, config_->handle.get_stream());
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
                                       config_->handle.get_stream());
 
-    compute_bin_distance(
-      out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
-      b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
-      config_->handle.get_stream(),
-      [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
-        value_t q_r_union = q_norm + r_norm;
-        value_t dice = (2 * dot) / q_r_union;
-        bool both_empty = q_r_union == 0;
-        return 1 - ((!both_empty * dice) + both_empty);
-      });
+    compute_bin_distance(out_dists,
+                         search_coo_rows.data(),
+                         config_->a_data,
+                         config_->a_nnz,
+                         b_indices,
+                         b_data,
+                         config_->b_nnz,
+                         config_->a_nrows,
+                         config_->b_nrows,
+                         config_->handle.get_stream(),
+                         [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                           value_t q_r_union = q_norm + r_norm;
+                           value_t dice      = (2 * dot) / q_r_union;
+                           bool both_empty   = q_r_union == 0;
+                           return 1 - ((!both_empty * dice) + both_empty);
+                         });
   }
 
   ~dice_expanded_distances_t() = default;
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   rmm::device_uvector<char> workspace;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
index 83844b8c54..6694d0fc4f 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
@@ -39,19 +39,29 @@ namespace sparse {
 namespace distance {
 namespace detail {
 
-template <typename value_idx, typename value_t, int threads_per_block = 1024,
-          typename product_f, typename accum_f, typename write_f,
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f,
           typename strategy_t>
 inline void balanced_coo_pairwise_generalized_spmv(
-  value_t *out_dists, const distances_config_t<value_idx, value_t> &config_,
-  value_idx *coo_rows_b, product_f product_func, accum_f accum_func,
-  write_f write_func, strategy_t strategy, int chunk_size = 500000) {
-  CUDA_CHECK(cudaMemsetAsync(
-    out_dists, 0, sizeof(value_t) * config_.a_nrows * config_.b_nrows,
-    config_.handle.get_stream()));
-
-  strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func,
-                    chunk_size);
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_b,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  strategy_t strategy,
+  int chunk_size = 500000)
+{
+  CUDA_CHECK(cudaMemsetAsync(out_dists,
+                             0,
+                             sizeof(value_t) * config_.a_nrows * config_.b_nrows,
+                             config_.handle.get_stream()));
+
+  strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size);
 };
 
 /**
@@ -87,39 +97,55 @@ inline void balanced_coo_pairwise_generalized_spmv(
  *            this value was found through profiling and represents a reasonable
  *            setting for both large and small densities
  */
-template <typename value_idx, typename value_t, int threads_per_block = 1024,
-          typename product_f, typename accum_f, typename write_f>
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
 inline void balanced_coo_pairwise_generalized_spmv(
-  value_t *out_dists, const distances_config_t<value_idx, value_t> &config_,
-  value_idx *coo_rows_b, product_f product_func, accum_f accum_func,
-  write_f write_func, int chunk_size = 500000) {
-  CUDA_CHECK(cudaMemsetAsync(
-    out_dists, 0, sizeof(value_t) * config_.a_nrows * config_.b_nrows,
-    config_.handle.get_stream()));
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_b,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  int chunk_size = 500000)
+{
+  CUDA_CHECK(cudaMemsetAsync(out_dists,
+                             0,
+                             sizeof(value_t) * config_.a_nrows * config_.b_nrows,
+                             config_.handle.get_stream()));
 
   int max_cols = max_cols_per_block<value_idx, value_t>();
 
   if (max_cols > config_.a_ncols) {
-    dense_smem_strategy<value_idx, value_t, threads_per_block> strategy(
-      config_);
-    strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func,
-                      write_func, chunk_size);
+    dense_smem_strategy<value_idx, value_t, threads_per_block> strategy(config_);
+    strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size);
   } else {
     hash_strategy<value_idx, value_t, threads_per_block> strategy(config_);
-    strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func,
-                      write_func, chunk_size);
+    strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size);
   }
 };
 
-template <typename value_idx, typename value_t, int threads_per_block = 1024,
-          typename product_f, typename accum_f, typename write_f,
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f,
           typename strategy_t>
 inline void balanced_coo_pairwise_generalized_spmv_rev(
-  value_t *out_dists, const distances_config_t<value_idx, value_t> &config_,
-  value_idx *coo_rows_a, product_f product_func, accum_f accum_func,
-  write_f write_func, strategy_t strategy, int chunk_size = 500000) {
-  strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func,
-                        write_func, chunk_size);
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_a,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  strategy_t strategy,
+  int chunk_size = 500000)
+{
+  strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size);
 };
 
 /**
@@ -158,24 +184,30 @@ inline void balanced_coo_pairwise_generalized_spmv_rev(
  *            this value was found through profiling and represents a reasonable
  *            setting for both large and small densities
  */
-template <typename value_idx, typename value_t, int threads_per_block = 1024,
-          typename product_f, typename accum_f, typename write_f>
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
 inline void balanced_coo_pairwise_generalized_spmv_rev(
-  value_t *out_dists, const distances_config_t<value_idx, value_t> &config_,
-  value_idx *coo_rows_a, product_f product_func, accum_f accum_func,
-  write_f write_func, int chunk_size = 500000) {
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_a,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  int chunk_size = 500000)
+{
   // try dense first
   int max_cols = max_cols_per_block<value_idx, value_t>();
 
   if (max_cols > config_.b_ncols) {
-    dense_smem_strategy<value_idx, value_t, threads_per_block> strategy(
-      config_);
-    strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func,
-                          write_func, chunk_size);
+    dense_smem_strategy<value_idx, value_t, threads_per_block> strategy(config_);
+    strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size);
   } else {
     hash_strategy<value_idx, value_t, threads_per_block> strategy(config_);
-    strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func,
-                          write_func, chunk_size);
+    strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size);
   }
 };
 
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh
index 866ff43224..9bfdd3bad0 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh
@@ -27,68 +27,88 @@ namespace sparse {
 namespace distance {
 namespace detail {
 /**
-  * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with
-  * sparse-matrix-sparse-vector multiplication layout (SPMV).
-  * This is intended to be scheduled n_chunks_b times for each row of a.
-  * The steps are as follows:
-  *
-  * 1. Load row from A into dense vector in shared memory.
-  *    This can be further chunked in the future if necessary to support larger
-  *    column sizes.
-  * 2. Threads of block all step through chunks of B in parallel.
-  *    When a new row is encountered in row_indices_b, a segmented
-  *    reduction is performed across the warps and then across the
-  *    block and the final value written out to host memory.
-  *
-  * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf
-  *
-  * @tparam value_idx index type
-  * @tparam value_t value type
-  * @tparam tpb threads per block configured on launch
-  * @tparam rev if this is true, the reduce/accumulate functions are only
-  *         executed when A[col] == 0.0. when executed before/after !rev
-  *         and A & B are reversed, this allows the full symmetric difference
-  *         and intersection to be computed.
-  * @tparam kv_t data type stored in shared mem cache
-  * @tparam product_f reduce function type (semiring product() function).
-  *                  accepts two arguments of value_t and returns a value_t
-  * @tparam accum_f accumulation function type (semiring sum() function).
-  *                 accepts two arguments of value_t and returns a value_t
-  * @tparam write_f function to write value out. this should be mathematically
-  *                 equivalent to the accumulate function but implemented as
-  *                 an atomic operation on global memory. Accepts two arguments
-  *                 of value_t* and value_t and updates the value given by the
-  *                 pointer.
-  * @param[in] indptrA column pointer array for A
-  * @param[in] indicesA column indices array for A
-  * @param[in] dataA data array for A
-  * @param[in] rowsB coo row array for B
-  * @param[in] indicesB column indices array for B
-  * @param[in] dataB data array for B
-  * @param[in] m number of rows in A
-  * @param[in] n number of rows in B
-  * @param[in] dim number of features
-  * @param[in] nnz_b number of nonzeros in B
-  * @param[out] out array of size m*n
-  * @param[in] n_blocks_per_row number of blocks of B per row of A
-  * @param[in] chunk_size number of nnz for B to use for each row of A
-  * @param[in] buffer_size amount of smem to use for each row of A
-  * @param[in] product_func semiring product() function
-  * @param[in] accum_func semiring sum() function
-  * @param[in] write_func atomic semiring sum() function
-  */
-template <typename strategy_t, typename indptr_it, typename value_idx,
-          typename value_t, bool rev, int tpb, typename product_f,
-          typename accum_f, typename write_f>
-__global__ void balanced_coo_generalized_spmv_kernel(
-  strategy_t strategy, indptr_it indptrA, value_idx *indicesA, value_t *dataA,
-  value_idx nnz_a, value_idx *rowsB, value_idx *indicesB, value_t *dataB,
-  value_idx m, value_idx n, int dim, value_idx nnz_b, value_t *out,
-  int n_blocks_per_row, int chunk_size, value_idx b_ncols,
-  product_f product_func, accum_f accum_func, write_f write_func) {
+ * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with
+ * sparse-matrix-sparse-vector multiplication layout (SPMV).
+ * This is intended to be scheduled n_chunks_b times for each row of a.
+ * The steps are as follows:
+ *
+ * 1. Load row from A into dense vector in shared memory.
+ *    This can be further chunked in the future if necessary to support larger
+ *    column sizes.
+ * 2. Threads of block all step through chunks of B in parallel.
+ *    When a new row is encountered in row_indices_b, a segmented
+ *    reduction is performed across the warps and then across the
+ *    block and the final value written out to host memory.
+ *
+ * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf
+ *
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @tparam tpb threads per block configured on launch
+ * @tparam rev if this is true, the reduce/accumulate functions are only
+ *         executed when A[col] == 0.0. when executed before/after !rev
+ *         and A & B are reversed, this allows the full symmetric difference
+ *         and intersection to be computed.
+ * @tparam kv_t data type stored in shared mem cache
+ * @tparam product_f reduce function type (semiring product() function).
+ *                  accepts two arguments of value_t and returns a value_t
+ * @tparam accum_f accumulation function type (semiring sum() function).
+ *                 accepts two arguments of value_t and returns a value_t
+ * @tparam write_f function to write value out. this should be mathematically
+ *                 equivalent to the accumulate function but implemented as
+ *                 an atomic operation on global memory. Accepts two arguments
+ *                 of value_t* and value_t and updates the value given by the
+ *                 pointer.
+ * @param[in] indptrA column pointer array for A
+ * @param[in] indicesA column indices array for A
+ * @param[in] dataA data array for A
+ * @param[in] rowsB coo row array for B
+ * @param[in] indicesB column indices array for B
+ * @param[in] dataB data array for B
+ * @param[in] m number of rows in A
+ * @param[in] n number of rows in B
+ * @param[in] dim number of features
+ * @param[in] nnz_b number of nonzeros in B
+ * @param[out] out array of size m*n
+ * @param[in] n_blocks_per_row number of blocks of B per row of A
+ * @param[in] chunk_size number of nnz for B to use for each row of A
+ * @param[in] buffer_size amount of smem to use for each row of A
+ * @param[in] product_func semiring product() function
+ * @param[in] accum_func semiring sum() function
+ * @param[in] write_func atomic semiring sum() function
+ */
+template <typename strategy_t,
+          typename indptr_it,
+          typename value_idx,
+          typename value_t,
+          bool rev,
+          int tpb,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
+__global__ void balanced_coo_generalized_spmv_kernel(strategy_t strategy,
+                                                     indptr_it indptrA,
+                                                     value_idx* indicesA,
+                                                     value_t* dataA,
+                                                     value_idx nnz_a,
+                                                     value_idx* rowsB,
+                                                     value_idx* indicesB,
+                                                     value_t* dataB,
+                                                     value_idx m,
+                                                     value_idx n,
+                                                     int dim,
+                                                     value_idx nnz_b,
+                                                     value_t* out,
+                                                     int n_blocks_per_row,
+                                                     int chunk_size,
+                                                     value_idx b_ncols,
+                                                     product_f product_func,
+                                                     accum_f accum_func,
+                                                     write_f write_func)
+{
   typedef cub::WarpReduce<value_t> warp_reduce;
 
-  value_idx cur_row_a = indptrA.get_row_idx(n_blocks_per_row);
+  value_idx cur_row_a        = indptrA.get_row_idx(n_blocks_per_row);
   value_idx cur_chunk_offset = blockIdx.x % n_blocks_per_row;
 
   // chunk starting offset
@@ -96,18 +116,17 @@ __global__ void balanced_coo_generalized_spmv_kernel(
   // how many total cols will be processed by this block (should be <= chunk_size * n_threads)
   value_idx active_chunk_size = min(chunk_size * tpb, nnz_b - ind_offset);
 
-  int tid = threadIdx.x;
+  int tid     = threadIdx.x;
   int warp_id = tid / raft::warp_size();
 
   // compute id relative to current warp
   unsigned int lane_id = tid & (raft::warp_size() - 1);
-  value_idx ind = ind_offset + threadIdx.x;
+  value_idx ind        = ind_offset + threadIdx.x;
 
   extern __shared__ char smem[];
 
-  typename strategy_t::smem_type A = (typename strategy_t::smem_type)(smem);
-  typename warp_reduce::TempStorage *temp_storage =
-    (typename warp_reduce::TempStorage *)(A + dim);
+  typename strategy_t::smem_type A                = (typename strategy_t::smem_type)(smem);
+  typename warp_reduce::TempStorage* temp_storage = (typename warp_reduce::TempStorage*)(A + dim);
 
   auto inserter = strategy.init_insert(A, dim);
 
@@ -115,13 +134,12 @@ __global__ void balanced_coo_generalized_spmv_kernel(
 
   value_idx start_offset_a, stop_offset_a;
   bool first_a_chunk, last_a_chunk;
-  indptrA.get_row_offsets(cur_row_a, start_offset_a, stop_offset_a,
-                          n_blocks_per_row, first_a_chunk, last_a_chunk);
+  indptrA.get_row_offsets(
+    cur_row_a, start_offset_a, stop_offset_a, n_blocks_per_row, first_a_chunk, last_a_chunk);
 
   // Convert current row vector in A to dense
   for (int i = tid; i <= (stop_offset_a - start_offset_a); i += blockDim.x) {
-    strategy.insert(inserter, indicesA[start_offset_a + i],
-                    dataA[start_offset_a + i]);
+    strategy.insert(inserter, indicesA[start_offset_a + i], dataA[start_offset_a + i]);
   }
 
   __syncthreads();
@@ -132,34 +150,36 @@ __global__ void balanced_coo_generalized_spmv_kernel(
   if (ind >= nnz_b) return;
 
   value_idx start_index_a = 0, stop_index_a = b_ncols - 1;
-  indptrA.get_indices_boundary(indicesA, cur_row_a, start_offset_a,
-                               stop_offset_a, start_index_a, stop_index_a,
-                               first_a_chunk, last_a_chunk);
+  indptrA.get_indices_boundary(indicesA,
+                               cur_row_a,
+                               start_offset_a,
+                               stop_offset_a,
+                               start_index_a,
+                               stop_index_a,
+                               first_a_chunk,
+                               last_a_chunk);
 
   value_idx cur_row_b = -1;
-  value_t c = 0.0;
+  value_t c           = 0.0;
 
   auto warp_red = warp_reduce(*(temp_storage + warp_id));
 
   if (tid < active_chunk_size) {
     cur_row_b = rowsB[ind];
 
-    auto index_b = indicesB[ind];
-    auto in_bounds =
-      indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b);
+    auto index_b   = indicesB[ind];
+    auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b);
 
     if (in_bounds) {
       value_t a_col = strategy.find(finder, index_b);
-      if (!rev || a_col == 0.0) {
-        c = product_func(a_col, dataB[ind]);
-      }
+      if (!rev || a_col == 0.0) { c = product_func(a_col, dataB[ind]); }
     }
   }
 
   // loop through chunks in parallel, reducing when a new row is
   // encountered by each thread
   for (int i = tid; i < active_chunk_size; i += blockDim.x) {
-    value_idx ind_next = ind + blockDim.x;
+    value_idx ind_next   = ind + blockDim.x;
     value_idx next_row_b = -1;
 
     if (i + blockDim.x < active_chunk_size) next_row_b = rowsB[ind_next];
@@ -170,14 +190,13 @@ __global__ void balanced_coo_generalized_spmv_kernel(
       // grab the threads currently participating in loops.
       // because any other threads should have returned already.
       unsigned int peer_group = __match_any_sync(0xffffffff, cur_row_b);
-      bool is_leader = get_lowest_peer(peer_group) == lane_id;
-      value_t v = warp_red.HeadSegmentedReduce(c, is_leader, accum_func);
+      bool is_leader          = get_lowest_peer(peer_group) == lane_id;
+      value_t v               = warp_red.HeadSegmentedReduce(c, is_leader, accum_func);
 
       // thread with lowest lane id among peers writes out
       if (is_leader && v != 0.0) {
         // this conditional should be uniform, since rev is constant
-        size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b
-                          : (size_t)cur_row_b * m + cur_row_a;
+        size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b : (size_t)cur_row_b * m + cur_row_a;
         write_func(out + idx, v);
       }
 
@@ -187,15 +206,12 @@ __global__ void balanced_coo_generalized_spmv_kernel(
     if (next_row_b != -1) {
       ind = ind_next;
 
-      auto index_b = indicesB[ind];
-      auto in_bounds =
-        indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b);
+      auto index_b   = indicesB[ind];
+      auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b);
       if (in_bounds) {
         value_t a_col = strategy.find(finder, index_b);
 
-        if (!rev || a_col == 0.0) {
-          c = accum_func(c, product_func(a_col, dataB[ind]));
-        }
+        if (!rev || a_col == 0.0) { c = accum_func(c, product_func(a_col, dataB[ind])); }
       }
 
       cur_row_b = next_row_b;
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh
index 4ad3368c4a..9b1dfff022 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh
@@ -31,58 +31,114 @@ namespace detail {
 template <typename value_idx, typename value_t, int tpb>
 class coo_spmv_strategy {
  public:
-  coo_spmv_strategy(const distances_config_t<value_idx, value_t> &config_)
-    : config(config_) {
+  coo_spmv_strategy(const distances_config_t<value_idx, value_t>& config_) : config(config_)
+  {
     smem = raft::getSharedMemPerBlock();
   }
 
-  template <typename strategy_t, typename indptr_it, typename product_f,
-            typename accum_f, typename write_f>
-  void _dispatch_base(strategy_t &strategy, int smem_dim, indptr_it &a_indptr,
-                      value_t *out_dists, value_idx *coo_rows_b,
-                      product_f product_func, accum_f accum_func,
-                      write_f write_func, int chunk_size, int n_blocks,
-                      int n_blocks_per_row) {
-    CUDA_CHECK(cudaFuncSetCacheConfig(
-      balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx,
-                                           value_t, false, tpb, product_f,
-                                           accum_f, write_f>,
-      cudaFuncCachePreferShared));
+  template <typename strategy_t,
+            typename indptr_it,
+            typename product_f,
+            typename accum_f,
+            typename write_f>
+  void _dispatch_base(strategy_t& strategy,
+                      int smem_dim,
+                      indptr_it& a_indptr,
+                      value_t* out_dists,
+                      value_idx* coo_rows_b,
+                      product_f product_func,
+                      accum_f accum_func,
+                      write_f write_func,
+                      int chunk_size,
+                      int n_blocks,
+                      int n_blocks_per_row)
+  {
+    CUDA_CHECK(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel<strategy_t,
+                                                                           indptr_it,
+                                                                           value_idx,
+                                                                           value_t,
+                                                                           false,
+                                                                           tpb,
+                                                                           product_f,
+                                                                           accum_f,
+                                                                           write_f>,
+                                      cudaFuncCachePreferShared));
 
-    balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx,
-                                         value_t, false, tpb>
-      <<<n_blocks, tpb, smem, config.handle.get_stream()>>>(
-        strategy, a_indptr, config.a_indices, config.a_data, config.a_nnz,
-        coo_rows_b, config.b_indices, config.b_data, config.a_nrows,
-        config.b_nrows, smem_dim, config.b_nnz, out_dists, n_blocks_per_row,
-        chunk_size, config.b_ncols, product_func, accum_func, write_func);
+    balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx, value_t, false, tpb>
+      <<<n_blocks, tpb, smem, config.handle.get_stream()>>>(strategy,
+                                                            a_indptr,
+                                                            config.a_indices,
+                                                            config.a_data,
+                                                            config.a_nnz,
+                                                            coo_rows_b,
+                                                            config.b_indices,
+                                                            config.b_data,
+                                                            config.a_nrows,
+                                                            config.b_nrows,
+                                                            smem_dim,
+                                                            config.b_nnz,
+                                                            out_dists,
+                                                            n_blocks_per_row,
+                                                            chunk_size,
+                                                            config.b_ncols,
+                                                            product_func,
+                                                            accum_func,
+                                                            write_func);
   }
 
-  template <typename strategy_t, typename indptr_it, typename product_f,
-            typename accum_f, typename write_f>
-  void _dispatch_base_rev(strategy_t &strategy, int smem_dim,
-                          indptr_it &b_indptr, value_t *out_dists,
-                          value_idx *coo_rows_a, product_f product_func,
-                          accum_f accum_func, write_f write_func,
-                          int chunk_size, int n_blocks, int n_blocks_per_row) {
-    CUDA_CHECK(cudaFuncSetCacheConfig(
-      balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx,
-                                           value_t, true, tpb, product_f,
-                                           accum_f, write_f>,
-      cudaFuncCachePreferShared));
+  template <typename strategy_t,
+            typename indptr_it,
+            typename product_f,
+            typename accum_f,
+            typename write_f>
+  void _dispatch_base_rev(strategy_t& strategy,
+                          int smem_dim,
+                          indptr_it& b_indptr,
+                          value_t* out_dists,
+                          value_idx* coo_rows_a,
+                          product_f product_func,
+                          accum_f accum_func,
+                          write_f write_func,
+                          int chunk_size,
+                          int n_blocks,
+                          int n_blocks_per_row)
+  {
+    CUDA_CHECK(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel<strategy_t,
+                                                                           indptr_it,
+                                                                           value_idx,
+                                                                           value_t,
+                                                                           true,
+                                                                           tpb,
+                                                                           product_f,
+                                                                           accum_f,
+                                                                           write_f>,
+                                      cudaFuncCachePreferShared));
 
-    balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx,
-                                         value_t, true, tpb>
-      <<<n_blocks, tpb, smem, config.handle.get_stream()>>>(
-        strategy, b_indptr, config.b_indices, config.b_data, config.b_nnz,
-        coo_rows_a, config.a_indices, config.a_data, config.b_nrows,
-        config.a_nrows, smem_dim, config.a_nnz, out_dists, n_blocks_per_row,
-        chunk_size, config.a_ncols, product_func, accum_func, write_func);
+    balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx, value_t, true, tpb>
+      <<<n_blocks, tpb, smem, config.handle.get_stream()>>>(strategy,
+                                                            b_indptr,
+                                                            config.b_indices,
+                                                            config.b_data,
+                                                            config.b_nnz,
+                                                            coo_rows_a,
+                                                            config.a_indices,
+                                                            config.a_data,
+                                                            config.b_nrows,
+                                                            config.a_nrows,
+                                                            smem_dim,
+                                                            config.a_nnz,
+                                                            out_dists,
+                                                            n_blocks_per_row,
+                                                            chunk_size,
+                                                            config.a_ncols,
+                                                            product_func,
+                                                            accum_func,
+                                                            write_func);
   }
 
  protected:
   int smem;
-  const distances_config_t<value_idx, value_t> &config;
+  const distances_config_t<value_idx, value_t>& config;
 };
 
 }  // namespace detail
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh
index 0ab7b65ac2..da51767307 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh
@@ -29,11 +29,15 @@ namespace detail {
 template <typename value_idx>
 class mask_row_it {
  public:
-  mask_row_it(const value_idx *full_indptr_, const value_idx &n_rows_,
-              value_idx *mask_row_idx_ = NULL)
-    : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_) {}
+  mask_row_it(const value_idx* full_indptr_,
+              const value_idx& n_rows_,
+              value_idx* mask_row_idx_ = NULL)
+    : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_)
+  {
+  }
 
-  __device__ inline value_idx get_row_idx(const int &n_blocks_nnz_b) {
+  __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b)
+  {
     if (mask_row_idx != NULL) {
       return mask_row_idx[blockIdx.x / n_blocks_nnz_b];
     } else {
@@ -41,37 +45,49 @@ class mask_row_it {
     }
   }
 
-  __device__ inline void get_row_offsets(
-    const value_idx &row_idx, value_idx &start_offset, value_idx &stop_offset,
-    const value_idx &n_blocks_nnz_b, bool &first_a_chunk, bool &last_a_chunk) {
+  __device__ inline void get_row_offsets(const value_idx& row_idx,
+                                         value_idx& start_offset,
+                                         value_idx& stop_offset,
+                                         const value_idx& n_blocks_nnz_b,
+                                         bool& first_a_chunk,
+                                         bool& last_a_chunk)
+  {
     start_offset = full_indptr[row_idx];
-    stop_offset = full_indptr[row_idx + 1] - 1;
+    stop_offset  = full_indptr[row_idx + 1] - 1;
   }
 
-  __device__ constexpr inline void get_indices_boundary(
-    const value_idx *indices, value_idx &indices_len, value_idx &start_offset,
-    value_idx &stop_offset, value_idx &start_index, value_idx &stop_index,
-    bool &first_a_chunk, bool &last_a_chunk) {
+  __device__ constexpr inline void get_indices_boundary(const value_idx* indices,
+                                                        value_idx& indices_len,
+                                                        value_idx& start_offset,
+                                                        value_idx& stop_offset,
+                                                        value_idx& start_index,
+                                                        value_idx& stop_index,
+                                                        bool& first_a_chunk,
+                                                        bool& last_a_chunk)
+  {
     // do nothing;
   }
 
-  __device__ constexpr inline bool check_indices_bounds(
-    value_idx &start_index_a, value_idx &stop_index_a, value_idx &index_b) {
+  __device__ constexpr inline bool check_indices_bounds(value_idx& start_index_a,
+                                                        value_idx& stop_index_a,
+                                                        value_idx& index_b)
+  {
     return true;
   }
 
   const value_idx *full_indptr, &n_rows;
-  value_idx *mask_row_idx;
+  value_idx* mask_row_idx;
 };
 
 template <typename value_idx>
-__global__ void fill_chunk_indices_kernel(value_idx *n_chunks_per_row,
-                                          value_idx *chunk_indices,
-                                          value_idx n_rows) {
+__global__ void fill_chunk_indices_kernel(value_idx* n_chunks_per_row,
+                                          value_idx* chunk_indices,
+                                          value_idx n_rows)
+{
   auto tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < n_rows) {
     auto start = n_chunks_per_row[tid];
-    auto end = n_chunks_per_row[tid + 1];
+    auto end   = n_chunks_per_row[tid + 1];
 
 #pragma unroll
     for (int i = start; i < end; i++) {
@@ -83,73 +99,89 @@ __global__ void fill_chunk_indices_kernel(value_idx *n_chunks_per_row,
 template <typename value_idx>
 class chunked_mask_row_it : public mask_row_it<value_idx> {
  public:
-  chunked_mask_row_it(const value_idx *full_indptr_, const value_idx &n_rows_,
-                      value_idx *mask_row_idx_, int row_chunk_size_,
-                      const value_idx *n_chunks_per_row_,
-                      const value_idx *chunk_indices_,
+  chunked_mask_row_it(const value_idx* full_indptr_,
+                      const value_idx& n_rows_,
+                      value_idx* mask_row_idx_,
+                      int row_chunk_size_,
+                      const value_idx* n_chunks_per_row_,
+                      const value_idx* chunk_indices_,
                       const cudaStream_t stream_)
     : mask_row_it<value_idx>(full_indptr_, n_rows_, mask_row_idx_),
       row_chunk_size(row_chunk_size_),
       n_chunks_per_row(n_chunks_per_row_),
       chunk_indices(chunk_indices_),
-      stream(stream_) {}
+      stream(stream_)
+  {
+  }
 
-  static void init(const value_idx *indptr, const value_idx *mask_row_idx,
-                   const value_idx &n_rows, const int row_chunk_size,
-                   rmm::device_uvector<value_idx> &n_chunks_per_row,
-                   rmm::device_uvector<value_idx> &chunk_indices,
-                   cudaStream_t stream) {
+  static void init(const value_idx* indptr,
+                   const value_idx* mask_row_idx,
+                   const value_idx& n_rows,
+                   const int row_chunk_size,
+                   rmm::device_uvector<value_idx>& n_chunks_per_row,
+                   rmm::device_uvector<value_idx>& chunk_indices,
+                   cudaStream_t stream)
+  {
     auto policy = rmm::exec_policy(stream);
 
     constexpr value_idx first_element = 0;
     n_chunks_per_row.set_element_async(0, first_element, stream);
     n_chunks_per_row_functor chunk_functor(indptr, row_chunk_size);
-    thrust::transform(policy, mask_row_idx, mask_row_idx + n_rows,
-                      n_chunks_per_row.begin() + 1, chunk_functor);
+    thrust::transform(
+      policy, mask_row_idx, mask_row_idx + n_rows, n_chunks_per_row.begin() + 1, chunk_functor);
 
-    thrust::inclusive_scan(policy, n_chunks_per_row.begin() + 1,
-                           n_chunks_per_row.end(),
-                           n_chunks_per_row.begin() + 1);
+    thrust::inclusive_scan(
+      policy, n_chunks_per_row.begin() + 1, n_chunks_per_row.end(), n_chunks_per_row.begin() + 1);
 
-    raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1,
-                      stream);
+    raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1, stream);
 
     fill_chunk_indices(n_rows, n_chunks_per_row, chunk_indices, stream);
   }
 
-  __device__ inline value_idx get_row_idx(const int &n_blocks_nnz_b) {
+  __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b)
+  {
     return this->mask_row_idx[chunk_indices[blockIdx.x / n_blocks_nnz_b]];
   }
 
-  __device__ inline void get_row_offsets(
-    const value_idx &row_idx, value_idx &start_offset, value_idx &stop_offset,
-    const int &n_blocks_nnz_b, bool &first_a_chunk, bool &last_a_chunk) {
-    auto chunk_index = blockIdx.x / n_blocks_nnz_b;
-    auto chunk_val = chunk_indices[chunk_index];
-    auto prev_n_chunks = n_chunks_per_row[chunk_val];
+  __device__ inline void get_row_offsets(const value_idx& row_idx,
+                                         value_idx& start_offset,
+                                         value_idx& stop_offset,
+                                         const int& n_blocks_nnz_b,
+                                         bool& first_a_chunk,
+                                         bool& last_a_chunk)
+  {
+    auto chunk_index    = blockIdx.x / n_blocks_nnz_b;
+    auto chunk_val      = chunk_indices[chunk_index];
+    auto prev_n_chunks  = n_chunks_per_row[chunk_val];
     auto relative_chunk = chunk_index - prev_n_chunks;
-    first_a_chunk = relative_chunk == 0;
+    first_a_chunk       = relative_chunk == 0;
 
     start_offset = this->full_indptr[row_idx] + relative_chunk * row_chunk_size;
-    stop_offset = start_offset + row_chunk_size;
+    stop_offset  = start_offset + row_chunk_size;
 
     auto final_stop_offset = this->full_indptr[row_idx + 1];
 
     last_a_chunk = stop_offset >= final_stop_offset;
-    stop_offset = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1;
+    stop_offset  = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1;
   }
 
-  __device__ inline void get_indices_boundary(
-    const value_idx *indices, value_idx &row_idx, value_idx &start_offset,
-    value_idx &stop_offset, value_idx &start_index, value_idx &stop_index,
-    bool &first_a_chunk, bool &last_a_chunk) {
+  __device__ inline void get_indices_boundary(const value_idx* indices,
+                                              value_idx& row_idx,
+                                              value_idx& start_offset,
+                                              value_idx& stop_offset,
+                                              value_idx& start_index,
+                                              value_idx& stop_index,
+                                              bool& first_a_chunk,
+                                              bool& last_a_chunk)
+  {
     start_index = first_a_chunk ? start_index : indices[start_offset - 1] + 1;
-    stop_index = last_a_chunk ? stop_index : indices[stop_offset];
+    stop_index  = last_a_chunk ? stop_index : indices[stop_offset];
   }
 
-  __device__ inline bool check_indices_bounds(value_idx &start_index_a,
-                                              value_idx &stop_index_a,
-                                              value_idx &index_b) {
+  __device__ inline bool check_indices_bounds(value_idx& start_index_a,
+                                              value_idx& stop_index_a,
+                                              value_idx& index_b)
+  {
     return (index_b >= start_index_a && index_b <= stop_index_a);
   }
 
@@ -160,30 +192,34 @@ class chunked_mask_row_it : public mask_row_it<value_idx> {
 
   struct n_chunks_per_row_functor {
    public:
-    n_chunks_per_row_functor(const value_idx *indptr_,
-                             value_idx row_chunk_size_)
-      : indptr(indptr_), row_chunk_size(row_chunk_size_) {}
+    n_chunks_per_row_functor(const value_idx* indptr_, value_idx row_chunk_size_)
+      : indptr(indptr_), row_chunk_size(row_chunk_size_)
+    {
+    }
 
-    __host__ __device__ value_idx operator()(const value_idx &i) {
+    __host__ __device__ value_idx operator()(const value_idx& i)
+    {
       auto degree = indptr[i + 1] - indptr[i];
       return raft::ceildiv(degree, (value_idx)row_chunk_size);
     }
 
-    const value_idx *indptr;
+    const value_idx* indptr;
     value_idx row_chunk_size;
   };
 
  private:
-  static void fill_chunk_indices(
-    const value_idx &n_rows, rmm::device_uvector<value_idx> &n_chunks_per_row,
-    rmm::device_uvector<value_idx> &chunk_indices, cudaStream_t stream) {
+  static void fill_chunk_indices(const value_idx& n_rows,
+                                 rmm::device_uvector<value_idx>& n_chunks_per_row,
+                                 rmm::device_uvector<value_idx>& chunk_indices,
+                                 cudaStream_t stream)
+  {
     auto n_threads = std::min(n_rows, 256);
-    auto n_blocks = raft::ceildiv(n_rows, (value_idx)n_threads);
+    auto n_blocks  = raft::ceildiv(n_rows, (value_idx)n_threads);
 
     chunk_indices.resize(total_row_blocks, stream);
 
-    fill_chunk_indices_kernel<value_idx><<<n_blocks, n_threads, 0, stream>>>(
-      n_chunks_per_row.data(), chunk_indices.data(), n_rows);
+    fill_chunk_indices_kernel<value_idx>
+      <<<n_blocks, n_threads, 0, stream>>>(n_chunks_per_row.data(), chunk_indices.data(), n_rows);
   }
 };
 
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh
index 79a5f154d0..5a1c152bd0 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh
@@ -26,71 +26,91 @@ namespace detail {
 template <typename value_idx, typename value_t, int tpb>
 class dense_smem_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
  public:
-  using smem_type = value_t *;
+  using smem_type   = value_t*;
   using insert_type = smem_type;
-  using find_type = smem_type;
+  using find_type   = smem_type;
 
-  dense_smem_strategy(const distances_config_t<value_idx, value_t> &config_)
-    : coo_spmv_strategy<value_idx, value_t, tpb>(config_) {}
+  dense_smem_strategy(const distances_config_t<value_idx, value_t>& config_)
+    : coo_spmv_strategy<value_idx, value_t, tpb>(config_)
+  {
+  }
 
-  inline static int smem_per_block(int n_cols) {
-    return (n_cols * sizeof(value_t)) +
-           ((1024 / raft::warp_size()) * sizeof(value_t));
+  inline static int smem_per_block(int n_cols)
+  {
+    return (n_cols * sizeof(value_t)) + ((1024 / raft::warp_size()) * sizeof(value_t));
   }
 
   template <typename product_f, typename accum_f, typename write_f>
-  void dispatch(value_t *out_dists, value_idx *coo_rows_b,
-                product_f product_func, accum_f accum_func, write_f write_func,
-                int chunk_size) {
-    auto n_blocks_per_row =
-      raft::ceildiv(this->config.b_nnz, chunk_size * 1024);
-    auto n_blocks = this->config.a_nrows * n_blocks_per_row;
-
-    mask_row_it<value_idx> a_indptr(this->config.a_indptr,
-                                    this->config.a_nrows);
-
-    this->_dispatch_base(*this, this->config.b_ncols, a_indptr, out_dists,
-                         coo_rows_b, product_func, accum_func, write_func,
-                         chunk_size, n_blocks, n_blocks_per_row);
+  void dispatch(value_t* out_dists,
+                value_idx* coo_rows_b,
+                product_f product_func,
+                accum_f accum_func,
+                write_f write_func,
+                int chunk_size)
+  {
+    auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * 1024);
+    auto n_blocks         = this->config.a_nrows * n_blocks_per_row;
+
+    mask_row_it<value_idx> a_indptr(this->config.a_indptr, this->config.a_nrows);
+
+    this->_dispatch_base(*this,
+                         this->config.b_ncols,
+                         a_indptr,
+                         out_dists,
+                         coo_rows_b,
+                         product_func,
+                         accum_func,
+                         write_func,
+                         chunk_size,
+                         n_blocks,
+                         n_blocks_per_row);
   }
 
   template <typename product_f, typename accum_f, typename write_f>
-  void dispatch_rev(value_t *out_dists, value_idx *coo_rows_a,
-                    product_f product_func, accum_f accum_func,
-                    write_f write_func, int chunk_size) {
-    auto n_blocks_per_row =
-      raft::ceildiv(this->config.a_nnz, chunk_size * 1024);
-    auto n_blocks = this->config.b_nrows * n_blocks_per_row;
-
-    mask_row_it<value_idx> b_indptr(this->config.b_indptr,
-                                    this->config.b_nrows);
-
-    this->_dispatch_base_rev(*this, this->config.a_ncols, b_indptr, out_dists,
-                             coo_rows_a, product_func, accum_func, write_func,
-                             chunk_size, n_blocks, n_blocks_per_row);
+  void dispatch_rev(value_t* out_dists,
+                    value_idx* coo_rows_a,
+                    product_f product_func,
+                    accum_f accum_func,
+                    write_f write_func,
+                    int chunk_size)
+  {
+    auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * 1024);
+    auto n_blocks         = this->config.b_nrows * n_blocks_per_row;
+
+    mask_row_it<value_idx> b_indptr(this->config.b_indptr, this->config.b_nrows);
+
+    this->_dispatch_base_rev(*this,
+                             this->config.a_ncols,
+                             b_indptr,
+                             out_dists,
+                             coo_rows_a,
+                             product_func,
+                             accum_func,
+                             write_func,
+                             chunk_size,
+                             n_blocks,
+                             n_blocks_per_row);
   }
 
-  __device__ inline insert_type init_insert(smem_type cache,
-                                            const value_idx &cache_size) {
+  __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size)
+  {
     for (int k = threadIdx.x; k < cache_size; k += blockDim.x) {
       cache[k] = 0.0;
     }
     return cache;
   }
 
-  __device__ inline void insert(insert_type cache, const value_idx &key,
-                                const value_t &value) {
+  __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value)
+  {
     cache[key] = value;
   }
 
-  __device__ inline find_type init_find(smem_type cache,
-                                        const value_idx &cache_size) {
+  __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size)
+  {
     return cache;
   }
 
-  __device__ inline value_t find(find_type cache, const value_idx &key) {
-    return cache[key];
-  }
+  __device__ inline value_t find(find_type cache, const value_idx& key) { return cache[key]; }
 };
 
 }  // namespace detail
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh
index 5ba2d5c102..4f8637b425 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh
@@ -1,18 +1,18 @@
 /*
-  * Copyright (c) 2021, NVIDIA CORPORATION.
-  *
-  * Licensed under the Apache License, Version 2.0 (the "License");
-  * you may not use this file except in compliance with the License.
-  * You may obtain a copy of the License at
-  *
-  *     http://www.apache.org/licenses/LICENSE-2.0
-  *
-  * Unless required by applicable law or agreed to in writing, software
-  * distributed under the License is distributed on an "AS IS" BASIS,
-  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  * See the License for the specific language governing permissions and
-  * limitations under the License.
-  */
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #pragma once
 
@@ -39,177 +39,238 @@ template <typename value_idx, typename value_t, int tpb>
 class hash_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
  public:
   using insert_type =
-    typename cuco::static_map<value_idx, value_t,
-                              cuda::thread_scope_block>::device_mutable_view;
-  using smem_type = typename insert_type::slot_type *;
+    typename cuco::static_map<value_idx, value_t, cuda::thread_scope_block>::device_mutable_view;
+  using smem_type = typename insert_type::slot_type*;
   using find_type =
-    typename cuco::static_map<value_idx, value_t,
-                              cuda::thread_scope_block>::device_view;
+    typename cuco::static_map<value_idx, value_t, cuda::thread_scope_block>::device_view;
 
-  hash_strategy(const distances_config_t<value_idx, value_t> &config_,
-                float capacity_threshold_ = 0.5, int map_size_ = get_map_size())
+  hash_strategy(const distances_config_t<value_idx, value_t>& config_,
+                float capacity_threshold_ = 0.5,
+                int map_size_             = get_map_size())
     : coo_spmv_strategy<value_idx, value_t, tpb>(config_),
       capacity_threshold(capacity_threshold_),
-      map_size(map_size_) {}
+      map_size(map_size_)
+  {
+  }
 
-  void chunking_needed(const value_idx *indptr, const value_idx n_rows,
-                       rmm::device_uvector<value_idx> &mask_indptr,
-                       std::tuple<value_idx, value_idx> &n_rows_divided,
-                       cudaStream_t stream) {
+  void chunking_needed(const value_idx* indptr,
+                       const value_idx n_rows,
+                       rmm::device_uvector<value_idx>& mask_indptr,
+                       std::tuple<value_idx, value_idx>& n_rows_divided,
+                       cudaStream_t stream)
+  {
     auto policy = this->config.handle.get_thrust_policy();
 
-    auto less = thrust::copy_if(
-      policy, thrust::make_counting_iterator(value_idx(0)),
-      thrust::make_counting_iterator(n_rows), mask_indptr.data(),
-      fits_in_hash_table(indptr, 0, capacity_threshold * map_size));
+    auto less                   = thrust::copy_if(policy,
+                                thrust::make_counting_iterator(value_idx(0)),
+                                thrust::make_counting_iterator(n_rows),
+                                mask_indptr.data(),
+                                fits_in_hash_table(indptr, 0, capacity_threshold * map_size));
     std::get<0>(n_rows_divided) = less - mask_indptr.data();
 
     auto more = thrust::copy_if(
-      policy, thrust::make_counting_iterator(value_idx(0)),
-      thrust::make_counting_iterator(n_rows), less,
-      fits_in_hash_table(indptr, capacity_threshold * map_size,
-                         std::numeric_limits<value_idx>::max()));
+      policy,
+      thrust::make_counting_iterator(value_idx(0)),
+      thrust::make_counting_iterator(n_rows),
+      less,
+      fits_in_hash_table(
+        indptr, capacity_threshold * map_size, std::numeric_limits<value_idx>::max()));
     std::get<1>(n_rows_divided) = more - less;
   }
 
   template <typename product_f, typename accum_f, typename write_f>
-  void dispatch(value_t *out_dists, value_idx *coo_rows_b,
-                product_f product_func, accum_f accum_func, write_f write_func,
-                int chunk_size) {
+  void dispatch(value_t* out_dists,
+                value_idx* coo_rows_b,
+                product_f product_func,
+                accum_f accum_func,
+                write_f write_func,
+                int chunk_size)
+  {
     auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * tpb);
-    rmm::device_uvector<value_idx> mask_indptr(
-      this->config.a_nrows, this->config.handle.get_stream());
+    rmm::device_uvector<value_idx> mask_indptr(this->config.a_nrows,
+                                               this->config.handle.get_stream());
     std::tuple<value_idx, value_idx> n_rows_divided;
 
-    chunking_needed(this->config.a_indptr, this->config.a_nrows, mask_indptr,
-                    n_rows_divided, this->config.handle.get_stream());
+    chunking_needed(this->config.a_indptr,
+                    this->config.a_nrows,
+                    mask_indptr,
+                    n_rows_divided,
+                    this->config.handle.get_stream());
 
     auto less_rows = std::get<0>(n_rows_divided);
     if (less_rows > 0) {
-      mask_row_it<value_idx> less(this->config.a_indptr, less_rows,
-                                  mask_indptr.data());
+      mask_row_it<value_idx> less(this->config.a_indptr, less_rows, mask_indptr.data());
 
       auto n_less_blocks = less_rows * n_blocks_per_row;
-      this->_dispatch_base(*this, map_size, less, out_dists, coo_rows_b,
-                           product_func, accum_func, write_func, chunk_size,
-                           n_less_blocks, n_blocks_per_row);
+      this->_dispatch_base(*this,
+                           map_size,
+                           less,
+                           out_dists,
+                           coo_rows_b,
+                           product_func,
+                           accum_func,
+                           write_func,
+                           chunk_size,
+                           n_less_blocks,
+                           n_blocks_per_row);
     }
 
     auto more_rows = std::get<1>(n_rows_divided);
     if (more_rows > 0) {
-      rmm::device_uvector<value_idx> n_chunks_per_row(
-        more_rows + 1, this->config.handle.get_stream());
-      rmm::device_uvector<value_idx> chunk_indices(
-        0, this->config.handle.get_stream());
-      chunked_mask_row_it<value_idx>::init(
-        this->config.a_indptr, mask_indptr.data() + less_rows, more_rows,
-        capacity_threshold * map_size, n_chunks_per_row, chunk_indices,
-        this->config.handle.get_stream());
-
-      chunked_mask_row_it<value_idx> more(
-        this->config.a_indptr, more_rows, mask_indptr.data() + less_rows,
-        capacity_threshold * map_size, n_chunks_per_row.data(),
-        chunk_indices.data(), this->config.handle.get_stream());
+      rmm::device_uvector<value_idx> n_chunks_per_row(more_rows + 1,
+                                                      this->config.handle.get_stream());
+      rmm::device_uvector<value_idx> chunk_indices(0, this->config.handle.get_stream());
+      chunked_mask_row_it<value_idx>::init(this->config.a_indptr,
+                                           mask_indptr.data() + less_rows,
+                                           more_rows,
+                                           capacity_threshold * map_size,
+                                           n_chunks_per_row,
+                                           chunk_indices,
+                                           this->config.handle.get_stream());
+
+      chunked_mask_row_it<value_idx> more(this->config.a_indptr,
+                                          more_rows,
+                                          mask_indptr.data() + less_rows,
+                                          capacity_threshold * map_size,
+                                          n_chunks_per_row.data(),
+                                          chunk_indices.data(),
+                                          this->config.handle.get_stream());
 
       auto n_more_blocks = more.total_row_blocks * n_blocks_per_row;
-      this->_dispatch_base(*this, map_size, more, out_dists, coo_rows_b,
-                           product_func, accum_func, write_func, chunk_size,
-                           n_more_blocks, n_blocks_per_row);
+      this->_dispatch_base(*this,
+                           map_size,
+                           more,
+                           out_dists,
+                           coo_rows_b,
+                           product_func,
+                           accum_func,
+                           write_func,
+                           chunk_size,
+                           n_more_blocks,
+                           n_blocks_per_row);
     }
   }
 
   template <typename product_f, typename accum_f, typename write_f>
-  void dispatch_rev(value_t *out_dists, value_idx *coo_rows_a,
-                    product_f product_func, accum_f accum_func,
-                    write_f write_func, int chunk_size) {
+  void dispatch_rev(value_t* out_dists,
+                    value_idx* coo_rows_a,
+                    product_f product_func,
+                    accum_f accum_func,
+                    write_f write_func,
+                    int chunk_size)
+  {
     auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * tpb);
-    rmm::device_uvector<value_idx> mask_indptr(
-      this->config.b_nrows, this->config.handle.get_stream());
+    rmm::device_uvector<value_idx> mask_indptr(this->config.b_nrows,
+                                               this->config.handle.get_stream());
     std::tuple<value_idx, value_idx> n_rows_divided;
 
-    chunking_needed(this->config.b_indptr, this->config.b_nrows, mask_indptr,
-                    n_rows_divided, this->config.handle.get_stream());
+    chunking_needed(this->config.b_indptr,
+                    this->config.b_nrows,
+                    mask_indptr,
+                    n_rows_divided,
+                    this->config.handle.get_stream());
 
     auto less_rows = std::get<0>(n_rows_divided);
     if (less_rows > 0) {
-      mask_row_it<value_idx> less(this->config.b_indptr, less_rows,
-                                  mask_indptr.data());
+      mask_row_it<value_idx> less(this->config.b_indptr, less_rows, mask_indptr.data());
 
       auto n_less_blocks = less_rows * n_blocks_per_row;
-      this->_dispatch_base_rev(*this, map_size, less, out_dists, coo_rows_a,
-                               product_func, accum_func, write_func, chunk_size,
-                               n_less_blocks, n_blocks_per_row);
+      this->_dispatch_base_rev(*this,
+                               map_size,
+                               less,
+                               out_dists,
+                               coo_rows_a,
+                               product_func,
+                               accum_func,
+                               write_func,
+                               chunk_size,
+                               n_less_blocks,
+                               n_blocks_per_row);
     }
 
     auto more_rows = std::get<1>(n_rows_divided);
     if (more_rows > 0) {
-      rmm::device_uvector<value_idx> n_chunks_per_row(
-        more_rows + 1, this->config.handle.get_stream());
-      rmm::device_uvector<value_idx> chunk_indices(
-        0, this->config.handle.get_stream());
-      chunked_mask_row_it<value_idx>::init(
-        this->config.b_indptr, mask_indptr.data() + less_rows, more_rows,
-        capacity_threshold * map_size, n_chunks_per_row, chunk_indices,
-        this->config.handle.get_stream());
-
-      chunked_mask_row_it<value_idx> more(
-        this->config.b_indptr, more_rows, mask_indptr.data() + less_rows,
-        capacity_threshold * map_size, n_chunks_per_row.data(),
-        chunk_indices.data(), this->config.handle.get_stream());
+      rmm::device_uvector<value_idx> n_chunks_per_row(more_rows + 1,
+                                                      this->config.handle.get_stream());
+      rmm::device_uvector<value_idx> chunk_indices(0, this->config.handle.get_stream());
+      chunked_mask_row_it<value_idx>::init(this->config.b_indptr,
+                                           mask_indptr.data() + less_rows,
+                                           more_rows,
+                                           capacity_threshold * map_size,
+                                           n_chunks_per_row,
+                                           chunk_indices,
+                                           this->config.handle.get_stream());
+
+      chunked_mask_row_it<value_idx> more(this->config.b_indptr,
+                                          more_rows,
+                                          mask_indptr.data() + less_rows,
+                                          capacity_threshold * map_size,
+                                          n_chunks_per_row.data(),
+                                          chunk_indices.data(),
+                                          this->config.handle.get_stream());
 
       auto n_more_blocks = more.total_row_blocks * n_blocks_per_row;
-      this->_dispatch_base_rev(*this, map_size, more, out_dists, coo_rows_a,
-                               product_func, accum_func, write_func, chunk_size,
-                               n_more_blocks, n_blocks_per_row);
+      this->_dispatch_base_rev(*this,
+                               map_size,
+                               more,
+                               out_dists,
+                               coo_rows_a,
+                               product_func,
+                               accum_func,
+                               write_func,
+                               chunk_size,
+                               n_more_blocks,
+                               n_blocks_per_row);
     }
   }
 
-  __device__ inline insert_type init_insert(smem_type cache,
-                                            const value_idx &cache_size) {
+  __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size)
+  {
     return insert_type::make_from_uninitialized_slots(
       cooperative_groups::this_thread_block(), cache, cache_size, -1, 0);
   }
 
-  __device__ inline void insert(insert_type cache, const value_idx &key,
-                                const value_t &value) {
+  __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value)
+  {
     auto success = cache.insert(cuco::pair<value_idx, value_t>(key, value));
   }
 
-  __device__ inline find_type init_find(smem_type cache,
-                                        const value_idx &cache_size) {
+  __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size)
+  {
     return find_type(cache, cache_size, -1, 0);
   }
 
-  __device__ inline value_t find(find_type cache, const value_idx &key) {
+  __device__ inline value_t find(find_type cache, const value_idx& key)
+  {
     auto a_pair = cache.find(key);
 
     value_t a_col = 0.0;
-    if (a_pair != cache.end()) {
-      a_col = a_pair->second;
-    }
+    if (a_pair != cache.end()) { a_col = a_pair->second; }
     return a_col;
   }
 
   struct fits_in_hash_table {
    public:
-    fits_in_hash_table(const value_idx *indptr_, value_idx degree_l_,
-                       value_idx degree_r_)
-      : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_) {}
+    fits_in_hash_table(const value_idx* indptr_, value_idx degree_l_, value_idx degree_r_)
+      : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_)
+    {
+    }
 
-    __host__ __device__ bool operator()(const value_idx &i) {
+    __host__ __device__ bool operator()(const value_idx& i)
+    {
       auto degree = indptr[i + 1] - indptr[i];
 
       return degree >= degree_l && degree < degree_r;
     }
 
    private:
-    const value_idx *indptr;
+    const value_idx* indptr;
     const value_idx degree_l, degree_r;
   };
 
-  inline static int get_map_size() {
-    return (raft::getSharedMemPerBlock() -
-            ((tpb / raft::warp_size()) * sizeof(value_t))) /
+  inline static int get_map_size()
+  {
+    return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) /
            sizeof(typename insert_type::slot_type);
   }
 
diff --git a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
index 2cd7b670d8..bde979a993 100644
--- a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
@@ -42,35 +42,38 @@ template <typename value_idx, typename value_t>
 class ip_distances_t : public distances_t<value_t> {
  public:
   /**
-         * Computes simple sparse inner product distances as sum(x_y * y_k)
-         * @param[in] config specifies inputs, outputs, and sizes
-         */
-  ip_distances_t(const distances_config_t<value_idx, value_t> &config)
-    : config_(&config), coo_rows_b(config.b_nnz, config.handle.get_stream()) {
-    raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows,
-                                      coo_rows_b.data(), config_->b_nnz,
+   * Computes simple sparse inner product distances as sum(x_y * y_k)
+   * @param[in] config specifies inputs, outputs, and sizes
+   */
+  ip_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), coo_rows_b(config.b_nnz, config.handle.get_stream())
+  {
+    raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                      config_->b_nrows,
+                                      coo_rows_b.data(),
+                                      config_->b_nnz,
                                       config_->handle.get_stream());
   }
 
   /**
-         * Performs pairwise distance computation and computes output distances
-         * @param out_distances dense output matrix (size a_nrows * b_nrows)
-         */
-  void compute(value_t *out_distances) {
+   * Performs pairwise distance computation and computes output distances
+   * @param out_distances dense output matrix (size a_nrows * b_nrows)
+   */
+  void compute(value_t* out_distances)
+  {
     /**
-               * Compute pairwise distances and return dense matrix in row-major format
-               */
+     * Compute pairwise distances and return dense matrix in row-major format
+     */
     balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
-      out_distances, *config_, coo_rows_b.data(), Product(), Sum(),
-      AtomicAdd());
+      out_distances, *config_, coo_rows_b.data(), Product(), Sum(), AtomicAdd());
   }
 
-  value_idx *b_rows_coo() { return coo_rows_b.data(); }
+  value_idx* b_rows_coo() { return coo_rows_b.data(); }
 
-  value_t *b_data_coo() { return config_->b_data; }
+  value_t* b_data_coo() { return config_->b_data; }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   rmm::device_uvector<value_idx> coo_rows_b;
 };
 
diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
index f06a15215c..a4a534823f 100644
--- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
@@ -38,35 +38,36 @@ namespace detail {
 
 // @TODO: Move this into sparse prims (coo_norm)
 template <typename value_idx, typename value_t>
-__global__ void compute_row_norm_kernel(value_t *out,
-                                        const value_idx *__restrict__ coo_rows,
-                                        const value_t *__restrict__ data,
-                                        value_idx nnz) {
+__global__ void compute_row_norm_kernel(value_t* out,
+                                        const value_idx* __restrict__ coo_rows,
+                                        const value_t* __restrict__ data,
+                                        value_idx nnz)
+{
   value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < nnz) {
-    atomicAdd(&out[coo_rows[i]], data[i] * data[i]);
-  }
+  if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i] * data[i]); }
 }
 
 template <typename value_idx, typename value_t>
-__global__ void compute_row_sum_kernel(value_t *out,
-                                       const value_idx *__restrict__ coo_rows,
-                                       const value_t *__restrict__ data,
-                                       value_idx nnz) {
+__global__ void compute_row_sum_kernel(value_t* out,
+                                       const value_idx* __restrict__ coo_rows,
+                                       const value_t* __restrict__ data,
+                                       value_idx nnz)
+{
   value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < nnz) {
-    atomicAdd(&out[coo_rows[i]], data[i]);
-  }
+  if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i]); }
 }
 
 template <typename value_idx, typename value_t, typename expansion_f>
-__global__ void compute_euclidean_warp_kernel(
-  value_t *__restrict__ C, const value_t *__restrict__ Q_sq_norms,
-  const value_t *__restrict__ R_sq_norms, value_idx n_rows, value_idx n_cols,
-  expansion_f expansion_func) {
+__global__ void compute_euclidean_warp_kernel(value_t* __restrict__ C,
+                                              const value_t* __restrict__ Q_sq_norms,
+                                              const value_t* __restrict__ R_sq_norms,
+                                              value_idx n_rows,
+                                              value_idx n_cols,
+                                              expansion_f expansion_func)
+{
   std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
-  value_idx i = tid / n_cols;
-  value_idx j = tid % n_cols;
+  value_idx i     = tid / n_cols;
+  value_idx j     = tid % n_cols;
 
   if (i >= n_rows || j >= n_cols) return;
 
@@ -80,25 +81,29 @@ __global__ void compute_euclidean_warp_kernel(
 }
 
 template <typename value_idx, typename value_t>
-__global__ void compute_correlation_warp_kernel(
-  value_t *__restrict__ C, const value_t *__restrict__ Q_sq_norms,
-  const value_t *__restrict__ R_sq_norms, const value_t *__restrict__ Q_norms,
-  const value_t *__restrict__ R_norms, value_idx n_rows, value_idx n_cols,
-  value_idx n) {
+__global__ void compute_correlation_warp_kernel(value_t* __restrict__ C,
+                                                const value_t* __restrict__ Q_sq_norms,
+                                                const value_t* __restrict__ R_sq_norms,
+                                                const value_t* __restrict__ Q_norms,
+                                                const value_t* __restrict__ R_norms,
+                                                value_idx n_rows,
+                                                value_idx n_cols,
+                                                value_idx n)
+{
   std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
-  value_idx i = tid / n_cols;
-  value_idx j = tid % n_cols;
+  value_idx i     = tid / n_cols;
+  value_idx j     = tid % n_cols;
 
   if (i >= n_rows || j >= n_cols) return;
 
-  value_t dot = C[(size_t)i * n_cols + j];
+  value_t dot  = C[(size_t)i * n_cols + j];
   value_t Q_l1 = Q_norms[i];
   value_t R_l1 = R_norms[j];
 
   value_t Q_l2 = Q_sq_norms[i];
   value_t R_l2 = R_sq_norms[j];
 
-  value_t numer = n * dot - (Q_l1 * R_l1);
+  value_t numer   = n * dot - (Q_l1 * R_l1);
   value_t Q_denom = n * Q_l2 - (Q_l1 * Q_l1);
   value_t R_denom = n * R_l2 - (R_l1 * R_l1);
 
@@ -108,56 +113,75 @@ __global__ void compute_correlation_warp_kernel(
   C[(size_t)i * n_cols + j] = val * (fabs(val) >= 0.0001);
 }
 
-template <typename value_idx, typename value_t, int tpb = 256,
-          typename expansion_f>
-void compute_euclidean(value_t *C, const value_t *Q_sq_norms,
-                       const value_t *R_sq_norms, value_idx n_rows,
-                       value_idx n_cols, cudaStream_t stream,
-                       expansion_f expansion_func) {
+template <typename value_idx, typename value_t, int tpb = 256, typename expansion_f>
+void compute_euclidean(value_t* C,
+                       const value_t* Q_sq_norms,
+                       const value_t* R_sq_norms,
+                       value_idx n_rows,
+                       value_idx n_cols,
+                       cudaStream_t stream,
+                       expansion_f expansion_func)
+{
   int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
   compute_euclidean_warp_kernel<<<blocks, tpb, 0, stream>>>(
     C, Q_sq_norms, R_sq_norms, n_rows, n_cols, expansion_func);
 }
 
-template <typename value_idx, typename value_t, int tpb = 256,
-          typename expansion_f>
-void compute_l2(value_t *out, const value_idx *Q_coo_rows,
-                const value_t *Q_data, value_idx Q_nnz,
-                const value_idx *R_coo_rows, const value_t *R_data,
-                value_idx R_nnz, value_idx m, value_idx n, cudaStream_t stream,
-                expansion_f expansion_func) {
+template <typename value_idx, typename value_t, int tpb = 256, typename expansion_f>
+void compute_l2(value_t* out,
+                const value_idx* Q_coo_rows,
+                const value_t* Q_data,
+                value_idx Q_nnz,
+                const value_idx* R_coo_rows,
+                const value_t* R_data,
+                value_idx R_nnz,
+                value_idx m,
+                value_idx n,
+                cudaStream_t stream,
+                expansion_f expansion_func)
+{
   rmm::device_uvector<value_t> Q_sq_norms(m, stream);
   rmm::device_uvector<value_t> R_sq_norms(n, stream);
-  CUDA_CHECK(
-    cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
-  CUDA_CHECK(
-    cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
 
   compute_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
     Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz);
   compute_row_norm_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
     R_sq_norms.data(), R_coo_rows, R_data, R_nnz);
 
-  compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream,
-                    expansion_func);
+  compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream, expansion_func);
 }
 
 template <typename value_idx, typename value_t, int tpb = 256>
-void compute_correlation(value_t *C, const value_t *Q_sq_norms,
-                         const value_t *R_sq_norms, const value_t *Q_norms,
-                         const value_t *R_norms, value_idx n_rows,
-                         value_idx n_cols, value_idx n, cudaStream_t stream) {
+void compute_correlation(value_t* C,
+                         const value_t* Q_sq_norms,
+                         const value_t* R_sq_norms,
+                         const value_t* Q_norms,
+                         const value_t* R_norms,
+                         value_idx n_rows,
+                         value_idx n_cols,
+                         value_idx n,
+                         cudaStream_t stream)
+{
   int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
   compute_correlation_warp_kernel<<<blocks, tpb, 0, stream>>>(
     C, Q_sq_norms, R_sq_norms, Q_norms, R_norms, n_rows, n_cols, n);
 }
 
 template <typename value_idx, typename value_t, int tpb = 256>
-void compute_corr(value_t *out, const value_idx *Q_coo_rows,
-                  const value_t *Q_data, value_idx Q_nnz,
-                  const value_idx *R_coo_rows, const value_t *R_data,
-                  value_idx R_nnz, value_idx m, value_idx n, value_idx n_cols,
-                  cudaStream_t stream) {
+void compute_corr(value_t* out,
+                  const value_idx* Q_coo_rows,
+                  const value_t* Q_data,
+                  value_idx Q_nnz,
+                  const value_idx* R_coo_rows,
+                  const value_t* R_data,
+                  value_idx R_nnz,
+                  value_idx m,
+                  value_idx n,
+                  value_idx n_cols,
+                  cudaStream_t stream)
+{
   // sum_sq for std dev
   rmm::device_uvector<value_t> Q_sq_norms(m, stream);
   rmm::device_uvector<value_t> R_sq_norms(n, stream);
@@ -166,15 +190,11 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows,
   rmm::device_uvector<value_t> Q_norms(m, stream);
   rmm::device_uvector<value_t> R_norms(n, stream);
 
-  CUDA_CHECK(
-    cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
-  CUDA_CHECK(
-    cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
 
-  CUDA_CHECK(
-    cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
-  CUDA_CHECK(
-    cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
 
   compute_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
     Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz);
@@ -186,8 +206,15 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows,
   compute_row_sum_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
     R_norms.data(), R_coo_rows, R_data, R_nnz);
 
-  compute_correlation(out, Q_sq_norms.data(), R_sq_norms.data(), Q_norms.data(),
-                      R_norms.data(), m, n, n_cols, stream);
+  compute_correlation(out,
+                      Q_sq_norms.data(),
+                      R_sq_norms.data(),
+                      Q_norms.data(),
+                      R_norms.data(),
+                      m,
+                      n,
+                      n_cols,
+                      stream);
 }
 
 /**
@@ -197,35 +224,44 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows,
 template <typename value_idx = int, typename value_t = float>
 class l2_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit l2_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config), ip_dists(config) {}
+  explicit l2_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), ip_dists(config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     ip_dists.compute(out_dists);
 
-    value_idx *b_indices = ip_dists.b_rows_coo();
-    value_t *b_data = ip_dists.b_data_coo();
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
 
-    rmm::device_uvector<value_idx> search_coo_rows(
-      config_->a_nnz, config_->handle.get_stream());
-    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
-                                      search_coo_rows.data(), config_->a_nnz,
+    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz, config_->handle.get_stream());
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
                                       config_->handle.get_stream());
 
-    compute_l2(
-      out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
-      b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
-      config_->handle.get_stream(),
-      [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
-        return -2 * dot + q_norm + r_norm;
-      });
+    compute_l2(out_dists,
+               search_coo_rows.data(),
+               config_->a_data,
+               config_->a_nnz,
+               b_indices,
+               b_data,
+               config_->b_nnz,
+               config_->a_nrows,
+               config_->b_nrows,
+               config_->handle.get_stream(),
+               [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                 return -2 * dot + q_norm + r_norm;
+               });
   }
 
   ~l2_expanded_distances_t() = default;
 
  protected:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
 
@@ -234,18 +270,21 @@ class l2_expanded_distances_t : public distances_t<value_t> {
  * The expanded form is more efficient for sparse data.
  */
 template <typename value_idx = int, typename value_t = float>
-class l2_sqrt_expanded_distances_t
-  : public l2_expanded_distances_t<value_idx, value_t> {
+class l2_sqrt_expanded_distances_t : public l2_expanded_distances_t<value_idx, value_t> {
  public:
-  explicit l2_sqrt_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : l2_expanded_distances_t<value_idx, value_t>(config) {}
+  explicit l2_sqrt_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : l2_expanded_distances_t<value_idx, value_t>(config)
+  {
+  }
 
-  void compute(value_t *out_dists) override {
+  void compute(value_t* out_dists) override
+  {
     l2_expanded_distances_t<value_idx, value_t>::compute(out_dists);
     // Sqrt Post-processing
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, this->config_->a_nrows * this->config_->b_nrows,
+      out_dists,
+      out_dists,
+      this->config_->a_nrows * this->config_->b_nrows,
       [] __device__(value_t input) {
         int neg = input < 0 ? -1 : 1;
         return sqrt(abs(input) * neg);
@@ -259,79 +298,96 @@ class l2_sqrt_expanded_distances_t
 template <typename value_idx, typename value_t>
 class correlation_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit correlation_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config), ip_dists(config) {}
+  explicit correlation_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), ip_dists(config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     ip_dists.compute(out_dists);
 
-    value_idx *b_indices = ip_dists.b_rows_coo();
-    value_t *b_data = ip_dists.b_data_coo();
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
 
-    rmm::device_uvector<value_idx> search_coo_rows(
-      config_->a_nnz, config_->handle.get_stream());
-    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
-                                      search_coo_rows.data(), config_->a_nnz,
+    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz, config_->handle.get_stream());
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
                                       config_->handle.get_stream());
 
-    compute_corr(out_dists, search_coo_rows.data(), config_->a_data,
-                 config_->a_nnz, b_indices, b_data, config_->b_nnz,
-                 config_->a_nrows, config_->b_nrows, config_->b_ncols,
+    compute_corr(out_dists,
+                 search_coo_rows.data(),
+                 config_->a_data,
+                 config_->a_nnz,
+                 b_indices,
+                 b_data,
+                 config_->b_nnz,
+                 config_->a_nrows,
+                 config_->b_nrows,
+                 config_->b_ncols,
                  config_->handle.get_stream());
   }
 
   ~correlation_expanded_distances_t() = default;
 
  protected:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
 
 /**
- * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) * sqrt(sum(y_k)^2)))
- * The expanded form is more efficient for sparse data.
+ * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) *
+ * sqrt(sum(y_k)^2))) The expanded form is more efficient for sparse data.
  */
 template <typename value_idx = int, typename value_t = float>
 class cosine_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit cosine_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config),
-      workspace(0, config.handle.get_stream()),
-      ip_dists(config) {}
+  explicit cosine_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     ip_dists.compute(out_dists);
 
-    value_idx *b_indices = ip_dists.b_rows_coo();
-    value_t *b_data = ip_dists.b_data_coo();
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
 
-    rmm::device_uvector<value_idx> search_coo_rows(
-      config_->a_nnz, config_->handle.get_stream());
-    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
-                                      search_coo_rows.data(), config_->a_nnz,
+    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz, config_->handle.get_stream());
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
                                       config_->handle.get_stream());
 
-    compute_l2(
-      out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
-      b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
-      config_->handle.get_stream(),
-      [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
-        value_t norms = sqrt(q_norm) * sqrt(r_norm);
-        // deal with potential for 0 in denominator by forcing 0/1 instead
-        value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms);
-
-        // flip the similarity when both rows are 0
-        bool both_empty = (q_norm == 0) && (r_norm == 0);
-        return 1 - ((!both_empty * cos) + both_empty);
-      });
+    compute_l2(out_dists,
+               search_coo_rows.data(),
+               config_->a_data,
+               config_->a_nnz,
+               b_indices,
+               b_data,
+               config_->b_nnz,
+               config_->a_nrows,
+               config_->b_nrows,
+               config_->handle.get_stream(),
+               [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                 value_t norms = sqrt(q_norm) * sqrt(r_norm);
+                 // deal with potential for 0 in denominator by forcing 0/1 instead
+                 value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms);
+
+                 // flip the similarity when both rows are 0
+                 bool both_empty = (q_norm == 0) && (r_norm == 0);
+                 return 1 - ((!both_empty * cos) + both_empty);
+               });
   }
 
   ~cosine_expanded_distances_t() = default;
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   rmm::device_uvector<char> workspace;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
@@ -348,25 +404,34 @@ class cosine_expanded_distances_t : public distances_t<value_t> {
 template <typename value_idx = int, typename value_t = float>
 class hellinger_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit hellinger_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config), workspace(0, config.handle.get_stream()) {}
+  explicit hellinger_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), workspace(0, config.handle.get_stream())
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
                                             config_->handle.get_stream());
 
-    raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows,
-                                      coo_rows.data(), config_->b_nnz,
+    raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                      config_->b_nrows,
+                                      coo_rows.data(),
+                                      config_->b_nnz,
                                       config_->handle.get_stream());
 
     balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
-      out_dists, *config_, coo_rows.data(),
-      [] __device__(value_t a, value_t b) { return sqrt(a) * sqrt(b); }, Sum(),
+      out_dists,
+      *config_,
+      coo_rows.data(),
+      [] __device__(value_t a, value_t b) { return sqrt(a) * sqrt(b); },
+      Sum(),
       AtomicAdd());
 
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) {
         // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
         bool rectifier = (1 - input) > 0;
@@ -378,42 +443,43 @@ class hellinger_expanded_distances_t : public distances_t<value_t> {
   ~hellinger_expanded_distances_t() = default;
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   rmm::device_uvector<char> workspace;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class russelrao_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit russelrao_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config),
-      workspace(0, config.handle.get_stream()),
-      ip_dists(config) {}
+  explicit russelrao_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     ip_dists.compute(out_dists);
 
-    value_t n_cols = config_->a_ncols;
+    value_t n_cols     = config_->a_ncols;
     value_t n_cols_inv = 1.0 / n_cols;
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) { return (n_cols - input) * n_cols_inv; },
       config_->handle.get_stream());
 
-    auto exec_policy = rmm::exec_policy(config_->handle.get_stream());
-    auto diags = thrust::counting_iterator<value_idx>(0);
+    auto exec_policy  = rmm::exec_policy(config_->handle.get_stream());
+    auto diags        = thrust::counting_iterator<value_idx>(0);
     value_idx b_nrows = config_->b_nrows;
-    thrust::for_each(exec_policy, diags, diags + config_->a_nrows,
-                     [=] __device__(value_idx input) {
-                       out_dists[input * b_nrows + input] = 0.0;
-                     });
+    thrust::for_each(exec_policy, diags, diags + config_->a_nrows, [=] __device__(value_idx input) {
+      out_dists[input * b_nrows + input] = 0.0;
+    });
   }
 
   ~russelrao_expanded_distances_t() = default;
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   rmm::device_uvector<char> workspace;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
index c11369375b..f5e7c75988 100644
--- a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
@@ -39,23 +39,33 @@ namespace sparse {
 namespace distance {
 namespace detail {
 
-template <typename value_idx = int, typename value_t = float,
-          typename product_f, typename accum_f, typename write_f>
-void unexpanded_lp_distances(
-  value_t *out_dists, const distances_config_t<value_idx, value_t> *config_,
-  product_f product_func, accum_f accum_func, write_f write_func) {
+template <typename value_idx = int,
+          typename value_t   = float,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
+void unexpanded_lp_distances(value_t* out_dists,
+                             const distances_config_t<value_idx, value_t>* config_,
+                             product_f product_func,
+                             accum_f accum_func,
+                             write_f write_func)
+{
   rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
                                           config_->handle.get_stream());
 
-  raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows,
-                                    coo_rows.data(), config_->b_nnz,
+  raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                    config_->b_nrows,
+                                    coo_rows.data(),
+                                    config_->b_nnz,
                                     config_->handle.get_stream());
 
   balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
     out_dists, *config_, coo_rows.data(), product_func, accum_func, write_func);
 
-  raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
-                                    coo_rows.data(), config_->a_nnz,
+  raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                    config_->a_nrows,
+                                    coo_rows.data(),
+                                    config_->a_nnz,
                                     config_->handle.get_stream());
 
   balanced_coo_pairwise_generalized_spmv_rev<value_idx, value_t>(
@@ -72,48 +82,51 @@ void unexpanded_lp_distances(
 template <typename value_idx = int, typename value_t = float>
 class l1_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  l1_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config) {}
+  l1_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config) : config_(&config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, AbsDiff(),
-                                                Sum(), AtomicAdd());
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, AbsDiff(), Sum(), AtomicAdd());
   }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class l2_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  l2_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config) {}
+  l2_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config) : config_(&config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, SqDiff(),
-                                                Sum(), AtomicAdd());
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, SqDiff(), Sum(), AtomicAdd());
   }
 
  protected:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
-class l2_sqrt_unexpanded_distances_t
-  : public l2_unexpanded_distances_t<value_idx, value_t> {
+class l2_sqrt_unexpanded_distances_t : public l2_unexpanded_distances_t<value_idx, value_t> {
  public:
-  l2_sqrt_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : l2_unexpanded_distances_t<value_idx, value_t>(config) {}
+  l2_sqrt_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : l2_unexpanded_distances_t<value_idx, value_t>(config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     l2_unexpanded_distances_t<value_idx, value_t>::compute(out_dists);
     // Sqrt Post-processing
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, this->config_->a_nrows * this->config_->b_nrows,
+      out_dists,
+      out_dists,
+      this->config_->a_nrows * this->config_->b_nrows,
       [] __device__(value_t input) {
         int neg = input < 0 ? -1 : 1;
         return sqrt(abs(input) * neg);
@@ -125,29 +138,33 @@ class l2_sqrt_unexpanded_distances_t
 template <typename value_idx = int, typename value_t = float>
 class linf_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  explicit linf_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config) {}
+  explicit linf_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, AbsDiff(),
-                                                Max(), AtomicMax());
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, AbsDiff(), Max(), AtomicMax());
   }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class canberra_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  explicit canberra_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config) {}
+  explicit canberra_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     unexpanded_lp_distances<value_idx, value_t>(
-      out_dists, config_,
+      out_dists,
+      config_,
       [] __device__(value_t a, value_t b) {
         value_t d = fabs(a) + fabs(b);
 
@@ -155,70 +172,82 @@ class canberra_unexpanded_distances_t : public distances_t<value_t> {
         // forcing 1/0 instead
         return ((d != 0) * fabs(a - b)) / (d + (d == 0));
       },
-      Sum(), AtomicAdd());
+      Sum(),
+      AtomicAdd());
   }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class lp_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  explicit lp_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config, value_t p_)
-    : config_(&config), p(p_) {}
+  explicit lp_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config,
+                                     value_t p_)
+    : config_(&config), p(p_)
+  {
+  }
 
-  void compute(value_t *out_dists) {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, PDiff(p),
-                                                Sum(), AtomicAdd());
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, PDiff(p), Sum(), AtomicAdd());
 
     float one_over_p = 1.0f / p;
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) { return pow(input, one_over_p); },
       config_->handle.get_stream());
   }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   value_t p;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class hamming_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  explicit hamming_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config) {}
+  explicit hamming_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, NotEqual(),
-                                                Sum(), AtomicAdd());
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, NotEqual(), Sum(), AtomicAdd());
 
     value_t n_cols = 1.0 / config_->a_ncols;
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) { return input * n_cols; },
       config_->handle.get_stream());
   }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class jensen_shannon_unexpanded_distances_t : public distances_t<value_t> {
  public:
   explicit jensen_shannon_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config) {}
+    const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     unexpanded_lp_distances<value_idx, value_t>(
-      out_dists, config_,
+      out_dists,
+      config_,
       [] __device__(value_t a, value_t b) {
-        value_t m = 0.5f * (a + b);
+        value_t m   = 0.5f * (a + b);
         bool a_zero = a == 0;
         bool b_zero = b == 0;
 
@@ -228,49 +257,61 @@ class jensen_shannon_unexpanded_distances_t : public distances_t<value_t> {
         bool x_zero = x == 0;
         bool y_zero = y == 0;
 
-        return (-a * (!x_zero * log(x + x_zero))) +
-               (-b * (!y_zero * log(y + y_zero)));
+        return (-a * (!x_zero * log(x + x_zero))) + (-b * (!y_zero * log(y + y_zero)));
       },
-      Sum(), AtomicAdd());
+      Sum(),
+      AtomicAdd());
 
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) { return sqrt(0.5 * input); },
       config_->handle.get_stream());
   }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class kl_divergence_unexpanded_distances_t : public distances_t<value_t> {
  public:
   explicit kl_divergence_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config) {}
+    const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
                                             config_->handle.get_stream());
 
-    raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows,
-                                      coo_rows.data(), config_->b_nnz,
+    raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                      config_->b_nrows,
+                                      coo_rows.data(),
+                                      config_->b_nnz,
                                       config_->handle.get_stream());
 
     balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
-      out_dists, *config_, coo_rows.data(),
-      [] __device__(value_t a, value_t b) { return a * log(a / b); }, Sum(),
+      out_dists,
+      *config_,
+      coo_rows.data(),
+      [] __device__(value_t a, value_t b) { return a * log(a / b); },
+      Sum(),
       AtomicAdd());
 
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) { return 0.5 * input; },
       config_->handle.get_stream());
   }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
 };
 
 };  // END namespace detail
diff --git a/cpp/include/raft/sparse/distance/detail/operators.cuh b/cpp/include/raft/sparse/distance/detail/operators.cuh
index 9f206095bf..b2c2e2172b 100644
--- a/cpp/include/raft/sparse/distance/detail/operators.cuh
+++ b/cpp/include/raft/sparse/distance/detail/operators.cuh
@@ -25,21 +25,24 @@ namespace detail {
 
 struct Sum {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
+  {
     return a + b;
   }
 };
 
 struct NotEqual {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
+  {
     return a != b;
   }
 };
 
 struct SqDiff {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
+  {
     return (a - b) * (a - b);
   }
 };
@@ -50,44 +53,48 @@ struct PDiff {
   PDiff(float p_) : p(p_) {}
 
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
+  {
     return pow(a - b, p);
   }
 };
 
 struct Max {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
+  {
     return fmax(a, b);
   }
 };
 
 struct AtomicAdd {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t *a,
-                                                         value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t* a, value_t b)
+  {
     return atomicAdd(a, b);
   }
 };
 
 struct AtomicMax {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t *a,
-                                                         value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t* a, value_t b)
+  {
     return atomicMax(a, b);
   }
 };
 
 struct Product {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
+  {
     return a * b;
   }
 };
 
 struct AbsDiff {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
+  {
     return fabs(a - b);
   }
 };
diff --git a/cpp/include/raft/sparse/distance/detail/utils.cuh b/cpp/include/raft/sparse/distance/detail/utils.cuh
index abfb7d24ea..8c01b33c1e 100644
--- a/cpp/include/raft/sparse/distance/detail/utils.cuh
+++ b/cpp/include/raft/sparse/distance/detail/utils.cuh
@@ -33,10 +33,10 @@ namespace detail {
  * @return the maximum number of columns that can be stored in smem
  */
 template <typename value_idx, typename value_t, int tpb = 1024>
-inline int max_cols_per_block() {
+inline int max_cols_per_block()
+{
   // max cols = (total smem available - cub reduction smem)
-  return (raft::getSharedMemPerBlock() -
-          ((tpb / raft::warp_size()) * sizeof(value_t))) /
+  return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) /
          sizeof(value_t);
 }
 
diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp
index 0aeabe5019..92c08654d2 100644
--- a/cpp/include/raft/sparse/distance/distance.hpp
+++ b/cpp/include/raft/sparse/distance/distance.hpp
@@ -71,90 +71,71 @@ static const std::unordered_set<raft::distance::DistanceType> supportedDistance{
  * @param[out] out dense output array (size A.nrows * B.nrows)
  * @param[in] input_config input argument configuration
  * @param[in] metric distance metric to use
-* @param[in] metric_arg metric argument (used for Minkowski distance)
+ * @param[in] metric_arg metric argument (used for Minkowski distance)
  */
 template <typename value_idx = int, typename value_t = float>
-void pairwiseDistance(value_t *out,
+void pairwiseDistance(value_t* out,
                       distances_config_t<value_idx, value_t> input_config,
-                      raft::distance::DistanceType metric, float metric_arg) {
+                      raft::distance::DistanceType metric,
+                      float metric_arg)
+{
   switch (metric) {
     case raft::distance::DistanceType::L2Expanded:
-      detail::l2_expanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::l2_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::L2SqrtExpanded:
-      detail::l2_sqrt_expanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::l2_sqrt_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::InnerProduct:
       detail::ip_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::L2Unexpanded:
-      detail::l2_unexpanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::l2_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::L2SqrtUnexpanded:
-      detail::l2_sqrt_unexpanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::l2_sqrt_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::L1:
-      detail::l1_unexpanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::l1_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::LpUnexpanded:
-      detail::lp_unexpanded_distances_t<value_idx, value_t>(input_config,
-                                                            metric_arg)
-        .compute(out);
+      detail::lp_unexpanded_distances_t<value_idx, value_t>(input_config, metric_arg).compute(out);
       break;
     case raft::distance::DistanceType::Linf:
-      detail::linf_unexpanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::linf_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::Canberra:
-      detail::canberra_unexpanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::canberra_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::JaccardExpanded:
-      detail::jaccard_expanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::jaccard_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::CosineExpanded:
-      detail::cosine_expanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::cosine_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::HellingerExpanded:
-      detail::hellinger_expanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::hellinger_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::DiceExpanded:
-      detail::dice_expanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::dice_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::CorrelationExpanded:
-      detail::correlation_expanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::correlation_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::RusselRaoExpanded:
-      detail::russelrao_expanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::russelrao_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::HammingUnexpanded:
-      detail::hamming_unexpanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::hamming_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::JensenShannon:
-      detail::jensen_shannon_unexpanded_distances_t<value_idx, value_t>(
-        input_config)
-        .compute(out);
+      detail::jensen_shannon_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::KLDivergence:
-      detail::kl_divergence_unexpanded_distances_t<value_idx, value_t>(
-        input_config)
-        .compute(out);
+      detail::kl_divergence_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
 
-    default:
-      THROW("Unsupported distance: %d", metric);
+    default: THROW("Unsupported distance: %d", metric);
   }
 }
 
diff --git a/cpp/include/raft/sparse/hierarchy/common.h b/cpp/include/raft/sparse/hierarchy/common.h
index 29f541498b..1738dd7498 100644
--- a/cpp/include/raft/sparse/hierarchy/common.h
+++ b/cpp/include/raft/sparse/hierarchy/common.h
@@ -37,13 +37,15 @@ class linkage_output {
   value_idx n_leaves;
   value_idx n_connected_components;
 
-  value_idx *labels;  // size: m
+  value_idx* labels;  // size: m
 
-  value_idx *children;  // size: (m-1, 2)
+  value_idx* children;  // size: (m-1, 2)
 };
 
-class linkage_output_int_float : public linkage_output<int, float> {};
-class linkage_output__int64_float : public linkage_output<int64_t, float> {};
+class linkage_output_int_float : public linkage_output<int, float> {
+};
+class linkage_output__int64_float : public linkage_output<int64_t, float> {
+};
 
 };  // namespace hierarchy
 };  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
index 4ef2ac43e2..207cca7287 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
@@ -44,31 +44,32 @@ class UnionFind {
   value_idx n_indices;
 
   UnionFind(value_idx N_)
-    : n_indices(2 * N_ - 1),
-      parent(2 * N_ - 1, -1),
-      size(2 * N_ - 1, 1),
-      next_label(N_) {
+    : n_indices(2 * N_ - 1), parent(2 * N_ - 1, -1), size(2 * N_ - 1, 1), next_label(N_)
+  {
     memset(size.data() + N_, 0, (size.size() - N_) * sizeof(value_idx));
   }
 
-  value_idx find(value_idx n) {
+  value_idx find(value_idx n)
+  {
     value_idx p;
     p = n;
 
-    while (parent[n] != -1) n = parent[n];
+    while (parent[n] != -1)
+      n = parent[n];
 
     // path compression
     while (parent[p] != n) {
-      p = parent[p == -1 ? n_indices - 1 : p];
+      p                                   = parent[p == -1 ? n_indices - 1 : p];
       parent[p == -1 ? n_indices - 1 : p] = n;
     }
     return n;
   }
 
-  void perform_union(value_idx m, value_idx n) {
+  void perform_union(value_idx m, value_idx n)
+  {
     size[next_label] = size[m] + size[n];
-    parent[m] = next_label;
-    parent[n] = next_label;
+    parent[m]        = next_label;
+    parent[n]        = next_label;
 
     next_label += 1;
   }
@@ -97,10 +98,15 @@ class UnionFind {
  * @param[out] out_size cluster sizes of output
  */
 template <typename value_idx, typename value_t>
-void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
-                           const value_idx *cols, const value_t *data,
-                           size_t nnz, value_idx *children, value_t *out_delta,
-                           value_idx *out_size) {
+void build_dendrogram_host(const handle_t& handle,
+                           const value_idx* rows,
+                           const value_idx* cols,
+                           const value_t* data,
+                           size_t nnz,
+                           value_idx* children,
+                           value_t* out_delta,
+                           value_idx* out_size)
+{
   auto stream = handle.get_stream();
 
   value_idx n_edges = nnz;
@@ -122,8 +128,8 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
   UnionFind<value_idx, value_t> U(nnz + 1);
 
   for (std::size_t i = 0; i < nnz; i++) {
-    value_idx a = mst_src_h[i];
-    value_idx b = mst_dst_h[i];
+    value_idx a   = mst_src_h[i];
+    value_idx b   = mst_dst_h[i];
     value_t delta = mst_weights_h[i];
 
     value_idx aa = U.find(a);
@@ -131,10 +137,10 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
 
     value_idx children_idx = i * 2;
 
-    children_h[children_idx] = aa;
+    children_h[children_idx]     = aa;
     children_h[children_idx + 1] = bb;
-    out_delta_h[i] = delta;
-    out_size_h[i] = U.size[aa] + U.size[bb];
+    out_delta_h[i]               = delta;
+    out_size_h[i]                = U.size[aa] + U.size[bb];
 
     U.perform_union(aa, bb);
   }
@@ -145,13 +151,15 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
 }
 
 template <typename value_idx>
-__global__ void write_levels_kernel(const value_idx *children,
-                                    value_idx *parents, value_idx n_vertices) {
+__global__ void write_levels_kernel(const value_idx* children,
+                                    value_idx* parents,
+                                    value_idx n_vertices)
+{
   value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
   if (tid < n_vertices) {
     value_idx level = tid / 2;
     value_idx child = children[tid];
-    parents[child] = level;
+    parents[child]  = level;
   }
 }
 
@@ -167,14 +175,17 @@ __global__ void write_levels_kernel(const value_idx *children,
  * @param labels
  */
 template <typename value_idx>
-__global__ void inherit_labels(const value_idx *children,
-                               const value_idx *levels, std::size_t n_leaves,
-                               value_idx *labels, int cut_level,
-                               value_idx n_vertices) {
+__global__ void inherit_labels(const value_idx* children,
+                               const value_idx* levels,
+                               std::size_t n_leaves,
+                               value_idx* labels,
+                               int cut_level,
+                               value_idx n_vertices)
+{
   value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
 
   if (tid < n_vertices) {
-    value_idx node = children[tid];
+    value_idx node      = children[tid];
     value_idx cur_level = tid / 2;
 
     /**
@@ -184,12 +195,12 @@ __global__ void inherit_labels(const value_idx *children,
     if (cur_level > cut_level) return;
 
     value_idx cur_parent = node;
-    value_idx label = labels[cur_parent];
+    value_idx label      = labels[cur_parent];
 
     while (label == -1) {
       cur_parent = cur_level + n_leaves;
-      cur_level = levels[cur_parent];
-      label = labels[cur_parent];
+      cur_level  = levels[cur_parent];
+      label      = labels[cur_parent];
     }
 
     labels[node] = label;
@@ -198,15 +209,16 @@ __global__ void inherit_labels(const value_idx *children,
 
 template <typename value_idx>
 struct init_label_roots {
-  init_label_roots(value_idx *labels_) : labels(labels_) {}
+  init_label_roots(value_idx* labels_) : labels(labels_) {}
 
   template <typename Tuple>
-  __host__ __device__ void operator()(Tuple t) {
+  __host__ __device__ void operator()(Tuple t)
+  {
     labels[thrust::get<1>(t)] = thrust::get<0>(t);
   }
 
  private:
-  value_idx *labels;
+  value_idx* labels;
 };
 
 /**
@@ -222,10 +234,13 @@ struct init_label_roots {
  * @param n_leaves
  */
 template <typename value_idx, int tpb = 256>
-void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
-                                const value_idx *children, size_t n_clusters,
-                                size_t n_leaves) {
-  auto stream = handle.get_stream();
+void extract_flattened_clusters(const raft::handle_t& handle,
+                                value_idx* labels,
+                                const value_idx* children,
+                                size_t n_clusters,
+                                size_t n_leaves)
+{
+  auto stream        = handle.get_stream();
   auto thrust_policy = handle.get_thrust_policy();
 
   // Handle special case where n_clusters == 1
@@ -243,24 +258,21 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
 
     auto n_edges = (n_leaves - 1) * 2;
 
-    thrust::device_ptr<const value_idx> d_ptr =
-      thrust::device_pointer_cast(children);
-    value_idx n_vertices =
-      *(thrust::max_element(thrust_policy, d_ptr, d_ptr + n_edges)) + 1;
+    thrust::device_ptr<const value_idx> d_ptr = thrust::device_pointer_cast(children);
+    value_idx n_vertices = *(thrust::max_element(thrust_policy, d_ptr, d_ptr + n_edges)) + 1;
 
     // Prevent potential infinite loop from labeling disconnected
     // connectivities graph.
     RAFT_EXPECTS(n_leaves > 0, "n_leaves must be positive");
-    RAFT_EXPECTS(static_cast<std::size_t>(n_vertices) ==
-                   static_cast<std::size_t>((n_leaves - 1) * 2),
-                 "Multiple components found in MST or MST is invalid. "
-                 "Cannot find single-linkage solution.");
+    RAFT_EXPECTS(
+      static_cast<std::size_t>(n_vertices) == static_cast<std::size_t>((n_leaves - 1) * 2),
+      "Multiple components found in MST or MST is invalid. "
+      "Cannot find single-linkage solution.");
 
     rmm::device_uvector<value_idx> levels(n_vertices, stream);
 
     value_idx n_blocks = ceildiv(n_vertices, (value_idx)tpb);
-    write_levels_kernel<<<n_blocks, tpb, 0, stream>>>(children, levels.data(),
-                                                      n_vertices);
+    write_levels_kernel<<<n_blocks, tpb, 0, stream>>>(children, levels.data(), n_vertices);
     /**
      * Step 1: Find label roots:
      *
@@ -274,27 +286,26 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
     rmm::device_uvector<value_idx> label_roots(child_size, stream);
 
     value_idx children_cpy_start = n_edges - child_size;
-    raft::copy_async(label_roots.data(), children + children_cpy_start,
-                     child_size, stream);
+    raft::copy_async(label_roots.data(), children + children_cpy_start, child_size, stream);
 
-    thrust::sort(thrust_policy, label_roots.data(),
+    thrust::sort(thrust_policy,
+                 label_roots.data(),
                  label_roots.data() + (child_size),
                  thrust::greater<value_idx>());
 
     rmm::device_uvector<value_idx> tmp_labels(n_vertices, stream);
 
     // Init labels to -1
-    thrust::fill(thrust_policy, tmp_labels.data(),
-                 tmp_labels.data() + n_vertices, -1);
+    thrust::fill(thrust_policy, tmp_labels.data(), tmp_labels.data() + n_vertices, -1);
 
     // Write labels for cluster roots to "labels"
     thrust::counting_iterator<uint> first(0);
 
-    auto z_iter = thrust::make_zip_iterator(thrust::make_tuple(
-      first, label_roots.data() + (label_roots.size() - n_clusters)));
+    auto z_iter = thrust::make_zip_iterator(
+      thrust::make_tuple(first, label_roots.data() + (label_roots.size() - n_clusters)));
 
-    thrust::for_each(thrust_policy, z_iter, z_iter + n_clusters,
-                     init_label_roots<value_idx>(tmp_labels.data()));
+    thrust::for_each(
+      thrust_policy, z_iter, z_iter + n_clusters, init_label_roots<value_idx>(tmp_labels.data()));
 
     /**
      * Step 2: Propagate labels by having children iterate through their parents
@@ -304,9 +315,8 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
      */
     value_idx cut_level = (n_edges / 2) - (n_clusters - 1);
 
-    inherit_labels<<<n_blocks, tpb, 0, stream>>>(children, levels.data(),
-                                                 n_leaves, tmp_labels.data(),
-                                                 cut_level, n_vertices);
+    inherit_labels<<<n_blocks, tpb, 0, stream>>>(
+      children, levels.data(), n_leaves, tmp_labels.data(), cut_level, n_vertices);
 
     // copy tmp labels to actual labels
     raft::copy_async(labels, tmp_labels.data(), n_leaves, stream);
diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
index 31e4a0f263..c06c24e100 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
@@ -36,14 +36,17 @@ namespace raft {
 namespace hierarchy {
 namespace detail {
 
-template <raft::hierarchy::LinkageDistance dist_type, typename value_idx,
-          typename value_t>
+template <raft::hierarchy::LinkageDistance dist_type, typename value_idx, typename value_t>
 struct distance_graph_impl {
-  void run(const raft::handle_t &handle, const value_t *X, size_t m, size_t n,
+  void run(const raft::handle_t& handle,
+           const value_t* X,
+           size_t m,
+           size_t n,
            raft::distance::DistanceType metric,
-           rmm::device_uvector<value_idx> &indptr,
-           rmm::device_uvector<value_idx> &indices,
-           rmm::device_uvector<value_t> &data, int c);
+           rmm::device_uvector<value_idx>& indptr,
+           rmm::device_uvector<value_idx>& indices,
+           rmm::device_uvector<value_t>& data,
+           int c);
 };
 
 /**
@@ -52,37 +55,41 @@ struct distance_graph_impl {
  * @tparam value_t
  */
 template <typename value_idx, typename value_t>
-struct distance_graph_impl<raft::hierarchy::LinkageDistance::KNN_GRAPH,
-                           value_idx, value_t> {
-  void run(const raft::handle_t &handle, const value_t *X, size_t m, size_t n,
+struct distance_graph_impl<raft::hierarchy::LinkageDistance::KNN_GRAPH, value_idx, value_t> {
+  void run(const raft::handle_t& handle,
+           const value_t* X,
+           size_t m,
+           size_t n,
            raft::distance::DistanceType metric,
-           rmm::device_uvector<value_idx> &indptr,
-           rmm::device_uvector<value_idx> &indices,
-           rmm::device_uvector<value_t> &data, int c) {
-    auto stream = handle.get_stream();
+           rmm::device_uvector<value_idx>& indptr,
+           rmm::device_uvector<value_idx>& indices,
+           rmm::device_uvector<value_t>& data,
+           int c)
+  {
+    auto stream        = handle.get_stream();
     auto thrust_policy = handle.get_thrust_policy();
 
     // Need to symmetrize knn into undirected graph
     raft::sparse::COO<value_t, value_idx> knn_graph_coo(stream);
 
-    raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo,
-                                       c);
+    raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo, c);
 
     indices.resize(knn_graph_coo.nnz, stream);
     data.resize(knn_graph_coo.nnz, stream);
 
     // self-loops get max distance
-    auto transform_in = thrust::make_zip_iterator(thrust::make_tuple(
-      knn_graph_coo.rows(), knn_graph_coo.cols(), knn_graph_coo.vals()));
-
-    thrust::transform(
-      thrust_policy, transform_in, transform_in + knn_graph_coo.nnz,
-      knn_graph_coo.vals(),
-      [=] __device__(const thrust::tuple<value_idx, value_idx, value_t> &tup) {
-        bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup);
-        return (self_loop * std::numeric_limits<value_t>::max()) +
-               (!self_loop * thrust::get<2>(tup));
-      });
+    auto transform_in = thrust::make_zip_iterator(
+      thrust::make_tuple(knn_graph_coo.rows(), knn_graph_coo.cols(), knn_graph_coo.vals()));
+
+    thrust::transform(thrust_policy,
+                      transform_in,
+                      transform_in + knn_graph_coo.nnz,
+                      knn_graph_coo.vals(),
+                      [=] __device__(const thrust::tuple<value_idx, value_idx, value_t>& tup) {
+                        bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup);
+                        return (self_loop * std::numeric_limits<value_t>::max()) +
+                               (!self_loop * thrust::get<2>(tup));
+                      });
 
     raft::sparse::convert::sorted_coo_to_csr(
       knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), m + 1, stream);
@@ -90,10 +97,8 @@ struct distance_graph_impl<raft::hierarchy::LinkageDistance::KNN_GRAPH,
     // TODO: Wouldn't need to copy here if we could compute knn
     // graph directly on the device uvectors
     // ref: https://github.com/rapidsai/raft/issues/227
-    raft::copy_async(indices.data(), knn_graph_coo.cols(), knn_graph_coo.nnz,
-                     stream);
-    raft::copy_async(data.data(), knn_graph_coo.vals(), knn_graph_coo.nnz,
-                     stream);
+    raft::copy_async(indices.data(), knn_graph_coo.cols(), knn_graph_coo.nnz, stream);
+    raft::copy_async(data.data(), knn_graph_coo.vals(), knn_graph_coo.nnz, stream);
   }
 };
 
@@ -113,13 +118,17 @@ struct distance_graph_impl<raft::hierarchy::LinkageDistance::KNN_GRAPH,
  * @param[out] c constant 'c' used for nearest neighbors-based distances
  *             which will guarantee k <= log(n) + c
  */
-template <typename value_idx, typename value_t,
-          raft::hierarchy::LinkageDistance dist_type>
-void get_distance_graph(const raft::handle_t &handle, const value_t *X,
-                        size_t m, size_t n, raft::distance::DistanceType metric,
-                        rmm::device_uvector<value_idx> &indptr,
-                        rmm::device_uvector<value_idx> &indices,
-                        rmm::device_uvector<value_t> &data, int c) {
+template <typename value_idx, typename value_t, raft::hierarchy::LinkageDistance dist_type>
+void get_distance_graph(const raft::handle_t& handle,
+                        const value_t* X,
+                        size_t m,
+                        size_t n,
+                        raft::distance::DistanceType metric,
+                        rmm::device_uvector<value_idx>& indptr,
+                        rmm::device_uvector<value_idx>& indices,
+                        rmm::device_uvector<value_t>& data,
+                        int c)
+{
   auto stream = handle.get_stream();
 
   indptr.resize(m + 1, stream);
diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
index 6ef6f9879b..0c0b049f11 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
@@ -34,9 +34,10 @@ namespace hierarchy {
 namespace detail {
 
 template <typename value_idx, typename value_t>
-void merge_msts(raft::Graph_COO<value_idx, value_idx, value_t> &coo1,
-                raft::Graph_COO<value_idx, value_idx, value_t> &coo2,
-                cudaStream_t stream) {
+void merge_msts(raft::Graph_COO<value_idx, value_idx, value_t>& coo1,
+                raft::Graph_COO<value_idx, value_idx, value_t>& coo2,
+                cudaStream_t stream)
+{
   /** Add edges to existing mst **/
   int final_nnz = coo2.n_edges + coo1.n_edges;
 
@@ -47,12 +48,9 @@ void merge_msts(raft::Graph_COO<value_idx, value_idx, value_t> &coo1,
   /**
    * Construct final edge list
    */
-  raft::copy_async(coo1.src.data() + coo1.n_edges, coo2.src.data(),
-                   coo2.n_edges, stream);
-  raft::copy_async(coo1.dst.data() + coo1.n_edges, coo2.dst.data(),
-                   coo2.n_edges, stream);
-  raft::copy_async(coo1.weights.data() + coo1.n_edges, coo2.weights.data(),
-                   coo2.n_edges, stream);
+  raft::copy_async(coo1.src.data() + coo1.n_edges, coo2.src.data(), coo2.n_edges, stream);
+  raft::copy_async(coo1.dst.data() + coo1.n_edges, coo2.dst.data(), coo2.n_edges, stream);
+  raft::copy_async(coo1.weights.data() + coo1.n_edges, coo2.weights.data(), coo2.n_edges, stream);
 
   coo1.n_edges = final_nnz;
 }
@@ -71,12 +69,16 @@ void merge_msts(raft::Graph_COO<value_idx, value_idx, value_t> &coo1,
  * @return updated MST edge list
  */
 template <typename value_idx, typename value_t, typename red_op>
-void connect_knn_graph(const raft::handle_t &handle, const value_t *X,
-                       raft::Graph_COO<value_idx, value_idx, value_t> &msf,
-                       size_t m, size_t n, value_idx *color,
-                       red_op reduction_op,
-                       raft::distance::DistanceType metric =
-                         raft::distance::DistanceType::L2SqrtExpanded) {
+void connect_knn_graph(
+  const raft::handle_t& handle,
+  const value_t* X,
+  raft::Graph_COO<value_idx, value_idx, value_t>& msf,
+  size_t m,
+  size_t n,
+  value_idx* color,
+  red_op reduction_op,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded)
+{
   auto stream = handle.get_stream();
 
   raft::sparse::COO<value_t, value_idx> connected_edges(stream);
@@ -90,9 +92,16 @@ void connect_knn_graph(const raft::handle_t &handle, const value_t *X,
 
   // On the second call, we hand the MST the original colors
   // and the new set of edges and let it restart the optimization process
-  auto new_mst = raft::mst::mst<value_idx, value_idx, value_t, double>(
-    handle, indptr2.data(), connected_edges.cols(), connected_edges.vals(), m,
-    connected_edges.nnz, color, stream, false, false);
+  auto new_mst = raft::mst::mst<value_idx, value_idx, value_t, double>(handle,
+                                                                       indptr2.data(),
+                                                                       connected_edges.cols(),
+                                                                       connected_edges.vals(),
+                                                                       m,
+                                                                       connected_edges.nnz,
+                                                                       color,
+                                                                       stream,
+                                                                       false,
+                                                                       false);
 
   merge_msts<value_idx, value_t>(msf, new_mst, stream);
 }
@@ -122,28 +131,34 @@ void connect_knn_graph(const raft::handle_t &handle, const value_t *X,
  *  argument is really just a safeguard against the potential for infinite loops.
  */
 template <typename value_idx, typename value_t, typename red_op>
-void build_sorted_mst(const raft::handle_t &handle, const value_t *X,
-                      const value_idx *indptr, const value_idx *indices,
-                      const value_t *pw_dists, size_t m, size_t n,
-                      value_idx *mst_src, value_idx *mst_dst,
-                      value_t *mst_weight, value_idx *color, size_t nnz,
-                      red_op reduction_op,
-                      raft::distance::DistanceType metric =
-                        raft::distance::DistanceType::L2SqrtExpanded,
-                      int max_iter = 10) {
+void build_sorted_mst(
+  const raft::handle_t& handle,
+  const value_t* X,
+  const value_idx* indptr,
+  const value_idx* indices,
+  const value_t* pw_dists,
+  size_t m,
+  size_t n,
+  value_idx* mst_src,
+  value_idx* mst_dst,
+  value_t* mst_weight,
+  value_idx* color,
+  size_t nnz,
+  red_op reduction_op,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded,
+  int max_iter                        = 10)
+{
   auto stream = handle.get_stream();
 
   // We want to have MST initialize colors on first call.
   auto mst_coo = raft::mst::mst<value_idx, value_idx, value_t, double>(
-    handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false,
-    true);
+    handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false, true);
 
-  int iters = 1;
+  int iters        = 1;
   int n_components = linkage::get_n_components(color, m, stream);
 
   while (n_components > 1 && iters < max_iter) {
-    connect_knn_graph<value_idx, value_t>(handle, X, mst_coo, m, n, color,
-                                          reduction_op);
+    connect_knn_graph<value_idx, value_t>(handle, X, mst_coo, m, n, color, reduction_op);
 
     iters++;
 
@@ -170,9 +185,8 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X,
                " or increase 'max_iter'",
                max_iter);
 
-  raft::sparse::op::coo_sort_by_weight(mst_coo.src.data(), mst_coo.dst.data(),
-                                       mst_coo.weights.data(), mst_coo.n_edges,
-                                       stream);
+  raft::sparse::op::coo_sort_by_weight(
+    mst_coo.src.data(), mst_coo.dst.data(), mst_coo.weights.data(), mst_coo.n_edges, stream);
 
   raft::copy_async(mst_src, mst_coo.src.data(), mst_coo.n_edges, stream);
   raft::copy_async(mst_dst, mst_coo.dst.data(), mst_coo.n_edges, stream);
diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
index 06fffb8aed..3b6f1347ab 100644
--- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
+++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
@@ -44,18 +44,24 @@ static const size_t EMPTY = 0;
  * @param[in] n number of columns in X
  * @param[in] metric distance metrix to use when constructing connectivities graph
  * @param[out] out struct containing output dendrogram and cluster assignments
- * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect control
+ * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect
+ control
  *            of k. The algorithm will set `k = log(n) + c`
  * @param[in] n_clusters number of clusters to assign data samples
  */
-template <typename value_idx, typename value_t,
+template <typename value_idx,
+          typename value_t,
           LinkageDistance dist_type = LinkageDistance::KNN_GRAPH>
-void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m,
-                    size_t n, raft::distance::DistanceType metric,
-                    linkage_output<value_idx, value_t> *out, int c,
-                    size_t n_clusters) {
-  ASSERT(n_clusters <= m,
-         "n_clusters must be less than or equal to the number of data points");
+void single_linkage(const raft::handle_t& handle,
+                    const value_t* X,
+                    size_t m,
+                    size_t n,
+                    raft::distance::DistanceType metric,
+                    linkage_output<value_idx, value_t>* out,
+                    int c,
+                    size_t n_clusters)
+{
+  ASSERT(n_clusters <= m, "n_clusters must be less than or equal to the number of data points");
 
   auto stream = handle.get_stream();
 
@@ -78,10 +84,20 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m,
    */
   rmm::device_uvector<value_idx> color(m, stream);
   raft::linkage::FixConnectivitiesRedOp<value_idx, value_t> op(color.data(), m);
-  detail::build_sorted_mst<value_idx, value_t>(
-    handle, X, indptr.data(), indices.data(), pw_dists.data(), m, n,
-    mst_rows.data(), mst_cols.data(), mst_data.data(), color.data(),
-    indices.size(), op, metric);
+  detail::build_sorted_mst<value_idx, value_t>(handle,
+                                               X,
+                                               indptr.data(),
+                                               indices.data(),
+                                               pw_dists.data(),
+                                               m,
+                                               n,
+                                               mst_rows.data(),
+                                               mst_cols.data(),
+                                               mst_data.data(),
+                                               color.data(),
+                                               indices.size(),
+                                               op,
+                                               metric);
 
   pw_dists.release();
 
@@ -93,15 +109,19 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m,
   rmm::device_uvector<value_t> out_delta(n_edges, stream);
   rmm::device_uvector<value_idx> out_size(n_edges, stream);
   // Create dendrogram
-  detail::build_dendrogram_host<value_idx, value_t>(
-    handle, mst_rows.data(), mst_cols.data(), mst_data.data(), n_edges,
-    out->children, out_delta.data(), out_size.data());
-  detail::extract_flattened_clusters(handle, out->labels, out->children,
-                                     n_clusters, m);
-
-  out->m = m;
-  out->n_clusters = n_clusters;
-  out->n_leaves = m;
+  detail::build_dendrogram_host<value_idx, value_t>(handle,
+                                                    mst_rows.data(),
+                                                    mst_cols.data(),
+                                                    mst_data.data(),
+                                                    n_edges,
+                                                    out->children,
+                                                    out_delta.data(),
+                                                    out_size.data());
+  detail::extract_flattened_clusters(handle, out->labels, out->children, n_clusters, m);
+
+  out->m                      = m;
+  out->n_clusters             = n_clusters;
+  out->n_leaves               = m;
   out->n_connected_components = 1;
 }
 
diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh
index 7ed627b9e2..0c17d55762 100644
--- a/cpp/include/raft/sparse/linalg/add.cuh
+++ b/cpp/include/raft/sparse/linalg/add.cuh
@@ -40,40 +40,47 @@ namespace sparse {
 namespace linalg {
 
 template <typename T, int TPB_X = 128>
-__global__ void csr_add_calc_row_counts_kernel(
-  const int *a_ind, const int *a_indptr, const T *a_val, int nnz1,
-  const int *b_ind, const int *b_indptr, const T *b_val, int nnz2, int m,
-  int *out_rowcounts) {
+__global__ void csr_add_calc_row_counts_kernel(const int* a_ind,
+                                               const int* a_indptr,
+                                               const T* a_val,
+                                               int nnz1,
+                                               const int* b_ind,
+                                               const int* b_indptr,
+                                               const T* b_val,
+                                               int nnz2,
+                                               int m,
+                                               int* out_rowcounts)
+{
   // loop through columns in each set of rows and
   // calculate number of unique cols across both rows
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
   if (row < m) {
     int a_start_idx = a_ind[row];
-    int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind);
+    int a_stop_idx  = get_stop_idx(row, m, nnz1, a_ind);
 
     int b_start_idx = b_ind[row];
-    int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind);
+    int b_stop_idx  = get_stop_idx(row, m, nnz2, b_ind);
 
     /**
-         * Union of columns within each row of A and B so that we can scan through
-         * them, adding their values together.
-         */
+     * Union of columns within each row of A and B so that we can scan through
+     * them, adding their values together.
+     */
     int max_size = (a_stop_idx - a_start_idx) + (b_stop_idx - b_start_idx);
 
-    int *arr = new int[max_size];
+    int* arr        = new int[max_size];
     int cur_arr_idx = 0;
     for (int j = a_start_idx; j < a_stop_idx; j++) {
       arr[cur_arr_idx] = a_indptr[j];
       cur_arr_idx++;
     }
 
-    int arr_size = cur_arr_idx;
+    int arr_size   = cur_arr_idx;
     int final_size = arr_size;
 
     for (int j = b_start_idx; j < b_stop_idx; j++) {
       int cur_col = b_indptr[j];
-      bool found = false;
+      bool found  = false;
       for (int k = 0; k < arr_size; k++) {
         if (arr[k] == cur_col) {
           found = true;
@@ -81,9 +88,7 @@ __global__ void csr_add_calc_row_counts_kernel(
         }
       }
 
-      if (!found) {
-        final_size++;
-      }
+      if (!found) { final_size++; }
     }
 
     out_rowcounts[row] = final_size;
@@ -94,11 +99,19 @@ __global__ void csr_add_calc_row_counts_kernel(
 }
 
 template <typename T, int TPB_X = 128>
-__global__ void csr_add_kernel(const int *a_ind, const int *a_indptr,
-                               const T *a_val, int nnz1, const int *b_ind,
-                               const int *b_indptr, const T *b_val, int nnz2,
-                               int m, int *out_ind, int *out_indptr,
-                               T *out_val) {
+__global__ void csr_add_kernel(const int* a_ind,
+                               const int* a_indptr,
+                               const T* a_val,
+                               int nnz1,
+                               const int* b_ind,
+                               const int* b_indptr,
+                               const T* b_val,
+                               int nnz2,
+                               int m,
+                               int* out_ind,
+                               int* out_indptr,
+                               T* out_val)
+{
   // 1 thread per row
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
@@ -109,21 +122,21 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr,
     int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind);
 
     int b_start_idx = b_ind[row];
-    int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind);
+    int b_stop_idx  = get_stop_idx(row, m, nnz2, b_ind);
 
     int o_idx = out_ind[row];
 
     int cur_o_idx = o_idx;
     for (int j = a_start_idx; j < a_stop_idx; j++) {
       out_indptr[cur_o_idx] = a_indptr[j];
-      out_val[cur_o_idx] = a_val[j];
+      out_val[cur_o_idx]    = a_val[j];
       cur_o_idx++;
     }
 
     int arr_size = cur_o_idx - o_idx;
     for (int j = b_start_idx; j < b_stop_idx; j++) {
       int cur_col = b_indptr[j];
-      bool found = false;
+      bool found  = false;
       for (int k = o_idx; k < o_idx + arr_size; k++) {
         // If we found a match, sum the two values
         if (out_indptr[k] == cur_col) {
@@ -136,7 +149,7 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr,
       // if we didn't find a match, add the value for b
       if (!found) {
         out_indptr[o_idx + arr_size] = cur_col;
-        out_val[o_idx + arr_size] = b_val[j];
+        out_val[o_idx + arr_size]    = b_val[j];
         arr_size++;
       }
     }
@@ -159,31 +172,35 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr,
  * @param stream: cuda stream to use
  */
 template <typename T, int TPB_X = 128>
-size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val,
-                         int nnz1, const int *b_ind, const int *b_indptr,
-                         const T *b_val, int nnz2, int m, int *out_ind,
-                         cudaStream_t stream) {
+size_t csr_add_calc_inds(const int* a_ind,
+                         const int* a_indptr,
+                         const T* a_val,
+                         int nnz1,
+                         const int* b_ind,
+                         const int* b_indptr,
+                         const T* b_val,
+                         int nnz2,
+                         int m,
+                         int* out_ind,
+                         cudaStream_t stream)
+{
   dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
   rmm::device_uvector<int> row_counts(m + 1, stream);
-  CUDA_CHECK(
-    cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream));
 
-  csr_add_calc_row_counts_kernel<T, TPB_X>
-    <<<grid, blk, 0, stream>>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr,
-                               b_val, nnz2, m, row_counts.data());
+  csr_add_calc_row_counts_kernel<T, TPB_X><<<grid, blk, 0, stream>>>(
+    a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, row_counts.data());
 
   int cnnz = 0;
   raft::update_host(&cnnz, row_counts.data() + m, 1, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
   // create csr compressed row index from row counts
-  thrust::device_ptr<int> row_counts_d =
-    thrust::device_pointer_cast(row_counts.data());
-  thrust::device_ptr<int> c_ind_d = thrust::device_pointer_cast(out_ind);
-  exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m,
-                 c_ind_d);
+  thrust::device_ptr<int> row_counts_d = thrust::device_pointer_cast(row_counts.data());
+  thrust::device_ptr<int> c_ind_d      = thrust::device_pointer_cast(out_ind);
+  exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m, c_ind_d);
 
   return cnnz;
 }
@@ -206,16 +223,25 @@ size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val,
  * @param stream: cuda stream to use
  */
 template <typename T, int TPB_X = 128>
-void csr_add_finalize(const int *a_ind, const int *a_indptr, const T *a_val,
-                      int nnz1, const int *b_ind, const int *b_indptr,
-                      const T *b_val, int nnz2, int m, int *c_ind,
-                      int *c_indptr, T *c_val, cudaStream_t stream) {
+void csr_add_finalize(const int* a_ind,
+                      const int* a_indptr,
+                      const T* a_val,
+                      int nnz1,
+                      const int* b_ind,
+                      const int* b_indptr,
+                      const T* b_val,
+                      int nnz2,
+                      int m,
+                      int* c_ind,
+                      int* c_indptr,
+                      T* c_val,
+                      cudaStream_t stream)
+{
   dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
-  csr_add_kernel<T, TPB_X>
-    <<<grid, blk, 0, stream>>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr,
-                               b_val, nnz2, m, c_ind, c_indptr, c_val);
+  csr_add_kernel<T, TPB_X><<<grid, blk, 0, stream>>>(
+    a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, c_ind, c_indptr, c_val);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh
index ef6a067c39..052f674325 100644
--- a/cpp/include/raft/sparse/linalg/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/degree.cuh
@@ -44,11 +44,10 @@ namespace linalg {
  * @param results array to place results
  */
 template <int TPB_X = 64, typename T = int>
-__global__ void coo_degree_kernel(const T *rows, int nnz, T *results) {
+__global__ void coo_degree_kernel(const T* rows, int nnz, T* results)
+{
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (row < nnz) {
-    atomicAdd(results + rows[row], (T)1);
-  }
+  if (row < nnz) { atomicAdd(results + rows[row], (T)1); }
 }
 
 /**
@@ -60,7 +59,8 @@ __global__ void coo_degree_kernel(const T *rows, int nnz, T *results) {
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T = int>
-void coo_degree(const T *rows, int nnz, T *results, cudaStream_t stream) {
+void coo_degree(const T* rows, int nnz, T* results, cudaStream_t stream)
+{
   dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
 
@@ -77,31 +77,28 @@ void coo_degree(const T *rows, int nnz, T *results, cudaStream_t stream) {
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void coo_degree(COO<T> *in, int *results, cudaStream_t stream) {
+void coo_degree(COO<T>* in, int* results, cudaStream_t stream)
+{
   dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
 
-  coo_degree_kernel<TPB_X>
-    <<<grid_rc, blk_rc, 0, stream>>>(in->rows(), in->nnz, results);
+  coo_degree_kernel<TPB_X><<<grid_rc, blk_rc, 0, stream>>>(in->rows(), in->nnz, results);
   CUDA_CHECK(cudaGetLastError());
 }
 
 template <int TPB_X = 64, typename T>
-__global__ void coo_degree_nz_kernel(const int *rows, const T *vals, int nnz,
-                                     int *results) {
+__global__ void coo_degree_nz_kernel(const int* rows, const T* vals, int nnz, int* results)
+{
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (row < nnz && vals[row] != 0.0) {
-    raft::myAtomicAdd(results + rows[row], 1);
-  }
+  if (row < nnz && vals[row] != 0.0) { raft::myAtomicAdd(results + rows[row], 1); }
 }
 
 template <int TPB_X = 64, typename T>
-__global__ void coo_degree_scalar_kernel(const int *rows, const T *vals,
-                                         int nnz, T scalar, int *results) {
+__global__ void coo_degree_scalar_kernel(
+  const int* rows, const T* vals, int nnz, T scalar, int* results)
+{
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (row < nnz && vals[row] != scalar) {
-    raft::myAtomicAdd(results + rows[row], 1);
-  }
+  if (row < nnz && vals[row] != scalar) { raft::myAtomicAdd(results + rows[row], 1); }
 }
 
 /**
@@ -114,12 +111,12 @@ __global__ void coo_degree_scalar_kernel(const int *rows, const T *vals,
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void coo_degree_scalar(COO<T> *in, T scalar, int *results,
-                       cudaStream_t stream) {
+void coo_degree_scalar(COO<T>* in, T scalar, int* results, cudaStream_t stream)
+{
   dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
-  coo_degree_scalar_kernel<TPB_X, T><<<grid_rc, blk_rc, 0, stream>>>(
-    in->rows(), in->vals(), in->nnz, scalar, results);
+  coo_degree_scalar_kernel<TPB_X, T>
+    <<<grid_rc, blk_rc, 0, stream>>>(in->rows(), in->vals(), in->nnz, scalar, results);
   CUDA_CHECK(cudaGetLastError());
 }
 
@@ -135,8 +132,9 @@ void coo_degree_scalar(COO<T> *in, T scalar, int *results,
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void coo_degree_scalar(const int *rows, const T *vals, int nnz, T scalar,
-                       int *results, cudaStream_t stream = 0) {
+void coo_degree_scalar(
+  const int* rows, const T* vals, int nnz, T scalar, int* results, cudaStream_t stream = 0)
+{
   dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
   coo_degree_scalar_kernel<TPB_X, T>
@@ -154,12 +152,11 @@ void coo_degree_scalar(const int *rows, const T *vals, int nnz, T scalar,
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void coo_degree_nz(const int *rows, const T *vals, int nnz, int *results,
-                   cudaStream_t stream) {
+void coo_degree_nz(const int* rows, const T* vals, int nnz, int* results, cudaStream_t stream)
+{
   dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
-  coo_degree_nz_kernel<TPB_X, T>
-    <<<grid_rc, blk_rc, 0, stream>>>(rows, vals, nnz, results);
+  coo_degree_nz_kernel<TPB_X, T><<<grid_rc, blk_rc, 0, stream>>>(rows, vals, nnz, results);
 }
 
 /**
@@ -171,7 +168,8 @@ void coo_degree_nz(const int *rows, const T *vals, int nnz, int *results,
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void coo_degree_nz(COO<T> *in, int *results, cudaStream_t stream) {
+void coo_degree_nz(COO<T>* in, int* results, cudaStream_t stream)
+{
   dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
 
diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh
index bfcd3fd592..59dc5ff3e4 100644
--- a/cpp/include/raft/sparse/linalg/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/norm.cuh
@@ -41,10 +41,12 @@ __global__ void csr_row_normalize_l1_kernel(
   // @TODO: This can be done much more parallel by
   // having threads in a warp compute the sum in parallel
   // over each row and then divide the values in parallel.
-  const int *ia,           // csr row ex_scan (sorted by row)
-  const T *vals, int nnz,  // array of values and number of non-zeros
-  int m,                   // num rows in csr
-  T *result) {             // output array
+  const int* ia,  // csr row ex_scan (sorted by row)
+  const T* vals,
+  int nnz,  // array of values and number of non-zeros
+  int m,    // num rows in csr
+  T* result)
+{  // output array
 
   // row-based matrix 1 thread per row
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
@@ -52,7 +54,7 @@ __global__ void csr_row_normalize_l1_kernel(
   // sum all vals_arr for row and divide each val by sum
   if (row < m) {
     int start_idx = ia[row];
-    int stop_idx = 0;
+    int stop_idx  = 0;
     if (row < m - 1) {
       stop_idx = ia[row + 1];
     } else
@@ -65,7 +67,7 @@ __global__ void csr_row_normalize_l1_kernel(
 
     for (int j = start_idx; j < stop_idx; j++) {
       if (sum != 0.0) {
-        T val = vals[j];
+        T val     = vals[j];
         result[j] = val / sum;
       } else {
         result[j] = 0.0;
@@ -85,18 +87,18 @@ __global__ void csr_row_normalize_l1_kernel(
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void csr_row_normalize_l1(const int *ia,  // csr row ex_scan (sorted by row)
-                          const T *vals,
+void csr_row_normalize_l1(const int* ia,  // csr row ex_scan (sorted by row)
+                          const T* vals,
                           int nnz,  // array of values and number of non-zeros
                           int m,    // num rows in csr
-                          T *result,
-                          cudaStream_t stream) {  // output array
+                          T* result,
+                          cudaStream_t stream)
+{  // output array
 
   dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
-  csr_row_normalize_l1_kernel<TPB_X, T>
-    <<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
+  csr_row_normalize_l1_kernel<TPB_X, T><<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
   CUDA_CHECK(cudaGetLastError());
 }
 
@@ -105,10 +107,12 @@ __global__ void csr_row_normalize_max_kernel(
   // @TODO: This can be done much more parallel by
   // having threads in a warp compute the sum in parallel
   // over each row and then divide the values in parallel.
-  const int *ia,           // csr row ind array (sorted by row)
-  const T *vals, int nnz,  // array of values and number of non-zeros
-  int m,                   // num total rows in csr
-  T *result) {             // output array
+  const int* ia,  // csr row ind array (sorted by row)
+  const T* vals,
+  int nnz,  // array of values and number of non-zeros
+  int m,    // num total rows in csr
+  T* result)
+{  // output array
 
   // row-based matrix 1 thread per row
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
@@ -116,7 +120,7 @@ __global__ void csr_row_normalize_max_kernel(
   // find max across columns and divide
   if (row < m) {
     int start_idx = ia[row];
-    int stop_idx = 0;
+    int stop_idx  = 0;
     if (row < m - 1) {
       stop_idx = ia[row + 1];
     } else
@@ -130,7 +134,7 @@ __global__ void csr_row_normalize_max_kernel(
     // divide nonzeros in current row by max
     for (int j = start_idx; j < stop_idx; j++) {
       if (max != 0.0 && max > std::numeric_limits<float>::min()) {
-        T val = vals[j];
+        T val     = vals[j];
         result[j] = val / max;
       } else {
         result[j] = 0.0;
@@ -151,16 +155,17 @@ __global__ void csr_row_normalize_max_kernel(
  */
 
 template <int TPB_X = 64, typename T>
-void csr_row_normalize_max(const int *ia,  // csr row ind array (sorted by row)
-                           const T *vals,
+void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
+                           const T* vals,
                            int nnz,  // array of values and number of non-zeros
                            int m,    // num total rows in csr
-                           T *result, cudaStream_t stream) {
+                           T* result,
+                           cudaStream_t stream)
+{
   dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
-  csr_row_normalize_max_kernel<TPB_X, T>
-    <<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
+  csr_row_normalize_max_kernel<TPB_X, T><<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
   CUDA_CHECK(cudaGetLastError());
 }
 
diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh
index ce0c4bbe6f..a293e359c2 100644
--- a/cpp/include/raft/sparse/linalg/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/spectral.cuh
@@ -30,15 +30,22 @@ namespace sparse {
 namespace spectral {
 
 template <typename T>
-void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals,
-                   int nnz, int n, int n_components, T *out,
-                   unsigned long long seed = 1234567) {
+void fit_embedding(const raft::handle_t& handle,
+                   int* rows,
+                   int* cols,
+                   T* vals,
+                   int nnz,
+                   int n,
+                   int n_components,
+                   T* out,
+                   unsigned long long seed = 1234567)
+{
   auto stream = handle.get_stream();
   rmm::device_uvector<int> src_offsets(n + 1, stream);
   rmm::device_uvector<int> dst_cols(nnz, stream);
   rmm::device_uvector<T> dst_vals(nnz, stream);
-  convert::coo_to_csr(handle, rows, cols, vals, nnz, n, src_offsets.data(),
-                      dst_cols.data(), dst_vals.data());
+  convert::coo_to_csr(
+    handle, rows, cols, vals, nnz, n, src_offsets.data(), dst_cols.data(), dst_vals.data());
 
   rmm::device_uvector<T> eigVals(n_components + 1, stream);
   rmm::device_uvector<T> eigVecs(n * (n_components + 1), stream);
@@ -52,45 +59,49 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals,
   using index_type = int;
   using value_type = T;
 
-  index_type *ro = src_offsets.data();
-  index_type *ci = dst_cols.data();
-  value_type *vs = dst_vals.data();
+  index_type* ro = src_offsets.data();
+  index_type* ci = dst_cols.data();
+  value_type* vs = dst_vals.data();
 
-  raft::matrix::sparse_matrix_t<index_type, value_type> const r_csr_m{
-    handle, ro, ci, vs, n, nnz};
+  raft::matrix::sparse_matrix_t<index_type, value_type> const r_csr_m{handle, ro, ci, vs, n, nnz};
 
-  index_type neigvs = n_components + 1;
-  index_type maxiter = 4000;  //default reset value (when set to 0);
-  value_type tol = 0.01;
-  index_type restart_iter = 15 + neigvs;  //what cugraph is using
+  index_type neigvs       = n_components + 1;
+  index_type maxiter      = 4000;  // default reset value (when set to 0);
+  value_type tol          = 0.01;
+  index_type restart_iter = 15 + neigvs;  // what cugraph is using
 
-  raft::eigen_solver_config_t<index_type, value_type> cfg{neigvs, maxiter,
-                                                          restart_iter, tol};
+  raft::eigen_solver_config_t<index_type, value_type> cfg{neigvs, maxiter, restart_iter, tol};
 
   cfg.seed = seed;
 
   raft::lanczos_solver_t<index_type, value_type> eig_solver{cfg};
 
-  //cluster computation here is irrelevant,
-  //hence define a no-op such solver to
-  //feed partition():
+  // cluster computation here is irrelevant,
+  // hence define a no-op such solver to
+  // feed partition():
   //
   struct no_op_cluster_solver_t {
     using index_type_t = index_type;
-    using size_type_t = index_type;
+    using size_type_t  = index_type;
     using value_type_t = value_type;
 
-    std::pair<value_type_t, index_type_t> solve(
-      handle_t const &handle, size_type_t n_obs_vecs, size_type_t dim,
-      value_type_t const *__restrict__ obs,
-      index_type_t *__restrict__ codes) const {
+    std::pair<value_type_t, index_type_t> solve(handle_t const& handle,
+                                                size_type_t n_obs_vecs,
+                                                size_type_t dim,
+                                                value_type_t const* __restrict__ obs,
+                                                index_type_t* __restrict__ codes) const
+    {
       return std::make_pair<value_type_t, index_type_t>(0, 0);
     }
   };
 
-  raft::spectral::partition(handle, r_csr_m, eig_solver,
-                            no_op_cluster_solver_t{}, labels.data(),
-                            eigVals.data(), eigVecs.data());
+  raft::spectral::partition(handle,
+                            r_csr_m,
+                            eig_solver,
+                            no_op_cluster_solver_t{},
+                            labels.data(),
+                            eigVals.data(),
+                            eigVecs.data());
 
   raft::copy<T>(out, eigVecs.data() + n, n * n_components, stream);
 
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh
index a6e1027288..ae89e7993c 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh
@@ -47,26 +47,34 @@ namespace linalg {
 // TODO: value_idx param needs to be used for this once FAISS is updated to use float32
 // for indices so that the index types can be uniform
 template <int TPB_X = 128, typename T, typename Lambda>
-__global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols,
-                                      T *vals, int *orows, int *ocols, T *ovals,
-                                      int n, int cnnz, Lambda reduction_op) {
+__global__ void coo_symmetrize_kernel(int* row_ind,
+                                      int* rows,
+                                      int* cols,
+                                      T* vals,
+                                      int* orows,
+                                      int* ocols,
+                                      T* ovals,
+                                      int n,
+                                      int cnnz,
+                                      Lambda reduction_op)
+{
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
   if (row < n) {
     int start_idx = row_ind[row];  // each thread processes one row
-    int stop_idx = get_stop_idx(row, n, cnnz, row_ind);
+    int stop_idx  = get_stop_idx(row, n, cnnz, row_ind);
 
-    int row_nnz = 0;
+    int row_nnz       = 0;
     int out_start_idx = start_idx * 2;
 
     for (int idx = 0; idx < stop_idx - start_idx; idx++) {
       int cur_row = rows[idx + start_idx];
       int cur_col = cols[idx + start_idx];
-      T cur_val = vals[idx + start_idx];
+      T cur_val   = vals[idx + start_idx];
 
       int lookup_row = cur_col;
-      int t_start = row_ind[lookup_row];  // Start at
-      int t_stop = get_stop_idx(lookup_row, n, cnnz, row_ind);
+      int t_start    = row_ind[lookup_row];  // Start at
+      int t_stop     = get_stop_idx(lookup_row, n, cnnz, row_ind);
 
       T transpose = 0.0;
 
@@ -77,7 +85,7 @@ __global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols,
         // done in a different thread.
         if (cols[t_idx] == cur_row && rows[t_idx] == cur_col) {
           // If it exists already, set transposed value to existing value
-          transpose = vals[t_idx];
+          transpose   = vals[t_idx];
           found_match = true;
           break;
         }
@@ -123,9 +131,11 @@ __global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols,
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 128, typename T, typename Lambda>
-void coo_symmetrize(COO<T> *in, COO<T> *out,
+void coo_symmetrize(COO<T>* in,
+                    COO<T>* out,
                     Lambda reduction_op,  // two-argument reducer
-                    cudaStream_t stream) {
+                    cudaStream_t stream)
+{
   dim3 grid(raft::ceildiv(in->n_rows, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
@@ -137,9 +147,16 @@ void coo_symmetrize(COO<T> *in, COO<T> *out,
 
   out->allocate(in->nnz * 2, in->n_rows, in->n_cols, true, stream);
 
-  coo_symmetrize_kernel<TPB_X, T><<<grid, blk, 0, stream>>>(
-    in_row_ind.data(), in->rows(), in->cols(), in->vals(), out->rows(),
-    out->cols(), out->vals(), in->n_rows, in->nnz, reduction_op);
+  coo_symmetrize_kernel<TPB_X, T><<<grid, blk, 0, stream>>>(in_row_ind.data(),
+                                                            in->rows(),
+                                                            in->cols(),
+                                                            in->vals(),
+                                                            out->rows(),
+                                                            out->cols(),
+                                                            out->vals(),
+                                                            in->n_rows,
+                                                            in->nnz,
+                                                            reduction_op);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -155,14 +172,15 @@ void coo_symmetrize(COO<T> *in, COO<T> *out,
  * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction
  */
 template <typename value_idx = int64_t, typename value_t = float>
-__global__ static void symmetric_find_size(const value_t *restrict data,
-                                           const value_idx *restrict indices,
-                                           const value_idx n, const int k,
-                                           value_idx *restrict row_sizes,
-                                           value_idx *restrict row_sizes2) {
+__global__ static void symmetric_find_size(const value_t* restrict data,
+                                           const value_idx* restrict indices,
+                                           const value_idx n,
+                                           const int k,
+                                           value_idx* restrict row_sizes,
+                                           value_idx* restrict row_sizes2)
+{
   const auto row = blockIdx.x * blockDim.x + threadIdx.x;  // for every row
-  const auto j =
-    blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
+  const auto j   = blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
   if (row >= n || j >= k) return;
 
   const auto col = indices[row * k + j];
@@ -182,9 +200,11 @@ __global__ static void symmetric_find_size(const value_t *restrict data,
  * @param row_sizes2: Input row sum 2 array(n) for faster reduction
  */
 template <typename value_idx>
-__global__ static void reduce_find_size(const value_idx n, const int k,
-                                        value_idx *restrict row_sizes,
-                                        const value_idx *restrict row_sizes2) {
+__global__ static void reduce_find_size(const value_idx n,
+                                        const int k,
+                                        value_idx* restrict row_sizes,
+                                        const value_idx* restrict row_sizes2)
+{
   const auto i = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (i >= n) return;
   row_sizes[i] += (row_sizes2[i] + k);
@@ -205,20 +225,21 @@ __global__ static void reduce_find_size(const value_idx n, const int k,
  * @param k: Number of n_neighbors
  */
 template <typename value_idx = int64_t, typename value_t = float>
-__global__ static void symmetric_sum(value_idx *restrict edges,
-                                     const value_t *restrict data,
-                                     const value_idx *restrict indices,
-                                     value_t *restrict VAL,
-                                     value_idx *restrict COL,
-                                     value_idx *restrict ROW, const value_idx n,
-                                     const int k) {
+__global__ static void symmetric_sum(value_idx* restrict edges,
+                                     const value_t* restrict data,
+                                     const value_idx* restrict indices,
+                                     value_t* restrict VAL,
+                                     value_idx* restrict COL,
+                                     value_idx* restrict ROW,
+                                     const value_idx n,
+                                     const int k)
+{
   const auto row = blockIdx.x * blockDim.x + threadIdx.x;  // for every row
-  const auto j =
-    blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
+  const auto j   = blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
   if (row >= n || j >= k) return;
 
-  const auto col = indices[row * k + j];
-  const auto original = atomicAdd(&edges[row], value_idx(1));
+  const auto col       = indices[row * k + j];
+  const auto original  = atomicAdd(&edges[row], value_idx(1));
   const auto transpose = atomicAdd(&edges[col], value_idx(1));
 
   VAL[transpose] = VAL[original] = data[row * k + j];
@@ -247,27 +268,25 @@ __global__ static void symmetric_sum(value_idx *restrict edges,
  * @param out: Output COO Matrix class
  * @param stream: Input cuda stream
  */
-template <typename value_idx = int64_t, typename value_t = float,
-          int TPB_X = 32, int TPB_Y = 32>
-void from_knn_symmetrize_matrix(const value_idx *restrict knn_indices,
-                                const value_t *restrict knn_dists,
-                                const value_idx n, const int k,
-                                COO<value_t, value_idx> *out,
-                                cudaStream_t stream) {
+template <typename value_idx = int64_t, typename value_t = float, int TPB_X = 32, int TPB_Y = 32>
+void from_knn_symmetrize_matrix(const value_idx* restrict knn_indices,
+                                const value_t* restrict knn_dists,
+                                const value_idx n,
+                                const int k,
+                                COO<value_t, value_idx>* out,
+                                cudaStream_t stream)
+{
   // (1) Find how much space needed in each row
   // We look through all datapoints and increment the count for each row.
   const dim3 threadsPerBlock(TPB_X, TPB_Y);
-  const dim3 numBlocks(raft::ceildiv(n, (value_idx)TPB_X),
-                       raft::ceildiv(k, TPB_Y));
+  const dim3 numBlocks(raft::ceildiv(n, (value_idx)TPB_X), raft::ceildiv(k, TPB_Y));
 
   // Notice n+1 since we can reuse these arrays for transpose_edges, original_edges in step (4)
   rmm::device_uvector<value_idx> row_sizes(n, stream);
-  CUDA_CHECK(
-    cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream));
+  CUDA_CHECK(cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream));
 
   rmm::device_uvector<value_idx> row_sizes2(n, stream);
-  CUDA_CHECK(
-    cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream));
+  CUDA_CHECK(cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream));
 
   symmetric_find_size<<<numBlocks, threadsPerBlock, 0, stream>>>(
     knn_dists, knn_indices, n, k, row_sizes.data(), row_sizes2.data());
@@ -288,14 +307,12 @@ void from_knn_symmetrize_matrix(const value_idx *restrict knn_indices,
   // This mirrors CSR matrix's row Pointer, were maximum bounds for each row
   // are calculated as the cumulative rolling sum of the previous rows.
   // Notice reusing old row_sizes2 memory
-  value_idx *edges = row_sizes2.data();
-  thrust::device_ptr<value_idx> __edges = thrust::device_pointer_cast(edges);
-  thrust::device_ptr<value_idx> __row_sizes =
-    thrust::device_pointer_cast(row_sizes.data());
+  value_idx* edges                          = row_sizes2.data();
+  thrust::device_ptr<value_idx> __edges     = thrust::device_pointer_cast(edges);
+  thrust::device_ptr<value_idx> __row_sizes = thrust::device_pointer_cast(row_sizes.data());
 
   // Rolling cumulative sum
-  thrust::exclusive_scan(rmm::exec_policy(stream), __row_sizes, __row_sizes + n,
-                         __edges);
+  thrust::exclusive_scan(rmm::exec_policy(stream), __row_sizes, __row_sizes + n, __edges);
 
   // (5) Perform final data + data.T operation in tandem with memcpying
   symmetric_sum<<<numBlocks, threadsPerBlock, 0, stream>>>(
@@ -307,9 +324,15 @@ void from_knn_symmetrize_matrix(const value_idx *restrict knn_indices,
  * Symmetrizes a COO matrix
  */
 template <typename value_idx, typename value_t>
-void symmetrize(const raft::handle_t &handle, const value_idx *rows,
-                const value_idx *cols, const value_t *vals, size_t m, size_t n,
-                size_t nnz, raft::sparse::COO<value_t, value_idx> &out) {
+void symmetrize(const raft::handle_t& handle,
+                const value_idx* rows,
+                const value_idx* cols,
+                const value_t* vals,
+                size_t m,
+                size_t n,
+                size_t nnz,
+                raft::sparse::COO<value_t, value_idx>& out)
+{
   auto stream = handle.get_stream();
 
   // copy rows to cols and cols to rows
@@ -326,13 +349,16 @@ void symmetrize(const raft::handle_t &handle, const value_idx *rows,
   raft::copy_async(symm_vals.data() + nnz, vals, nnz, stream);
 
   // sort COO
-  raft::sparse::op::coo_sort((value_idx)m, (value_idx)n, (value_idx)nnz * 2,
-                             symm_rows.data(), symm_cols.data(),
-                             symm_vals.data(), stream);
-
-  raft::sparse::op::max_duplicates(handle, out, symm_rows.data(),
-                                   symm_cols.data(), symm_vals.data(), nnz * 2,
-                                   m, n);
+  raft::sparse::op::coo_sort((value_idx)m,
+                             (value_idx)n,
+                             (value_idx)nnz * 2,
+                             symm_rows.data(),
+                             symm_cols.data(),
+                             symm_vals.data(),
+                             stream);
+
+  raft::sparse::op::max_duplicates(
+    handle, out, symm_rows.data(), symm_cols.data(), symm_vals.data(), nnz * 2, m, n);
 }
 
 };  // end NAMESPACE linalg
diff --git a/cpp/include/raft/sparse/linalg/transpose.h b/cpp/include/raft/sparse/linalg/transpose.h
index 7ad4b93ec0..e3a9b1fbd9 100644
--- a/cpp/include/raft/sparse/linalg/transpose.h
+++ b/cpp/include/raft/sparse/linalg/transpose.h
@@ -55,27 +55,53 @@ namespace linalg {
  * @param[in] stream : Cuda stream for ordering events
  */
 template <typename value_idx, typename value_t>
-void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr,
-                   const value_idx *csr_indices, const value_t *csr_data,
-                   value_idx *csc_indptr, value_idx *csc_indices,
-                   value_t *csc_data, value_idx csr_nrows, value_idx csr_ncols,
-                   value_idx nnz, cudaStream_t stream) {
+void csr_transpose(cusparseHandle_t handle,
+                   const value_idx* csr_indptr,
+                   const value_idx* csr_indices,
+                   const value_t* csr_data,
+                   value_idx* csc_indptr,
+                   value_idx* csc_indices,
+                   value_t* csc_data,
+                   value_idx csr_nrows,
+                   value_idx csr_ncols,
+                   value_idx nnz,
+                   cudaStream_t stream)
+{
   size_t convert_csc_workspace_size = 0;
 
-  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize(
-    handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices,
-    csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC,
-    CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1,
-    &convert_csc_workspace_size, stream));
+  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize(handle,
+                                                          csr_nrows,
+                                                          csr_ncols,
+                                                          nnz,
+                                                          csr_data,
+                                                          csr_indptr,
+                                                          csr_indices,
+                                                          csc_data,
+                                                          csc_indptr,
+                                                          csc_indices,
+                                                          CUSPARSE_ACTION_NUMERIC,
+                                                          CUSPARSE_INDEX_BASE_ZERO,
+                                                          CUSPARSE_CSR2CSC_ALG1,
+                                                          &convert_csc_workspace_size,
+                                                          stream));
 
-  rmm::device_uvector<char> convert_csc_workspace(convert_csc_workspace_size,
-                                                  stream);
+  rmm::device_uvector<char> convert_csc_workspace(convert_csc_workspace_size, stream);
 
-  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc(
-    handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices,
-    csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC,
-    CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1,
-    convert_csc_workspace.data(), stream));
+  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc(handle,
+                                               csr_nrows,
+                                               csr_ncols,
+                                               nnz,
+                                               csr_data,
+                                               csr_indptr,
+                                               csr_indices,
+                                               csc_data,
+                                               csc_indptr,
+                                               csc_indices,
+                                               CUSPARSE_ACTION_NUMERIC,
+                                               CUSPARSE_INDEX_BASE_ZERO,
+                                               CUSPARSE_CSR2CSC_ALG1,
+                                               convert_csc_workspace.data(),
+                                               stream));
 }
 
 };  // end NAMESPACE linalg
diff --git a/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh b/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh
index f0d30b0cb7..36d426029b 100644
--- a/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh
+++ b/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh
@@ -28,10 +28,16 @@ namespace mst {
 namespace detail {
 
 template <typename vertex_t, typename edge_t, typename alteration_t>
-__global__ void kernel_min_edge_per_vertex(
-  const edge_t* offsets, const vertex_t* indices, const alteration_t* weights,
-  const vertex_t* color, const vertex_t* color_index, edge_t* new_mst_edge,
-  const bool* mst_edge, alteration_t* min_edge_color, const vertex_t v) {
+__global__ void kernel_min_edge_per_vertex(const edge_t* offsets,
+                                           const vertex_t* indices,
+                                           const alteration_t* weights,
+                                           const vertex_t* color,
+                                           const vertex_t* color_index,
+                                           edge_t* new_mst_edge,
+                                           const bool* mst_edge,
+                                           alteration_t* min_edge_color,
+                                           const vertex_t v)
+{
   edge_t tid = threadIdx.x + blockIdx.x * blockDim.x;
 
   unsigned warp_id = tid / 32;
@@ -41,14 +47,14 @@ __global__ void kernel_min_edge_per_vertex(
   __shared__ alteration_t min_edge_weight[32];
   __shared__ vertex_t min_color[32];
 
-  min_edge_index[lane_id] = std::numeric_limits<edge_t>::max();
+  min_edge_index[lane_id]  = std::numeric_limits<edge_t>::max();
   min_edge_weight[lane_id] = std::numeric_limits<alteration_t>::max();
-  min_color[lane_id] = std::numeric_limits<vertex_t>::max();
+  min_color[lane_id]       = std::numeric_limits<vertex_t>::max();
 
   __syncthreads();
 
   vertex_t self_color_idx = color_index[warp_id];
-  vertex_t self_color = color[self_color_idx];
+  vertex_t self_color     = color[self_color_idx];
 
   // find the minimum edge associated per row
   // each thread in warp holds the minimum edge for
@@ -56,20 +62,20 @@ __global__ void kernel_min_edge_per_vertex(
   if (warp_id < v) {
     // one row is associated with one warp
     edge_t row_start = offsets[warp_id];
-    edge_t row_end = offsets[warp_id + 1];
+    edge_t row_end   = offsets[warp_id + 1];
 
     // assuming one warp per row
     // find min for each thread in warp
     for (edge_t e = row_start + lane_id; e < row_end; e += 32) {
       alteration_t curr_edge_weight = weights[e];
-      vertex_t successor_color_idx = color_index[indices[e]];
-      vertex_t successor_color = color[successor_color_idx];
+      vertex_t successor_color_idx  = color_index[indices[e]];
+      vertex_t successor_color      = color[successor_color_idx];
 
       if (!mst_edge[e] && self_color != successor_color) {
         if (curr_edge_weight < min_edge_weight[lane_id]) {
-          min_color[lane_id] = successor_color;
+          min_color[lane_id]       = successor_color;
           min_edge_weight[lane_id] = curr_edge_weight;
-          min_edge_index[lane_id] = e;
+          min_edge_index[lane_id]  = e;
         }
       }
     }
@@ -82,9 +88,9 @@ __global__ void kernel_min_edge_per_vertex(
   for (int offset = 16; offset > 0; offset >>= 1) {
     if (lane_id < offset) {
       if (min_edge_weight[lane_id] > min_edge_weight[lane_id + offset]) {
-        min_color[lane_id] = min_color[lane_id + offset];
+        min_color[lane_id]       = min_color[lane_id + offset];
         min_edge_weight[lane_id] = min_edge_weight[lane_id + offset];
-        min_edge_index[lane_id] = min_edge_index[lane_id + offset];
+        min_edge_index[lane_id]  = min_edge_index[lane_id + offset];
       }
     }
     __syncthreads();
@@ -102,19 +108,26 @@ __global__ void kernel_min_edge_per_vertex(
   }
 }
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-__global__ void min_edge_per_supervertex(
-  const vertex_t* color, const vertex_t* color_index, edge_t* new_mst_edge,
-  bool* mst_edge, const vertex_t* indices, const weight_t* weights,
-  const alteration_t* altered_weights, vertex_t* temp_src, vertex_t* temp_dst,
-  weight_t* temp_weights, const alteration_t* min_edge_color, const vertex_t v,
-  bool symmetrize_output) {
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+__global__ void min_edge_per_supervertex(const vertex_t* color,
+                                         const vertex_t* color_index,
+                                         edge_t* new_mst_edge,
+                                         bool* mst_edge,
+                                         const vertex_t* indices,
+                                         const weight_t* weights,
+                                         const alteration_t* altered_weights,
+                                         vertex_t* temp_src,
+                                         vertex_t* temp_dst,
+                                         weight_t* temp_weights,
+                                         const alteration_t* min_edge_color,
+                                         const vertex_t v,
+                                         bool symmetrize_output)
+{
   auto tid = get_1D_idx<vertex_t>();
   if (tid < v) {
     vertex_t vertex_color_idx = color_index[tid];
-    vertex_t vertex_color = color[vertex_color_idx];
-    edge_t edge_idx = new_mst_edge[tid];
+    vertex_t vertex_color     = color[vertex_color_idx];
+    edge_t edge_idx           = new_mst_edge[tid];
 
     // check if valid outgoing edge was found
     // find minimum edge is same as minimum edge of whole supervertex
@@ -129,32 +142,27 @@ __global__ void min_edge_per_supervertex(
         auto dst = indices[edge_idx];
         if (!symmetrize_output) {
           auto dst_edge_idx = new_mst_edge[dst];
-          auto dst_color = color[color_index[dst]];
+          auto dst_color    = color[color_index[dst]];
 
           // vertices added each other
           // only if destination has found an edge
           // the edge points back to source
           // the edge is minimum edge found for dst color
-          if (dst_edge_idx != std::numeric_limits<edge_t>::max() &&
-              indices[dst_edge_idx] == tid &&
+          if (dst_edge_idx != std::numeric_limits<edge_t>::max() && indices[dst_edge_idx] == tid &&
               min_edge_color[dst_color] == altered_weights[dst_edge_idx]) {
-            if (vertex_color > dst_color) {
-              add_edge = false;
-            }
+            if (vertex_color > dst_color) { add_edge = false; }
           }
         }
 
         if (add_edge) {
-          temp_src[tid] = tid;
-          temp_dst[tid] = dst;
-          temp_weights[tid] = weights[edge_idx];
+          temp_src[tid]      = tid;
+          temp_dst[tid]      = dst;
+          temp_weights[tid]  = weights[edge_idx];
           mst_edge[edge_idx] = true;
         }
       }
 
-      if (!add_edge) {
-        new_mst_edge[tid] = std::numeric_limits<edge_t>::max();
-      }
+      if (!add_edge) { new_mst_edge[tid] = std::numeric_limits<edge_t>::max(); }
     }
   }
 }
@@ -162,9 +170,13 @@ __global__ void min_edge_per_supervertex(
 template <typename vertex_t, typename edge_t, typename weight_t>
 __global__ void add_reverse_edge(const edge_t* new_mst_edge,
                                  const vertex_t* indices,
-                                 const weight_t* weights, vertex_t* temp_src,
-                                 vertex_t* temp_dst, weight_t* temp_weights,
-                                 const vertex_t v, bool symmetrize_output) {
+                                 const weight_t* weights,
+                                 vertex_t* temp_src,
+                                 vertex_t* temp_dst,
+                                 weight_t* temp_weights,
+                                 const vertex_t v,
+                                 bool symmetrize_output)
+{
   auto tid = get_1D_idx<vertex_t>();
 
   if (tid < v) {
@@ -186,9 +198,7 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge,
 
           // if vertices did not pick each other
           // add a reverse edge
-          if (tid != neighbor_vertex_neighbor) {
-            reverse_needed = true;
-          }
+          if (tid != neighbor_vertex_neighbor) { reverse_needed = true; }
         }
       }
 
@@ -197,8 +207,8 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge,
         // it is assumed the each vertex only picks one valid min edge
         // per cycle
         // hence, we store at index tid + v for the reverse edge scenario
-        temp_src[tid + v] = neighbor_vertex;
-        temp_dst[tid + v] = tid;
+        temp_src[tid + v]     = neighbor_vertex;
+        temp_dst[tid + v]     = tid;
         temp_weights[tid + v] = weights[edge_idx];
       }
     }
@@ -207,11 +217,13 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge,
 
 // executes for newly added mst edges and updates the colors of both vertices to the lower color
 template <typename vertex_t, typename edge_t>
-__global__ void min_pair_colors(const vertex_t v, const vertex_t* indices,
+__global__ void min_pair_colors(const vertex_t v,
+                                const vertex_t* indices,
                                 const edge_t* new_mst_edge,
                                 const vertex_t* color,
                                 const vertex_t* color_index,
-                                vertex_t* next_color) {
+                                vertex_t* next_color)
+{
   auto i = get_1D_idx<vertex_t>();
 
   if (i < v) {
@@ -220,9 +232,9 @@ __global__ void min_pair_colors(const vertex_t v, const vertex_t* indices,
     if (edge_idx != std::numeric_limits<edge_t>::max()) {
       vertex_t neighbor_vertex = indices[edge_idx];
       // vertex_t self_color = color[i];
-      vertex_t self_color_idx = color_index[i];
-      vertex_t self_color = color[self_color_idx];
-      vertex_t neighbor_color_idx = color_index[neighbor_vertex];
+      vertex_t self_color_idx       = color_index[i];
+      vertex_t self_color           = color[self_color_idx];
+      vertex_t neighbor_color_idx   = color_index[neighbor_vertex];
       vertex_t neighbor_super_color = color[neighbor_color_idx];
 
       // update my own color as source of edge
@@ -238,33 +250,36 @@ __global__ void min_pair_colors(const vertex_t v, const vertex_t* indices,
 
 // for each vertex, update color if it was changed in min_pair_colors kernel
 template <typename vertex_t>
-__global__ void update_colors(const vertex_t v, vertex_t* color,
+__global__ void update_colors(const vertex_t v,
+                              vertex_t* color,
                               const vertex_t* color_index,
-                              const vertex_t* next_color, bool* done) {
+                              const vertex_t* next_color,
+                              bool* done)
+{
   auto i = get_1D_idx<vertex_t>();
 
   if (i < v) {
-    vertex_t self_color = color[i];
+    vertex_t self_color     = color[i];
     vertex_t self_color_idx = color_index[i];
-    vertex_t new_color = next_color[self_color_idx];
+    vertex_t new_color      = next_color[self_color_idx];
 
     // update self color to new smaller color
     if (self_color > new_color) {
       color[i] = new_color;
-      *done = false;
+      *done    = false;
     }
   }
 }
 
 // point vertices to their final color index
 template <typename vertex_t>
-__global__ void final_color_indices(const vertex_t v, const vertex_t* color,
-                                    vertex_t* color_index) {
+__global__ void final_color_indices(const vertex_t v, const vertex_t* color, vertex_t* color_index)
+{
   auto i = get_1D_idx<vertex_t>();
 
   if (i < v) {
     vertex_t self_color_idx = color_index[i];
-    vertex_t self_color = color[self_color_idx];
+    vertex_t self_color     = color[self_color_idx];
 
     // if self color is not equal to self color index,
     // it means self is not supervertex
@@ -272,7 +287,7 @@ __global__ void final_color_indices(const vertex_t v, const vertex_t* color,
     // parent supervertex
     while (self_color_idx != self_color) {
       self_color_idx = color_index[self_color];
-      self_color = color[self_color_idx];
+      self_color     = color[self_color_idx];
     }
 
     // point to new supervertex
@@ -282,22 +297,23 @@ __global__ void final_color_indices(const vertex_t v, const vertex_t* color,
 
 // Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu
 // Consider using curand device API instead of precomputed random_values array
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-__global__ void alteration_kernel(const vertex_t v, const edge_t e,
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+__global__ void alteration_kernel(const vertex_t v,
+                                  const edge_t e,
                                   const edge_t* offsets,
                                   const vertex_t* indices,
-                                  const weight_t* weights, alteration_t max,
+                                  const weight_t* weights,
+                                  alteration_t max,
                                   alteration_t* random_values,
-                                  alteration_t* altered_weights) {
+                                  alteration_t* altered_weights)
+{
   auto row = get_1D_idx<vertex_t>();
   if (row < v) {
     auto row_begin = offsets[row];
-    auto row_end = offsets[row + 1];
+    auto row_end   = offsets[row + 1];
     for (auto i = row_begin; i < row_end; i++) {
-      auto column = indices[i];
-      altered_weights[i] =
-        weights[i] + max * (random_values[row] + random_values[column]);
+      auto column        = indices[i];
+      altered_weights[i] = weights[i] + max * (random_values[row] + random_values[column]);
     }
   }
 }
@@ -305,17 +321,15 @@ __global__ void alteration_kernel(const vertex_t v, const edge_t e,
 template <typename vertex_t, typename edge_t>
 __global__ void kernel_count_new_mst_edges(const vertex_t* mst_src,
                                            edge_t* mst_edge_count,
-                                           const vertex_t v) {
+                                           const vertex_t v)
+{
   auto tid = get_1D_idx<vertex_t>();
 
   // count number of new mst edges added
-  bool predicate =
-    tid < v && (mst_src[tid] != std::numeric_limits<vertex_t>::max());
+  bool predicate       = tid < v && (mst_src[tid] != std::numeric_limits<vertex_t>::max());
   vertex_t block_count = __syncthreads_count(predicate);
 
-  if (threadIdx.x == 0 && block_count > 0) {
-    atomicAdd(mst_edge_count, block_count);
-  }
+  if (threadIdx.x == 0 && block_count > 0) { atomicAdd(mst_edge_count, block_count); }
 }
 
 }  // namespace detail
diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
index 33b980afcd..5591e15b19 100644
--- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
+++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
@@ -40,21 +40,30 @@ typedef std::chrono::high_resolution_clock Clock;
 
 // curand generator uniform
 inline curandStatus_t curand_generate_uniformX(curandGenerator_t generator,
-                                               float* outputPtr, size_t n) {
+                                               float* outputPtr,
+                                               size_t n)
+{
   return curandGenerateUniform(generator, outputPtr, n);
 }
 inline curandStatus_t curand_generate_uniformX(curandGenerator_t generator,
-                                               double* outputPtr, size_t n) {
+                                               double* outputPtr,
+                                               size_t n)
+{
   return curandGenerateUniformDouble(generator, outputPtr, n);
 }
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(
-  const raft::handle_t& handle_, const edge_t* offsets_,
-  const vertex_t* indices_, const weight_t* weights_, const vertex_t v_,
-  const edge_t e_, vertex_t* color_, cudaStream_t stream_,
-  bool symmetrize_output_, bool initialize_colors_, int iterations_)
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(const raft::handle_t& handle_,
+                                                                 const edge_t* offsets_,
+                                                                 const vertex_t* indices_,
+                                                                 const weight_t* weights_,
+                                                                 const vertex_t v_,
+                                                                 const edge_t e_,
+                                                                 vertex_t* color_,
+                                                                 cudaStream_t stream_,
+                                                                 bool symmetrize_output_,
+                                                                 bool initialize_colors_,
+                                                                 int iterations_)
   : handle(handle_),
     offsets(offsets_),
     indices(indices_),
@@ -76,17 +85,17 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(
     stream(stream_),
     symmetrize_output(symmetrize_output_),
     initialize_colors(initialize_colors_),
-    iterations(iterations_) {
-  max_blocks = handle_.get_device_properties().maxGridSize[0];
+    iterations(iterations_)
+{
+  max_blocks  = handle_.get_device_properties().maxGridSize[0];
   max_threads = handle_.get_device_properties().maxThreadsPerBlock;
-  sm_count = handle_.get_device_properties().multiProcessorCount;
+  sm_count    = handle_.get_device_properties().multiProcessorCount;
 
   mst_edge_count.set_value_to_zero_async(stream);
   prev_mst_edge_count.set_value_to_zero_async(stream);
-  CUDA_CHECK(cudaMemsetAsync(mst_edge.data(), 0, mst_edge.size() * sizeof(bool),
-                             stream));
+  CUDA_CHECK(cudaMemsetAsync(mst_edge.data(), 0, mst_edge.size() * sizeof(bool), stream));
 
-  //Initially, color holds the vertex id as color
+  // Initially, color holds the vertex id as color
   auto policy = handle.get_thrust_policy();
   if (initialize_colors_) {
     thrust::sequence(policy, color.begin(), color.end(), 0);
@@ -97,10 +106,10 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(
   thrust::sequence(policy, next_color.begin(), next_color.end(), 0);
 }
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
 raft::Graph_COO<vertex_t, edge_t, weight_t>
-MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve() {
+MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve()
+{
   RAFT_EXPECTS(v > 0, "0 vertices");
   RAFT_EXPECTS(e > 0, "0 edges");
   RAFT_EXPECTS(offsets != nullptr, "Null offsets.");
@@ -113,12 +122,13 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve() {
 
   // Alterating the weights
   // this is done by identifying the lowest cost edge weight gap that is not 0, call this theta.
-  // For each edge, add noise that is less than theta. That is, generate a random number in the range [0.0, theta) and add it to each edge weight.
+  // For each edge, add noise that is less than theta. That is, generate a random number in the
+  // range [0.0, theta) and add it to each edge weight.
   alteration();
 
 #ifdef MST_TIME
   auto stop = Clock::now();
-  timer0 = duration_us(stop - start);
+  timer0    = duration_us(stop - start);
 #endif
 
   auto max_mst_edges = symmetrize_output ? 2 * v - 2 : v - 1;
@@ -167,8 +177,8 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve() {
     if (curr_mst_edge_count == prev_mst_edge_count.value(stream)) {
 #ifdef MST_TIME
       std::cout << "Iterations: " << i << std::endl;
-      std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3
-                << "," << timer4 << "," << timer5 << std::endl;
+      std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3 << "," << timer4 << ","
+                << timer5 << std::endl;
 #endif
       // exit here when reaching steady state
       break;
@@ -178,8 +188,7 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve() {
     start = Clock::now();
 #endif
     // append the newly found MST edges to the final output
-    append_src_dst_pair(mst_result.src.data(), mst_result.dst.data(),
-                        mst_result.weights.data());
+    append_src_dst_pair(mst_result.src.data(), mst_result.dst.data(), mst_result.weights.data());
 #ifdef MST_TIME
     stop = Clock::now();
     timer4 += duration_us(stop - start);
@@ -210,50 +219,46 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve() {
 // ||y|-|x||
 template <typename weight_t>
 struct alteration_functor {
-  __host__ __device__ weight_t
-  operator()(const thrust::tuple<weight_t, weight_t>& t) {
+  __host__ __device__ weight_t operator()(const thrust::tuple<weight_t, weight_t>& t)
+  {
     auto x = thrust::get<0>(t);
     auto y = thrust::get<1>(t);
-    x = x < 0 ? -x : x;
-    y = y < 0 ? -y : y;
+    x      = x < 0 ? -x : x;
+    y      = y < 0 ? -y : y;
     return x < y ? y - x : x - y;
   }
 };
 
 // Compute the uper bound for the alteration
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-alteration_t
-MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration_max() {
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+alteration_t MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration_max()
+{
   auto policy = handle.get_thrust_policy();
   rmm::device_uvector<weight_t> tmp(e, stream);
   thrust::device_ptr<const weight_t> weights_ptr(weights);
   thrust::copy(policy, weights_ptr, weights_ptr + e, tmp.begin());
-  //sort tmp weights
+  // sort tmp weights
   thrust::sort(policy, tmp.begin(), tmp.end());
 
-  //remove duplicates
+  // remove duplicates
   auto new_end = thrust::unique(policy, tmp.begin(), tmp.end());
 
-  //min(a[i+1]-a[i])/2
-  auto begin =
-    thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1));
-  auto end =
-    thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end));
-  auto init = tmp.element(1, stream) - tmp.element(0, stream);
-  auto max =
-    thrust::transform_reduce(policy, begin, end, alteration_functor<weight_t>(),
-                             init, thrust::minimum<weight_t>());
+  // min(a[i+1]-a[i])/2
+  auto begin = thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1));
+  auto end   = thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end));
+  auto init  = tmp.element(1, stream) - tmp.element(0, stream);
+  auto max   = thrust::transform_reduce(
+    policy, begin, end, alteration_functor<weight_t>(), init, thrust::minimum<weight_t>());
   return max / static_cast<alteration_t>(2);
 }
 
 // Compute the alteration to make all undirected edge weight unique
 // Preserves weights order
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration() {
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration()
+{
   auto nthreads = std::min(v, max_threads);
-  auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks);
+  auto nblocks  = std::min((v + nthreads - 1) / nthreads, max_blocks);
 
   // maximum alteration that does not change realtive weights order
   alteration_t max = alteration_max();
@@ -270,34 +275,32 @@ void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration() {
   auto curand_status = curand_generate_uniformX(randGen, rand_values.data(), v);
   RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND failed");
   curand_status = curandDestroyGenerator(randGen);
-  RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS,
-               "MST: CURAND cleanup failed");
+  RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND cleanup failed");
 
-  //Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu
+  // Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu
   detail::alteration_kernel<<<nblocks, nthreads, 0, stream>>>(
-    v, e, offsets, indices, weights, max, rand_values.data(),
-    altered_weights.data());
+    v, e, offsets, indices, weights, max, rand_values.data(), altered_weights.data());
 }
 
 // updates colors of vertices by propagating the lower color to the higher
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::label_prop(
-  vertex_t* mst_src, vertex_t* mst_dst) {
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::label_prop(vertex_t* mst_src,
+                                                                      vertex_t* mst_dst)
+{
   // update the colors of both ends its until there is no change in colors
   edge_t curr_mst_edge_count = mst_edge_count.value(stream);
 
   auto min_pair_nthreads = std::min(v, (vertex_t)max_threads);
-  auto min_pair_nblocks = std::min(
-    (v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks);
+  auto min_pair_nblocks =
+    std::min((v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks);
 
   edge_t* new_mst_edge_ptr = new_mst_edge.data();
-  vertex_t* color_ptr = color.data();
+  vertex_t* color_ptr      = color.data();
   vertex_t* next_color_ptr = next_color.data();
 
   rmm::device_scalar<bool> done(stream);
   done.set_value_to_zero_async(stream);
-  bool* done_ptr = done.data();
+  bool* done_ptr      = done.data();
   const bool true_val = true;
 
   auto i = 0;
@@ -312,84 +315,99 @@ void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::label_prop(
     i++;
   }
 
-  detail::
-    final_color_indices<<<min_pair_nblocks, min_pair_nthreads, 0, stream>>>(
-      v, color_ptr, color_index);
+  detail::final_color_indices<<<min_pair_nblocks, min_pair_nthreads, 0, stream>>>(
+    v, color_ptr, color_index);
 #ifdef MST_TIME
   std::cout << "Label prop iterations: " << i << std::endl;
 #endif
 }
 
 // Finds the minimum edge from each vertex to the lowest color
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-void MST_solver<vertex_t, edge_t, weight_t,
-                alteration_t>::min_edge_per_vertex() {
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::min_edge_per_vertex()
+{
   auto policy = handle.get_thrust_policy();
-  thrust::fill(policy, min_edge_color.begin(), min_edge_color.end(),
-               std::numeric_limits<alteration_t>::max());
-  thrust::fill(policy, new_mst_edge.begin(), new_mst_edge.end(),
-               std::numeric_limits<weight_t>::max());
+  thrust::fill(
+    policy, min_edge_color.begin(), min_edge_color.end(), std::numeric_limits<alteration_t>::max());
+  thrust::fill(
+    policy, new_mst_edge.begin(), new_mst_edge.end(), std::numeric_limits<weight_t>::max());
 
   int n_threads = 32;
 
-  vertex_t* color_ptr = color.data();
-  edge_t* new_mst_edge_ptr = new_mst_edge.data();
-  bool* mst_edge_ptr = mst_edge.data();
-  alteration_t* min_edge_color_ptr = min_edge_color.data();
+  vertex_t* color_ptr               = color.data();
+  edge_t* new_mst_edge_ptr          = new_mst_edge.data();
+  bool* mst_edge_ptr                = mst_edge.data();
+  alteration_t* min_edge_color_ptr  = min_edge_color.data();
   alteration_t* altered_weights_ptr = altered_weights.data();
 
-  detail::kernel_min_edge_per_vertex<<<v, n_threads, 0, stream>>>(
-    offsets, indices, altered_weights_ptr, color_ptr, color_index,
-    new_mst_edge_ptr, mst_edge_ptr, min_edge_color_ptr, v);
+  detail::kernel_min_edge_per_vertex<<<v, n_threads, 0, stream>>>(offsets,
+                                                                  indices,
+                                                                  altered_weights_ptr,
+                                                                  color_ptr,
+                                                                  color_index,
+                                                                  new_mst_edge_ptr,
+                                                                  mst_edge_ptr,
+                                                                  min_edge_color_ptr,
+                                                                  v);
 }
 
 // Finds the minimum edge from each supervertex to the lowest color
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-void MST_solver<vertex_t, edge_t, weight_t,
-                alteration_t>::min_edge_per_supervertex() {
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::min_edge_per_supervertex()
+{
   auto nthreads = std::min(v, max_threads);
-  auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks);
+  auto nblocks  = std::min((v + nthreads - 1) / nthreads, max_blocks);
 
   auto policy = handle.get_thrust_policy();
-  thrust::fill(policy, temp_src.begin(), temp_src.end(),
-               std::numeric_limits<vertex_t>::max());
+  thrust::fill(policy, temp_src.begin(), temp_src.end(), std::numeric_limits<vertex_t>::max());
 
-  vertex_t* color_ptr = color.data();
-  edge_t* new_mst_edge_ptr = new_mst_edge.data();
-  bool* mst_edge_ptr = mst_edge.data();
-  alteration_t* min_edge_color_ptr = min_edge_color.data();
+  vertex_t* color_ptr               = color.data();
+  edge_t* new_mst_edge_ptr          = new_mst_edge.data();
+  bool* mst_edge_ptr                = mst_edge.data();
+  alteration_t* min_edge_color_ptr  = min_edge_color.data();
   alteration_t* altered_weights_ptr = altered_weights.data();
-  vertex_t* temp_src_ptr = temp_src.data();
-  vertex_t* temp_dst_ptr = temp_dst.data();
-  weight_t* temp_weights_ptr = temp_weights.data();
-
-  detail::min_edge_per_supervertex<<<nblocks, nthreads, 0, stream>>>(
-    color_ptr, color_index, new_mst_edge_ptr, mst_edge_ptr, indices, weights,
-    altered_weights_ptr, temp_src_ptr, temp_dst_ptr, temp_weights_ptr,
-    min_edge_color_ptr, v, symmetrize_output);
+  vertex_t* temp_src_ptr            = temp_src.data();
+  vertex_t* temp_dst_ptr            = temp_dst.data();
+  weight_t* temp_weights_ptr        = temp_weights.data();
+
+  detail::min_edge_per_supervertex<<<nblocks, nthreads, 0, stream>>>(color_ptr,
+                                                                     color_index,
+                                                                     new_mst_edge_ptr,
+                                                                     mst_edge_ptr,
+                                                                     indices,
+                                                                     weights,
+                                                                     altered_weights_ptr,
+                                                                     temp_src_ptr,
+                                                                     temp_dst_ptr,
+                                                                     temp_weights_ptr,
+                                                                     min_edge_color_ptr,
+                                                                     v,
+                                                                     symmetrize_output);
 
   // the above kernel only adds directed mst edges in the case where
   // a pair of vertices don't pick the same min edge between them
   // so, now we add the reverse edge to make it undirected
   if (symmetrize_output) {
-    detail::add_reverse_edge<<<nblocks, nthreads, 0, stream>>>(
-      new_mst_edge_ptr, indices, weights, temp_src_ptr, temp_dst_ptr,
-      temp_weights_ptr, v, symmetrize_output);
+    detail::add_reverse_edge<<<nblocks, nthreads, 0, stream>>>(new_mst_edge_ptr,
+                                                               indices,
+                                                               weights,
+                                                               temp_src_ptr,
+                                                               temp_dst_ptr,
+                                                               temp_weights_ptr,
+                                                               v,
+                                                               symmetrize_output);
   }
 }
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::check_termination() {
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::check_termination()
+{
   vertex_t nthreads = std::min(2 * v, (vertex_t)max_threads);
-  vertex_t nblocks =
-    std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks);
+  vertex_t nblocks  = std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks);
 
   // count number of new mst edges
   edge_t* mst_edge_count_ptr = mst_edge_count.data();
-  vertex_t* temp_src_ptr = temp_src.data();
+  vertex_t* temp_src_ptr     = temp_src.data();
 
   detail::kernel_count_new_mst_edges<<<nblocks, nthreads, 0, stream>>>(
     temp_src_ptr, mst_edge_count_ptr, 2 * v);
@@ -397,36 +415,40 @@ void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::check_termination() {
 
 template <typename vertex_t, typename weight_t>
 struct new_edges_functor {
-  __host__ __device__ bool operator()(
-    const thrust::tuple<vertex_t, vertex_t, weight_t>& t) {
+  __host__ __device__ bool operator()(const thrust::tuple<vertex_t, vertex_t, weight_t>& t)
+  {
     auto src = thrust::get<0>(t);
 
     return src != std::numeric_limits<vertex_t>::max() ? true : false;
   }
 };
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
 void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::append_src_dst_pair(
-  vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights) {
+  vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights)
+{
   auto policy = handle.get_thrust_policy();
 
   edge_t curr_mst_edge_count = prev_mst_edge_count.value(stream);
 
   // iterator to end of mst edges added to final output in previous iteration
-  auto src_dst_zip_end = thrust::make_zip_iterator(thrust::make_tuple(
-    mst_src + curr_mst_edge_count, mst_dst + curr_mst_edge_count,
-    mst_weights + curr_mst_edge_count));
+  auto src_dst_zip_end =
+    thrust::make_zip_iterator(thrust::make_tuple(mst_src + curr_mst_edge_count,
+                                                 mst_dst + curr_mst_edge_count,
+                                                 mst_weights + curr_mst_edge_count));
 
   // iterator to new mst edges found
-  auto temp_src_dst_zip_begin = thrust::make_zip_iterator(thrust::make_tuple(
-    temp_src.begin(), temp_dst.begin(), temp_weights.begin()));
+  auto temp_src_dst_zip_begin = thrust::make_zip_iterator(
+    thrust::make_tuple(temp_src.begin(), temp_dst.begin(), temp_weights.begin()));
   auto temp_src_dst_zip_end = thrust::make_zip_iterator(
     thrust::make_tuple(temp_src.end(), temp_dst.end(), temp_weights.end()));
 
   // copy new mst edges to final output
-  thrust::copy_if(policy, temp_src_dst_zip_begin, temp_src_dst_zip_end,
-                  src_dst_zip_end, new_edges_functor<vertex_t, weight_t>());
+  thrust::copy_if(policy,
+                  temp_src_dst_zip_begin,
+                  temp_src_dst_zip_end,
+                  src_dst_zip_end,
+                  new_edges_functor<vertex_t, weight_t>());
 }
 
 }  // namespace mst
diff --git a/cpp/include/raft/sparse/mst/detail/utils.cuh b/cpp/include/raft/sparse/mst/detail/utils.cuh
index 4d5ca6ebe1..97a76e1d50 100644
--- a/cpp/include/raft/sparse/mst/detail/utils.cuh
+++ b/cpp/include/raft/sparse/mst/detail/utils.cuh
@@ -26,32 +26,29 @@ namespace mst {
 namespace detail {
 
 template <typename idx_t>
-__device__ idx_t get_1D_idx() {
+__device__ idx_t get_1D_idx()
+{
   return blockIdx.x * blockDim.x + threadIdx.x;
 }
 
 // somewhat smart vector print
 template <typename T>
-void printv(rmm::device_uvector<T>& vec, const std::string& name = "",
-            const size_t displ = 5) {
+void printv(rmm::device_uvector<T>& vec, const std::string& name = "", const size_t displ = 5)
+{
 #ifdef MST_TIME
   std::cout.precision(15);
   std::cout << name << " size = " << vec.size() << std::endl;
   if (displ < vec.size()) {
-    thrust::copy(vec.begin(), vec.begin() + displ,
-                 std::ostream_iterator<T>(std::cout, " "));
+    thrust::copy(vec.begin(), vec.begin() + displ, std::ostream_iterator<T>(std::cout, " "));
     std::cout << " ... ";
-    thrust::copy(vec.end() - displ, vec.end(),
-                 std::ostream_iterator<T>(std::cout, " "));
+    thrust::copy(vec.end() - displ, vec.end(), std::ostream_iterator<T>(std::cout, " "));
   } else {
-    thrust::copy(vec.begin(), vec.end(),
-                 std::ostream_iterator<T>(std::cout, " "));
+    thrust::copy(vec.begin(), vec.end(), std::ostream_iterator<T>(std::cout, " "));
   }
   std::cout << std::endl << std::endl;
 #endif
 }
-#define duration_us(a) \
-  std::chrono::duration_cast<std::chrono::microseconds>(a).count()
+#define duration_us(a) std::chrono::duration_cast<std::chrono::microseconds>(a).count()
 
 }  // namespace detail
 }  // namespace mst
diff --git a/cpp/include/raft/sparse/mst/mst.cuh b/cpp/include/raft/sparse/mst/mst.cuh
index 10c981445e..b49003467b 100644
--- a/cpp/include/raft/sparse/mst/mst.cuh
+++ b/cpp/include/raft/sparse/mst/mst.cuh
@@ -22,16 +22,30 @@
 namespace raft {
 namespace mst {
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t = weight_t>
-raft::Graph_COO<vertex_t, edge_t, weight_t> mst(
-  const raft::handle_t& handle, edge_t const* offsets, vertex_t const* indices,
-  weight_t const* weights, vertex_t const v, edge_t const e, vertex_t* color,
-  cudaStream_t stream, bool symmetrize_output = true,
-  bool initialize_colors = true, int iterations = 0) {
-  MST_solver<vertex_t, edge_t, weight_t, alteration_t> mst_solver(
-    handle, offsets, indices, weights, v, e, color, stream, symmetrize_output,
-    initialize_colors, iterations);
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t = weight_t>
+raft::Graph_COO<vertex_t, edge_t, weight_t> mst(const raft::handle_t& handle,
+                                                edge_t const* offsets,
+                                                vertex_t const* indices,
+                                                weight_t const* weights,
+                                                vertex_t const v,
+                                                edge_t const e,
+                                                vertex_t* color,
+                                                cudaStream_t stream,
+                                                bool symmetrize_output = true,
+                                                bool initialize_colors = true,
+                                                int iterations         = 0)
+{
+  MST_solver<vertex_t, edge_t, weight_t, alteration_t> mst_solver(handle,
+                                                                  offsets,
+                                                                  indices,
+                                                                  weights,
+                                                                  v,
+                                                                  e,
+                                                                  color,
+                                                                  stream,
+                                                                  symmetrize_output,
+                                                                  initialize_colors,
+                                                                  iterations);
   return mst_solver.solve();
 }
 
diff --git a/cpp/include/raft/sparse/mst/mst_solver.cuh b/cpp/include/raft/sparse/mst/mst_solver.cuh
index 44b34ee5c7..bae5d77d8e 100644
--- a/cpp/include/raft/sparse/mst/mst_solver.cuh
+++ b/cpp/include/raft/sparse/mst/mst_solver.cuh
@@ -31,20 +31,27 @@ struct Graph_COO {
   edge_t n_edges;
 
   Graph_COO(vertex_t size, cudaStream_t stream)
-    : src(size, stream), dst(size, stream), weights(size, stream) {}
+    : src(size, stream), dst(size, stream), weights(size, stream)
+  {
+  }
 };
 
 namespace mst {
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
 class MST_solver {
  public:
-  MST_solver(const raft::handle_t& handle_, const edge_t* offsets_,
-             const vertex_t* indices_, const weight_t* weights_,
-             const vertex_t v_, const edge_t e_, vertex_t* color_,
-             cudaStream_t stream_, bool symmetrize_output_,
-             bool initialize_colors_, int iterations_);
+  MST_solver(const raft::handle_t& handle_,
+             const edge_t* offsets_,
+             const vertex_t* indices_,
+             const weight_t* weights_,
+             const vertex_t v_,
+             const edge_t e_,
+             vertex_t* color_,
+             cudaStream_t stream_,
+             bool symmetrize_output_,
+             bool initialize_colors_,
+             int iterations_);
 
   raft::Graph_COO<vertex_t, edge_t, weight_t> solve();
 
@@ -56,7 +63,7 @@ class MST_solver {
   bool symmetrize_output, initialize_colors;
   int iterations;
 
-  //CSR
+  // CSR
   const edge_t* offsets;
   const vertex_t* indices;
   const weight_t* weights;
@@ -67,20 +74,16 @@ class MST_solver {
   vertex_t max_threads;
   vertex_t sm_count;
 
-  vertex_t* color_index;  // represent each supervertex as a color
-  rmm::device_uvector<alteration_t>
-    min_edge_color;  // minimum incident edge weight per color
-  rmm::device_uvector<edge_t> new_mst_edge;  // new minimum edge per vertex
-  rmm::device_uvector<alteration_t>
-    altered_weights;  // weights to be used for mst
+  vertex_t* color_index;                              // represent each supervertex as a color
+  rmm::device_uvector<alteration_t> min_edge_color;   // minimum incident edge weight per color
+  rmm::device_uvector<edge_t> new_mst_edge;           // new minimum edge per vertex
+  rmm::device_uvector<alteration_t> altered_weights;  // weights to be used for mst
+  rmm::device_scalar<edge_t> mst_edge_count;  // total number of edges added after every iteration
   rmm::device_scalar<edge_t>
-    mst_edge_count;  // total number of edges added after every iteration
-  rmm::device_scalar<edge_t>
-    prev_mst_edge_count;  // total number of edges up to the previous iteration
-  rmm::device_uvector<bool>
-    mst_edge;  // mst output -  true if the edge belongs in mst
+    prev_mst_edge_count;                     // total number of edges up to the previous iteration
+  rmm::device_uvector<bool> mst_edge;        // mst output -  true if the edge belongs in mst
   rmm::device_uvector<vertex_t> next_color;  //  next iteration color
-  rmm::device_uvector<vertex_t> color;  // index of color that vertex points to
+  rmm::device_uvector<vertex_t> color;       // index of color that vertex points to
 
   // new src-dst pairs found per iteration
   rmm::device_uvector<vertex_t> temp_src;
@@ -93,8 +96,7 @@ class MST_solver {
   void check_termination();
   void alteration();
   alteration_t alteration_max();
-  void append_src_dst_pair(vertex_t* mst_src, vertex_t* mst_dst,
-                           weight_t* mst_weights);
+  void append_src_dst_pair(vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights);
 };
 
 }  // namespace mst
diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh
index 492058f85f..8bc8c746f9 100644
--- a/cpp/include/raft/sparse/op/filter.cuh
+++ b/cpp/include/raft/sparse/op/filter.cuh
@@ -42,15 +42,23 @@ namespace sparse {
 namespace op {
 
 template <int TPB_X, typename T>
-__global__ void coo_remove_scalar_kernel(const int *rows, const int *cols,
-                                         const T *vals, int nnz, int *crows,
-                                         int *ccols, T *cvals, int *ex_scan,
-                                         int *cur_ex_scan, int m, T scalar) {
+__global__ void coo_remove_scalar_kernel(const int* rows,
+                                         const int* cols,
+                                         const T* vals,
+                                         int nnz,
+                                         int* crows,
+                                         int* ccols,
+                                         T* cvals,
+                                         int* ex_scan,
+                                         int* cur_ex_scan,
+                                         int m,
+                                         T scalar)
+{
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
   if (row < m) {
-    int start = cur_ex_scan[row];
-    int stop = get_stop_idx(row, m, nnz, cur_ex_scan);
+    int start       = cur_ex_scan[row];
+    int stop        = get_stop_idx(row, m, nnz, cur_ex_scan);
     int cur_out_idx = ex_scan[row];
 
     for (int idx = start; idx < stop; idx++) {
@@ -82,35 +90,49 @@ __global__ void coo_remove_scalar_kernel(const int *rows, const int *cols,
  * @param stream: cuda stream to use
  */
 template <int TPB_X, typename T>
-void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz,
-                       int *crows, int *ccols, T *cvals, int *cnnz,
-                       int *cur_cnnz, T scalar, int n, cudaStream_t stream) {
+void coo_remove_scalar(const int* rows,
+                       const int* cols,
+                       const T* vals,
+                       int nnz,
+                       int* crows,
+                       int* ccols,
+                       T* cvals,
+                       int* cnnz,
+                       int* cur_cnnz,
+                       T scalar,
+                       int n,
+                       cudaStream_t stream)
+{
   rmm::device_uvector<int> ex_scan(n, stream);
   rmm::device_uvector<int> cur_ex_scan(n, stream);
 
   CUDA_CHECK(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream));
   CUDA_CHECK(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream));
 
-  thrust::device_ptr<int> dev_cnnz = thrust::device_pointer_cast(cnnz);
-  thrust::device_ptr<int> dev_ex_scan =
-    thrust::device_pointer_cast(ex_scan.data());
-  thrust::exclusive_scan(rmm::exec_policy(stream), dev_cnnz, dev_cnnz + n,
-                         dev_ex_scan);
+  thrust::device_ptr<int> dev_cnnz    = thrust::device_pointer_cast(cnnz);
+  thrust::device_ptr<int> dev_ex_scan = thrust::device_pointer_cast(ex_scan.data());
+  thrust::exclusive_scan(rmm::exec_policy(stream), dev_cnnz, dev_cnnz + n, dev_ex_scan);
   CUDA_CHECK(cudaPeekAtLastError());
 
-  thrust::device_ptr<int> dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz);
-  thrust::device_ptr<int> dev_cur_ex_scan =
-    thrust::device_pointer_cast(cur_ex_scan.data());
-  thrust::exclusive_scan(rmm::exec_policy(stream), dev_cur_cnnz,
-                         dev_cur_cnnz + n, dev_cur_ex_scan);
+  thrust::device_ptr<int> dev_cur_cnnz    = thrust::device_pointer_cast(cur_cnnz);
+  thrust::device_ptr<int> dev_cur_ex_scan = thrust::device_pointer_cast(cur_ex_scan.data());
+  thrust::exclusive_scan(rmm::exec_policy(stream), dev_cur_cnnz, dev_cur_cnnz + n, dev_cur_ex_scan);
   CUDA_CHECK(cudaPeekAtLastError());
 
   dim3 grid(raft::ceildiv(n, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
-  coo_remove_scalar_kernel<TPB_X><<<grid, blk, 0, stream>>>(
-    rows, cols, vals, nnz, crows, ccols, cvals, dev_ex_scan.get(),
-    dev_cur_ex_scan.get(), n, scalar);
+  coo_remove_scalar_kernel<TPB_X><<<grid, blk, 0, stream>>>(rows,
+                                                            cols,
+                                                            vals,
+                                                            nnz,
+                                                            crows,
+                                                            ccols,
+                                                            cvals,
+                                                            dev_ex_scan.get(),
+                                                            dev_cur_ex_scan.get(),
+                                                            n,
+                                                            scalar);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -123,33 +145,39 @@ void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz,
  * @param stream: cuda stream to use
  */
 template <int TPB_X, typename T>
-void coo_remove_scalar(COO<T> *in, COO<T> *out, T scalar, cudaStream_t stream) {
+void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
+{
   rmm::device_uvector<int> row_count_nz(in->n_rows, stream);
   rmm::device_uvector<int> row_count(in->n_rows, stream);
 
-  CUDA_CHECK(
-    cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream));
-  CUDA_CHECK(
-    cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream));
 
   linalg::coo_degree<TPB_X>(in->rows(), in->nnz, row_count.data(), stream);
   CUDA_CHECK(cudaPeekAtLastError());
 
-  linalg::coo_degree_scalar<TPB_X>(in->rows(), in->vals(), in->nnz, scalar,
-                                   row_count_nz.data(), stream);
+  linalg::coo_degree_scalar<TPB_X>(
+    in->rows(), in->vals(), in->nnz, scalar, row_count_nz.data(), stream);
   CUDA_CHECK(cudaPeekAtLastError());
 
-  thrust::device_ptr<int> d_row_count_nz =
-    thrust::device_pointer_cast(row_count_nz.data());
-  int out_nnz = thrust::reduce(rmm::exec_policy(stream), d_row_count_nz,
-                               d_row_count_nz + in->n_rows);
+  thrust::device_ptr<int> d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data());
+  int out_nnz =
+    thrust::reduce(rmm::exec_policy(stream), d_row_count_nz, d_row_count_nz + in->n_rows);
 
   out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream);
 
-  coo_remove_scalar<TPB_X, T>(in->rows(), in->cols(), in->vals(), in->nnz,
-                              out->rows(), out->cols(), out->vals(),
-                              row_count_nz.data(), row_count.data(), scalar,
-                              in->n_rows, stream);
+  coo_remove_scalar<TPB_X, T>(in->rows(),
+                              in->cols(),
+                              in->vals(),
+                              in->nnz,
+                              out->rows(),
+                              out->cols(),
+                              out->vals(),
+                              row_count_nz.data(),
+                              row_count.data(),
+                              scalar,
+                              in->n_rows,
+                              stream);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -161,7 +189,8 @@ void coo_remove_scalar(COO<T> *in, COO<T> *out, T scalar, cudaStream_t stream) {
  * @param stream: cuda stream to use
  */
 template <int TPB_X, typename T>
-void coo_remove_zeros(COO<T> *in, COO<T> *out, cudaStream_t stream) {
+void coo_remove_zeros(COO<T>* in, COO<T>* out, cudaStream_t stream)
+{
   coo_remove_scalar<TPB_X, T>(in, out, T(0.0), stream);
 }
 
diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh
index 09a35720fb..84d584d108 100644
--- a/cpp/include/raft/sparse/op/reduce.cuh
+++ b/cpp/include/raft/sparse/op/reduce.cuh
@@ -44,25 +44,29 @@ namespace sparse {
 namespace op {
 
 template <typename value_idx>
-__global__ void compute_duplicates_diffs_kernel(const value_idx *rows,
-                                                const value_idx *cols,
-                                                value_idx *diff, size_t nnz) {
+__global__ void compute_duplicates_diffs_kernel(const value_idx* rows,
+                                                const value_idx* cols,
+                                                value_idx* diff,
+                                                size_t nnz)
+{
   size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
   if (tid >= nnz) return;
 
   value_idx d = 1;
-  if (tid == 0 || (rows[tid - 1] == rows[tid] && cols[tid - 1] == cols[tid]))
-    d = 0;
+  if (tid == 0 || (rows[tid - 1] == rows[tid] && cols[tid - 1] == cols[tid])) d = 0;
   diff[tid] = d;
 }
 
 template <typename value_idx, typename value_t>
-__global__ void max_duplicates_kernel(const value_idx *src_rows,
-                                      const value_idx *src_cols,
-                                      const value_t *src_vals,
-                                      const value_idx *index,
-                                      value_idx *out_rows, value_idx *out_cols,
-                                      value_t *out_vals, size_t nnz) {
+__global__ void max_duplicates_kernel(const value_idx* src_rows,
+                                      const value_idx* src_cols,
+                                      const value_t* src_vals,
+                                      const value_idx* index,
+                                      value_idx* out_rows,
+                                      value_idx* out_cols,
+                                      value_t* out_vals,
+                                      size_t nnz)
+{
   size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
 
   if (tid < nnz) {
@@ -94,13 +98,13 @@ __global__ void max_duplicates_kernel(const value_idx *src_rows,
  * @param[in] stream cuda ops will be ordered wrt this stream
  */
 template <typename value_idx>
-void compute_duplicates_mask(value_idx *mask, const value_idx *rows,
-                             const value_idx *cols, size_t nnz,
-                             cudaStream_t stream) {
+void compute_duplicates_mask(
+  value_idx* mask, const value_idx* rows, const value_idx* cols, size_t nnz, cudaStream_t stream)
+{
   CUDA_CHECK(cudaMemsetAsync(mask, 0, nnz * sizeof(value_idx), stream));
 
-  compute_duplicates_diffs_kernel<<<raft::ceildiv(nnz, (size_t)256), 256, 0,
-                                    stream>>>(rows, cols, mask, nnz);
+  compute_duplicates_diffs_kernel<<<raft::ceildiv(nnz, (size_t)256), 256, 0, stream>>>(
+    rows, cols, mask, nnz);
 }
 
 /**
@@ -120,11 +124,16 @@ void compute_duplicates_mask(value_idx *mask, const value_idx *rows,
  * @param[in] stream cuda ops will be ordered wrt this stream
  */
 template <typename value_idx, typename value_t>
-void max_duplicates(const raft::handle_t &handle,
-                    raft::sparse::COO<value_t, value_idx> &out,
-                    const value_idx *rows, const value_idx *cols,
-                    const value_t *vals, size_t nnz, size_t m, size_t n) {
-  auto stream = handle.get_stream();
+void max_duplicates(const raft::handle_t& handle,
+                    raft::sparse::COO<value_t, value_idx>& out,
+                    const value_idx* rows,
+                    const value_idx* cols,
+                    const value_t* vals,
+                    size_t nnz,
+                    size_t m,
+                    size_t n)
+{
+  auto stream        = handle.get_stream();
   auto thrust_policy = handle.get_thrust_policy();
 
   // compute diffs & take exclusive scan
@@ -132,8 +141,7 @@ void max_duplicates(const raft::handle_t &handle,
 
   compute_duplicates_mask(diff.data(), rows, cols, nnz, stream);
 
-  thrust::exclusive_scan(thrust_policy, diff.data(), diff.data() + diff.size(),
-                         diff.data());
+  thrust::exclusive_scan(thrust_policy, diff.data(), diff.data() + diff.size(), diff.data());
 
   // compute final size
   value_idx size = 0;
diff --git a/cpp/include/raft/sparse/op/row_op.cuh b/cpp/include/raft/sparse/op/row_op.cuh
index 9e5034dc28..194a878ac1 100644
--- a/cpp/include/raft/sparse/op/row_op.cuh
+++ b/cpp/include/raft/sparse/op/row_op.cuh
@@ -38,12 +38,12 @@ namespace sparse {
 namespace op {
 
 template <typename T, int TPB_X = 256, typename Lambda = auto(T, T, T)->void>
-__global__ void csr_row_op_kernel(const T *row_ind, T n_rows, T nnz,
-                                  Lambda op) {
+__global__ void csr_row_op_kernel(const T* row_ind, T n_rows, T nnz, Lambda op)
+{
   T row = blockIdx.x * TPB_X + threadIdx.x;
   if (row < n_rows) {
     T start_idx = row_ind[row];
-    T stop_idx = row < n_rows - 1 ? row_ind[row + 1] : nnz;
+    T stop_idx  = row < n_rows - 1 ? row_ind[row + 1] : nnz;
     op(row, start_idx, stop_idx);
   }
 }
@@ -59,14 +59,12 @@ __global__ void csr_row_op_kernel(const T *row_ind, T n_rows, T nnz,
  * @param op custom row operation functor accepting the row and beginning index.
  * @param stream cuda stream to use
  */
-template <typename Index_, int TPB_X = 256,
-          typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_row_op(const Index_ *row_ind, Index_ n_rows, Index_ nnz, Lambda op,
-                cudaStream_t stream) {
+template <typename Index_, int TPB_X = 256, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_row_op(const Index_* row_ind, Index_ n_rows, Index_ nnz, Lambda op, cudaStream_t stream)
+{
   dim3 grid(raft::ceildiv(n_rows, Index_(TPB_X)), 1, 1);
   dim3 blk(TPB_X, 1, 1);
-  csr_row_op_kernel<Index_, TPB_X>
-    <<<grid, blk, 0, stream>>>(row_ind, n_rows, nnz, op);
+  csr_row_op_kernel<Index_, TPB_X><<<grid, blk, 0, stream>>>(row_ind, n_rows, nnz, op);
 
   CUDA_CHECK(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/sparse/op/slice.h b/cpp/include/raft/sparse/op/slice.h
index 46f4f41879..9bbe04cf34 100644
--- a/cpp/include/raft/sparse/op/slice.h
+++ b/cpp/include/raft/sparse/op/slice.h
@@ -50,10 +50,14 @@ namespace op {
  * @param[in] stream : cuda stream for ordering events
  */
 template <typename value_idx>
-void csr_row_slice_indptr(value_idx start_row, value_idx stop_row,
-                          const value_idx *indptr, value_idx *indptr_out,
-                          value_idx *start_offset, value_idx *stop_offset,
-                          cudaStream_t stream) {
+void csr_row_slice_indptr(value_idx start_row,
+                          value_idx stop_row,
+                          const value_idx* indptr,
+                          value_idx* indptr_out,
+                          value_idx* start_offset,
+                          value_idx* stop_offset,
+                          cudaStream_t stream)
+{
   raft::update_host(start_offset, indptr + start_row, 1, stream);
   raft::update_host(stop_offset, indptr + stop_row + 1, 1, stream);
 
@@ -63,11 +67,12 @@ void csr_row_slice_indptr(value_idx start_row, value_idx stop_row,
 
   // 0-based indexing so we need to add 1 to stop row. Because we want n_rows+1,
   // we add another 1 to stop row.
-  raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row,
-                   stream);
+  raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row, stream);
 
   raft::linalg::unaryOp<value_idx>(
-    indptr_out, indptr_out, (stop_row + 2) - start_row,
+    indptr_out,
+    indptr_out,
+    (stop_row + 2) - start_row,
     [s_offset] __device__(value_idx input) { return input - s_offset; },
     stream);
 }
@@ -85,12 +90,15 @@ void csr_row_slice_indptr(value_idx start_row, value_idx stop_row,
  * @param[in] stream : cuda stream for ordering events
  */
 template <typename value_idx, typename value_t>
-void csr_row_slice_populate(value_idx start_offset, value_idx stop_offset,
-                            const value_idx *indices, const value_t *data,
-                            value_idx *indices_out, value_t *data_out,
-                            cudaStream_t stream) {
-  raft::copy(indices_out, indices + start_offset, stop_offset - start_offset,
-             stream);
+void csr_row_slice_populate(value_idx start_offset,
+                            value_idx stop_offset,
+                            const value_idx* indices,
+                            const value_t* data,
+                            value_idx* indices_out,
+                            value_t* data_out,
+                            cudaStream_t stream)
+{
+  raft::copy(indices_out, indices + start_offset, stop_offset - start_offset, stream);
   raft::copy(data_out, data + start_offset, stop_offset - start_offset, stream);
 }
 
diff --git a/cpp/include/raft/sparse/op/sort.h b/cpp/include/raft/sparse/op/sort.h
index c40801a0b1..d397bce780 100644
--- a/cpp/include/raft/sparse/op/sort.h
+++ b/cpp/include/raft/sparse/op/sort.h
@@ -38,7 +38,8 @@ namespace op {
 
 struct TupleComp {
   template <typename one, typename two>
-  __host__ __device__ bool operator()(const one &t1, const two &t2) {
+  __host__ __device__ bool operator()(const one& t1, const two& t2)
+  {
     // sort first by each sample's color,
     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
     if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false;
@@ -61,13 +62,12 @@ struct TupleComp {
  * @param stream: cuda stream to use
  */
 template <typename T>
-void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals,
-              cudaStream_t stream) {
+void coo_sort(int m, int n, int nnz, int* rows, int* cols, T* vals, cudaStream_t stream)
+{
   auto coo_indices = thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
 
   // get all the colors in contiguous locations so we can map them to warps.
-  thrust::sort_by_key(rmm::exec_policy(stream), coo_indices, coo_indices + nnz,
-                      vals, TupleComp());
+  thrust::sort_by_key(rmm::exec_policy(stream), coo_indices, coo_indices + nnz, vals, TupleComp());
 }
 
 /**
@@ -77,9 +77,9 @@ void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals,
  * @param stream: the cuda stream to use
  */
 template <typename T>
-void coo_sort(COO<T> *const in, cudaStream_t stream) {
-  coo_sort<T>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(),
-              in->vals(), stream);
+void coo_sort(COO<T>* const in, cudaStream_t stream)
+{
+  coo_sort<T>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream);
 }
 
 /**
@@ -93,8 +93,9 @@ void coo_sort(COO<T> *const in, cudaStream_t stream) {
  * @param[in] stream cuda stream for which to order cuda operations
  */
 template <typename value_idx, typename value_t>
-void coo_sort_by_weight(value_idx *rows, value_idx *cols, value_t *data,
-                        value_idx nnz, cudaStream_t stream) {
+void coo_sort_by_weight(
+  value_idx* rows, value_idx* cols, value_t* data, value_idx nnz, cudaStream_t stream)
+{
   thrust::device_ptr<value_t> t_data = thrust::device_pointer_cast(data);
 
   auto first = thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh
index 5313b81192..8edb0e8b43 100644
--- a/cpp/include/raft/sparse/selection/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/connect_components.cuh
@@ -59,17 +59,20 @@ struct KeyValuePair {
   __host__ __device__ __forceinline__ KeyValuePair() {}
 
   /// Copy Constructor
-  __host__ __device__ __forceinline__
-  KeyValuePair(cub::KeyValuePair<_Key, _Value> kvp)
-    : key(kvp.key), value(kvp.value) {}
+  __host__ __device__ __forceinline__ KeyValuePair(cub::KeyValuePair<_Key, _Value> kvp)
+    : key(kvp.key), value(kvp.value)
+  {
+  }
 
   /// Constructor
-  __host__ __device__ __forceinline__ KeyValuePair(Key const &key,
-                                                   Value const &value)
-    : key(key), value(value) {}
+  __host__ __device__ __forceinline__ KeyValuePair(Key const& key, Value const& value)
+    : key(key), value(value)
+  {
+  }
 
   /// Inequality operator
-  __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair &b) {
+  __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair& b)
+  {
     return (value != b.value) || (key != b.key);
   }
 };
@@ -83,31 +86,32 @@ struct KeyValuePair {
  */
 template <typename value_idx, typename value_t>
 struct FixConnectivitiesRedOp {
-  value_idx *colors;
+  value_idx* colors;
   value_idx m;
 
-  FixConnectivitiesRedOp(value_idx *colors_, value_idx m_)
-    : colors(colors_), m(m_){};
+  FixConnectivitiesRedOp(value_idx* colors_, value_idx m_) : colors(colors_), m(m_){};
 
   typedef typename cub::KeyValuePair<value_idx, value_t> KVP;
-  DI void operator()(value_idx rit, KVP *out, const KVP &other) {
-    if (rit < m && other.value < out->value &&
-        colors[rit] != colors[other.key]) {
-      out->key = other.key;
+  DI void operator()(value_idx rit, KVP* out, const KVP& other)
+  {
+    if (rit < m && other.value < out->value && colors[rit] != colors[other.key]) {
+      out->key   = other.key;
       out->value = other.value;
     }
   }
 
-  DI KVP operator()(value_idx rit, const KVP &a, const KVP &b) {
+  DI KVP operator()(value_idx rit, const KVP& a, const KVP& b)
+  {
     if (rit < m && a.value < b.value && colors[rit] != colors[a.key]) {
       return a;
     } else
       return b;
   }
 
-  DI void init(value_t *out, value_t maxVal) { *out = maxVal; }
-  DI void init(KVP *out, value_t maxVal) {
-    out->key = -1;
+  DI void init(value_t* out, value_t maxVal) { *out = maxVal; }
+  DI void init(KVP* out, value_t maxVal)
+  {
+    out->key   = -1;
     out->value = maxVal;
   }
 };
@@ -119,7 +123,8 @@ struct FixConnectivitiesRedOp {
  */
 struct TupleComp {
   template <typename one, typename two>
-  __host__ __device__ bool operator()(const one &t1, const two &t2) {
+  __host__ __device__ bool operator()(const one& t1, const two& t2)
+  {
     // sort first by each sample's color,
     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
     if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false;
@@ -137,13 +142,9 @@ template <typename LabelT, typename DataT>
 struct CubKVPMinReduce {
   typedef cub::KeyValuePair<LabelT, DataT> KVP;
 
-  DI KVP operator()(LabelT rit, const KVP &a, const KVP &b) {
-    return b.value < a.value ? b : a;
-  }
+  DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
 
-  DI KVP operator()(const KVP &a, const KVP &b) {
-    return b.value < a.value ? b : a;
-  }
+  DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
 
 };  // KVPMinReduce
 
@@ -158,11 +159,10 @@ struct CubKVPMinReduce {
  * @return total number of components
  */
 template <typename value_idx>
-value_idx get_n_components(value_idx *colors, size_t n_rows,
-                           cudaStream_t stream) {
+value_idx get_n_components(value_idx* colors, size_t n_rows, cudaStream_t stream)
+{
   rmm::device_uvector<value_idx> map_ids(0, stream);
-  int num_clusters =
-    raft::label::getUniquelabels(map_ids, colors, n_rows, stream);
+  int num_clusters = raft::label::getUniquelabels(map_ids, colors, n_rows, stream);
   return num_clusters;
 }
 
@@ -173,11 +173,12 @@ value_idx get_n_components(value_idx *colors, size_t n_rows,
  */
 template <typename value_idx, typename value_t>
 struct LookupColorOp {
-  value_idx *colors;
+  value_idx* colors;
 
-  LookupColorOp(value_idx *colors_) : colors(colors_) {}
+  LookupColorOp(value_idx* colors_) : colors(colors_) {}
 
-  DI value_idx operator()(const cub::KeyValuePair<value_idx, value_t> &kvp) {
+  DI value_idx operator()(const cub::KeyValuePair<value_idx, value_t>& kvp)
+  {
     return colors[kvp.key];
   }
 };
@@ -187,7 +188,8 @@ struct LookupColorOp {
  * the given array of components
  * @tparam value_idx
  * @tparam value_t
- * @param[out] kvp mapping of closest neighbor vertex and distance for each vertex in the given array of components
+ * @param[out] kvp mapping of closest neighbor vertex and distance for each vertex in the given
+ * array of components
  * @param[out] nn_colors components of nearest neighbors for each vertex
  * @param[in] colors components of each vertex
  * @param[in] X original dense data
@@ -196,24 +198,38 @@ struct LookupColorOp {
  * @param[in] stream cuda stream for which to order cuda operations
  */
 template <typename value_idx, typename value_t, typename red_op>
-void perform_1nn(cub::KeyValuePair<value_idx, value_t> *kvp,
-                 value_idx *nn_colors, value_idx *colors, const value_t *X,
-                 size_t n_rows, size_t n_cols, cudaStream_t stream,
-                 red_op reduction_op) {
+void perform_1nn(cub::KeyValuePair<value_idx, value_t>* kvp,
+                 value_idx* nn_colors,
+                 value_idx* colors,
+                 const value_t* X,
+                 size_t n_rows,
+                 size_t n_cols,
+                 cudaStream_t stream,
+                 red_op reduction_op)
+{
   rmm::device_uvector<int> workspace(n_rows, stream);
   rmm::device_uvector<value_t> x_norm(n_rows, stream);
 
-  raft::linalg::rowNorm(x_norm.data(), X, n_cols, n_rows, raft::linalg::L2Norm,
-                        true, stream);
-
-  raft::distance::fusedL2NN<value_t, cub::KeyValuePair<value_idx, value_t>,
-                            value_idx>(
-    kvp, X, X, x_norm.data(), x_norm.data(), n_rows, n_rows, n_cols,
-    workspace.data(), reduction_op, reduction_op, true, true, stream);
+  raft::linalg::rowNorm(x_norm.data(), X, n_cols, n_rows, raft::linalg::L2Norm, true, stream);
+
+  raft::distance::fusedL2NN<value_t, cub::KeyValuePair<value_idx, value_t>, value_idx>(
+    kvp,
+    X,
+    X,
+    x_norm.data(),
+    x_norm.data(),
+    n_rows,
+    n_rows,
+    n_cols,
+    workspace.data(),
+    reduction_op,
+    reduction_op,
+    true,
+    true,
+    stream);
 
   LookupColorOp<value_idx, value_t> extract_colors_op(colors);
-  thrust::transform(rmm::exec_policy(stream), kvp, kvp + n_rows, nn_colors,
-                    extract_colors_op);
+  thrust::transform(rmm::exec_policy(stream), kvp, kvp + n_rows, nn_colors, extract_colors_op);
 }
 
 /**
@@ -229,27 +245,33 @@ void perform_1nn(cub::KeyValuePair<value_idx, value_t> *kvp,
  * @param stream stream for which to order CUDA operations
  */
 template <typename value_idx, typename value_t>
-void sort_by_color(value_idx *colors, value_idx *nn_colors,
-                   cub::KeyValuePair<value_idx, value_t> *kvp,
-                   value_idx *src_indices, size_t n_rows, cudaStream_t stream) {
+void sort_by_color(value_idx* colors,
+                   value_idx* nn_colors,
+                   cub::KeyValuePair<value_idx, value_t>* kvp,
+                   value_idx* src_indices,
+                   size_t n_rows,
+                   cudaStream_t stream)
+{
   thrust::counting_iterator<value_idx> arg_sort_iter(0);
-  thrust::copy(rmm::exec_policy(stream), arg_sort_iter, arg_sort_iter + n_rows,
-               src_indices);
+  thrust::copy(rmm::exec_policy(stream), arg_sort_iter, arg_sort_iter + n_rows, src_indices);
 
-  auto keys = thrust::make_zip_iterator(thrust::make_tuple(
-    colors, nn_colors, (raft::linkage::KeyValuePair<value_idx, value_t> *)kvp));
+  auto keys = thrust::make_zip_iterator(
+    thrust::make_tuple(colors, nn_colors, (raft::linkage::KeyValuePair<value_idx, value_t>*)kvp));
   auto vals = thrust::make_zip_iterator(thrust::make_tuple(src_indices));
 
   // get all the colors in contiguous locations so we can map them to warps.
-  thrust::sort_by_key(rmm::exec_policy(stream), keys, keys + n_rows, vals,
-                      TupleComp());
+  thrust::sort_by_key(rmm::exec_policy(stream), keys, keys + n_rows, vals, TupleComp());
 }
 
 template <typename value_idx, typename value_t>
-__global__ void min_components_by_color_kernel(
-  value_idx *out_rows, value_idx *out_cols, value_t *out_vals,
-  const value_idx *out_index, const value_idx *indices,
-  const cub::KeyValuePair<value_idx, value_t> *kvp, size_t nnz) {
+__global__ void min_components_by_color_kernel(value_idx* out_rows,
+                                               value_idx* out_cols,
+                                               value_t* out_vals,
+                                               const value_idx* out_index,
+                                               const value_idx* indices,
+                                               const cub::KeyValuePair<value_idx, value_t>* kvp,
+                                               size_t nnz)
+{
   size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
 
   if (tid >= nnz) return;
@@ -278,19 +300,20 @@ __global__ void min_components_by_color_kernel(
  * @param[in] stream cuda stream for which to order cuda operations
  */
 template <typename value_idx, typename value_t>
-void min_components_by_color(raft::sparse::COO<value_t, value_idx> &coo,
-                             const value_idx *out_index,
-                             const value_idx *indices,
-                             const cub::KeyValuePair<value_idx, value_t> *kvp,
-                             size_t nnz, cudaStream_t stream) {
+void min_components_by_color(raft::sparse::COO<value_t, value_idx>& coo,
+                             const value_idx* out_index,
+                             const value_idx* indices,
+                             const cub::KeyValuePair<value_idx, value_t>* kvp,
+                             size_t nnz,
+                             cudaStream_t stream)
+{
   /**
    * Arrays should be ordered by: colors_indptr->colors_n->kvp.value
    * so the last element of each column in the input CSR should be
    * the min.
    */
-  min_components_by_color_kernel<<<raft::ceildiv(nnz, (size_t)256), 256, 0,
-                                   stream>>>(coo.rows(), coo.cols(), coo.vals(),
-                                             out_index, indices, kvp, nnz);
+  min_components_by_color_kernel<<<raft::ceildiv(nnz, (size_t)256), 256, 0, stream>>>(
+    coo.rows(), coo.cols(), coo.vals(), out_index, indices, kvp, nnz);
 }
 
 /**
@@ -312,12 +335,16 @@ void min_components_by_color(raft::sparse::COO<value_t, value_idx> &coo,
  * @param[in] n_cols number of cols in X
  */
 template <typename value_idx, typename value_t, typename red_op>
-void connect_components(const raft::handle_t &handle,
-                        raft::sparse::COO<value_t, value_idx> &out,
-                        const value_t *X, const value_idx *orig_colors,
-                        size_t n_rows, size_t n_cols, red_op reduction_op,
-                        raft::distance::DistanceType metric =
-                          raft::distance::DistanceType::L2SqrtExpanded) {
+void connect_components(
+  const raft::handle_t& handle,
+  raft::sparse::COO<value_t, value_idx>& out,
+  const value_t* X,
+  const value_idx* orig_colors,
+  size_t n_rows,
+  size_t n_cols,
+  red_op reduction_op,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded)
+{
   auto stream = handle.get_stream();
 
   RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded,
@@ -328,8 +355,7 @@ void connect_components(const raft::handle_t &handle,
   raft::copy_async(colors.data(), orig_colors, n_rows, stream);
 
   // Normalize colors so they are drawn from a monotonically increasing set
-  raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream,
-                              true);
+  raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream, true);
 
   value_idx n_components = get_n_components(colors.data(), n_rows, stream);
 
@@ -338,36 +364,42 @@ void connect_components(const raft::handle_t &handle,
    * is guaranteed to be != color of its nearest neighbor.
    */
   rmm::device_uvector<value_idx> nn_colors(n_rows, stream);
-  rmm::device_uvector<cub::KeyValuePair<value_idx, value_t>> temp_inds_dists(
-    n_rows, stream);
+  rmm::device_uvector<cub::KeyValuePair<value_idx, value_t>> temp_inds_dists(n_rows, stream);
   rmm::device_uvector<value_idx> src_indices(n_rows, stream);
 
-  perform_1nn(temp_inds_dists.data(), nn_colors.data(), colors.data(), X,
-              n_rows, n_cols, stream, reduction_op);
+  perform_1nn(temp_inds_dists.data(),
+              nn_colors.data(),
+              colors.data(),
+              X,
+              n_rows,
+              n_cols,
+              stream,
+              reduction_op);
 
   /**
    * Sort data points by color (neighbors are not sorted)
    */
   // max_color + 1 = number of connected components
   // sort nn_colors by key w/ original colors
-  sort_by_color(colors.data(), nn_colors.data(), temp_inds_dists.data(),
-                src_indices.data(), n_rows, stream);
+  sort_by_color(
+    colors.data(), nn_colors.data(), temp_inds_dists.data(), src_indices.data(), n_rows, stream);
 
   /**
    * Take the min for any duplicate colors
    */
   // Compute mask of duplicates
   rmm::device_uvector<value_idx> out_index(n_rows + 1, stream);
-  raft::sparse::op::compute_duplicates_mask(out_index.data(), colors.data(),
-                                            nn_colors.data(), n_rows, stream);
+  raft::sparse::op::compute_duplicates_mask(
+    out_index.data(), colors.data(), nn_colors.data(), n_rows, stream);
 
-  thrust::exclusive_scan(handle.get_thrust_policy(), out_index.data(),
-                         out_index.data() + out_index.size(), out_index.data());
+  thrust::exclusive_scan(handle.get_thrust_policy(),
+                         out_index.data(),
+                         out_index.data() + out_index.size(),
+                         out_index.data());
 
   // compute final size
   value_idx size = 0;
-  raft::update_host(&size, out_index.data() + (out_index.size() - 1), 1,
-                    stream);
+  raft::update_host(&size, out_index.data() + (out_index.size() - 1), 1, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
   size++;
@@ -375,14 +407,14 @@ void connect_components(const raft::handle_t &handle,
   raft::sparse::COO<value_t, value_idx> min_edges(stream);
   min_edges.allocate(size, n_rows, n_rows, true, stream);
 
-  min_components_by_color(min_edges, out_index.data(), src_indices.data(),
-                          temp_inds_dists.data(), n_rows, stream);
+  min_components_by_color(
+    min_edges, out_index.data(), src_indices.data(), temp_inds_dists.data(), n_rows, stream);
 
   /**
    * Symmetrize resulting edge list
    */
-  raft::sparse::linalg::symmetrize(handle, min_edges.rows(), min_edges.cols(),
-                                   min_edges.vals(), n_rows, n_rows, size, out);
+  raft::sparse::linalg::symmetrize(
+    handle, min_edges.rows(), min_edges.cols(), min_edges.vals(), n_rows, n_rows, size, out);
 }
 
 };  // end namespace linkage
diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh
index b796b63dc8..8486abd863 100644
--- a/cpp/include/raft/sparse/selection/knn.cuh
+++ b/cpp/include/raft/sparse/selection/knn.cuh
@@ -38,9 +38,11 @@ namespace selection {
 
 template <typename value_idx, typename value_t>
 struct csr_batcher_t {
-  csr_batcher_t(value_idx batch_size, value_idx n_rows,
-                const value_idx *csr_indptr, const value_idx *csr_indices,
-                const value_t *csr_data)
+  csr_batcher_t(value_idx batch_size,
+                value_idx n_rows,
+                const value_idx* csr_indptr,
+                const value_idx* csr_indices,
+                const value_t* csr_data)
     : batch_start_(0),
       batch_stop_(0),
       batch_rows_(0),
@@ -50,32 +52,42 @@ struct csr_batcher_t {
       csr_indices_(csr_indices),
       csr_data_(csr_data),
       batch_csr_start_offset_(0),
-      batch_csr_stop_offset_(0) {}
+      batch_csr_stop_offset_(0)
+  {
+  }
 
-  void set_batch(int batch_num) {
+  void set_batch(int batch_num)
+  {
     batch_start_ = batch_num * batch_size_;
-    batch_stop_ = batch_start_ + batch_size_ - 1;  // zero-based indexing
+    batch_stop_  = batch_start_ + batch_size_ - 1;  // zero-based indexing
 
-    if (batch_stop_ >= total_rows_)
-      batch_stop_ = total_rows_ - 1;  // zero-based indexing
+    if (batch_stop_ >= total_rows_) batch_stop_ = total_rows_ - 1;  // zero-based indexing
 
     batch_rows_ = (batch_stop_ - batch_start_) + 1;
   }
 
-  value_idx get_batch_csr_indptr_nnz(value_idx *batch_indptr,
-                                     cudaStream_t stream) {
-    raft::sparse::op::csr_row_slice_indptr(
-      batch_start_, batch_stop_, csr_indptr_, batch_indptr,
-      &batch_csr_start_offset_, &batch_csr_stop_offset_, stream);
+  value_idx get_batch_csr_indptr_nnz(value_idx* batch_indptr, cudaStream_t stream)
+  {
+    raft::sparse::op::csr_row_slice_indptr(batch_start_,
+                                           batch_stop_,
+                                           csr_indptr_,
+                                           batch_indptr,
+                                           &batch_csr_start_offset_,
+                                           &batch_csr_stop_offset_,
+                                           stream);
 
     return batch_csr_stop_offset_ - batch_csr_start_offset_;
   }
 
-  void get_batch_csr_indices_data(value_idx *csr_indices, value_t *csr_data,
-                                  cudaStream_t stream) {
-    raft::sparse::op::csr_row_slice_populate(
-      batch_csr_start_offset_, batch_csr_stop_offset_, csr_indices_, csr_data_,
-      csr_indices, csr_data, stream);
+  void get_batch_csr_indices_data(value_idx* csr_indices, value_t* csr_data, cudaStream_t stream)
+  {
+    raft::sparse::op::csr_row_slice_populate(batch_csr_start_offset_,
+                                             batch_csr_stop_offset_,
+                                             csr_indices_,
+                                             csr_data_,
+                                             csr_indices,
+                                             csr_data,
+                                             stream);
   }
 
   value_idx batch_rows() const { return batch_rows_; }
@@ -92,9 +104,9 @@ struct csr_batcher_t {
 
   value_idx total_rows_;
 
-  const value_idx *csr_indptr_;
-  const value_idx *csr_indices_;
-  const value_t *csr_data_;
+  const value_idx* csr_indptr_;
+  const value_idx* csr_indices_;
+  const value_t* csr_data_;
 
   value_idx batch_csr_start_offset_;
   value_idx batch_csr_stop_offset_;
@@ -103,18 +115,26 @@ struct csr_batcher_t {
 template <typename value_idx, typename value_t>
 class sparse_knn_t {
  public:
-  sparse_knn_t(const value_idx *idxIndptr_, const value_idx *idxIndices_,
-               const value_t *idxData_, size_t idxNNZ_, int n_idx_rows_,
-               int n_idx_cols_, const value_idx *queryIndptr_,
-               const value_idx *queryIndices_, const value_t *queryData_,
-               size_t queryNNZ_, int n_query_rows_, int n_query_cols_,
-               value_idx *output_indices_, value_t *output_dists_, int k_,
-               const raft::handle_t &handle_,
-               size_t batch_size_index_ = 2 << 14,  // approx 1M
-               size_t batch_size_query_ = 2 << 14,
-               raft::distance::DistanceType metric_ =
-                 raft::distance::DistanceType::L2Expanded,
-               float metricArg_ = 0)
+  sparse_knn_t(const value_idx* idxIndptr_,
+               const value_idx* idxIndices_,
+               const value_t* idxData_,
+               size_t idxNNZ_,
+               int n_idx_rows_,
+               int n_idx_cols_,
+               const value_idx* queryIndptr_,
+               const value_idx* queryIndices_,
+               const value_t* queryData_,
+               size_t queryNNZ_,
+               int n_query_rows_,
+               int n_query_cols_,
+               value_idx* output_indices_,
+               value_t* output_dists_,
+               int k_,
+               const raft::handle_t& handle_,
+               size_t batch_size_index_             = 2 << 14,  // approx 1M
+               size_t batch_size_query_             = 2 << 14,
+               raft::distance::DistanceType metric_ = raft::distance::DistanceType::L2Expanded,
+               float metricArg_                     = 0)
     : idxIndptr(idxIndptr_),
       idxIndices(idxIndices_),
       idxData(idxData_),
@@ -134,9 +154,12 @@ class sparse_knn_t {
       batch_size_index(batch_size_index_),
       batch_size_query(batch_size_query_),
       metric(metric_),
-      metricArg(metricArg_) {}
+      metricArg(metricArg_)
+  {
+  }
 
-  void run() {
+  void run()
+  {
     using namespace raft::sparse;
 
     int n_batches_query = raft::ceildiv((size_t)n_query_rows, batch_size_query);
@@ -147,37 +170,33 @@ class sparse_knn_t {
 
     for (int i = 0; i < n_batches_query; i++) {
       /**
-        * Compute index batch info
-        */
+       * Compute index batch info
+       */
       query_batcher.set_batch(i);
 
       /**
-        * Slice CSR to rows in batch
-        */
+       * Slice CSR to rows in batch
+       */
 
-      rmm::device_uvector<value_idx> query_batch_indptr(
-        query_batcher.batch_rows() + 1, handle.get_stream());
+      rmm::device_uvector<value_idx> query_batch_indptr(query_batcher.batch_rows() + 1,
+                                                        handle.get_stream());
 
-      value_idx n_query_batch_nnz = query_batcher.get_batch_csr_indptr_nnz(
-        query_batch_indptr.data(), handle.get_stream());
+      value_idx n_query_batch_nnz =
+        query_batcher.get_batch_csr_indptr_nnz(query_batch_indptr.data(), handle.get_stream());
 
-      rmm::device_uvector<value_idx> query_batch_indices(n_query_batch_nnz,
-                                                         handle.get_stream());
-      rmm::device_uvector<value_t> query_batch_data(n_query_batch_nnz,
-                                                    handle.get_stream());
+      rmm::device_uvector<value_idx> query_batch_indices(n_query_batch_nnz, handle.get_stream());
+      rmm::device_uvector<value_t> query_batch_data(n_query_batch_nnz, handle.get_stream());
 
-      query_batcher.get_batch_csr_indices_data(query_batch_indices.data(),
-                                               query_batch_data.data(),
-                                               handle.get_stream());
+      query_batcher.get_batch_csr_indices_data(
+        query_batch_indices.data(), query_batch_data.data(), handle.get_stream());
 
       // A 3-partition temporary merge space to scale the batching. 2 parts for subsequent
       // batches and 1 space for the results of the merge, which get copied back to the top
-      rmm::device_uvector<value_idx> merge_buffer_indices(0,
-                                                          handle.get_stream());
+      rmm::device_uvector<value_idx> merge_buffer_indices(0, handle.get_stream());
       rmm::device_uvector<value_t> merge_buffer_dists(0, handle.get_stream());
 
-      value_t *dists_merge_buffer_ptr;
-      value_idx *indices_merge_buffer_ptr;
+      value_t* dists_merge_buffer_ptr;
+      value_idx* indices_merge_buffer_ptr;
 
       int n_batches_idx = raft::ceildiv((size_t)n_idx_rows, batch_size_index);
       csr_batcher_t<value_idx, value_t> idx_batcher(
@@ -186,22 +205,19 @@ class sparse_knn_t {
       for (int j = 0; j < n_batches_idx; j++) {
         idx_batcher.set_batch(j);
 
-        merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3,
-                                    handle.get_stream());
-        merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3,
-                                  handle.get_stream());
+        merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3, handle.get_stream());
+        merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3, handle.get_stream());
 
         /**
-          * Slice CSR to rows in batch
-        */
-        rmm::device_uvector<value_idx> idx_batch_indptr(
-          idx_batcher.batch_rows() + 1, handle.get_stream());
-        rmm::device_uvector<value_idx> idx_batch_indices(0,
-                                                         handle.get_stream());
+         * Slice CSR to rows in batch
+         */
+        rmm::device_uvector<value_idx> idx_batch_indptr(idx_batcher.batch_rows() + 1,
+                                                        handle.get_stream());
+        rmm::device_uvector<value_idx> idx_batch_indices(0, handle.get_stream());
         rmm::device_uvector<value_t> idx_batch_data(0, handle.get_stream());
 
-        value_idx idx_batch_nnz = idx_batcher.get_batch_csr_indptr_nnz(
-          idx_batch_indptr.data(), handle.get_stream());
+        value_idx idx_batch_nnz =
+          idx_batcher.get_batch_csr_indptr_nnz(idx_batch_indptr.data(), handle.get_stream());
 
         idx_batch_indices.resize(idx_batch_nnz, handle.get_stream());
         idx_batch_data.resize(idx_batch_nnz, handle.get_stream());
@@ -210,111 +226,126 @@ class sparse_knn_t {
           idx_batch_indices.data(), idx_batch_data.data(), handle.get_stream());
 
         /**
-           * Compute distances
-           */
-        size_t dense_size =
-          idx_batcher.batch_rows() * query_batcher.batch_rows();
-        rmm::device_uvector<value_t> batch_dists(dense_size,
-                                                 handle.get_stream());
-
-        CUDA_CHECK(cudaMemset(batch_dists.data(), 0,
-                              batch_dists.size() * sizeof(value_t)));
-
-        compute_distances(idx_batcher, query_batcher, idx_batch_nnz,
-                          n_query_batch_nnz, idx_batch_indptr.data(),
-                          idx_batch_indices.data(), idx_batch_data.data(),
-                          query_batch_indptr.data(), query_batch_indices.data(),
-                          query_batch_data.data(), batch_dists.data());
+         * Compute distances
+         */
+        size_t dense_size = idx_batcher.batch_rows() * query_batcher.batch_rows();
+        rmm::device_uvector<value_t> batch_dists(dense_size, handle.get_stream());
+
+        CUDA_CHECK(cudaMemset(batch_dists.data(), 0, batch_dists.size() * sizeof(value_t)));
+
+        compute_distances(idx_batcher,
+                          query_batcher,
+                          idx_batch_nnz,
+                          n_query_batch_nnz,
+                          idx_batch_indptr.data(),
+                          idx_batch_indices.data(),
+                          idx_batch_data.data(),
+                          query_batch_indptr.data(),
+                          query_batch_indices.data(),
+                          query_batch_data.data(),
+                          batch_dists.data());
 
         // Build batch indices array
-        rmm::device_uvector<value_idx> batch_indices(batch_dists.size(),
-                                                     handle.get_stream());
+        rmm::device_uvector<value_idx> batch_indices(batch_dists.size(), handle.get_stream());
 
         // populate batch indices array
-        value_idx batch_rows = query_batcher.batch_rows(),
-                  batch_cols = idx_batcher.batch_rows();
+        value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows();
 
-        iota_fill(batch_indices.data(), batch_rows, batch_cols,
-                  handle.get_stream());
+        iota_fill(batch_indices.data(), batch_rows, batch_cols, handle.get_stream());
 
         /**
          * Perform k-selection on batch & merge with other k-selections
          */
         size_t merge_buffer_offset = batch_rows * k;
-        dists_merge_buffer_ptr =
-          merge_buffer_dists.data() + merge_buffer_offset;
-        indices_merge_buffer_ptr =
-          merge_buffer_indices.data() + merge_buffer_offset;
-
-        perform_k_selection(idx_batcher, query_batcher, batch_dists.data(),
-                            batch_indices.data(), dists_merge_buffer_ptr,
+        dists_merge_buffer_ptr     = merge_buffer_dists.data() + merge_buffer_offset;
+        indices_merge_buffer_ptr   = merge_buffer_indices.data() + merge_buffer_offset;
+
+        perform_k_selection(idx_batcher,
+                            query_batcher,
+                            batch_dists.data(),
+                            batch_indices.data(),
+                            dists_merge_buffer_ptr,
                             indices_merge_buffer_ptr);
 
-        value_t *dists_merge_buffer_tmp_ptr = dists_merge_buffer_ptr;
-        value_idx *indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr;
+        value_t* dists_merge_buffer_tmp_ptr     = dists_merge_buffer_ptr;
+        value_idx* indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr;
 
         // Merge results of difference batches if necessary
         if (idx_batcher.batch_start() > 0) {
-          size_t merge_buffer_tmp_out = batch_rows * k * 2;
-          dists_merge_buffer_tmp_ptr =
-            merge_buffer_dists.data() + merge_buffer_tmp_out;
-          indices_merge_buffer_tmp_ptr =
-            merge_buffer_indices.data() + merge_buffer_tmp_out;
-
-          merge_batches(idx_batcher, query_batcher, merge_buffer_dists.data(),
-                        merge_buffer_indices.data(), dists_merge_buffer_tmp_ptr,
+          size_t merge_buffer_tmp_out  = batch_rows * k * 2;
+          dists_merge_buffer_tmp_ptr   = merge_buffer_dists.data() + merge_buffer_tmp_out;
+          indices_merge_buffer_tmp_ptr = merge_buffer_indices.data() + merge_buffer_tmp_out;
+
+          merge_batches(idx_batcher,
+                        query_batcher,
+                        merge_buffer_dists.data(),
+                        merge_buffer_indices.data(),
+                        dists_merge_buffer_tmp_ptr,
                         indices_merge_buffer_tmp_ptr);
         }
 
         // copy merged output back into merge buffer partition for next iteration
         raft::copy_async<value_idx>(merge_buffer_indices.data(),
                                     indices_merge_buffer_tmp_ptr,
-                                    batch_rows * k, handle.get_stream());
+                                    batch_rows * k,
+                                    handle.get_stream());
         raft::copy_async<value_t>(merge_buffer_dists.data(),
-                                  dists_merge_buffer_tmp_ptr, batch_rows * k,
+                                  dists_merge_buffer_tmp_ptr,
+                                  batch_rows * k,
                                   handle.get_stream());
       }
 
       // Copy final merged batch to output array
-      raft::copy_async<value_idx>(
-        output_indices + (rows_processed * k), merge_buffer_indices.data(),
-        query_batcher.batch_rows() * k, handle.get_stream());
-      raft::copy_async<value_t>(
-        output_dists + (rows_processed * k), merge_buffer_dists.data(),
-        query_batcher.batch_rows() * k, handle.get_stream());
+      raft::copy_async<value_idx>(output_indices + (rows_processed * k),
+                                  merge_buffer_indices.data(),
+                                  query_batcher.batch_rows() * k,
+                                  handle.get_stream());
+      raft::copy_async<value_t>(output_dists + (rows_processed * k),
+                                merge_buffer_dists.data(),
+                                query_batcher.batch_rows() * k,
+                                handle.get_stream());
 
       rows_processed += query_batcher.batch_rows();
     }
   }
 
  private:
-  void merge_batches(csr_batcher_t<value_idx, value_t> &idx_batcher,
-                     csr_batcher_t<value_idx, value_t> &query_batcher,
-                     value_t *merge_buffer_dists,
-                     value_idx *merge_buffer_indices, value_t *out_dists,
-                     value_idx *out_indices) {
+  void merge_batches(csr_batcher_t<value_idx, value_t>& idx_batcher,
+                     csr_batcher_t<value_idx, value_t>& query_batcher,
+                     value_t* merge_buffer_dists,
+                     value_idx* merge_buffer_indices,
+                     value_t* out_dists,
+                     value_idx* out_indices)
+  {
     // build translation buffer to shift resulting indices by the batch
     std::vector<value_idx> id_ranges;
     id_ranges.push_back(0);
     id_ranges.push_back(idx_batcher.batch_start());
 
     rmm::device_uvector<value_idx> trans(id_ranges.size(), handle.get_stream());
-    raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(),
-                        handle.get_stream());
+    raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(), handle.get_stream());
 
     // combine merge buffers only if there's more than 1 partition to combine
-    raft::spatial::knn::knn_merge_parts(
-      merge_buffer_dists, merge_buffer_indices, out_dists, out_indices,
-      query_batcher.batch_rows(), 2, k, handle.get_stream(), trans.data());
+    raft::spatial::knn::knn_merge_parts(merge_buffer_dists,
+                                        merge_buffer_indices,
+                                        out_dists,
+                                        out_indices,
+                                        query_batcher.batch_rows(),
+                                        2,
+                                        k,
+                                        handle.get_stream(),
+                                        trans.data());
   }
 
   void perform_k_selection(csr_batcher_t<value_idx, value_t> idx_batcher,
                            csr_batcher_t<value_idx, value_t> query_batcher,
-                           value_t *batch_dists, value_idx *batch_indices,
-                           value_t *out_dists, value_idx *out_indices) {
+                           value_t* batch_dists,
+                           value_idx* batch_indices,
+                           value_t* out_dists,
+                           value_idx* out_indices)
+  {
     // populate batch indices array
-    value_idx batch_rows = query_batcher.batch_rows(),
-              batch_cols = idx_batcher.batch_rows();
+    value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows();
 
     // build translation buffer to shift resulting indices by the batch
     std::vector<value_idx> id_ranges;
@@ -329,52 +360,60 @@ class sparse_knn_t {
     if (metric == raft::distance::DistanceType::InnerProduct) ascending = false;
 
     // kernel to slice first (min) k cols and copy into batched merge buffer
-    raft::spatial::knn::select_k(batch_dists, batch_indices, batch_rows,
-                                 batch_cols, out_dists, out_indices, ascending,
-                                 n_neighbors, handle.get_stream());
+    raft::spatial::knn::select_k(batch_dists,
+                                 batch_indices,
+                                 batch_rows,
+                                 batch_cols,
+                                 out_dists,
+                                 out_indices,
+                                 ascending,
+                                 n_neighbors,
+                                 handle.get_stream());
   }
 
-  void compute_distances(csr_batcher_t<value_idx, value_t> &idx_batcher,
-                         csr_batcher_t<value_idx, value_t> &query_batcher,
-                         size_t idx_batch_nnz, size_t query_batch_nnz,
-                         value_idx *idx_batch_indptr,
-                         value_idx *idx_batch_indices, value_t *idx_batch_data,
-                         value_idx *query_batch_indptr,
-                         value_idx *query_batch_indices,
-                         value_t *query_batch_data, value_t *batch_dists) {
+  void compute_distances(csr_batcher_t<value_idx, value_t>& idx_batcher,
+                         csr_batcher_t<value_idx, value_t>& query_batcher,
+                         size_t idx_batch_nnz,
+                         size_t query_batch_nnz,
+                         value_idx* idx_batch_indptr,
+                         value_idx* idx_batch_indices,
+                         value_t* idx_batch_data,
+                         value_idx* query_batch_indptr,
+                         value_idx* query_batch_indices,
+                         value_t* query_batch_data,
+                         value_t* batch_dists)
+  {
     /**
      * Compute distances
      */
-    raft::sparse::distance::distances_config_t<value_idx, value_t> dist_config(
-      handle);
+    raft::sparse::distance::distances_config_t<value_idx, value_t> dist_config(handle);
     dist_config.b_nrows = idx_batcher.batch_rows();
     dist_config.b_ncols = n_idx_cols;
-    dist_config.b_nnz = idx_batch_nnz;
+    dist_config.b_nnz   = idx_batch_nnz;
 
-    dist_config.b_indptr = idx_batch_indptr;
+    dist_config.b_indptr  = idx_batch_indptr;
     dist_config.b_indices = idx_batch_indices;
-    dist_config.b_data = idx_batch_data;
+    dist_config.b_data    = idx_batch_data;
 
     dist_config.a_nrows = query_batcher.batch_rows();
     dist_config.a_ncols = n_query_cols;
-    dist_config.a_nnz = query_batch_nnz;
+    dist_config.a_nnz   = query_batch_nnz;
 
-    dist_config.a_indptr = query_batch_indptr;
+    dist_config.a_indptr  = query_batch_indptr;
     dist_config.a_indices = query_batch_indices;
-    dist_config.a_data = query_batch_data;
+    dist_config.a_data    = query_batch_data;
 
     if (raft::sparse::distance::supportedDistance.find(metric) ==
         raft::sparse::distance::supportedDistance.end())
       THROW("DistanceType not supported: %d", metric);
 
-    raft::sparse::distance::pairwiseDistance(batch_dists, dist_config, metric,
-                                             metricArg);
+    raft::sparse::distance::pairwiseDistance(batch_dists, dist_config, metric, metricArg);
   }
 
   const value_idx *idxIndptr, *idxIndices, *queryIndptr, *queryIndices;
-  value_idx *output_indices;
+  value_idx* output_indices;
   const value_t *idxData, *queryData;
-  value_t *output_dists;
+  value_t* output_dists;
 
   size_t idxNNZ, queryNNZ, batch_size_index, batch_size_query;
 
@@ -384,50 +423,74 @@ class sparse_knn_t {
 
   int n_idx_rows, n_idx_cols, n_query_rows, n_query_cols, k;
 
-  const raft::handle_t &handle;
+  const raft::handle_t& handle;
 };
 
 /**
-   * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors
-   * using some distance implementation
-   * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1)
-   * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz)
-   * @param[in] idxData csr data array of the index matrix (size idxNNZ)
-   * @param[in] idxNNA number of non-zeros for sparse index matrix
-   * @param[in] n_idx_rows number of data samples in index matrix
-   * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1)
-   * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ)
-   * @param[in] queryData csr data array of the query matrix (size queryNNZ)
-   * @param[in] queryNNZ number of non-zeros for sparse query matrix
-   * @param[in] n_query_rows number of data samples in query matrix
-   * @param[in] n_query_cols number of features in query matrix
-   * @param[out] output_indices dense matrix for output indices (size n_query_rows * k)
-   * @param[out] output_dists dense matrix for output distances (size n_query_rows * k)
-   * @param[in] k the number of neighbors to query
-   * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to
-   * @param[in] batch_size_index maximum number of rows to use from index matrix per batch
-   * @param[in] batch_size_query maximum number of rows to use from query matrix per batch
-   * @param[in] metric distance metric/measure to use
-   * @param[in] metricArg potential argument for metric (currently unused)
-   */
+ * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors
+ * using some distance implementation
+ * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1)
+ * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz)
+ * @param[in] idxData csr data array of the index matrix (size idxNNZ)
+ * @param[in] idxNNA number of non-zeros for sparse index matrix
+ * @param[in] n_idx_rows number of data samples in index matrix
+ * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1)
+ * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ)
+ * @param[in] queryData csr data array of the query matrix (size queryNNZ)
+ * @param[in] queryNNZ number of non-zeros for sparse query matrix
+ * @param[in] n_query_rows number of data samples in query matrix
+ * @param[in] n_query_cols number of features in query matrix
+ * @param[out] output_indices dense matrix for output indices (size n_query_rows * k)
+ * @param[out] output_dists dense matrix for output distances (size n_query_rows * k)
+ * @param[in] k the number of neighbors to query
+ * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to
+ * @param[in] batch_size_index maximum number of rows to use from index matrix per batch
+ * @param[in] batch_size_query maximum number of rows to use from query matrix per batch
+ * @param[in] metric distance metric/measure to use
+ * @param[in] metricArg potential argument for metric (currently unused)
+ */
 template <typename value_idx = int, typename value_t = float, int TPB_X = 32>
-void brute_force_knn(const value_idx *idxIndptr, const value_idx *idxIndices,
-                     const value_t *idxData, size_t idxNNZ, int n_idx_rows,
-                     int n_idx_cols, const value_idx *queryIndptr,
-                     const value_idx *queryIndices, const value_t *queryData,
-                     size_t queryNNZ, int n_query_rows, int n_query_cols,
-                     value_idx *output_indices, value_t *output_dists, int k,
-                     const raft::handle_t &handle,
-                     size_t batch_size_index = 2 << 14,  // approx 1M
-                     size_t batch_size_query = 2 << 14,
-                     raft::distance::DistanceType metric =
-                       raft::distance::DistanceType::L2Expanded,
-                     float metricArg = 0) {
-  sparse_knn_t<value_idx, value_t>(
-    idxIndptr, idxIndices, idxData, idxNNZ, n_idx_rows, n_idx_cols, queryIndptr,
-    queryIndices, queryData, queryNNZ, n_query_rows, n_query_cols,
-    output_indices, output_dists, k, handle, batch_size_index, batch_size_query,
-    metric, metricArg)
+void brute_force_knn(const value_idx* idxIndptr,
+                     const value_idx* idxIndices,
+                     const value_t* idxData,
+                     size_t idxNNZ,
+                     int n_idx_rows,
+                     int n_idx_cols,
+                     const value_idx* queryIndptr,
+                     const value_idx* queryIndices,
+                     const value_t* queryData,
+                     size_t queryNNZ,
+                     int n_query_rows,
+                     int n_query_cols,
+                     value_idx* output_indices,
+                     value_t* output_dists,
+                     int k,
+                     const raft::handle_t& handle,
+                     size_t batch_size_index             = 2 << 14,  // approx 1M
+                     size_t batch_size_query             = 2 << 14,
+                     raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
+                     float metricArg                     = 0)
+{
+  sparse_knn_t<value_idx, value_t>(idxIndptr,
+                                   idxIndices,
+                                   idxData,
+                                   idxNNZ,
+                                   n_idx_rows,
+                                   n_idx_cols,
+                                   queryIndptr,
+                                   queryIndices,
+                                   queryData,
+                                   queryNNZ,
+                                   n_query_rows,
+                                   n_query_cols,
+                                   output_indices,
+                                   output_dists,
+                                   k,
+                                   handle,
+                                   batch_size_index,
+                                   batch_size_query,
+                                   metric,
+                                   metricArg)
     .run();
 }
 
diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh
index 3df1c77081..f13c43c306 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.cuh
+++ b/cpp/include/raft/sparse/selection/knn_graph.cuh
@@ -45,31 +45,34 @@ namespace selection {
  * @param m
  */
 template <typename value_idx>
-__global__ void fill_indices(value_idx *indices, size_t m, size_t nnz) {
+__global__ void fill_indices(value_idx* indices, size_t m, size_t nnz)
+{
   value_idx tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (tid >= nnz) return;
-  value_idx v = tid / m;
+  value_idx v  = tid / m;
   indices[tid] = v;
 }
 
 template <typename value_idx>
-value_idx build_k(value_idx n_samples, int c) {
+value_idx build_k(value_idx n_samples, int c)
+{
   // from "kNN-MST-Agglomerative: A fast & scalable graph-based data clustering
   // approach on GPU"
-  return min(n_samples,
-             max((value_idx)2, (value_idx)floor(log2(n_samples)) + c));
+  return min(n_samples, max((value_idx)2, (value_idx)floor(log2(n_samples)) + c));
 }
 
 template <typename in_t, typename out_t>
-__global__ void conv_indices_kernel(in_t *inds, out_t *out, size_t nnz) {
+__global__ void conv_indices_kernel(in_t* inds, out_t* out, size_t nnz)
+{
   size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
   if (tid >= nnz) return;
-  out_t v = inds[tid];
+  out_t v  = inds[tid];
   out[tid] = v;
 }
 
 template <typename in_t, typename out_t, int tpb = 256>
-void conv_indices(in_t *inds, out_t *out, size_t size, cudaStream_t stream) {
+void conv_indices(in_t* inds, out_t* out, size_t size, cudaStream_t stream)
+{
   size_t blocks = ceildiv(size, (size_t)tpb);
   conv_indices_kernel<<<blocks, tpb, 0, stream>>>(inds, out, size);
 }
@@ -92,9 +95,14 @@ void conv_indices(in_t *inds, out_t *out, size_t size, cudaStream_t stream) {
  * @param c
  */
 template <typename value_idx = int, typename value_t = float>
-void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n,
+void knn_graph(const handle_t& handle,
+               const value_t* X,
+               size_t m,
+               size_t n,
                raft::distance::DistanceType metric,
-               raft::sparse::COO<value_t, value_idx> &out, int c = 15) {
+               raft::sparse::COO<value_t, value_idx>& out,
+               int c = 15)
+{
   int k = build_k(m, c);
 
   auto stream = handle.get_stream();
@@ -108,8 +116,8 @@ void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n,
   size_t blocks = ceildiv(nnz, (size_t)256);
   fill_indices<value_idx><<<blocks, 256, 0, stream>>>(rows.data(), k, nnz);
 
-  std::vector<value_t *> inputs;
-  inputs.push_back(const_cast<value_t *>(X));
+  std::vector<value_t*> inputs;
+  inputs.push_back(const_cast<value_t*>(X));
 
   std::vector<int> sizes;
   sizes.push_back(m);
@@ -119,15 +127,25 @@ void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n,
   rmm::device_uvector<int64_t> int64_indices(nnz, stream);
 
   uint32_t knn_start = curTimeMillis();
-  raft::spatial::knn::brute_force_knn(
-    handle, inputs, sizes, n, const_cast<value_t *>(X), m, int64_indices.data(),
-    data.data(), k, true, true, nullptr, metric);
+  raft::spatial::knn::brute_force_knn(handle,
+                                      inputs,
+                                      sizes,
+                                      n,
+                                      const_cast<value_t*>(X),
+                                      m,
+                                      int64_indices.data(),
+                                      data.data(),
+                                      k,
+                                      true,
+                                      true,
+                                      nullptr,
+                                      metric);
 
   // convert from current knn's 64-bit to 32-bit.
   conv_indices(int64_indices.data(), indices.data(), nnz, stream);
 
-  raft::sparse::linalg::symmetrize(handle, rows.data(), indices.data(),
-                                   data.data(), m, k, nnz, out);
+  raft::sparse::linalg::symmetrize(
+    handle, rows.data(), indices.data(), data.data(), m, k, nnz, out);
 }
 
 };  // namespace selection
diff --git a/cpp/include/raft/sparse/utils.h b/cpp/include/raft/sparse/utils.h
index 63578bf1f3..56e8832e0a 100644
--- a/cpp/include/raft/sparse/utils.h
+++ b/cpp/include/raft/sparse/utils.h
@@ -26,7 +26,8 @@ namespace sparse {
  * @param[in] ncols number of blocks to quantize
  */
 template <typename value_idx>
-inline int block_dim(value_idx ncols) {
+inline int block_dim(value_idx ncols)
+{
   int blockdim;
   if (ncols <= 32)
     blockdim = 32;
@@ -54,9 +55,9 @@ inline int block_dim(value_idx ncols) {
  * @return
  */
 template <typename G>
-__device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask,
-                                                    G key) {
-  unsigned int mask = __ballot_sync(init_mask, true);
+__device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask, G key)
+{
+  unsigned int mask       = __ballot_sync(init_mask, true);
   unsigned int peer_group = 0;
   bool is_peer;
 
@@ -77,12 +78,14 @@ __device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask,
 }
 #endif
 
-__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group) {
+__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group)
+{
   return __ffs(peer_group) - 1;
 }
 
 template <typename value_idx>
-__global__ void iota_fill_block_kernel(value_idx *indices, value_idx ncols) {
+__global__ void iota_fill_block_kernel(value_idx* indices, value_idx ncols)
+{
   int row = blockIdx.x;
   int tid = threadIdx.x;
 
@@ -92,15 +95,16 @@ __global__ void iota_fill_block_kernel(value_idx *indices, value_idx ncols) {
 }
 
 template <typename value_idx>
-void iota_fill(value_idx *indices, value_idx nrows, value_idx ncols,
-               cudaStream_t stream) {
+void iota_fill(value_idx* indices, value_idx nrows, value_idx ncols, cudaStream_t stream)
+{
   int blockdim = block_dim(ncols);
 
   iota_fill_block_kernel<<<nrows, blockdim, 0, stream>>>(indices, ncols);
 }
 
 template <typename T>
-__device__ int get_stop_idx(T row, T m, T nnz, const T *ind) {
+__device__ int get_stop_idx(T row, T m, T nnz, const T* ind)
+{
   int stop_idx = 0;
   if (row < (m - 1))
     stop_idx = ind[row + 1];
diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp
index 2cdf9bf4f5..e8cc85256d 100644
--- a/cpp/include/raft/spatial/knn/ann.hpp
+++ b/cpp/include/raft/spatial/knn/ann.hpp
@@ -42,14 +42,16 @@ namespace knn {
  * @param[in] D the dimensionality of the index array
  */
 template <typename value_idx = int>
-inline void approx_knn_build_index(raft::handle_t &handle,
-                                   raft::spatial::knn::knnIndex *index,
-                                   knnIndexParam *params,
+inline void approx_knn_build_index(raft::handle_t& handle,
+                                   raft::spatial::knn::knnIndex* index,
+                                   knnIndexParam* params,
                                    raft::distance::DistanceType metric,
-                                   float metricArg, float *index_array,
-                                   value_idx n, value_idx D) {
-  detail::approx_knn_build_index(handle, index, params, metric, metricArg,
-                                 index_array, n, D);
+                                   float metricArg,
+                                   float* index_array,
+                                   value_idx n,
+                                   value_idx D)
+{
+  detail::approx_knn_build_index(handle, index, params, metric, metricArg, index_array, n, D);
 }
 
 /**
@@ -66,12 +68,15 @@ inline void approx_knn_build_index(raft::handle_t &handle,
  * @param[in] n number of rows in the query array
  */
 template <typename value_idx = int>
-inline void approx_knn_search(raft::handle_t &handle, float *distances,
-                              int64_t *indices,
-                              raft::spatial::knn::knnIndex *index, value_idx k,
-                              float *query_array, value_idx n) {
-  detail::approx_knn_search(handle, distances, indices, index, k, query_array,
-                            n);
+inline void approx_knn_search(raft::handle_t& handle,
+                              float* distances,
+                              int64_t* indices,
+                              raft::spatial::knn::knnIndex* index,
+                              value_idx k,
+                              float* query_array,
+                              value_idx n)
+{
+  detail::approx_knn_search(handle, distances, indices, index, k, query_array, n);
 }
 
 }  // namespace knn
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index 6a6c7751c2..573a23181d 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -26,13 +26,14 @@ namespace spatial {
 namespace knn {
 
 struct knnIndex {
-  faiss::gpu::GpuIndex *index;
+  faiss::gpu::GpuIndex* index;
   raft::distance::DistanceType metric;
   float metricArg;
 
-  faiss::gpu::StandardGpuResources *gpu_res;
+  faiss::gpu::StandardGpuResources* gpu_res;
   int device;
-  ~knnIndex() {
+  ~knnIndex()
+  {
     delete index;
     delete gpu_res;
   }
@@ -57,7 +58,8 @@ struct IVFParam : knnIndexParam {
   int nprobe;
 };
 
-struct IVFFlatParam : IVFParam {};
+struct IVFFlatParam : IVFParam {
+};
 
 struct IVFPQParam : IVFParam {
   int M;
diff --git a/cpp/include/raft/spatial/knn/ball_cover.hpp b/cpp/include/raft/spatial/knn/ball_cover.hpp
index a98473f186..cb2b9e99cd 100644
--- a/cpp/include/raft/spatial/knn/ball_cover.hpp
+++ b/cpp/include/raft/spatial/knn/ball_cover.hpp
@@ -28,12 +28,11 @@ namespace raft {
 namespace spatial {
 namespace knn {
 
-template <typename value_idx = std::int64_t, typename value_t,
-          typename value_int = std::uint32_t>
-void rbc_build_index(const raft::handle_t &handle,
-                     BallCoverIndex<value_idx, value_t, value_int> &index) {
-  ASSERT(index.n == 2,
-         "Random ball cover currently only works in 2-dimensions");
+template <typename value_idx = std::int64_t, typename value_t, typename value_int = std::uint32_t>
+void rbc_build_index(const raft::handle_t& handle,
+                     BallCoverIndex<value_idx, value_t, value_int>& index)
+{
+  ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions");
   if (index.metric == raft::distance::DistanceType::Haversine) {
     detail::rbc_build_index(handle, index, detail::HaversineFunc());
   } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
@@ -74,23 +73,23 @@ void rbc_build_index(const raft::handle_t &handle,
  *               many datasets can still have great recall even by only
  *               looking in the closest landmark.
  */
-template <typename value_idx = std::int64_t, typename value_t,
-          typename value_int = std::uint32_t>
-void rbc_all_knn_query(const raft::handle_t &handle,
-                       BallCoverIndex<value_idx, value_t, value_int> &index,
-                       value_int k, value_idx *inds, value_t *dists,
-                       bool perform_post_filtering = true, float weight = 1.0) {
-  ASSERT(index.n == 2,
-         "Random ball cover currently only works in 2-dimensions");
+template <typename value_idx = std::int64_t, typename value_t, typename value_int = std::uint32_t>
+void rbc_all_knn_query(const raft::handle_t& handle,
+                       BallCoverIndex<value_idx, value_t, value_int>& index,
+                       value_int k,
+                       value_idx* inds,
+                       value_t* dists,
+                       bool perform_post_filtering = true,
+                       float weight                = 1.0)
+{
+  ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions");
   if (index.metric == raft::distance::DistanceType::Haversine) {
-    detail::rbc_all_knn_query(handle, index, k, inds, dists,
-                              detail::HaversineFunc(), perform_post_filtering,
-                              weight);
+    detail::rbc_all_knn_query(
+      handle, index, k, inds, dists, detail::HaversineFunc(), perform_post_filtering, weight);
   } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
              index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
-    detail::rbc_all_knn_query(handle, index, k, inds, dists,
-                              detail::EuclideanFunc(), perform_post_filtering,
-                              weight);
+    detail::rbc_all_knn_query(
+      handle, index, k, inds, dists, detail::EuclideanFunc(), perform_post_filtering, weight);
   } else {
     RAFT_FAIL("Metric not supported");
   }
@@ -127,23 +126,40 @@ void rbc_all_knn_query(const raft::handle_t &handle,
  *               looking in the closest landmark.
  * @param[in] n_query_pts number of query points
  */
-template <typename value_idx = std::int64_t, typename value_t,
-          typename value_int = std::uint32_t>
-void rbc_knn_query(const raft::handle_t &handle,
-                   BallCoverIndex<value_idx, value_t, value_int> &index,
-                   value_int k, const value_t *query, value_int n_query_pts,
-                   value_idx *inds, value_t *dists,
-                   bool perform_post_filtering = true, float weight = 1.0) {
-  ASSERT(index.n == 2,
-         "Random ball cover currently only works in 2-dimensions");
+template <typename value_idx = std::int64_t, typename value_t, typename value_int = std::uint32_t>
+void rbc_knn_query(const raft::handle_t& handle,
+                   BallCoverIndex<value_idx, value_t, value_int>& index,
+                   value_int k,
+                   const value_t* query,
+                   value_int n_query_pts,
+                   value_idx* inds,
+                   value_t* dists,
+                   bool perform_post_filtering = true,
+                   float weight                = 1.0)
+{
+  ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions");
   if (index.metric == raft::distance::DistanceType::Haversine) {
-    detail::rbc_knn_query(handle, index, k, query, n_query_pts, inds, dists,
-                          detail::HaversineFunc(), perform_post_filtering,
+    detail::rbc_knn_query(handle,
+                          index,
+                          k,
+                          query,
+                          n_query_pts,
+                          inds,
+                          dists,
+                          detail::HaversineFunc(),
+                          perform_post_filtering,
                           weight);
   } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
              index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
-    detail::rbc_knn_query(handle, index, k, query, n_query_pts, inds, dists,
-                          detail::EuclideanFunc(), perform_post_filtering,
+    detail::rbc_knn_query(handle,
+                          index,
+                          k,
+                          query,
+                          n_query_pts,
+                          inds,
+                          dists,
+                          detail::EuclideanFunc(),
+                          perform_post_filtering,
                           weight);
   } else {
     RAFT_FAIL("Metric not supported");
diff --git a/cpp/include/raft/spatial/knn/ball_cover_common.h b/cpp/include/raft/spatial/knn/ball_cover_common.h
index ca614bb0cb..e38124edb6 100644
--- a/cpp/include/raft/spatial/knn/ball_cover_common.h
+++ b/cpp/include/raft/spatial/knn/ball_cover_common.h
@@ -34,12 +34,13 @@ namespace knn {
  * @tparam value_t
  * @tparam value_int
  */
-template <typename value_idx, typename value_t,
-          typename value_int = std::uint32_t>
+template <typename value_idx, typename value_t, typename value_int = std::uint32_t>
 class BallCoverIndex {
  public:
-  explicit BallCoverIndex(const raft::handle_t &handle_, const value_t *X_,
-                          value_int m_, value_int n_,
+  explicit BallCoverIndex(const raft::handle_t& handle_,
+                          const value_t* X_,
+                          value_int m_,
+                          value_int n_,
                           raft::distance::DistanceType metric_)
     : handle(handle_),
       X(X_),
@@ -47,37 +48,39 @@ class BallCoverIndex {
       n(n_),
       metric(metric_),
       /**
-      * the sqrt() here makes the sqrt(m)^2 a linear-time lower bound
-      *
-      * Total memory footprint of index: (2 * sqrt(m)) + (n * sqrt(m)) + (2 * m)
-      */
+       * the sqrt() here makes the sqrt(m)^2 a linear-time lower bound
+       *
+       * Total memory footprint of index: (2 * sqrt(m)) + (n * sqrt(m)) + (2 * m)
+       */
       n_landmarks(sqrt(m_)),
       R_indptr(sqrt(m_) + 1, handle.get_stream()),
       R_1nn_cols(m_, handle.get_stream()),
       R_1nn_dists(m_, handle.get_stream()),
       R(sqrt(m_) * n_, handle.get_stream()),
       R_radius(sqrt(m_), handle.get_stream()),
-      index_trained(false) {}
+      index_trained(false)
+  {
+  }
 
-  value_idx *get_R_indptr() { return R_indptr.data(); }
-  value_idx *get_R_1nn_cols() { return R_1nn_cols.data(); }
-  value_t *get_R_1nn_dists() { return R_1nn_dists.data(); }
-  value_t *get_R_radius() { return R_radius.data(); }
-  value_t *get_R() { return R.data(); }
-  const value_t *get_X() { return X; }
+  value_idx* get_R_indptr() { return R_indptr.data(); }
+  value_idx* get_R_1nn_cols() { return R_1nn_cols.data(); }
+  value_t* get_R_1nn_dists() { return R_1nn_dists.data(); }
+  value_t* get_R_radius() { return R_radius.data(); }
+  value_t* get_R() { return R.data(); }
+  const value_t* get_X() { return X; }
 
   bool is_index_trained() const { return index_trained; };
 
   // This should only be set by internal functions
   void set_index_trained() { index_trained = true; }
 
-  const raft::handle_t &handle;
+  const raft::handle_t& handle;
 
   const value_int m;
   const value_int n;
   const value_int n_landmarks;
 
-  const value_t *X;
+  const value_t* X;
 
   raft::distance::DistanceType metric;
 
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index 980001f166..7f4e4511d2 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -55,90 +55,84 @@ namespace spatial {
 namespace knn {
 namespace detail {
 
-inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(
-  QuantizerType qtype) {
+inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(QuantizerType qtype)
+{
   switch (qtype) {
-    case QuantizerType::QT_8bit:
-      return faiss::ScalarQuantizer::QuantizerType::QT_8bit;
+    case QuantizerType::QT_8bit: return faiss::ScalarQuantizer::QuantizerType::QT_8bit;
     case QuantizerType::QT_8bit_uniform:
       return faiss::ScalarQuantizer::QuantizerType::QT_8bit_uniform;
     case QuantizerType::QT_4bit_uniform:
       return faiss::ScalarQuantizer::QuantizerType::QT_4bit_uniform;
-    case QuantizerType::QT_fp16:
-      return faiss::ScalarQuantizer::QuantizerType::QT_fp16;
+    case QuantizerType::QT_fp16: return faiss::ScalarQuantizer::QuantizerType::QT_fp16;
     case QuantizerType::QT_8bit_direct:
       return faiss::ScalarQuantizer::QuantizerType::QT_8bit_direct;
-    case QuantizerType::QT_6bit:
-      return faiss::ScalarQuantizer::QuantizerType::QT_6bit;
-    default:
-      return (faiss::ScalarQuantizer::QuantizerType)qtype;
+    case QuantizerType::QT_6bit: return faiss::ScalarQuantizer::QuantizerType::QT_6bit;
+    default: return (faiss::ScalarQuantizer::QuantizerType)qtype;
   }
 }
 
 template <typename IntType = int>
-void approx_knn_ivfflat_build_index(knnIndex *index, IVFParam *params,
-                                    raft::distance::DistanceType metric,
-                                    IntType n, IntType D) {
+void approx_knn_ivfflat_build_index(
+  knnIndex* index, IVFParam* params, raft::distance::DistanceType metric, IntType n, IntType D)
+{
   faiss::gpu::GpuIndexIVFFlatConfig config;
-  config.device = index->device;
+  config.device                  = index->device;
   faiss::MetricType faiss_metric = build_faiss_metric(metric);
-  faiss::gpu::GpuIndexIVFFlat *faiss_index = new faiss::gpu::GpuIndexIVFFlat(
-    index->gpu_res, D, params->nlist, faiss_metric, config);
+  faiss::gpu::GpuIndexIVFFlat* faiss_index =
+    new faiss::gpu::GpuIndexIVFFlat(index->gpu_res, D, params->nlist, faiss_metric, config);
   faiss_index->setNumProbes(params->nprobe);
   index->index = faiss_index;
 }
 
 template <typename IntType = int>
-void approx_knn_ivfpq_build_index(knnIndex *index, IVFPQParam *params,
-                                  raft::distance::DistanceType metric,
-                                  IntType n, IntType D) {
+void approx_knn_ivfpq_build_index(
+  knnIndex* index, IVFPQParam* params, raft::distance::DistanceType metric, IntType n, IntType D)
+{
   faiss::gpu::GpuIndexIVFPQConfig config;
-  config.device = index->device;
-  config.usePrecomputedTables = params->usePrecomputedTables;
-  config.interleavedLayout = params->n_bits != 8;
-  faiss::MetricType faiss_metric = build_faiss_metric(metric);
-  faiss::gpu::GpuIndexIVFPQ *faiss_index =
-    new faiss::gpu::GpuIndexIVFPQ(index->gpu_res, D, params->nlist, params->M,
-                                  params->n_bits, faiss_metric, config);
+  config.device                          = index->device;
+  config.usePrecomputedTables            = params->usePrecomputedTables;
+  config.interleavedLayout               = params->n_bits != 8;
+  faiss::MetricType faiss_metric         = build_faiss_metric(metric);
+  faiss::gpu::GpuIndexIVFPQ* faiss_index = new faiss::gpu::GpuIndexIVFPQ(
+    index->gpu_res, D, params->nlist, params->M, params->n_bits, faiss_metric, config);
   faiss_index->setNumProbes(params->nprobe);
   index->index = faiss_index;
 }
 
 template <typename IntType = int>
-void approx_knn_ivfsq_build_index(knnIndex *index, IVFSQParam *params,
-                                  raft::distance::DistanceType metric,
-                                  IntType n, IntType D) {
+void approx_knn_ivfsq_build_index(
+  knnIndex* index, IVFSQParam* params, raft::distance::DistanceType metric, IntType n, IntType D)
+{
   faiss::gpu::GpuIndexIVFScalarQuantizerConfig config;
-  config.device = index->device;
-  faiss::MetricType faiss_metric = build_faiss_metric(metric);
-  faiss::ScalarQuantizer::QuantizerType faiss_qtype =
-    build_faiss_qtype(params->qtype);
-  faiss::gpu::GpuIndexIVFScalarQuantizer *faiss_index =
-    new faiss::gpu::GpuIndexIVFScalarQuantizer(index->gpu_res, D, params->nlist,
-                                               faiss_qtype, faiss_metric,
-                                               params->encodeResidual);
+  config.device                                       = index->device;
+  faiss::MetricType faiss_metric                      = build_faiss_metric(metric);
+  faiss::ScalarQuantizer::QuantizerType faiss_qtype   = build_faiss_qtype(params->qtype);
+  faiss::gpu::GpuIndexIVFScalarQuantizer* faiss_index = new faiss::gpu::GpuIndexIVFScalarQuantizer(
+    index->gpu_res, D, params->nlist, faiss_qtype, faiss_metric, params->encodeResidual);
   faiss_index->setNumProbes(params->nprobe);
   index->index = faiss_index;
 }
 
 template <typename IntType = int>
-void approx_knn_build_index(raft::handle_t &handle,
-                            raft::spatial::knn::knnIndex *index,
-                            raft::spatial::knn::knnIndexParam *params,
+void approx_knn_build_index(raft::handle_t& handle,
+                            raft::spatial::knn::knnIndex* index,
+                            raft::spatial::knn::knnIndexParam* params,
                             raft::distance::DistanceType metric,
-                            float metricArg, float *index_array, IntType n,
-                            IntType D) {
+                            float metricArg,
+                            float* index_array,
+                            IntType n,
+                            IntType D)
+{
   int device;
   CUDA_CHECK(cudaGetDevice(&device));
 
-  faiss::gpu::StandardGpuResources *gpu_res =
-    new faiss::gpu::StandardGpuResources();
+  faiss::gpu::StandardGpuResources* gpu_res = new faiss::gpu::StandardGpuResources();
   gpu_res->noTempMemory();
   gpu_res->setDefaultStream(device, handle.get_stream());
-  index->gpu_res = gpu_res;
-  index->device = device;
-  index->index = nullptr;
-  index->metric = metric;
+  index->gpu_res   = gpu_res;
+  index->device    = device;
+  index->index     = nullptr;
+  index->metric    = metric;
   index->metricArg = metricArg;
 
   // perform preprocessing
@@ -148,21 +142,20 @@ void approx_knn_build_index(raft::handle_t &handle,
 
   query_metric_processor->preprocess(index_array);
 
-  if (dynamic_cast<IVFFlatParam *>(params)) {
-    IVFFlatParam *IVFFlat_param = dynamic_cast<IVFFlatParam *>(params);
+  if (dynamic_cast<IVFFlatParam*>(params)) {
+    IVFFlatParam* IVFFlat_param = dynamic_cast<IVFFlatParam*>(params);
     approx_knn_ivfflat_build_index(index, IVFFlat_param, metric, n, D);
     std::vector<float> h_index_array(n * D);
-    raft::update_host(h_index_array.data(), index_array, h_index_array.size(),
-                      handle.get_stream());
+    raft::update_host(h_index_array.data(), index_array, h_index_array.size(), handle.get_stream());
     query_metric_processor->revert(index_array);
     index->index->train(n, h_index_array.data());
     index->index->add(n, h_index_array.data());
   } else {
-    if (dynamic_cast<IVFPQParam *>(params)) {
-      IVFPQParam *IVFPQ_param = dynamic_cast<IVFPQParam *>(params);
+    if (dynamic_cast<IVFPQParam*>(params)) {
+      IVFPQParam* IVFPQ_param = dynamic_cast<IVFPQParam*>(params);
       approx_knn_ivfpq_build_index(index, IVFPQ_param, metric, n, D);
-    } else if (dynamic_cast<IVFSQParam *>(params)) {
-      IVFSQParam *IVFSQ_param = dynamic_cast<IVFSQParam *>(params);
+    } else if (dynamic_cast<IVFSQParam*>(params)) {
+      IVFSQParam* IVFSQ_param = dynamic_cast<IVFSQParam*>(params);
       approx_knn_ivfsq_build_index(index, IVFSQ_param, metric, n, D);
     } else {
       ASSERT(index->index, "KNN index could not be initialized");
@@ -175,13 +168,17 @@ void approx_knn_build_index(raft::handle_t &handle,
 }
 
 template <typename IntType = int>
-void approx_knn_search(raft::handle_t &handle, float *distances,
-                       int64_t *indices, raft::spatial::knn::knnIndex *index,
-                       IntType k, float *query_array, IntType n) {
+void approx_knn_search(raft::handle_t& handle,
+                       float* distances,
+                       int64_t* indices,
+                       raft::spatial::knn::knnIndex* index,
+                       IntType k,
+                       float* query_array,
+                       IntType n)
+{
   // perform preprocessing
   std::unique_ptr<MetricProcessor<float>> query_metric_processor =
-    create_processor<float>(index->metric, n, index->index->d, k, false,
-                            handle.get_stream());
+    create_processor<float>(index->metric, n, index->index->d, k, false, handle.get_stream());
 
   query_metric_processor->preprocess(query_array);
   index->index->search(n, query_array, k, distances, indices);
@@ -192,13 +189,14 @@ void approx_knn_search(raft::handle_t &handle, float *distances,
       index->metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
       index->metric == raft::distance::DistanceType::LpUnexpanded) {
     /**
-  * post-processing
-  */
+     * post-processing
+     */
     float p = 0.5;  // standard l2
-    if (index->metric == raft::distance::DistanceType::LpUnexpanded)
-      p = 1.0 / index->metricArg;
+    if (index->metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / index->metricArg;
     raft::linalg::unaryOp<float>(
-      distances, distances, n * k,
+      distances,
+      distances,
+      n * k,
       [p] __device__(float input) { return powf(input, p); },
       handle.get_stream());
   }
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
index 7354fa3497..7b54c3d25b 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
@@ -60,34 +60,43 @@ namespace detail {
  * @param handle
  * @param index
  */
-template <typename value_idx, typename value_t,
-          typename value_int = std::uint32_t>
-void sample_landmarks(const raft::handle_t &handle,
-                      BallCoverIndex<value_idx, value_t, value_int> &index) {
-  rmm::device_uvector<value_idx> R_1nn_cols2(index.n_landmarks,
-                                             handle.get_stream());
+template <typename value_idx, typename value_t, typename value_int = std::uint32_t>
+void sample_landmarks(const raft::handle_t& handle,
+                      BallCoverIndex<value_idx, value_t, value_int>& index)
+{
+  rmm::device_uvector<value_idx> R_1nn_cols2(index.n_landmarks, handle.get_stream());
   rmm::device_uvector<value_t> R_1nn_ones(index.m, handle.get_stream());
-  rmm::device_uvector<value_idx> R_indices(index.n_landmarks,
-                                           handle.get_stream());
+  rmm::device_uvector<value_idx> R_indices(index.n_landmarks, handle.get_stream());
 
-  thrust::sequence(handle.get_thrust_policy(), index.get_R_1nn_cols(),
-                   index.get_R_1nn_cols() + index.m, (value_idx)0);
+  thrust::sequence(handle.get_thrust_policy(),
+                   index.get_R_1nn_cols(),
+                   index.get_R_1nn_cols() + index.m,
+                   (value_idx)0);
 
-  thrust::fill(handle.get_thrust_policy(), R_1nn_ones.data(),
-               R_1nn_ones.data() + R_1nn_ones.size(), 1.0);
+  thrust::fill(
+    handle.get_thrust_policy(), R_1nn_ones.data(), R_1nn_ones.data() + R_1nn_ones.size(), 1.0);
 
   /**
- * 1. Randomly sample sqrt(n) points from X
- */
+   * 1. Randomly sample sqrt(n) points from X
+   */
   auto rng = raft::random::Rng(12345);
-  rng.sampleWithoutReplacement(handle, R_indices.data(), R_1nn_cols2.data(),
-                               index.get_R_1nn_cols(), R_1nn_ones.data(),
-                               (value_idx)index.n_landmarks, (value_idx)index.m,
+  rng.sampleWithoutReplacement(handle,
+                               R_indices.data(),
+                               R_1nn_cols2.data(),
+                               index.get_R_1nn_cols(),
+                               R_1nn_ones.data(),
+                               (value_idx)index.n_landmarks,
+                               (value_idx)index.m,
                                handle.get_stream());
 
-  raft::matrix::copyRows<value_t, value_idx, size_t>(
-    index.get_X(), index.m, index.n, index.get_R(), R_1nn_cols2.data(),
-    index.n_landmarks, handle.get_stream(), true);
+  raft::matrix::copyRows<value_t, value_idx, size_t>(index.get_X(),
+                                                     index.m,
+                                                     index.n,
+                                                     index.get_R(),
+                                                     R_1nn_cols2.data(),
+                                                     index.n_landmarks,
+                                                     handle.get_stream(),
+                                                     true);
 }
 
 /**
@@ -100,35 +109,34 @@ void sample_landmarks(const raft::handle_t &handle,
  * @param k
  * @param index
  */
-template <typename value_idx, typename value_t,
-          typename value_int = std::uint32_t>
-void construct_landmark_1nn(
-  const raft::handle_t &handle, const value_idx *R_knn_inds_ptr,
-  const value_t *R_knn_dists_ptr, value_int k,
-  BallCoverIndex<value_idx, value_t, value_int> &index) {
+template <typename value_idx, typename value_t, typename value_int = std::uint32_t>
+void construct_landmark_1nn(const raft::handle_t& handle,
+                            const value_idx* R_knn_inds_ptr,
+                            const value_t* R_knn_dists_ptr,
+                            value_int k,
+                            BallCoverIndex<value_idx, value_t, value_int>& index)
+{
   rmm::device_uvector<value_idx> R_1nn_inds(index.m, handle.get_stream());
 
-  value_idx *R_1nn_inds_ptr = R_1nn_inds.data();
-  value_t *R_1nn_dists_ptr = index.get_R_1nn_dists();
+  value_idx* R_1nn_inds_ptr = R_1nn_inds.data();
+  value_t* R_1nn_dists_ptr  = index.get_R_1nn_dists();
 
   auto idxs = thrust::make_counting_iterator<value_idx>(0);
-  thrust::for_each(handle.get_thrust_policy(), idxs, idxs + index.m,
-                   [=] __device__(value_idx i) {
-                     R_1nn_inds_ptr[i] = R_knn_inds_ptr[i * k];
-                     R_1nn_dists_ptr[i] = R_knn_dists_ptr[i * k];
-                   });
+  thrust::for_each(handle.get_thrust_policy(), idxs, idxs + index.m, [=] __device__(value_idx i) {
+    R_1nn_inds_ptr[i]  = R_knn_inds_ptr[i * k];
+    R_1nn_dists_ptr[i] = R_knn_dists_ptr[i * k];
+  });
 
-  auto keys = thrust::make_zip_iterator(
-    thrust::make_tuple(R_1nn_inds.data(), index.get_R_1nn_dists()));
+  auto keys =
+    thrust::make_zip_iterator(thrust::make_tuple(R_1nn_inds.data(), index.get_R_1nn_dists()));
 
   // group neighborhoods for each reference landmark and sort each group by distance
-  thrust::sort_by_key(handle.get_thrust_policy(), keys, keys + index.m,
-                      index.get_R_1nn_cols(), NNComp());
+  thrust::sort_by_key(
+    handle.get_thrust_policy(), keys, keys + index.m, index.get_R_1nn_cols(), NNComp());
 
   // convert to CSR for fast lookup
   raft::sparse::convert::sorted_coo_to_csr(
-    R_1nn_inds.data(), index.m, index.get_R_indptr(), index.n_landmarks + 1,
-    handle.get_stream());
+    R_1nn_inds.data(), index.m, index.get_R_indptr(), index.n_landmarks + 1, handle.get_stream());
 }
 
 /**
@@ -144,20 +152,33 @@ void construct_landmark_1nn(
  * @param R_knn_inds
  * @param R_knn_dists
  */
-template <typename value_idx, typename value_t,
-          typename value_int = std::uint32_t>
-void k_closest_landmarks(const raft::handle_t &handle,
-                         BallCoverIndex<value_idx, value_t, value_int> &index,
-                         const value_t *query_pts, value_int n_query_pts,
-                         value_int k, value_idx *R_knn_inds,
-                         value_t *R_knn_dists) {
-  std::vector<value_t *> input = {index.get_R()};
+template <typename value_idx, typename value_t, typename value_int = std::uint32_t>
+void k_closest_landmarks(const raft::handle_t& handle,
+                         BallCoverIndex<value_idx, value_t, value_int>& index,
+                         const value_t* query_pts,
+                         value_int n_query_pts,
+                         value_int k,
+                         value_idx* R_knn_inds,
+                         value_t* R_knn_dists)
+{
+  std::vector<value_t*> input      = {index.get_R()};
   std::vector<std::uint32_t> sizes = {index.n_landmarks};
 
-  brute_force_knn_impl<std::uint32_t, std::int64_t>(
-    input, sizes, index.n, const_cast<value_t *>(query_pts), n_query_pts,
-    R_knn_inds, R_knn_dists, k, handle.get_stream(), nullptr, 0, true, true,
-    nullptr, index.metric);
+  brute_force_knn_impl<std::uint32_t, std::int64_t>(input,
+                                                    sizes,
+                                                    index.n,
+                                                    const_cast<value_t*>(query_pts),
+                                                    n_query_pts,
+                                                    R_knn_inds,
+                                                    R_knn_dists,
+                                                    k,
+                                                    handle.get_stream(),
+                                                    nullptr,
+                                                    0,
+                                                    true,
+                                                    true,
+                                                    nullptr,
+                                                    index.metric);
 }
 
 /**
@@ -168,21 +189,21 @@ void k_closest_landmarks(const raft::handle_t &handle,
  * @param handle
  * @param index
  */
-template <typename value_idx, typename value_t,
-          typename value_int = std::uint32_t>
-void compute_landmark_radii(
-  const raft::handle_t &handle,
-  BallCoverIndex<value_idx, value_t, value_int> &index) {
+template <typename value_idx, typename value_t, typename value_int = std::uint32_t>
+void compute_landmark_radii(const raft::handle_t& handle,
+                            BallCoverIndex<value_idx, value_t, value_int>& index)
+{
   auto entries = thrust::make_counting_iterator<value_idx>(0);
 
-  const value_idx *R_indptr_ptr = index.get_R_indptr();
-  const value_t *R_1nn_dists_ptr = index.get_R_1nn_dists();
-  value_t *R_radius_ptr = index.get_R_radius();
-  thrust::for_each(handle.get_thrust_policy(), entries,
+  const value_idx* R_indptr_ptr  = index.get_R_indptr();
+  const value_t* R_1nn_dists_ptr = index.get_R_1nn_dists();
+  value_t* R_radius_ptr          = index.get_R_radius();
+  thrust::for_each(handle.get_thrust_policy(),
+                   entries,
                    entries + index.n_landmarks,
                    [=] __device__(value_idx input) {
                      value_idx last_row_idx = R_indptr_ptr[input + 1] - 1;
-                     R_radius_ptr[input] = R_1nn_dists_ptr[last_row_idx];
+                     R_radius_ptr[input]    = R_1nn_dists_ptr[last_row_idx];
                    });
 }
 
@@ -196,23 +217,51 @@ void compute_landmark_radii(
  * marking the distance to be computed between x, y only
  * if knn[k].distance >= d(x_i, R_k) + d(R_k, y)
  */
-template <typename value_idx, typename value_t,
-          typename value_int = std::uint32_t, typename dist_func>
-void perform_rbc_query(const raft::handle_t &handle,
-                       BallCoverIndex<value_idx, value_t, value_int> &index,
-                       const value_t *query, value_int n_query_pts,
-                       std::uint32_t k, const value_idx *R_knn_inds,
-                       const value_t *R_knn_dists, dist_func dfunc,
-                       value_idx *inds, value_t *dists,
-                       value_int *dists_counter, value_int *post_dists_counter,
-                       float weight = 1.0, bool perform_post_filtering = true) {
+template <typename value_idx,
+          typename value_t,
+          typename value_int = std::uint32_t,
+          typename dist_func>
+void perform_rbc_query(const raft::handle_t& handle,
+                       BallCoverIndex<value_idx, value_t, value_int>& index,
+                       const value_t* query,
+                       value_int n_query_pts,
+                       std::uint32_t k,
+                       const value_idx* R_knn_inds,
+                       const value_t* R_knn_dists,
+                       dist_func dfunc,
+                       value_idx* inds,
+                       value_t* dists,
+                       value_int* dists_counter,
+                       value_int* post_dists_counter,
+                       float weight                = 1.0,
+                       bool perform_post_filtering = true)
+{
   // Compute nearest k for each neighborhood in each closest R
-  rbc_low_dim_pass_one(handle, index, query, n_query_pts, k, R_knn_inds,
-                       R_knn_dists, dfunc, inds, dists, weight, dists_counter);
+  rbc_low_dim_pass_one(handle,
+                       index,
+                       query,
+                       n_query_pts,
+                       k,
+                       R_knn_inds,
+                       R_knn_dists,
+                       dfunc,
+                       inds,
+                       dists,
+                       weight,
+                       dists_counter);
 
   if (perform_post_filtering) {
-    rbc_low_dim_pass_two(handle, index, query, n_query_pts, k, R_knn_inds,
-                         R_knn_dists, dfunc, inds, dists, weight,
+    rbc_low_dim_pass_two(handle,
+                         index,
+                         query,
+                         n_query_pts,
+                         k,
+                         R_knn_inds,
+                         R_knn_dists,
+                         dfunc,
+                         inds,
+                         dists,
+                         weight,
                          post_dists_counter);
   }
 }
@@ -228,13 +277,15 @@ void perform_rbc_query(const raft::handle_t &handle,
  * query which is useful for algorithms that need to perform
  * A * A.T.
  */
-template <typename value_idx = std::int64_t, typename value_t,
-          typename value_int = std::uint32_t, typename distance_func>
-void rbc_build_index(const raft::handle_t &handle,
-                     BallCoverIndex<value_idx, value_t, value_int> &index,
-                     distance_func dfunc) {
-  ASSERT(index.n == 2,
-         "only 2d vectors are supported in current implementation");
+template <typename value_idx = std::int64_t,
+          typename value_t,
+          typename value_int = std::uint32_t,
+          typename distance_func>
+void rbc_build_index(const raft::handle_t& handle,
+                     BallCoverIndex<value_idx, value_t, value_int>& index,
+                     distance_func dfunc)
+{
+  ASSERT(index.n == 2, "only 2d vectors are supported in current implementation");
   ASSERT(!index.is_index_trained(), "index cannot be previously trained");
 
   rmm::device_uvector<value_idx> R_knn_inds(index.m, handle.get_stream());
@@ -249,8 +300,8 @@ void rbc_build_index(const raft::handle_t &handle,
    * 2. Perform knn = bfknn(X, R, k)
    */
   value_int k = 1;
-  k_closest_landmarks(handle, index, index.get_X(), index.m, k,
-                      R_knn_inds.data(), R_knn_dists.data());
+  k_closest_landmarks(
+    handle, index, index.get_X(), index.m, k, R_knn_inds.data(), R_knn_dists.data());
 
   /**
    * 3. Create L_r = knn[:,0].T (CSR)
@@ -258,8 +309,7 @@ void rbc_build_index(const raft::handle_t &handle,
    * Slice closest neighboring R
    * Secondary sort by (R_knn_inds, R_knn_dists)
    */
-  construct_landmark_1nn(handle, R_knn_inds.data(), R_knn_dists.data(), k,
-                         index);
+  construct_landmark_1nn(handle, R_knn_inds.data(), R_knn_dists.data(), k, index);
 
   /**
    * Compute radius of each R for filtering: p(q, r) <= p(q, q_r) + radius(r)
@@ -271,16 +321,21 @@ void rbc_build_index(const raft::handle_t &handle,
 /**
  * Performs an all neighbors knn query (e.g. index == query)
  */
-template <typename value_idx = std::int64_t, typename value_t,
-          typename value_int = std::uint32_t, typename distance_func>
-void rbc_all_knn_query(const raft::handle_t &handle,
-                       BallCoverIndex<value_idx, value_t, value_int> &index,
-                       value_int k, value_idx *inds, value_t *dists,
+template <typename value_idx = std::int64_t,
+          typename value_t,
+          typename value_int = std::uint32_t,
+          typename distance_func>
+void rbc_all_knn_query(const raft::handle_t& handle,
+                       BallCoverIndex<value_idx, value_t, value_int>& index,
+                       value_int k,
+                       value_idx* inds,
+                       value_t* dists,
                        distance_func dfunc,
                        // approximate nn options
-                       bool perform_post_filtering = true, float weight = 1.0) {
-  ASSERT(index.n == 2,
-         "only 2d vectors are supported in current implementation");
+                       bool perform_post_filtering = true,
+                       float weight                = 1.0)
+{
+  ASSERT(index.n == 2, "only 2d vectors are supported in current implementation");
   ASSERT(index.n_landmarks >= k, "number of landmark samples must be >= k");
   ASSERT(!index.is_index_trained(), "index cannot be previously trained");
 
@@ -289,22 +344,30 @@ void rbc_all_knn_query(const raft::handle_t &handle,
 
   // For debugging / verification. Remove before releasing
   rmm::device_uvector<value_int> dists_counter(index.m, handle.get_stream());
-  rmm::device_uvector<value_int> post_dists_counter(index.m,
-                                                    handle.get_stream());
+  rmm::device_uvector<value_int> post_dists_counter(index.m, handle.get_stream());
 
   sample_landmarks<value_idx, value_t>(handle, index);
 
-  k_closest_landmarks(handle, index, index.get_X(), index.m, k,
-                      R_knn_inds.data(), R_knn_dists.data());
+  k_closest_landmarks(
+    handle, index, index.get_X(), index.m, k, R_knn_inds.data(), R_knn_dists.data());
 
-  construct_landmark_1nn(handle, R_knn_inds.data(), R_knn_dists.data(), k,
-                         index);
+  construct_landmark_1nn(handle, R_knn_inds.data(), R_knn_dists.data(), k, index);
 
   compute_landmark_radii(handle, index);
 
-  perform_rbc_query(handle, index, index.get_X(), index.m, k, R_knn_inds.data(),
-                    R_knn_dists.data(), dfunc, inds, dists,
-                    dists_counter.data(), post_dists_counter.data(), weight,
+  perform_rbc_query(handle,
+                    index,
+                    index.get_X(),
+                    index.m,
+                    k,
+                    R_knn_inds.data(),
+                    R_knn_dists.data(),
+                    dfunc,
+                    inds,
+                    dists,
+                    dists_counter.data(),
+                    post_dists_counter.data(),
+                    weight,
                     perform_post_filtering);
 }
 
@@ -312,35 +375,50 @@ void rbc_all_knn_query(const raft::handle_t &handle,
  * Performs a knn query against an index. This assumes the index has
  * already been built.
  */
-template <typename value_idx = std::int64_t, typename value_t,
-          typename value_int = std::uint32_t, typename distance_func>
-void rbc_knn_query(const raft::handle_t &handle,
-                   BallCoverIndex<value_idx, value_t, value_int> &index,
-                   value_int k, const value_t *query, value_int n_query_pts,
-                   value_idx *inds, value_t *dists, distance_func dfunc,
+template <typename value_idx = std::int64_t,
+          typename value_t,
+          typename value_int = std::uint32_t,
+          typename distance_func>
+void rbc_knn_query(const raft::handle_t& handle,
+                   BallCoverIndex<value_idx, value_t, value_int>& index,
+                   value_int k,
+                   const value_t* query,
+                   value_int n_query_pts,
+                   value_idx* inds,
+                   value_t* dists,
+                   distance_func dfunc,
                    // approximate nn options
-                   bool perform_post_filtering = true, float weight = 1.0) {
-  ASSERT(index.n == 2,
-         "only 2d vectors are supported in current implementation");
+                   bool perform_post_filtering = true,
+                   float weight                = 1.0)
+{
+  ASSERT(index.n == 2, "only 2d vectors are supported in current implementation");
   ASSERT(index.n_landmarks >= k, "number of landmark samples must be >= k");
   ASSERT(index.is_index_trained(), "index must be previously trained");
 
   rmm::device_uvector<value_idx> R_knn_inds(k * index.m, handle.get_stream());
   rmm::device_uvector<value_t> R_knn_dists(k * index.m, handle.get_stream());
 
-  k_closest_landmarks(handle, index, query, n_query_pts, k, R_knn_inds.data(),
-                      R_knn_dists.data());
+  k_closest_landmarks(handle, index, query, n_query_pts, k, R_knn_inds.data(), R_knn_dists.data());
 
   // For debugging / verification. Remove before releasing
   rmm::device_uvector<value_int> dists_counter(index.m, handle.get_stream());
-  rmm::device_uvector<value_int> post_dists_counter(index.m,
-                                                    handle.get_stream());
-  thrust::fill(handle.get_thrust_policy(), post_dists_counter.data(),
-               post_dists_counter.data() + index.m, 0);
-
-  perform_rbc_query(handle, index, query, n_query_pts, k, R_knn_inds.data(),
-                    R_knn_dists.data(), dfunc, inds, dists,
-                    dists_counter.data(), post_dists_counter.data(), weight,
+  rmm::device_uvector<value_int> post_dists_counter(index.m, handle.get_stream());
+  thrust::fill(
+    handle.get_thrust_policy(), post_dists_counter.data(), post_dists_counter.data() + index.m, 0);
+
+  perform_rbc_query(handle,
+                    index,
+                    query,
+                    n_query_pts,
+                    k,
+                    R_knn_inds.data(),
+                    R_knn_dists.data(),
+                    dfunc,
+                    inds,
+                    dists,
+                    dists_counter.data(),
+                    post_dists_counter.data(),
+                    weight,
                     perform_post_filtering);
 }
 
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
index c6cb679408..181dad1a90 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
@@ -27,7 +27,8 @@ namespace detail {
 
 struct NNComp {
   template <typename one, typename two>
-  __host__ __device__ bool operator()(const one &t1, const two &t2) {
+  __host__ __device__ bool operator()(const one& t1, const two& t2)
+  {
     // sort first by each sample's reference landmark,
     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
     if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false;
@@ -39,17 +40,20 @@ struct NNComp {
 
 struct HaversineFunc {
   template <typename value_t, typename value_int = std::uint32_t>
-  __device__ __host__ __forceinline__ value_t
-  operator()(const value_t *a, const value_t *b, const value_int n_dims) {
-    return raft::spatial::knn::detail::compute_haversine(a[0], b[0], a[1],
-                                                         b[1]);
+  __device__ __host__ __forceinline__ value_t operator()(const value_t* a,
+                                                         const value_t* b,
+                                                         const value_int n_dims)
+  {
+    return raft::spatial::knn::detail::compute_haversine(a[0], b[0], a[1], b[1]);
   }
 };
 
 struct EuclideanFunc {
   template <typename value_t, typename value_int = std::uint32_t>
-  __device__ __host__ __forceinline__ value_t
-  operator()(const value_t *a, const value_t *b, const value_int n_dims) {
+  __device__ __host__ __forceinline__ value_t operator()(const value_t* a,
+                                                         const value_t* b,
+                                                         const value_int n_dims)
+  {
     value_t sum_sq = 0;
     for (value_int i = 0; i < n_dims; ++i) {
       value_t diff = a[i] - b[i];
@@ -63,7 +67,8 @@ struct EuclideanFunc {
 /**
  * Zeros the bit at location h in a one-hot encoded 32-bit int array
  */
-__device__ inline void _zero_bit(std::uint32_t *arr, std::uint32_t h) {
+__device__ inline void _zero_bit(std::uint32_t* arr, std::uint32_t h)
+{
   int bit = h % 32;
   int idx = h / 32;
 
@@ -71,7 +76,7 @@ __device__ inline void _zero_bit(std::uint32_t *arr, std::uint32_t h) {
   std::uint32_t old = arr[idx];
   do {
     assumed = old;
-    old = atomicCAS(arr + idx, assumed, assumed & ~(1 << bit));
+    old     = atomicCAS(arr + idx, assumed, assumed & ~(1 << bit));
   } while (assumed != old);
 }
 
@@ -79,7 +84,8 @@ __device__ inline void _zero_bit(std::uint32_t *arr, std::uint32_t h) {
  * Returns whether or not bit at location h is nonzero in a one-hot
  * encoded 32-bit in array.
  */
-__device__ inline bool _get_val(std::uint32_t *arr, std::uint32_t h) {
+__device__ inline bool _get_val(std::uint32_t* arr, std::uint32_t h)
+{
   int bit = h % 32;
   int idx = h / 32;
   return (arr[idx] & (1 << bit)) > 0;
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
index 4a476274dd..5d28258f7a 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
@@ -58,14 +58,24 @@ namespace detail {
  * @param output
  * @param weight
  */
-template <typename value_idx, typename value_t,
-          typename value_int = std::uint32_t, int tpb = 32,
+template <typename value_idx,
+          typename value_t,
+          typename value_int = std::uint32_t,
+          int tpb            = 32,
           typename distance_func>
-__global__ void perform_post_filter_registers(
-  const value_t *X, value_int n_cols, const value_idx *R_knn_inds,
-  const value_t *R_knn_dists, const value_t *R_radius, const value_t *landmarks,
-  int n_landmarks, value_int bitset_size, value_int k, distance_func dfunc,
-  std::uint32_t *output, float weight = 1.0) {
+__global__ void perform_post_filter_registers(const value_t* X,
+                                              value_int n_cols,
+                                              const value_idx* R_knn_inds,
+                                              const value_t* R_knn_dists,
+                                              const value_t* R_radius,
+                                              const value_t* landmarks,
+                                              int n_landmarks,
+                                              value_int bitset_size,
+                                              value_int k,
+                                              distance_func dfunc,
+                                              std::uint32_t* output,
+                                              float weight = 1.0)
+{
   // allocate array of size n_landmarks / 32 ints
   extern __shared__ std::uint32_t shared_mem[];
 
@@ -98,8 +108,7 @@ __global__ void perform_post_filter_registers(
   for (value_int l = threadIdx.x; l < n_landmarks; l += tpb) {
     // compute p(q, r)
     value_t dist = dfunc(local_x_ptr, landmarks + (n_cols * l), n_cols);
-    if (dist > weight * (closest_R_dist + R_radius[l]) ||
-        dist > 3 * closest_R_dist) {
+    if (dist > weight * (closest_R_dist + R_radius[l]) || dist > 3 * closest_R_dist) {
       _zero_bit(shared_mem, l);
     }
   }
@@ -136,38 +145,58 @@ __global__ void perform_post_filter_registers(
  * @param k
  * @param dist_counter
  */
-template <typename value_idx, typename value_t,
-          typename value_int = std::uint32_t,
-          typename bitset_type = std::uint32_t, typename dist_func,
-          int warp_q = 32, int thread_q = 2, int tpb = 128, int col_q = 2>
-__global__ void compute_final_dists_registers(
-  const value_t *X_index, const value_t *X, const value_int n_cols,
-  bitset_type *bitset, value_int bitset_size, const value_t *R_knn_dists,
-  const value_idx *R_indptr, const value_idx *R_1nn_inds,
-  const value_t *R_1nn_dists, value_idx *knn_inds, value_t *knn_dists,
-  value_int n_landmarks, value_int k, dist_func dfunc,
-  value_int *dist_counter) {
+template <typename value_idx,
+          typename value_t,
+          typename value_int   = std::uint32_t,
+          typename bitset_type = std::uint32_t,
+          typename dist_func,
+          int warp_q   = 32,
+          int thread_q = 2,
+          int tpb      = 128,
+          int col_q    = 2>
+__global__ void compute_final_dists_registers(const value_t* X_index,
+                                              const value_t* X,
+                                              const value_int n_cols,
+                                              bitset_type* bitset,
+                                              value_int bitset_size,
+                                              const value_t* R_knn_dists,
+                                              const value_idx* R_indptr,
+                                              const value_idx* R_1nn_inds,
+                                              const value_t* R_1nn_dists,
+                                              value_idx* knn_inds,
+                                              value_t* knn_dists,
+                                              value_int n_landmarks,
+                                              value_int k,
+                                              dist_func dfunc,
+                                              value_int* dist_counter)
+{
   static constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
   __shared__ value_t shared_memK[kNumWarps * warp_q];
-  __shared__ faiss::gpu::KeyValuePair<value_t, value_idx>
-    shared_memV[kNumWarps * warp_q];
+  __shared__ faiss::gpu::KeyValuePair<value_t, value_idx> shared_memV[kNumWarps * warp_q];
 
-  const value_t *x_ptr = X + (n_cols * blockIdx.x);
+  const value_t* x_ptr = X + (n_cols * blockIdx.x);
   value_t local_x_ptr[col_q];
   for (value_int j = 0; j < n_cols; ++j) {
     local_x_ptr[j] = x_ptr[j];
   }
 
-  faiss::gpu::KeyValueBlockSelect<value_t, value_idx, false,
-                                  faiss::gpu::Comparator<value_t>, warp_q,
-                                  thread_q, tpb>
+  faiss::gpu::KeyValueBlockSelect<value_t,
+                                  value_idx,
+                                  false,
+                                  faiss::gpu::Comparator<value_t>,
+                                  warp_q,
+                                  thread_q,
+                                  tpb>
     heap(faiss::gpu::Limits<value_t>::getMax(),
-         faiss::gpu::Limits<value_t>::getMax(), -1, shared_memK, shared_memV,
+         faiss::gpu::Limits<value_t>::getMax(),
+         -1,
+         shared_memK,
+         shared_memV,
          k);
 
   const value_int n_k = faiss::gpu::utils::roundDown(k, faiss::gpu::kWarpSize);
-  value_int i = threadIdx.x;
+  value_int i         = threadIdx.x;
   for (; i < n_k; i += tpb) {
     value_idx ind = knn_inds[blockIdx.x * k + i];
     heap.add(knn_dists[blockIdx.x * k + i], R_knn_dists[ind * k], ind);
@@ -185,33 +214,31 @@ __global__ void compute_final_dists_registers(
     // candidate
     if (_get_val(bitset + (blockIdx.x * bitset_size), cur_R_ind)) {
       value_idx R_start_offset = R_indptr[cur_R_ind];
-      value_idx R_stop_offset = R_indptr[cur_R_ind + 1];
-      value_idx R_size = R_stop_offset - R_start_offset;
+      value_idx R_stop_offset  = R_indptr[cur_R_ind + 1];
+      value_idx R_size         = R_stop_offset - R_start_offset;
 
       // Loop through R's neighborhood in parallel
 
       // Round R_size to the nearest warp threads so they can
       // all be computing in parallel.
 
-      const value_int limit =
-        faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize);
+      const value_int limit = faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize);
 
       i = threadIdx.x;
       for (; i < limit; i += tpb) {
         value_idx cur_candidate_ind = R_1nn_inds[R_start_offset + i];
-        value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i];
-        value_t z = heap.warpKTopRDist == 0.00
-                      ? 0.0
-                      : (abs(heap.warpKTop - heap.warpKTopRDist) *
-                           abs(heap.warpKTopRDist - cur_candidate_dist) -
-                         heap.warpKTop * cur_candidate_dist) /
-                          heap.warpKTopRDist;
-        z = isnan(z) ? 0.0 : z;
+        value_t cur_candidate_dist  = R_1nn_dists[R_start_offset + i];
+        value_t z                   = heap.warpKTopRDist == 0.00 ? 0.0
+                                                                 : (abs(heap.warpKTop - heap.warpKTopRDist) *
+                                                    abs(heap.warpKTopRDist - cur_candidate_dist) -
+                                                  heap.warpKTop * cur_candidate_dist) /
+                                                   heap.warpKTopRDist;
+        z                           = isnan(z) ? 0.0 : z;
         // If lower bound on distance could possibly be in
         // the closest k neighbors, compute it and add to k-select
         value_t dist = std::numeric_limits<value_t>::max();
         if (z <= heap.warpKTop) {
-          const value_t *y_ptr = X_index + (n_cols * cur_candidate_ind);
+          const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind);
           value_t local_y_ptr[col_q];
           for (value_int j = 0; j < n_cols; ++j) {
             local_y_ptr[j] = y_ptr[j];
@@ -226,21 +253,20 @@ __global__ void compute_final_dists_registers(
       // second round guarantees to be only a single warp.
       if (i < R_size) {
         value_idx cur_candidate_ind = R_1nn_inds[R_start_offset + i];
-        value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i];
+        value_t cur_candidate_dist  = R_1nn_dists[R_start_offset + i];
 
-        value_t z = heap.warpKTopRDist == 0.00
-                      ? 0.0
-                      : (abs(heap.warpKTop - heap.warpKTopRDist) *
-                           abs(heap.warpKTopRDist - cur_candidate_dist) -
-                         heap.warpKTop * cur_candidate_dist) /
-                          heap.warpKTopRDist;
+        value_t z = heap.warpKTopRDist == 0.00 ? 0.0
+                                               : (abs(heap.warpKTop - heap.warpKTopRDist) *
+                                                    abs(heap.warpKTopRDist - cur_candidate_dist) -
+                                                  heap.warpKTop * cur_candidate_dist) /
+                                                   heap.warpKTopRDist;
 
         z = isnan(z) ? 0.0 : z;
         // If lower bound on distance could possibly be in
         // the closest k neighbors, compute it and add to k-select
         value_t dist = std::numeric_limits<value_t>::max();
         if (z <= heap.warpKTop) {
-          const value_t *y_ptr = X_index + (n_cols * cur_candidate_ind);
+          const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind);
           value_t local_y_ptr[col_q];
           for (value_int j = 0; j < n_cols; ++j) {
             local_y_ptr[j] = y_ptr[j];
@@ -257,7 +283,7 @@ __global__ void compute_final_dists_registers(
 
   for (value_int i = threadIdx.x; i < k; i += tpb) {
     knn_dists[blockIdx.x * k + i] = shared_memK[i];
-    knn_inds[blockIdx.x * k + i] = shared_memV[i].value;
+    knn_inds[blockIdx.x * k + i]  = shared_memV[i].value;
   }
 }
 
@@ -278,28 +304,41 @@ __global__ void compute_final_dists_registers(
  * @param R_1nn_cols
  * @param R_1nn_dists
  */
-template <typename value_idx = std::int64_t, typename value_t, int warp_q = 32,
-          int thread_q = 2, int tpb = 128, int col_q = 2,
-          typename value_int = std::uint32_t, typename distance_func>
-__global__ void block_rbc_kernel_registers(
-  const value_t *X_index, const value_t *X,
-  value_int n_cols,  // n_cols should be 2 or 3 dims
-  const value_idx *R_knn_inds, const value_t *R_knn_dists, value_int m,
-  value_int k, const value_idx *R_indptr, const value_idx *R_1nn_cols,
-  const value_t *R_1nn_dists, value_idx *out_inds, value_t *out_dists,
-  value_int *dist_counter, value_t *R_radius, distance_func dfunc,
-  float weight = 1.0) {
+template <typename value_idx = std::int64_t,
+          typename value_t,
+          int warp_q         = 32,
+          int thread_q       = 2,
+          int tpb            = 128,
+          int col_q          = 2,
+          typename value_int = std::uint32_t,
+          typename distance_func>
+__global__ void block_rbc_kernel_registers(const value_t* X_index,
+                                           const value_t* X,
+                                           value_int n_cols,  // n_cols should be 2 or 3 dims
+                                           const value_idx* R_knn_inds,
+                                           const value_t* R_knn_dists,
+                                           value_int m,
+                                           value_int k,
+                                           const value_idx* R_indptr,
+                                           const value_idx* R_1nn_cols,
+                                           const value_t* R_1nn_dists,
+                                           value_idx* out_inds,
+                                           value_t* out_dists,
+                                           value_int* dist_counter,
+                                           value_t* R_radius,
+                                           distance_func dfunc,
+                                           float weight = 1.0)
+{
   static constexpr value_int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
   __shared__ value_t shared_memK[kNumWarps * warp_q];
-  __shared__ faiss::gpu::KeyValuePair<value_t, value_idx>
-    shared_memV[kNumWarps * warp_q];
+  __shared__ faiss::gpu::KeyValuePair<value_t, value_idx> shared_memV[kNumWarps * warp_q];
 
   // TODO: Separate kernels for different widths:
   // 1. Very small (between 3 and 32) just use registers for columns of "blockIdx.x"
   // 2. Can fit comfortably in shared memory (32 to a few thousand?)
   // 3. Load each time individually.
-  const value_t *x_ptr = X + (n_cols * blockIdx.x);
+  const value_t* x_ptr = X + (n_cols * blockIdx.x);
 
   // Use registers only for 2d or 3d
   value_t local_x_ptr[col_q];
@@ -308,11 +347,18 @@ __global__ void block_rbc_kernel_registers(
   }
 
   // Each warp works on 1 R
-  faiss::gpu::KeyValueBlockSelect<value_t, value_idx, false,
-                                  faiss::gpu::Comparator<value_t>, warp_q,
-                                  thread_q, tpb>
+  faiss::gpu::KeyValueBlockSelect<value_t,
+                                  value_idx,
+                                  false,
+                                  faiss::gpu::Comparator<value_t>,
+                                  warp_q,
+                                  thread_q,
+                                  tpb>
     heap(faiss::gpu::Limits<value_t>::getMax(),
-         faiss::gpu::Limits<value_t>::getMax(), -1, shared_memK, shared_memV,
+         faiss::gpu::Limits<value_t>::getMax(),
+         -1,
+         shared_memK,
+         shared_memV,
          k);
 
   value_t min_R_dist = R_knn_dists[blockIdx.x * k + (k - 1)];
@@ -327,7 +373,7 @@ __global__ void block_rbc_kernel_registers(
   // determining if the distance could even potentially be in the heap.
   for (value_int cur_k = 0; cur_k < k; ++cur_k) {
     // index and distance to current blockIdx.x's closest landmark
-    value_t cur_R_dist = R_knn_dists[blockIdx.x * k + cur_k];
+    value_t cur_R_dist  = R_knn_dists[blockIdx.x * k + cur_k];
     value_idx cur_R_ind = R_knn_inds[blockIdx.x * k + cur_k];
 
     // Equation (2) in Cayton's paper- prune out R's which are > 3 * p(q, r_q)
@@ -336,38 +382,37 @@ __global__ void block_rbc_kernel_registers(
 
     // The whole warp should iterate through the elements in the current R
     value_idx R_start_offset = R_indptr[cur_R_ind];
-    value_idx R_stop_offset = R_indptr[cur_R_ind + 1];
+    value_idx R_stop_offset  = R_indptr[cur_R_ind + 1];
 
     value_idx R_size = R_stop_offset - R_start_offset;
 
-    value_int limit =
-      faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize);
-    value_int i = threadIdx.x;
+    value_int limit = faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize);
+    value_int i     = threadIdx.x;
     for (; i < limit; i += tpb) {
       // Index and distance of current candidate's nearest landmark
       value_idx cur_candidate_ind = R_1nn_cols[R_start_offset + i];
-      value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i];
+      value_t cur_candidate_dist  = R_1nn_dists[R_start_offset + i];
 
       // Take 2 landmarks l_1 and l_2 where l_1 is the furthest point in the heap
       // and l_2 is the current landmark R. s is the current data point and
       // t is the new candidate data point. We know that:
-      // d(s, t) cannot possibly be any smaller than | d(s, l_1) - d(l_1, l_2) | * | d(l_1, l_2) - d(l_2, t) | - d(s, l_1) * d(l_2, t)
+      // d(s, t) cannot possibly be any smaller than | d(s, l_1) - d(l_1, l_2) | * | d(l_1, l_2) -
+      // d(l_2, t) | - d(s, l_1) * d(l_2, t)
 
-      // Therefore, if d(s, t) >= d(s, l_1) from the computation above, we know that the distance to the candidate point
-      // cannot possibly be in the nearest neighbors. However, if d(s, t) < d(s, l_1) then we should compute the
-      // distance because it's possible it could be smaller.
+      // Therefore, if d(s, t) >= d(s, l_1) from the computation above, we know that the distance to
+      // the candidate point cannot possibly be in the nearest neighbors. However, if d(s, t) < d(s,
+      // l_1) then we should compute the distance because it's possible it could be smaller.
       //
-      value_t z = heap.warpKTopRDist == 0.00
-                    ? 0.0
-                    : (abs(heap.warpKTop - heap.warpKTopRDist) *
-                         abs(heap.warpKTopRDist - cur_candidate_dist) -
-                       heap.warpKTop * cur_candidate_dist) /
-                        heap.warpKTopRDist;
-
-      z = isnan(z) ? 0.0 : z;
+      value_t z = heap.warpKTopRDist == 0.00 ? 0.0
+                                             : (abs(heap.warpKTop - heap.warpKTopRDist) *
+                                                  abs(heap.warpKTopRDist - cur_candidate_dist) -
+                                                heap.warpKTop * cur_candidate_dist) /
+                                                 heap.warpKTopRDist;
+
+      z            = isnan(z) ? 0.0 : z;
       value_t dist = std::numeric_limits<value_t>::max();
       if (i < k || z <= heap.warpKTop) {
-        const value_t *y_ptr = X_index + (n_cols * cur_candidate_ind);
+        const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind);
         value_t local_y_ptr[col_q];
         for (value_int j = 0; j < n_cols; ++j) {
           local_y_ptr[j] = y_ptr[j];
@@ -381,18 +426,17 @@ __global__ void block_rbc_kernel_registers(
 
     if (i < R_size) {
       value_idx cur_candidate_ind = R_1nn_cols[R_start_offset + i];
-      value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i];
-      value_t z = heap.warpKTopRDist == 0.0
-                    ? 0.0
-                    : (abs(heap.warpKTop - heap.warpKTopRDist) *
-                         abs(heap.warpKTopRDist - cur_candidate_dist) -
-                       heap.warpKTop * cur_candidate_dist) /
-                        heap.warpKTopRDist;
-
-      z = isnan(z) ? 0.0 : z;
+      value_t cur_candidate_dist  = R_1nn_dists[R_start_offset + i];
+      value_t z                   = heap.warpKTopRDist == 0.0 ? 0.0
+                                                              : (abs(heap.warpKTop - heap.warpKTopRDist) *
+                                                 abs(heap.warpKTopRDist - cur_candidate_dist) -
+                                               heap.warpKTop * cur_candidate_dist) /
+                                                heap.warpKTopRDist;
+
+      z            = isnan(z) ? 0.0 : z;
       value_t dist = std::numeric_limits<value_t>::max();
       if (i < k || z <= heap.warpKTop) {
-        const value_t *y_ptr = X_index + (n_cols * cur_candidate_ind);
+        const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind);
         value_t local_y_ptr[col_q];
         for (value_int j = 0; j < n_cols; ++j) {
           local_y_ptr[j] = y_ptr[j];
@@ -411,124 +455,327 @@ __global__ void block_rbc_kernel_registers(
 
   for (int i = threadIdx.x; i < k; i += tpb) {
     out_dists[blockIdx.x * k + i] = shared_memK[i];
-    out_inds[blockIdx.x * k + i] = shared_memV[i].value;
+    out_inds[blockIdx.x * k + i]  = shared_memV[i].value;
   }
 }
 
-template <typename value_idx, typename value_t,
-          typename value_int = std::uint32_t, typename dist_func>
-void rbc_low_dim_pass_one(const raft::handle_t &handle,
-                          BallCoverIndex<value_idx, value_t, value_int> &index,
-                          const value_t *query, const value_int n_query_rows,
-                          value_int k, const value_idx *R_knn_inds,
-                          const value_t *R_knn_dists, dist_func dfunc,
-                          value_idx *inds, value_t *dists, float weight,
-                          value_int *dists_counter) {
+template <typename value_idx,
+          typename value_t,
+          typename value_int = std::uint32_t,
+          typename dist_func>
+void rbc_low_dim_pass_one(const raft::handle_t& handle,
+                          BallCoverIndex<value_idx, value_t, value_int>& index,
+                          const value_t* query,
+                          const value_int n_query_rows,
+                          value_int k,
+                          const value_idx* R_knn_inds,
+                          const value_t* R_knn_dists,
+                          dist_func dfunc,
+                          value_idx* inds,
+                          value_t* dists,
+                          float weight,
+                          value_int* dists_counter)
+{
   if (k <= 32)
     block_rbc_kernel_registers<value_idx, value_t, 32, 2, 128, 2, value_int>
-      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, dists_counter, index.get_R_radius(), dfunc, weight);
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
+                                                      query,
+                                                      index.n,
+                                                      R_knn_inds,
+                                                      R_knn_dists,
+                                                      index.m,
+                                                      k,
+                                                      index.get_R_indptr(),
+                                                      index.get_R_1nn_cols(),
+                                                      index.get_R_1nn_dists(),
+                                                      inds,
+                                                      dists,
+                                                      dists_counter,
+                                                      index.get_R_radius(),
+                                                      dfunc,
+                                                      weight);
 
   else if (k <= 64)
     block_rbc_kernel_registers<value_idx, value_t, 64, 3, 128, 2, value_int>
-      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, dists_counter, index.get_R_radius(), dfunc, weight);
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
+                                                      query,
+                                                      index.n,
+                                                      R_knn_inds,
+                                                      R_knn_dists,
+                                                      index.m,
+                                                      k,
+                                                      index.get_R_indptr(),
+                                                      index.get_R_1nn_cols(),
+                                                      index.get_R_1nn_dists(),
+                                                      inds,
+                                                      dists,
+                                                      dists_counter,
+                                                      index.get_R_radius(),
+                                                      dfunc,
+                                                      weight);
   else if (k <= 128)
     block_rbc_kernel_registers<value_idx, value_t, 128, 3, 128, 2, value_int>
-      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, dists_counter, index.get_R_radius(), dfunc, weight);
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
+                                                      query,
+                                                      index.n,
+                                                      R_knn_inds,
+                                                      R_knn_dists,
+                                                      index.m,
+                                                      k,
+                                                      index.get_R_indptr(),
+                                                      index.get_R_1nn_cols(),
+                                                      index.get_R_1nn_dists(),
+                                                      inds,
+                                                      dists,
+                                                      dists_counter,
+                                                      index.get_R_radius(),
+                                                      dfunc,
+                                                      weight);
 
   else if (k <= 256)
     block_rbc_kernel_registers<value_idx, value_t, 256, 4, 128, 2, value_int>
-      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, dists_counter, index.get_R_radius(), dfunc, weight);
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
+                                                      query,
+                                                      index.n,
+                                                      R_knn_inds,
+                                                      R_knn_dists,
+                                                      index.m,
+                                                      k,
+                                                      index.get_R_indptr(),
+                                                      index.get_R_1nn_cols(),
+                                                      index.get_R_1nn_dists(),
+                                                      inds,
+                                                      dists,
+                                                      dists_counter,
+                                                      index.get_R_radius(),
+                                                      dfunc,
+                                                      weight);
 
   else if (k <= 512)
     block_rbc_kernel_registers<value_idx, value_t, 512, 8, 64, 2, value_int>
-      <<<n_query_rows, 64, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, dists_counter, index.get_R_radius(), dfunc, weight);
+      <<<n_query_rows, 64, 0, handle.get_stream()>>>(index.get_X(),
+                                                     query,
+                                                     index.n,
+                                                     R_knn_inds,
+                                                     R_knn_dists,
+                                                     index.m,
+                                                     k,
+                                                     index.get_R_indptr(),
+                                                     index.get_R_1nn_cols(),
+                                                     index.get_R_1nn_dists(),
+                                                     inds,
+                                                     dists,
+                                                     dists_counter,
+                                                     index.get_R_radius(),
+                                                     dfunc,
+                                                     weight);
 
   else if (k <= 1024)
     block_rbc_kernel_registers<value_idx, value_t, 1024, 8, 64, 2, value_int>
-      <<<n_query_rows, 64, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, dists_counter, index.get_R_radius(), dfunc, weight);
+      <<<n_query_rows, 64, 0, handle.get_stream()>>>(index.get_X(),
+                                                     query,
+                                                     index.n,
+                                                     R_knn_inds,
+                                                     R_knn_dists,
+                                                     index.m,
+                                                     k,
+                                                     index.get_R_indptr(),
+                                                     index.get_R_1nn_cols(),
+                                                     index.get_R_1nn_dists(),
+                                                     inds,
+                                                     dists,
+                                                     dists_counter,
+                                                     index.get_R_radius(),
+                                                     dfunc,
+                                                     weight);
 }
 
-template <typename value_idx, typename value_t,
-          typename value_int = std::uint32_t, typename dist_func>
-void rbc_low_dim_pass_two(const raft::handle_t &handle,
-                          BallCoverIndex<value_idx, value_t, value_int> &index,
-                          const value_t *query, const value_int n_query_rows,
-                          value_int k, const value_idx *R_knn_inds,
-                          const value_t *R_knn_dists, dist_func dfunc,
-                          value_idx *inds, value_t *dists, float weight,
-                          value_int *post_dists_counter) {
+template <typename value_idx,
+          typename value_t,
+          typename value_int = std::uint32_t,
+          typename dist_func>
+void rbc_low_dim_pass_two(const raft::handle_t& handle,
+                          BallCoverIndex<value_idx, value_t, value_int>& index,
+                          const value_t* query,
+                          const value_int n_query_rows,
+                          value_int k,
+                          const value_idx* R_knn_inds,
+                          const value_t* R_knn_dists,
+                          dist_func dfunc,
+                          value_idx* inds,
+                          value_t* dists,
+                          float weight,
+                          value_int* post_dists_counter)
+{
   const value_int bitset_size = ceil(index.n_landmarks / 32.0);
 
-  rmm::device_uvector<std::uint32_t> bitset(bitset_size * index.m,
-                                            handle.get_stream());
+  rmm::device_uvector<std::uint32_t> bitset(bitset_size * index.m, handle.get_stream());
 
   perform_post_filter_registers<value_idx, value_t, value_int, 128, dist_func>
-    <<<n_query_rows, 128, bitset_size * sizeof(std::uint32_t),
-       handle.get_stream()>>>(index.get_X(), index.n, R_knn_inds, R_knn_dists,
-                              index.get_R_radius(), index.get_R(),
-                              index.n_landmarks, bitset_size, k, dfunc,
-                              bitset.data(), weight);
+    <<<n_query_rows, 128, bitset_size * sizeof(std::uint32_t), handle.get_stream()>>>(
+      index.get_X(),
+      index.n,
+      R_knn_inds,
+      R_knn_dists,
+      index.get_R_radius(),
+      index.get_R(),
+      index.n_landmarks,
+      bitset_size,
+      k,
+      dfunc,
+      bitset.data(),
+      weight);
 
   if (k <= 32)
-    compute_final_dists_registers<value_idx, value_t, value_int, std::uint32_t,
-                                  dist_func, 32, 2, 128, 2>
-      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, index.n_landmarks, k, dfunc, post_dists_counter);
+    compute_final_dists_registers<value_idx,
+                                  value_t,
+                                  value_int,
+                                  std::uint32_t,
+                                  dist_func,
+                                  32,
+                                  2,
+                                  128,
+                                  2>
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
+                                                      query,
+                                                      index.n,
+                                                      bitset.data(),
+                                                      bitset_size,
+                                                      R_knn_dists,
+                                                      index.get_R_indptr(),
+                                                      index.get_R_1nn_cols(),
+                                                      index.get_R_1nn_dists(),
+                                                      inds,
+                                                      dists,
+                                                      index.n_landmarks,
+                                                      k,
+                                                      dfunc,
+                                                      post_dists_counter);
   else if (k <= 64)
-    compute_final_dists_registers<value_idx, value_t, value_int, std::uint32_t,
-                                  dist_func, 64, 3, 128, 2>
-      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, index.n_landmarks, k, dfunc, post_dists_counter);
+    compute_final_dists_registers<value_idx,
+                                  value_t,
+                                  value_int,
+                                  std::uint32_t,
+                                  dist_func,
+                                  64,
+                                  3,
+                                  128,
+                                  2>
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
+                                                      query,
+                                                      index.n,
+                                                      bitset.data(),
+                                                      bitset_size,
+                                                      R_knn_dists,
+                                                      index.get_R_indptr(),
+                                                      index.get_R_1nn_cols(),
+                                                      index.get_R_1nn_dists(),
+                                                      inds,
+                                                      dists,
+                                                      index.n_landmarks,
+                                                      k,
+                                                      dfunc,
+                                                      post_dists_counter);
   else if (k <= 128)
-    compute_final_dists_registers<value_idx, value_t, value_int, std::uint32_t,
-                                  dist_func, 128, 3, 128, 2>
-      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, index.n_landmarks, k, dfunc, post_dists_counter);
+    compute_final_dists_registers<value_idx,
+                                  value_t,
+                                  value_int,
+                                  std::uint32_t,
+                                  dist_func,
+                                  128,
+                                  3,
+                                  128,
+                                  2>
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
+                                                      query,
+                                                      index.n,
+                                                      bitset.data(),
+                                                      bitset_size,
+                                                      R_knn_dists,
+                                                      index.get_R_indptr(),
+                                                      index.get_R_1nn_cols(),
+                                                      index.get_R_1nn_dists(),
+                                                      inds,
+                                                      dists,
+                                                      index.n_landmarks,
+                                                      k,
+                                                      dfunc,
+                                                      post_dists_counter);
   else if (k <= 256)
-    compute_final_dists_registers<value_idx, value_t, value_int, std::uint32_t,
-                                  dist_func, 256, 4, 128, 2>
-      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, index.n_landmarks, k, dfunc, post_dists_counter);
+    compute_final_dists_registers<value_idx,
+                                  value_t,
+                                  value_int,
+                                  std::uint32_t,
+                                  dist_func,
+                                  256,
+                                  4,
+                                  128,
+                                  2>
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
+                                                      query,
+                                                      index.n,
+                                                      bitset.data(),
+                                                      bitset_size,
+                                                      R_knn_dists,
+                                                      index.get_R_indptr(),
+                                                      index.get_R_1nn_cols(),
+                                                      index.get_R_1nn_dists(),
+                                                      inds,
+                                                      dists,
+                                                      index.n_landmarks,
+                                                      k,
+                                                      dfunc,
+                                                      post_dists_counter);
   else if (k <= 512)
-    compute_final_dists_registers<value_idx, value_t, value_int, std::uint32_t,
-                                  dist_func, 512, 8, 64, 2>
-      <<<n_query_rows, 64, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, index.n_landmarks, k, dfunc, post_dists_counter);
+    compute_final_dists_registers<value_idx,
+                                  value_t,
+                                  value_int,
+                                  std::uint32_t,
+                                  dist_func,
+                                  512,
+                                  8,
+                                  64,
+                                  2>
+      <<<n_query_rows, 64, 0, handle.get_stream()>>>(index.get_X(),
+                                                     query,
+                                                     index.n,
+                                                     bitset.data(),
+                                                     bitset_size,
+                                                     R_knn_dists,
+                                                     index.get_R_indptr(),
+                                                     index.get_R_1nn_cols(),
+                                                     index.get_R_1nn_dists(),
+                                                     inds,
+                                                     dists,
+                                                     index.n_landmarks,
+                                                     k,
+                                                     dfunc,
+                                                     post_dists_counter);
   else if (k <= 1024)
-    compute_final_dists_registers<value_idx, value_t, value_int, std::uint32_t,
-                                  dist_func, 1024, 8, 64, 2>
-      <<<n_query_rows, 64, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, index.n_landmarks, k, dfunc, post_dists_counter);
+    compute_final_dists_registers<value_idx,
+                                  value_t,
+                                  value_int,
+                                  std::uint32_t,
+                                  dist_func,
+                                  1024,
+                                  8,
+                                  64,
+                                  2>
+      <<<n_query_rows, 64, 0, handle.get_stream()>>>(index.get_X(),
+                                                     query,
+                                                     index.n,
+                                                     bitset.data(),
+                                                     bitset_size,
+                                                     R_knn_dists,
+                                                     index.get_R_indptr(),
+                                                     index.get_R_1nn_cols(),
+                                                     index.get_R_1nn_dists(),
+                                                     inds,
+                                                     dists,
+                                                     index.n_landmarks,
+                                                     k,
+                                                     dfunc,
+                                                     post_dists_counter);
 }
 
 };  // namespace detail
diff --git a/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh b/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh
index d2f7bc2210..a53a5b03e6 100644
--- a/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh
@@ -25,15 +25,19 @@ namespace gpu {
 
 // `Dir` true, produce largest values.
 // `Dir` false, produce smallest values.
-template <typename K, typename V, bool Dir, typename Comp, int NumWarpQ,
-          int NumThreadQ, int ThreadsPerBlock>
+template <typename K,
+          typename V,
+          bool Dir,
+          typename Comp,
+          int NumWarpQ,
+          int NumThreadQ,
+          int ThreadsPerBlock>
 struct KeyValueBlockSelect {
-  static constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
+  static constexpr int kNumWarps          = ThreadsPerBlock / kWarpSize;
   static constexpr int kTotalWarpSortSize = NumWarpQ;
 
-  __device__ inline KeyValueBlockSelect(K initKVal, K initVKey, V initVVal,
-                                        K* smemK, KeyValuePair<K, V>* smemV,
-                                        int k)
+  __device__ inline KeyValueBlockSelect(
+    K initKVal, K initVKey, V initVVal, K* smemK, KeyValuePair<K, V>* smemV, int k)
     : initK(initKVal),
       initVk(initVKey),
       initVv(initVVal),
@@ -42,53 +46,55 @@ struct KeyValueBlockSelect {
       warpKTopRDist(initKVal),
       sharedK(smemK),
       sharedV(smemV),
-      kMinus1(k - 1) {
-    static_assert(utils::isPowerOf2(ThreadsPerBlock),
-                  "threads must be a power-of-2");
+      kMinus1(k - 1)
+  {
+    static_assert(utils::isPowerOf2(ThreadsPerBlock), "threads must be a power-of-2");
     static_assert(utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2");
 
     // Fill the per-thread queue keys with the default value
 #pragma unroll
     for (int i = 0; i < NumThreadQ; ++i) {
-      threadK[i] = initK;
-      threadV[i].key = initVk;
+      threadK[i]       = initK;
+      threadV[i].key   = initVk;
       threadV[i].value = initVv;
     }
 
     int laneId = getLaneId();
     int warpId = threadIdx.x / kWarpSize;
-    warpK = sharedK + warpId * kTotalWarpSortSize;
-    warpV = sharedV + warpId * kTotalWarpSortSize;
+    warpK      = sharedK + warpId * kTotalWarpSortSize;
+    warpV      = sharedV + warpId * kTotalWarpSortSize;
 
     // Fill warp queue (only the actual queue space is fine, not where
     // we write the per-thread queues for merging)
     for (int i = laneId; i < NumWarpQ; i += kWarpSize) {
-      warpK[i] = initK;
-      warpV[i].key = initVk;
+      warpK[i]       = initK;
+      warpV[i].key   = initVk;
       warpV[i].value = initVv;
     }
 
     warpFence();
   }
 
-  __device__ inline void addThreadQ(K k, K vk, V vv) {
+  __device__ inline void addThreadQ(K k, K vk, V vv)
+  {
     if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) {
       // Rotate right
 #pragma unroll
       for (int i = NumThreadQ - 1; i > 0; --i) {
-        threadK[i] = threadK[i - 1];
-        threadV[i].key = threadV[i - 1].key;
+        threadK[i]       = threadK[i - 1];
+        threadV[i].key   = threadV[i - 1].key;
         threadV[i].value = threadV[i - 1].value;
       }
 
-      threadK[0] = k;
-      threadV[0].key = vk;
+      threadK[0]       = k;
+      threadV[0].key   = vk;
       threadV[0].value = vv;
       ++numVals;
     }
   }
 
-  __device__ inline void checkThreadQ() {
+  __device__ inline void checkThreadQ()
+  {
     bool needSort = (numVals == NumThreadQ);
 
 #if CUDA_VERSION >= 9000
@@ -111,13 +117,13 @@ struct KeyValueBlockSelect {
 
 #pragma unroll
     for (int i = 0; i < NumThreadQ; ++i) {
-      threadK[i] = initK;
-      threadV[i].key = initVk;
+      threadK[i]       = initK;
+      threadV[i].key   = initVk;
       threadV[i].value = initVv;
     }
 
     // We have to beat at least this element
-    warpKTop = warpK[kMinus1];
+    warpKTop      = warpK[kMinus1];
     warpKTopRDist = warpV[kMinus1].key;
 
     warpFence();
@@ -126,7 +132,8 @@ struct KeyValueBlockSelect {
   /// This function handles sorting and merging together the
   /// per-thread queues with the warp-wide queue, creating a sorted
   /// list across both
-  __device__ inline void mergeWarpQ() {
+  __device__ inline void mergeWarpQ()
+  {
     int laneId = getLaneId();
 
     // Sort all of the per-thread queues
@@ -138,8 +145,8 @@ struct KeyValueBlockSelect {
 
 #pragma unroll
     for (int i = 0; i < kNumWarpQRegisters; ++i) {
-      warpKRegisters[i] = warpK[i * kWarpSize + laneId];
-      warpVRegisters[i].key = warpV[i * kWarpSize + laneId].key;
+      warpKRegisters[i]       = warpK[i * kWarpSize + laneId];
+      warpVRegisters[i].key   = warpV[i * kWarpSize + laneId].key;
       warpVRegisters[i].value = warpV[i * kWarpSize + laneId].value;
     }
 
@@ -148,15 +155,14 @@ struct KeyValueBlockSelect {
     // The warp queue is already sorted, and now that we've sorted the
     // per-thread queue, merge both sorted lists together, producing
     // one sorted list
-    warpMergeAnyRegistersKVP<K, V, kNumWarpQRegisters, NumThreadQ, !Dir, Comp,
-                             false>(warpKRegisters, warpVRegisters, threadK,
-                                    threadV);
+    warpMergeAnyRegistersKVP<K, V, kNumWarpQRegisters, NumThreadQ, !Dir, Comp, false>(
+      warpKRegisters, warpVRegisters, threadK, threadV);
 
     // Write back out the warp queue
 #pragma unroll
     for (int i = 0; i < kNumWarpQRegisters; ++i) {
-      warpK[i * kWarpSize + laneId] = warpKRegisters[i];
-      warpV[i * kWarpSize + laneId].key = warpVRegisters[i].key;
+      warpK[i * kWarpSize + laneId]       = warpKRegisters[i];
+      warpV[i * kWarpSize + laneId].key   = warpVRegisters[i].key;
       warpV[i * kWarpSize + laneId].value = warpVRegisters[i].value;
     }
 
@@ -165,12 +171,14 @@ struct KeyValueBlockSelect {
 
   /// WARNING: all threads in a warp must participate in this.
   /// Otherwise, you must call the constituent parts separately.
-  __device__ inline void add(K k, K vk, V vv) {
+  __device__ inline void add(K k, K vk, V vv)
+  {
     addThreadQ(k, vk, vv);
     checkThreadQ();
   }
 
-  __device__ inline void reduce() {
+  __device__ inline void reduce()
+  {
     // Have all warps dump and merge their queues; this will produce
     // the final per-warp results
     mergeWarpQ();
@@ -182,8 +190,8 @@ struct KeyValueBlockSelect {
     // All warp queues are contiguous in smem.
     // Now, we have kNumWarps lists of NumWarpQ elements.
     // This is a power of 2.
-    FinalBlockMerge<kNumWarps, ThreadsPerBlock, K, KeyValuePair<K, V>, NumWarpQ,
-                    Dir, Comp>::merge(sharedK, sharedV);
+    FinalBlockMerge<kNumWarps, ThreadsPerBlock, K, KeyValuePair<K, V>, NumWarpQ, Dir, Comp>::merge(
+      sharedK, sharedV);
 
     // The block-wide merge has a trailing syncthreads
   }
diff --git a/cpp/include/raft/spatial/knn/detail/common_faiss.h b/cpp/include/raft/spatial/knn/detail/common_faiss.h
index 0c0398a336..5618186dfc 100644
--- a/cpp/include/raft/spatial/knn/detail/common_faiss.h
+++ b/cpp/include/raft/spatial/knn/detail/common_faiss.h
@@ -27,37 +27,26 @@ namespace spatial {
 namespace knn {
 namespace detail {
 
-inline faiss::MetricType build_faiss_metric(
-  raft::distance::DistanceType metric) {
+inline faiss::MetricType build_faiss_metric(raft::distance::DistanceType metric)
+{
   switch (metric) {
     case raft::distance::DistanceType::CosineExpanded:
       return faiss::MetricType::METRIC_INNER_PRODUCT;
     case raft::distance::DistanceType::CorrelationExpanded:
       return faiss::MetricType::METRIC_INNER_PRODUCT;
-    case raft::distance::DistanceType::L2Expanded:
-      return faiss::MetricType::METRIC_L2;
-    case raft::distance::DistanceType::L2Unexpanded:
-      return faiss::MetricType::METRIC_L2;
-    case raft::distance::DistanceType::L2SqrtExpanded:
-      return faiss::MetricType::METRIC_L2;
-    case raft::distance::DistanceType::L2SqrtUnexpanded:
-      return faiss::MetricType::METRIC_L2;
-    case raft::distance::DistanceType::L1:
-      return faiss::MetricType::METRIC_L1;
-    case raft::distance::DistanceType::InnerProduct:
-      return faiss::MetricType::METRIC_INNER_PRODUCT;
-    case raft::distance::DistanceType::LpUnexpanded:
-      return faiss::MetricType::METRIC_Lp;
-    case raft::distance::DistanceType::Linf:
-      return faiss::MetricType::METRIC_Linf;
-    case raft::distance::DistanceType::Canberra:
-      return faiss::MetricType::METRIC_Canberra;
-    case raft::distance::DistanceType::BrayCurtis:
-      return faiss::MetricType::METRIC_BrayCurtis;
+    case raft::distance::DistanceType::L2Expanded: return faiss::MetricType::METRIC_L2;
+    case raft::distance::DistanceType::L2Unexpanded: return faiss::MetricType::METRIC_L2;
+    case raft::distance::DistanceType::L2SqrtExpanded: return faiss::MetricType::METRIC_L2;
+    case raft::distance::DistanceType::L2SqrtUnexpanded: return faiss::MetricType::METRIC_L2;
+    case raft::distance::DistanceType::L1: return faiss::MetricType::METRIC_L1;
+    case raft::distance::DistanceType::InnerProduct: return faiss::MetricType::METRIC_INNER_PRODUCT;
+    case raft::distance::DistanceType::LpUnexpanded: return faiss::MetricType::METRIC_Lp;
+    case raft::distance::DistanceType::Linf: return faiss::MetricType::METRIC_Linf;
+    case raft::distance::DistanceType::Canberra: return faiss::MetricType::METRIC_Canberra;
+    case raft::distance::DistanceType::BrayCurtis: return faiss::MetricType::METRIC_BrayCurtis;
     case raft::distance::DistanceType::JensenShannon:
       return faiss::MetricType::METRIC_JensenShannon;
-    default:
-      THROW("MetricType not supported: %d", metric);
+    default: THROW("MetricType not supported: %d", metric);
   }
 }
 
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index f774d9d1ea..47fc62066d 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -29,19 +29,21 @@ namespace knn {
 namespace detail {
 
 template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT>
-DI void loadAllWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const IdxT m,
-                          const unsigned int numOfNN) {
+DI void loadAllWarpQShmem(myWarpSelect& heapArr,
+                          Pair* shDumpKV,
+                          const IdxT m,
+                          const unsigned int numOfNN)
+{
   const int lid = raft::laneId();
 #pragma unroll
   for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-    const auto rowId =
-      (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+    const auto rowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
     if (rowId < m) {
 #pragma unroll
       for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) {
         const int idx = j * warpSize + lid;
         if (idx < numOfNN) {
-          Pair KVPair = shDumpKV[rowId * numOfNN + idx];
+          Pair KVPair          = shDumpKV[rowId * numOfNN + idx];
           heapArr[i]->warpV[j] = KVPair.key;
           heapArr[i]->warpK[j] = KVPair.value;
         }
@@ -51,14 +53,17 @@ DI void loadAllWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const IdxT m,
 }
 
 template <typename Policy, typename Pair, typename myWarpSelect>
-DI void loadWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const int rowId,
-                       const unsigned int numOfNN) {
+DI void loadWarpQShmem(myWarpSelect& heapArr,
+                       Pair* shDumpKV,
+                       const int rowId,
+                       const unsigned int numOfNN)
+{
   const int lid = raft::laneId();
 #pragma unroll
   for (int j = 0; j < heapArr->kNumWarpQRegisters; ++j) {
     const int idx = j * warpSize + lid;
     if (idx < numOfNN) {
-      Pair KVPair = shDumpKV[rowId * numOfNN + idx];
+      Pair KVPair       = shDumpKV[rowId * numOfNN + idx];
       heapArr->warpV[j] = KVPair.key;
       heapArr->warpK[j] = KVPair.value;
     }
@@ -66,25 +71,31 @@ DI void loadWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const int rowId,
 }
 
 template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT>
-DI void storeWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const IdxT rowId,
-                        const unsigned int numOfNN) {
+DI void storeWarpQShmem(myWarpSelect& heapArr,
+                        Pair* shDumpKV,
+                        const IdxT rowId,
+                        const unsigned int numOfNN)
+{
   const int lid = raft::laneId();
 
 #pragma unroll
   for (int j = 0; j < heapArr->kNumWarpQRegisters; ++j) {
     const int idx = j * warpSize + lid;
     if (idx < numOfNN) {
-      Pair otherKV = Pair(heapArr->warpV[j], heapArr->warpK[j]);
+      Pair otherKV                    = Pair(heapArr->warpV[j], heapArr->warpK[j]);
       shDumpKV[rowId * numOfNN + idx] = otherKV;
     }
   }
 }
 
-template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT,
-          typename OutT>
-DI void storeWarpQGmem(myWarpSelect &heapArr, OutT *out_dists, IdxT *out_inds,
-                       const IdxT m, const unsigned int numOfNN,
-                       const IdxT starty) {
+template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT, typename OutT>
+DI void storeWarpQGmem(myWarpSelect& heapArr,
+                       OutT* out_dists,
+                       IdxT* out_inds,
+                       const IdxT m,
+                       const unsigned int numOfNN,
+                       const IdxT starty)
+{
   const int lid = raft::laneId();
 #pragma unroll
   for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
@@ -95,18 +106,21 @@ DI void storeWarpQGmem(myWarpSelect &heapArr, OutT *out_dists, IdxT *out_inds,
         const auto idx = j * warpSize + lid;
         if (idx < numOfNN) {
           out_dists[gmemRowId * numOfNN + idx] = heapArr[i]->warpK[j];
-          out_inds[gmemRowId * numOfNN + idx] = (IdxT)heapArr[i]->warpV[j];
+          out_inds[gmemRowId * numOfNN + idx]  = (IdxT)heapArr[i]->warpV[j];
         }
       }
     }
   }
 }
 
-template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT,
-          typename OutT>
-DI void loadPrevTopKsGmemWarpQ(myWarpSelect &heapArr, OutT *out_dists,
-                               IdxT *out_inds, const IdxT m,
-                               const unsigned int numOfNN, const IdxT starty) {
+template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT, typename OutT>
+DI void loadPrevTopKsGmemWarpQ(myWarpSelect& heapArr,
+                               OutT* out_dists,
+                               IdxT* out_inds,
+                               const IdxT m,
+                               const unsigned int numOfNN,
+                               const IdxT starty)
+{
   const int lid = raft::laneId();
 #pragma unroll
   for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
@@ -121,17 +135,17 @@ DI void loadPrevTopKsGmemWarpQ(myWarpSelect &heapArr, OutT *out_dists,
         }
       }
       auto constexpr kLaneWarpKTop = heapArr[i]->kNumWarpQRegisters - 1;
-      heapArr[i]->warpKTop =
-        raft::shfl(heapArr[i]->warpK[kLaneWarpKTop], heapArr[i]->kLane);
+      heapArr[i]->warpKTop = raft::shfl(heapArr[i]->warpK[kLaneWarpKTop], heapArr[i]->kLane);
     }
   }
 }
 
 template <typename Pair, int NumWarpQRegs, typename myWarpSelect>
-DI void updateSortedWarpQ(myWarpSelect &heapArr, Pair *allWarpTopKs, int rowId,
-                          int finalNumVals, int startId = 0) {
+DI void updateSortedWarpQ(
+  myWarpSelect& heapArr, Pair* allWarpTopKs, int rowId, int finalNumVals, int startId = 0)
+{
   constexpr uint32_t mask = 0xffffffffu;
-  const int lid = raft::laneId();
+  const int lid           = raft::laneId();
   // calculate srcLane such that tid 0 -> 31, 1 -> 0,... 31 -> 30.
   // warp around 0 to 31 required for NN > 32
   const auto srcLane = (warpSize + (lid - 1)) & (warpSize - 1);
@@ -140,12 +154,11 @@ DI void updateSortedWarpQ(myWarpSelect &heapArr, Pair *allWarpTopKs, int rowId,
     Pair KVPair = allWarpTopKs[rowId * (256) + k];
 #pragma unroll
     for (int i = 0; i < NumWarpQRegs; i++) {
-      unsigned activeLanes =
-        __ballot_sync(mask, KVPair.value < heapArr->warpK[i]);
+      unsigned activeLanes = __ballot_sync(mask, KVPair.value < heapArr->warpK[i]);
       if (activeLanes) {
         Pair tempKV;
-        tempKV.value = raft::shfl(heapArr->warpK[i], srcLane);
-        tempKV.key = raft::shfl(heapArr->warpV[i], srcLane);
+        tempKV.value               = raft::shfl(heapArr->warpK[i], srcLane);
+        tempKV.key                 = raft::shfl(heapArr->warpV[i], srcLane);
         const auto firstActiveLane = __ffs(activeLanes) - 1;
         if (firstActiveLane == lid) {
           heapArr->warpK[i] = KVPair.value;
@@ -168,43 +181,60 @@ DI void updateSortedWarpQ(myWarpSelect &heapArr, Pair *allWarpTopKs, int rowId,
   }
 }
 
-template <bool useNorms, typename DataT, typename AccT, typename OutT,
-          typename IdxT, typename Policy, typename CoreLambda,
-          typename FinalLambda, int NumWarpQ, int NumThreadQ,
-          bool usePrevTopKs = false, bool isRowMajor = true>
-__global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
-  const DataT *x, const DataT *y, const DataT *_xn, const DataT *_yn,
-  const IdxT m, const IdxT n, const IdxT k, const IdxT lda, const IdxT ldb,
-  const IdxT ldd, CoreLambda core_op, FinalLambda fin_op, bool sqrt,
-  unsigned int numOfNN, int *mutexes, OutT *out_dists, IdxT *out_inds) {
+template <bool useNorms,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename Policy,
+          typename CoreLambda,
+          typename FinalLambda,
+          int NumWarpQ,
+          int NumThreadQ,
+          bool usePrevTopKs = false,
+          bool isRowMajor   = true>
+__global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x,
+                                                                  const DataT* y,
+                                                                  const DataT* _xn,
+                                                                  const DataT* _yn,
+                                                                  const IdxT m,
+                                                                  const IdxT n,
+                                                                  const IdxT k,
+                                                                  const IdxT lda,
+                                                                  const IdxT ldb,
+                                                                  const IdxT ldd,
+                                                                  CoreLambda core_op,
+                                                                  FinalLambda fin_op,
+                                                                  bool sqrt,
+                                                                  unsigned int numOfNN,
+                                                                  int* mutexes,
+                                                                  OutT* out_dists,
+                                                                  IdxT* out_inds)
+{
   extern __shared__ char smem[];
 
   typedef cub::KeyValuePair<uint32_t, AccT> Pair;
   constexpr auto identity = std::numeric_limits<AccT>::max();
-  constexpr auto keyMax = std::numeric_limits<uint32_t>::max();
-  constexpr auto Dir = false;
-  typedef faiss::gpu::WarpSelect<
-    AccT, uint32_t, Dir, faiss::gpu::Comparator<AccT>, NumWarpQ, NumThreadQ, 32>
-    myWarpSelect;
-
-  auto rowEpilog_lambda = [m, n, numOfNN, out_dists, out_inds,
-                           mutexes] __device__(IdxT gridStrideY) {
-    if (gridDim.x == 1) {
-      return;
-    }
+  constexpr auto keyMax   = std::numeric_limits<uint32_t>::max();
+  constexpr auto Dir      = false;
+  typedef faiss::gpu::
+    WarpSelect<AccT, uint32_t, Dir, faiss::gpu::Comparator<AccT>, NumWarpQ, NumThreadQ, 32>
+      myWarpSelect;
 
-    volatile int *mutex = mutexes;
+  auto rowEpilog_lambda = [m, n, numOfNN, out_dists, out_inds, mutexes] __device__(
+                            IdxT gridStrideY) {
+    if (gridDim.x == 1) { return; }
 
-    Pair *shDumpKV = nullptr;
+    volatile int* mutex = mutexes;
+
+    Pair* shDumpKV = nullptr;
     if (useNorms) {
-      shDumpKV =
-        (Pair *)(&smem[Policy::SmemSize +
-                       ((Policy::Mblk + Policy::Nblk) * sizeof(DataT))]);
+      shDumpKV = (Pair*)(&smem[Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT))]);
     } else {
-      shDumpKV = (Pair *)(&smem[Policy::SmemSize]);
+      shDumpKV = (Pair*)(&smem[Policy::SmemSize]);
     }
 
-    const int lid = threadIdx.x % warpSize;
+    const int lid     = threadIdx.x % warpSize;
     const IdxT starty = gridStrideY + (threadIdx.x / Policy::AccThCols);
 
     //  0 -> consumer done consuming the buffer.
@@ -215,7 +245,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
       auto cta_processed = 0;
       myWarpSelect heapArr1(identity, keyMax, numOfNN);
       myWarpSelect heapArr2(identity, keyMax, numOfNN);
-      myWarpSelect *heapArr[] = {&heapArr1, &heapArr2};
+      myWarpSelect* heapArr[] = {&heapArr1, &heapArr2};
       __syncwarp();
 
       loadAllWarpQShmem<Policy, Pair>(heapArr, &shDumpKV[0], m, numOfNN);
@@ -224,7 +254,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
         if (threadIdx.x == 0) {
           int32_t old = -3;
           while (old != -1) {
-            old = atomicCAS((int *)&mutex[gridStrideY / Policy::Mblk], -2, -1);
+            old = atomicCAS((int*)&mutex[gridStrideY / Policy::Mblk], -2, -1);
           }
           __threadfence();
         }
@@ -232,18 +262,17 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
 
 #pragma unroll
         for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-          const auto rowId = starty + i * Policy::AccThRows;
-          const auto shMemRowId =
-            (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+          const auto rowId      = starty + i * Policy::AccThRows;
+          const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
 #pragma unroll
           for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) {
             Pair otherKV;
-            otherKV.value = identity;
-            otherKV.key = keyMax;
+            otherKV.value  = identity;
+            otherKV.key    = keyMax;
             const auto idx = j * warpSize + lid;
             if (idx < numOfNN && rowId < m) {
-              otherKV.value = out_dists[rowId * numOfNN + idx];
-              otherKV.key = (uint32_t)out_inds[rowId * numOfNN + idx];
+              otherKV.value                        = out_dists[rowId * numOfNN + idx];
+              otherKV.key                          = (uint32_t)out_inds[rowId * numOfNN + idx];
               shDumpKV[shMemRowId * numOfNN + idx] = otherKV;
             }
           }
@@ -260,19 +289,16 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
 
 #pragma unroll
         for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-          const auto rowId = starty + i * Policy::AccThRows;
-          const auto shMemRowId =
-            (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+          const auto rowId      = starty + i * Policy::AccThRows;
+          const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
           if (rowId < m) {
 #pragma unroll
             for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) {
               Pair otherKV;
-              otherKV.value = identity;
-              otherKV.key = keyMax;
+              otherKV.value  = identity;
+              otherKV.key    = keyMax;
               const auto idx = j * warpSize + lid;
-              if (idx < numOfNN) {
-                otherKV = shDumpKV[shMemRowId * numOfNN + idx];
-              }
+              if (idx < numOfNN) { otherKV = shDumpKV[shMemRowId * numOfNN + idx]; }
               heapArr[i]->add(otherKV.value, otherKV.key);
             }
           }
@@ -284,20 +310,17 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
         const auto rowId = starty + i * Policy::AccThRows;
         if (rowId < m) {
           bool needSort = (heapArr[i]->numVals > 0);
-          needSort = __any_sync(0xffffffff, needSort);
-          if (needSort) {
-            heapArr[i]->reduce();
-          }
+          needSort      = __any_sync(0xffffffff, needSort);
+          if (needSort) { heapArr[i]->reduce(); }
         }
       }
-      storeWarpQGmem<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN,
-                                   starty);
+      storeWarpQGmem<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN, starty);
     } else {
       if (threadIdx.x == 0) {
-        int32_t old = -1;
+        int32_t old    = -1;
         int32_t blkIdX = (int32_t)blockIdx.x;
         while (old != blkIdX) {
-          old = atomicCAS((int *)&mutex[gridStrideY / Policy::Mblk], 0, blkIdX);
+          old = atomicCAS((int*)&mutex[gridStrideY / Policy::Mblk], 0, blkIdX);
         }
         __threadfence();
       }
@@ -305,14 +328,13 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
 
 #pragma unroll
       for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-        const auto rowId = starty + i * Policy::AccThRows;
-        const auto shMemRowId =
-          (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+        const auto rowId      = starty + i * Policy::AccThRows;
+        const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
         if (rowId < m) {
           for (int idx = lid; idx < numOfNN; idx += warpSize) {
-            Pair KVPair = shDumpKV[shMemRowId * numOfNN + idx];
+            Pair KVPair                      = shDumpKV[shMemRowId * numOfNN + idx];
             out_dists[rowId * numOfNN + idx] = KVPair.value;
-            out_inds[rowId * numOfNN + idx] = (IdxT)KVPair.key;
+            out_inds[rowId * numOfNN + idx]  = (IdxT)KVPair.key;
           }
         }
       }
@@ -328,7 +350,9 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
   // epilogue operation lambda for final value calculation
   auto epilog_lambda = [numOfNN, m, n, ldd, out_dists, out_inds] __device__(
                          AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         DataT * regxn,
+                         DataT * regyn,
+                         IdxT gridStrideX,
                          IdxT gridStrideY) {
     if (useNorms) {
 #pragma unroll
@@ -340,36 +364,34 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
       }
     }
 
-    Pair *shDumpKV = nullptr;
+    Pair* shDumpKV = nullptr;
     if (useNorms) {
       constexpr size_t shmemSize =
         Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT));
-      shDumpKV = (Pair *)(&smem[shmemSize]);
+      shDumpKV = (Pair*)(&smem[shmemSize]);
     } else {
-      shDumpKV = (Pair *)(&smem[Policy::SmemSize]);
+      shDumpKV = (Pair*)(&smem[Policy::SmemSize]);
     }
 
     constexpr uint32_t mask = 0xffffffffu;
-    const IdxT starty = gridStrideY + (threadIdx.x / Policy::AccThCols);
-    const IdxT startx = gridStrideX + (threadIdx.x % Policy::AccThCols);
-    const int lid = raft::laneId();
+    const IdxT starty       = gridStrideY + (threadIdx.x / Policy::AccThCols);
+    const IdxT startx       = gridStrideX + (threadIdx.x % Policy::AccThCols);
+    const int lid           = raft::laneId();
 
     myWarpSelect heapArr1(identity, keyMax, numOfNN);
     myWarpSelect heapArr2(identity, keyMax, numOfNN);
-    myWarpSelect *heapArr[] = {&heapArr1, &heapArr2};
+    myWarpSelect* heapArr[] = {&heapArr1, &heapArr2};
     if (usePrevTopKs) {
       if (gridStrideX == blockIdx.x * Policy::Nblk) {
-        loadPrevTopKsGmemWarpQ<Policy, Pair>(heapArr, out_dists, out_inds, m,
-                                             numOfNN, starty);
+        loadPrevTopKsGmemWarpQ<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN, starty);
       }
     }
 
     if (gridStrideX > blockIdx.x * Policy::Nblk) {
 #pragma unroll
       for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-        const auto rowId =
-          (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
-        Pair tempKV = shDumpKV[(rowId * numOfNN) + numOfNN - 1];
+        const auto rowId     = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+        Pair tempKV          = shDumpKV[(rowId * numOfNN) + numOfNN - 1];
         heapArr[i]->warpKTop = tempKV.value;
       }
 
@@ -378,16 +400,14 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
       int anyWarpTopKs = 0;
 #pragma unroll
       for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-        const auto rowId = starty + i * Policy::AccThRows;
+        const auto rowId   = starty + i * Policy::AccThRows;
         numValsWarpTopK[i] = 0;
         if (rowId < m) {
 #pragma unroll
           for (int j = 0; j < Policy::AccColsPerTh; ++j) {
             const auto colId = startx + j * Policy::AccThCols;
             if (colId < ldd) {
-              if (acc[i][j] < heapArr[i]->warpKTop) {
-                numValsWarpTopK[i]++;
-              }
+              if (acc[i][j] < heapArr[i]->warpKTop) { numValsWarpTopK[i]++; }
             }
           }
           anyWarpTopKs += numValsWarpTopK[i];
@@ -395,24 +415,21 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
       }
       anyWarpTopKs = __syncthreads_or(anyWarpTopKs > 0);
       if (anyWarpTopKs) {
-        Pair *allWarpTopKs = (Pair *)(&smem[0]);
+        Pair* allWarpTopKs = (Pair*)(&smem[0]);
         uint32_t needScanSort[Policy::AccRowsPerTh];
 
 #pragma unroll
         for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
           const auto gmemRowId = starty + i * Policy::AccThRows;
-          needScanSort[i] = 0;
+          needScanSort[i]      = 0;
           if (gmemRowId < m) {
-            int myVals = numValsWarpTopK[i];
+            int myVals      = numValsWarpTopK[i];
             needScanSort[i] = __ballot_sync(mask, myVals > 0);
             if (needScanSort[i]) {
 #pragma unroll
               for (unsigned int k = 1; k <= 16; k *= 2) {
-                const unsigned int n =
-                  __shfl_up_sync(mask, numValsWarpTopK[i], k);
-                if (lid >= k) {
-                  numValsWarpTopK[i] += n;
-                }
+                const unsigned int n = __shfl_up_sync(mask, numValsWarpTopK[i], k);
+                if (lid >= k) { numValsWarpTopK[i] += n; }
               }
             }
             // As each thread will know its total vals to write.
@@ -421,8 +438,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
           }
 
           if (needScanSort[i]) {
-            const auto rowId =
-              (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+            const auto rowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
             if (gmemRowId < m) {
               if (needScanSort[i] & ((uint32_t)1 << lid)) {
 #pragma unroll
@@ -430,17 +446,15 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
                   const auto colId = startx + j * Policy::AccThCols;
                   if (colId < ldd) {
                     if (acc[i][j] < heapArr[i]->warpKTop) {
-                      Pair otherKV = {colId, acc[i][j]};
-                      allWarpTopKs[rowId * (256) + numValsWarpTopK[i]] =
-                        otherKV;
+                      Pair otherKV                                     = {colId, acc[i][j]};
+                      allWarpTopKs[rowId * (256) + numValsWarpTopK[i]] = otherKV;
                       numValsWarpTopK[i]++;
                     }
                   }
                 }
               }
               const int finalNumVals = raft::shfl(numValsWarpTopK[i], 31);
-              loadWarpQShmem<Policy, Pair>(heapArr[i], &shDumpKV[0], rowId,
-                                           numOfNN);
+              loadWarpQShmem<Policy, Pair>(heapArr[i], &shDumpKV[0], rowId, numOfNN);
               updateSortedWarpQ<Pair, heapArr[i]->kNumWarpQRegisters>(
                 heapArr[i], &allWarpTopKs[0], rowId, finalNumVals);
             }
@@ -450,12 +464,10 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
 #pragma unroll
         for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
           if (needScanSort[i]) {
-            const auto rowId =
-              (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+            const auto rowId     = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
             const auto gmemRowId = starty + i * Policy::AccThRows;
             if (gmemRowId < m) {
-              storeWarpQShmem<Policy, Pair>(heapArr[i], shDumpKV, rowId,
-                                            numOfNN);
+              storeWarpQShmem<Policy, Pair>(heapArr[i], shDumpKV, rowId, numOfNN);
             }
           }
         }
@@ -463,28 +475,24 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
     } else {
 #pragma unroll
       for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-        const auto gmemRowId = starty + i * Policy::AccThRows;
-        const auto shMemRowId =
-          (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+        const auto gmemRowId  = starty + i * Policy::AccThRows;
+        const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
         if (gmemRowId < m) {
 #pragma unroll
           for (int j = 0; j < Policy::AccColsPerTh; ++j) {
             const auto colId = startx + j * Policy::AccThCols;
-            Pair otherKV = {keyMax, identity};
+            Pair otherKV     = {keyMax, identity};
             if (colId < ldd) {
               otherKV.value = acc[i][j];
-              otherKV.key = colId;
+              otherKV.key   = colId;
             }
             heapArr[i]->add(otherKV.value, otherKV.key);
           }
 
           bool needSort = (heapArr[i]->numVals > 0);
-          needSort = __any_sync(mask, needSort);
-          if (needSort) {
-            heapArr[i]->reduce();
-          }
-          storeWarpQShmem<Policy, Pair>(heapArr[i], shDumpKV, shMemRowId,
-                                        numOfNN);
+          needSort      = __any_sync(mask, needSort);
+          if (needSort) { heapArr[i]->reduce(); }
+          storeWarpQShmem<Policy, Pair>(heapArr[i], shDumpKV, shMemRowId, numOfNN);
         }
       }
     }
@@ -492,27 +500,64 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
     if (((gridStrideX + Policy::Nblk * gridDim.x) > n) && gridDim.x == 1) {
       // This is last iteration of grid stride X
       loadAllWarpQShmem<Policy, Pair>(heapArr, &shDumpKV[0], m, numOfNN);
-      storeWarpQGmem<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN,
-                                   starty);
+      storeWarpQGmem<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN, starty);
     }
   };
 
-  raft::distance::detail::PairwiseDistances<
-    useNorms, DataT, AccT, OutT, IdxT, Policy, CoreLambda,
-    decltype(epilog_lambda), FinalLambda, decltype(rowEpilog_lambda),
-    isRowMajor, false>
-    obj(x, y, m, n, k, lda, ldb, ldd, _xn, _yn, nullptr, smem, core_op,
-        epilog_lambda, fin_op, rowEpilog_lambda);
+  raft::distance::detail::PairwiseDistances<useNorms,
+                                            DataT,
+                                            AccT,
+                                            OutT,
+                                            IdxT,
+                                            Policy,
+                                            CoreLambda,
+                                            decltype(epilog_lambda),
+                                            FinalLambda,
+                                            decltype(rowEpilog_lambda),
+                                            isRowMajor,
+                                            false>
+    obj(x,
+        y,
+        m,
+        n,
+        k,
+        lda,
+        ldb,
+        ldd,
+        _xn,
+        _yn,
+        nullptr,
+        smem,
+        core_op,
+        epilog_lambda,
+        fin_op,
+        rowEpilog_lambda);
   obj.run();
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, bool usePrevTopKs, bool isRowMajor>
-void fusedL2UnexpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
-                         IdxT lda, IdxT ldb, IdxT ldd, bool sqrt,
-                         OutT *out_dists, IdxT *out_inds, IdxT numOfNN,
-                         cudaStream_t stream, void *workspace,
-                         size_t &worksize) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          bool usePrevTopKs,
+          bool isRowMajor>
+void fusedL2UnexpKnnImpl(const DataT* x,
+                         const DataT* y,
+                         IdxT m,
+                         IdxT n,
+                         IdxT k,
+                         IdxT lda,
+                         IdxT ldb,
+                         IdxT ldd,
+                         bool sqrt,
+                         OutT* out_dists,
+                         IdxT* out_inds,
+                         IdxT numOfNN,
+                         cudaStream_t stream,
+                         void* workspace,
+                         size_t& worksize)
+{
   typedef typename raft::linalg::Policy2x8<DataT, 1>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
@@ -532,12 +577,30 @@ void fusedL2UnexpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
   typedef cub::KeyValuePair<uint32_t, AccT> Pair;
 
   if (isRowMajor) {
-    constexpr auto fusedL2UnexpKnn32RowMajor =
-      fusedL2kNN<false, DataT, AccT, OutT, IdxT, KPolicy, decltype(core_lambda),
-                 decltype(fin_op), 32, 2, usePrevTopKs, true>;
-    constexpr auto fusedL2UnexpKnn64RowMajor =
-      fusedL2kNN<false, DataT, AccT, OutT, IdxT, KPolicy, decltype(core_lambda),
-                 decltype(fin_op), 64, 3, usePrevTopKs, true>;
+    constexpr auto fusedL2UnexpKnn32RowMajor = fusedL2kNN<false,
+                                                          DataT,
+                                                          AccT,
+                                                          OutT,
+                                                          IdxT,
+                                                          KPolicy,
+                                                          decltype(core_lambda),
+                                                          decltype(fin_op),
+                                                          32,
+                                                          2,
+                                                          usePrevTopKs,
+                                                          true>;
+    constexpr auto fusedL2UnexpKnn64RowMajor = fusedL2kNN<false,
+                                                          DataT,
+                                                          AccT,
+                                                          OutT,
+                                                          IdxT,
+                                                          KPolicy,
+                                                          decltype(core_lambda),
+                                                          decltype(fin_op),
+                                                          64,
+                                                          3,
+                                                          usePrevTopKs,
+                                                          true>;
 
     auto fusedL2UnexpKnnRowMajor = fusedL2UnexpKnn32RowMajor;
     if (numOfNN <= 32) {
@@ -545,13 +608,11 @@ void fusedL2UnexpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
     } else if (numOfNN <= 64) {
       fusedL2UnexpKnnRowMajor = fusedL2UnexpKnn64RowMajor;
     } else {
-      ASSERT(numOfNN <= 64,
-             "fusedL2kNN: num of nearest neighbors must be <= 64");
+      ASSERT(numOfNN <= 64, "fusedL2kNN: num of nearest neighbors must be <= 64");
     }
 
-    const auto sharedMemSize =
-      KPolicy::SmemSize + (KPolicy::Mblk * numOfNN * sizeof(Pair));
-    dim3 grid = raft::distance::detail::launchConfigGenerator<KPolicy>(
+    const auto sharedMemSize = KPolicy::SmemSize + (KPolicy::Mblk * numOfNN * sizeof(Pair));
+    dim3 grid                = raft::distance::detail::launchConfigGenerator<KPolicy>(
       m, n, sharedMemSize, fusedL2UnexpKnnRowMajor);
 
     if (grid.x > 1) {
@@ -560,51 +621,133 @@ void fusedL2UnexpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
         worksize = sizeof(int32_t) * numMutexes;
         return;
       } else {
-        CUDA_CHECK(
-          cudaMemsetAsync(workspace, 0, sizeof(int32_t) * numMutexes, stream));
+        CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int32_t) * numMutexes, stream));
       }
     }
 
-    fusedL2UnexpKnnRowMajor<<<grid, blk, sharedMemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, core_lambda, fin_op, sqrt,
-      (uint32_t)numOfNN, (int *)workspace, out_dists, out_inds);
+    fusedL2UnexpKnnRowMajor<<<grid, blk, sharedMemSize, stream>>>(x,
+                                                                  y,
+                                                                  nullptr,
+                                                                  nullptr,
+                                                                  m,
+                                                                  n,
+                                                                  k,
+                                                                  lda,
+                                                                  ldb,
+                                                                  ldd,
+                                                                  core_lambda,
+                                                                  fin_op,
+                                                                  sqrt,
+                                                                  (uint32_t)numOfNN,
+                                                                  (int*)workspace,
+                                                                  out_dists,
+                                                                  out_inds);
   } else {
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          bool usePrevTopKs, bool isRowMajor>
-void fusedL2UnexpKnn(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                     const DataT *x, const DataT *y, bool sqrt, OutT *out_dists,
-                     IdxT *out_inds, IdxT numOfNN, cudaStream_t stream,
-                     void *workspace, size_t &worksize) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          bool usePrevTopKs,
+          bool isRowMajor>
+void fusedL2UnexpKnn(IdxT m,
+                     IdxT n,
+                     IdxT k,
+                     IdxT lda,
+                     IdxT ldb,
+                     IdxT ldd,
+                     const DataT* x,
+                     const DataT* y,
+                     bool sqrt,
+                     OutT* out_dists,
+                     IdxT* out_inds,
+                     IdxT numOfNN,
+                     cudaStream_t stream,
+                     void* workspace,
+                     size_t& worksize)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    fusedL2UnexpKnnImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT),
-                        usePrevTopKs, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, out_inds, numOfNN, stream,
-      workspace, worksize);
+    fusedL2UnexpKnnImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), usePrevTopKs, isRowMajor>(
+      x,
+      y,
+      m,
+      n,
+      k,
+      lda,
+      ldb,
+      ldd,
+      sqrt,
+      out_dists,
+      out_inds,
+      numOfNN,
+      stream,
+      workspace,
+      worksize);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    fusedL2UnexpKnnImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT),
-                        usePrevTopKs, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, out_inds, numOfNN, stream,
-      workspace, worksize);
+    fusedL2UnexpKnnImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), usePrevTopKs, isRowMajor>(
+      x,
+      y,
+      m,
+      n,
+      k,
+      lda,
+      ldb,
+      ldd,
+      sqrt,
+      out_dists,
+      out_inds,
+      numOfNN,
+      stream,
+      workspace,
+      worksize);
   } else {
-    fusedL2UnexpKnnImpl<DataT, AccT, OutT, IdxT, 1, usePrevTopKs, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, out_inds, numOfNN, stream,
-      workspace, worksize);
+    fusedL2UnexpKnnImpl<DataT, AccT, OutT, IdxT, 1, usePrevTopKs, isRowMajor>(x,
+                                                                              y,
+                                                                              m,
+                                                                              n,
+                                                                              k,
+                                                                              lda,
+                                                                              ldb,
+                                                                              ldd,
+                                                                              sqrt,
+                                                                              out_dists,
+                                                                              out_inds,
+                                                                              numOfNN,
+                                                                              stream,
+                                                                              workspace,
+                                                                              worksize);
   }
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, bool usePrevTopKs, bool isRowMajor>
-void fusedL2ExpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
-                       IdxT lda, IdxT ldb, IdxT ldd, bool sqrt, OutT *out_dists,
-                       IdxT *out_inds, IdxT numOfNN, cudaStream_t stream,
-                       void *workspace, size_t &worksize) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          bool usePrevTopKs,
+          bool isRowMajor>
+void fusedL2ExpKnnImpl(const DataT* x,
+                       const DataT* y,
+                       IdxT m,
+                       IdxT n,
+                       IdxT k,
+                       IdxT lda,
+                       IdxT ldb,
+                       IdxT ldd,
+                       bool sqrt,
+                       OutT* out_dists,
+                       IdxT* out_inds,
+                       IdxT numOfNN,
+                       cudaStream_t stream,
+                       void* workspace,
+                       size_t& worksize)
+{
   typedef typename raft::linalg::Policy2x8<DataT, 1>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
@@ -612,28 +755,43 @@ void fusedL2ExpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
 
   ASSERT(isRowMajor, "Only Row major inputs are allowed");
 
-  ASSERT(!(((x != y) && (worksize < (m + n) * sizeof(AccT))) ||
-           (worksize < m * sizeof(AccT))),
+  ASSERT(!(((x != y) && (worksize < (m + n) * sizeof(AccT))) || (worksize < m * sizeof(AccT))),
          "workspace size error");
   ASSERT(workspace != nullptr, "workspace is null");
 
   dim3 blk(KPolicy::Nthreads);
   // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    acc += x * y;
-  };
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
 
   auto fin_op = [] __device__(AccT d_val, int g_d_idx) { return d_val; };
 
   typedef cub::KeyValuePair<uint32_t, AccT> Pair;
 
   if (isRowMajor) {
-    constexpr auto fusedL2ExpKnn32RowMajor =
-      fusedL2kNN<true, DataT, AccT, OutT, IdxT, KPolicy, decltype(core_lambda),
-                 decltype(fin_op), 32, 2, usePrevTopKs, true>;
-    constexpr auto fusedL2ExpKnn64RowMajor =
-      fusedL2kNN<true, DataT, AccT, OutT, IdxT, KPolicy, decltype(core_lambda),
-                 decltype(fin_op), 64, 3, usePrevTopKs, true>;
+    constexpr auto fusedL2ExpKnn32RowMajor = fusedL2kNN<true,
+                                                        DataT,
+                                                        AccT,
+                                                        OutT,
+                                                        IdxT,
+                                                        KPolicy,
+                                                        decltype(core_lambda),
+                                                        decltype(fin_op),
+                                                        32,
+                                                        2,
+                                                        usePrevTopKs,
+                                                        true>;
+    constexpr auto fusedL2ExpKnn64RowMajor = fusedL2kNN<true,
+                                                        DataT,
+                                                        AccT,
+                                                        OutT,
+                                                        IdxT,
+                                                        KPolicy,
+                                                        decltype(core_lambda),
+                                                        decltype(fin_op),
+                                                        64,
+                                                        3,
+                                                        usePrevTopKs,
+                                                        true>;
 
     auto fusedL2ExpKnnRowMajor = fusedL2ExpKnn32RowMajor;
     if (numOfNN <= 32) {
@@ -641,77 +799,137 @@ void fusedL2ExpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
     } else if (numOfNN <= 64) {
       fusedL2ExpKnnRowMajor = fusedL2ExpKnn64RowMajor;
     } else {
-      ASSERT(numOfNN <= 64,
-             "fusedL2kNN: num of nearest neighbors must be <= 64");
+      ASSERT(numOfNN <= 64, "fusedL2kNN: num of nearest neighbors must be <= 64");
     }
 
-    const auto sharedMemSize =
-      KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)) +
-      (KPolicy::Mblk * numOfNN * sizeof(Pair));
+    const auto sharedMemSize = KPolicy::SmemSize +
+                               ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)) +
+                               (KPolicy::Mblk * numOfNN * sizeof(Pair));
     dim3 grid = raft::distance::detail::launchConfigGenerator<KPolicy>(
       m, n, sharedMemSize, fusedL2ExpKnnRowMajor);
-    int32_t *mutexes = nullptr;
+    int32_t* mutexes = nullptr;
     if (grid.x > 1) {
-      const auto numMutexes = raft::ceildiv<int>(m, KPolicy::Mblk);
-      const auto normsSize =
-        (x != y) ? (m + n) * sizeof(DataT) : n * sizeof(DataT);
+      const auto numMutexes   = raft::ceildiv<int>(m, KPolicy::Mblk);
+      const auto normsSize    = (x != y) ? (m + n) * sizeof(DataT) : n * sizeof(DataT);
       const auto requiredSize = sizeof(int32_t) * numMutexes + normsSize;
       if (worksize < requiredSize) {
         worksize = requiredSize;
         return;
       } else {
-        mutexes = (int32_t *)((char *)workspace + normsSize);
-        CUDA_CHECK(
-          cudaMemsetAsync(mutexes, 0, sizeof(int32_t) * numMutexes, stream));
+        mutexes = (int32_t*)((char*)workspace + normsSize);
+        CUDA_CHECK(cudaMemsetAsync(mutexes, 0, sizeof(int32_t) * numMutexes, stream));
       }
     }
 
-    DataT *xn = (DataT *)workspace;
-    DataT *yn = (DataT *)workspace;
+    DataT* xn = (DataT*)workspace;
+    DataT* yn = (DataT*)workspace;
 
     auto norm_op = [] __device__(DataT in) { return in; };
 
     if (x != y) {
       yn += m;
-      raft::linalg::rowNorm(xn, x, k, m, raft::linalg::L2Norm, isRowMajor,
-                            stream, norm_op);
-      raft::linalg::rowNorm(yn, y, k, n, raft::linalg::L2Norm, isRowMajor,
-                            stream, norm_op);
+      raft::linalg::rowNorm(xn, x, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
+      raft::linalg::rowNorm(yn, y, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
     } else {
-      raft::linalg::rowNorm(xn, x, k, n, raft::linalg::L2Norm, isRowMajor,
-                            stream, norm_op);
+      raft::linalg::rowNorm(xn, x, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
     }
-    fusedL2ExpKnnRowMajor<<<grid, blk, sharedMemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, core_lambda, fin_op, sqrt,
-      (uint32_t)numOfNN, mutexes, out_dists, out_inds);
+    fusedL2ExpKnnRowMajor<<<grid, blk, sharedMemSize, stream>>>(x,
+                                                                y,
+                                                                xn,
+                                                                yn,
+                                                                m,
+                                                                n,
+                                                                k,
+                                                                lda,
+                                                                ldb,
+                                                                ldd,
+                                                                core_lambda,
+                                                                fin_op,
+                                                                sqrt,
+                                                                (uint32_t)numOfNN,
+                                                                mutexes,
+                                                                out_dists,
+                                                                out_inds);
   } else {
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          bool usePrevTopKs, bool isRowMajor>
-void fusedL2ExpKnn(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                   const DataT *x, const DataT *y, bool sqrt, OutT *out_dists,
-                   IdxT *out_inds, IdxT numOfNN, cudaStream_t stream,
-                   void *workspace, size_t &worksize) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          bool usePrevTopKs,
+          bool isRowMajor>
+void fusedL2ExpKnn(IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   IdxT lda,
+                   IdxT ldb,
+                   IdxT ldd,
+                   const DataT* x,
+                   const DataT* y,
+                   bool sqrt,
+                   OutT* out_dists,
+                   IdxT* out_inds,
+                   IdxT numOfNN,
+                   cudaStream_t stream,
+                   void* workspace,
+                   size_t& worksize)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), usePrevTopKs,
-                      isRowMajor>(x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists,
-                                  out_inds, numOfNN, stream, workspace,
-                                  worksize);
+    fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), usePrevTopKs, isRowMajor>(
+      x,
+      y,
+      m,
+      n,
+      k,
+      lda,
+      ldb,
+      ldd,
+      sqrt,
+      out_dists,
+      out_inds,
+      numOfNN,
+      stream,
+      workspace,
+      worksize);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), usePrevTopKs,
-                      isRowMajor>(x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists,
-                                  out_inds, numOfNN, stream, workspace,
-                                  worksize);
+    fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), usePrevTopKs, isRowMajor>(
+      x,
+      y,
+      m,
+      n,
+      k,
+      lda,
+      ldb,
+      ldd,
+      sqrt,
+      out_dists,
+      out_inds,
+      numOfNN,
+      stream,
+      workspace,
+      worksize);
   } else {
-    fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 1, usePrevTopKs, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, out_inds, numOfNN, stream,
-      workspace, worksize);
+    fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 1, usePrevTopKs, isRowMajor>(x,
+                                                                            y,
+                                                                            m,
+                                                                            n,
+                                                                            k,
+                                                                            lda,
+                                                                            ldb,
+                                                                            ldd,
+                                                                            sqrt,
+                                                                            out_dists,
+                                                                            out_inds,
+                                                                            numOfNN,
+                                                                            stream,
+                                                                            workspace,
+                                                                            worksize);
   }
 }
 
@@ -732,11 +950,19 @@ void fusedL2ExpKnn(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param[in] stream stream to order kernel launch
  */
 template <typename value_idx, typename value_t, bool usePrevTopKs = false>
-void fusedL2Knn(size_t D, value_idx *out_inds, value_t *out_dists,
-                const value_t *index, const value_t *query, size_t n_index_rows,
-                size_t n_query_rows, int k, bool rowMajorIndex,
-                bool rowMajorQuery, cudaStream_t stream,
-                raft::distance::DistanceType metric) {
+void fusedL2Knn(size_t D,
+                value_idx* out_inds,
+                value_t* out_dists,
+                const value_t* index,
+                const value_t* query,
+                size_t n_index_rows,
+                size_t n_query_rows,
+                int k,
+                bool rowMajorIndex,
+                bool rowMajorQuery,
+                cudaStream_t stream,
+                raft::distance::DistanceType metric)
+{
   // Validate the input data
   ASSERT(k > 0, "l2Knn: k must be > 0");
   ASSERT(D > 0, "l2Knn: D must be > 0");
@@ -750,8 +976,7 @@ void fusedL2Knn(size_t D, value_idx *out_inds, value_t *out_dists,
   ASSERT(rowMajorIndex == rowMajorQuery,
          "l2Knn: rowMajorIndex and rowMajorQuery should have same layout");
   // TODO: Add support for column major layout
-  ASSERT(rowMajorIndex == true,
-         "l2Knn: only rowMajor inputs are supported for now.");
+  ASSERT(rowMajorIndex == true, "l2Knn: only rowMajor inputs are supported for now.");
 
   // Even for L2 Sqrt distance case we use non-sqrt version as FAISS bfKNN only support
   // non-sqrt metric & some tests in RAFT/cuML (like Linkage) fails if we use L2 sqrt.
@@ -764,37 +989,82 @@ void fusedL2Knn(size_t D, value_idx *out_inds, value_t *out_dists,
   switch (metric) {
     case raft::distance::DistanceType::L2SqrtExpanded:
     case raft::distance::DistanceType::L2Expanded:
-      tempWorksize = raft::distance::detail::getWorkspaceSize<
-        raft::distance::DistanceType::L2Expanded, float, float, float,
-        value_idx>(query, index, n_query_rows, n_index_rows, D);
+      tempWorksize = raft::distance::detail::
+        getWorkspaceSize<raft::distance::DistanceType::L2Expanded, float, float, float, value_idx>(
+          query, index, n_query_rows, n_index_rows, D);
       worksize = tempWorksize;
       workspace.resize(worksize, stream);
-      fusedL2ExpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(
-        n_query_rows, n_index_rows, D, lda, ldb, ldd, query, index, sqrt,
-        out_dists, out_inds, k, stream, workspace.data(), worksize);
+      fusedL2ExpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(n_query_rows,
+                                                                              n_index_rows,
+                                                                              D,
+                                                                              lda,
+                                                                              ldb,
+                                                                              ldd,
+                                                                              query,
+                                                                              index,
+                                                                              sqrt,
+                                                                              out_dists,
+                                                                              out_inds,
+                                                                              k,
+                                                                              stream,
+                                                                              workspace.data(),
+                                                                              worksize);
       if (worksize > tempWorksize) {
         workspace.resize(worksize, stream);
-        fusedL2ExpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(
-          n_query_rows, n_index_rows, D, lda, ldb, ldd, query, index, sqrt,
-          out_dists, out_inds, k, stream, workspace.data(), worksize);
+        fusedL2ExpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(n_query_rows,
+                                                                                n_index_rows,
+                                                                                D,
+                                                                                lda,
+                                                                                ldb,
+                                                                                ldd,
+                                                                                query,
+                                                                                index,
+                                                                                sqrt,
+                                                                                out_dists,
+                                                                                out_inds,
+                                                                                k,
+                                                                                stream,
+                                                                                workspace.data(),
+                                                                                worksize);
       }
       break;
     case raft::distance::DistanceType::L2Unexpanded:
     case raft::distance::DistanceType::L2SqrtUnexpanded:
-      fusedL2UnexpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(
-        n_query_rows, n_index_rows, D, lda, ldb, ldd, query, index, sqrt,
-        out_dists, out_inds, k, stream, workspace.data(), worksize);
+      fusedL2UnexpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(n_query_rows,
+                                                                                n_index_rows,
+                                                                                D,
+                                                                                lda,
+                                                                                ldb,
+                                                                                ldd,
+                                                                                query,
+                                                                                index,
+                                                                                sqrt,
+                                                                                out_dists,
+                                                                                out_inds,
+                                                                                k,
+                                                                                stream,
+                                                                                workspace.data(),
+                                                                                worksize);
       if (worksize) {
         workspace.resize(worksize, stream);
-        fusedL2UnexpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs,
-                        true>(n_query_rows, n_index_rows, D, lda, ldb, ldd,
-                              query, index, sqrt, out_dists, out_inds, k,
-                              stream, workspace.data(), worksize);
+        fusedL2UnexpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(n_query_rows,
+                                                                                  n_index_rows,
+                                                                                  D,
+                                                                                  lda,
+                                                                                  ldb,
+                                                                                  ldd,
+                                                                                  query,
+                                                                                  index,
+                                                                                  sqrt,
+                                                                                  out_dists,
+                                                                                  out_inds,
+                                                                                  k,
+                                                                                  stream,
+                                                                                  workspace.data(),
+                                                                                  worksize);
       }
       break;
-    default:
-      printf("only L2 distance metric is supported\n");
-      break;
+    default: printf("only L2 distance metric is supported\n"); break;
   };
 }
 
diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
index 7d87254cb6..049c11514c 100644
--- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
+++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
@@ -35,7 +35,8 @@ namespace knn {
 namespace detail {
 
 template <typename value_t>
-DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) {
+DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2)
+{
   value_t sin_0 = sin(0.5 * (x1 - y1));
   value_t sin_1 = sin(0.5 * (x2 - y2));
   value_t rdist = sin_0 * sin_0 + cos(x1) * cos(y1) * sin_1 * sin_1;
@@ -56,34 +57,36 @@ DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) {
  * @param[in] n_index_rows number of rows in index array
  * @param[in] k number of closest neighbors to return
  */
-template <typename value_idx, typename value_t, int warp_q = 1024,
-          int thread_q = 8, int tpb = 128>
-__global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists,
-                                     const value_t *index, const value_t *query,
-                                     size_t n_index_rows, int k) {
+template <typename value_idx, typename value_t, int warp_q = 1024, int thread_q = 8, int tpb = 128>
+__global__ void haversine_knn_kernel(value_idx* out_inds,
+                                     value_t* out_dists,
+                                     const value_t* index,
+                                     const value_t* query,
+                                     size_t n_index_rows,
+                                     int k)
+{
   constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
   __shared__ value_t smemK[kNumWarps * warp_q];
   __shared__ value_idx smemV[kNumWarps * warp_q];
 
-  faiss::gpu::BlockSelect<value_t, value_idx, false,
-                          faiss::gpu::Comparator<value_t>, warp_q, thread_q,
-                          tpb>
-    heap(faiss::gpu::Limits<value_t>::getMax(), -1, smemK, smemV, k);
+  faiss::gpu::
+    BlockSelect<value_t, value_idx, false, faiss::gpu::Comparator<value_t>, warp_q, thread_q, tpb>
+      heap(faiss::gpu::Limits<value_t>::getMax(), -1, smemK, smemV, k);
 
   // Grid is exactly sized to rows available
   int limit = faiss::gpu::utils::roundDown(n_index_rows, faiss::gpu::kWarpSize);
 
-  const value_t *query_ptr = query + (blockIdx.x * 2);
-  value_t x1 = query_ptr[0];
-  value_t x2 = query_ptr[1];
+  const value_t* query_ptr = query + (blockIdx.x * 2);
+  value_t x1               = query_ptr[0];
+  value_t x2               = query_ptr[1];
 
   int i = threadIdx.x;
 
   for (; i < limit; i += tpb) {
-    const value_t *idx_ptr = index + (i * 2);
-    value_t y1 = idx_ptr[0];
-    value_t y2 = idx_ptr[1];
+    const value_t* idx_ptr = index + (i * 2);
+    value_t y1             = idx_ptr[0];
+    value_t y2             = idx_ptr[1];
 
     value_t dist = compute_haversine(x1, y1, x2, y2);
 
@@ -92,9 +95,9 @@ __global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists,
 
   // Handle last remainder fraction of a warp of elements
   if (i < n_index_rows) {
-    const value_t *idx_ptr = index + (i * 2);
-    value_t y1 = idx_ptr[0];
-    value_t y2 = idx_ptr[1];
+    const value_t* idx_ptr = index + (i * 2);
+    value_t y1             = idx_ptr[0];
+    value_t y2             = idx_ptr[1];
 
     value_t dist = compute_haversine(x1, y1, x2, y2);
 
@@ -105,7 +108,7 @@ __global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists,
 
   for (int i = threadIdx.x; i < k; i += tpb) {
     out_dists[blockIdx.x * k + i] = smemK[i];
-    out_inds[blockIdx.x * k + i] = smemV[i];
+    out_inds[blockIdx.x * k + i]  = smemV[i];
   }
 }
 
@@ -126,10 +129,15 @@ __global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists,
  * @param[in] stream stream to order kernel launch
  */
 template <typename value_idx, typename value_t>
-void haversine_knn(value_idx *out_inds, value_t *out_dists,
-                   const value_t *index, const value_t *query,
-                   size_t n_index_rows, size_t n_query_rows, int k,
-                   cudaStream_t stream) {
+void haversine_knn(value_idx* out_inds,
+                   value_t* out_dists,
+                   const value_t* index,
+                   const value_t* query,
+                   size_t n_index_rows,
+                   size_t n_query_rows,
+                   int k,
+                   cudaStream_t stream)
+{
   haversine_knn_kernel<<<n_query_rows, 128, 0, stream>>>(
     out_inds, out_dists, index, query, n_index_rows, k);
 }
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index da1217e3cf..2866049188 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -46,13 +46,22 @@ namespace spatial {
 namespace knn {
 namespace detail {
 
-template <typename value_idx = std::int64_t, typename value_t = float,
-          int warp_q, int thread_q, int tpb>
-__global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV,
-                                       value_t *outK, value_idx *outV,
-                                       size_t n_samples, int n_parts,
-                                       value_t initK, value_idx initV, int k,
-                                       value_idx *translations) {
+template <typename value_idx = std::int64_t,
+          typename value_t   = float,
+          int warp_q,
+          int thread_q,
+          int tpb>
+__global__ void knn_merge_parts_kernel(value_t* inK,
+                                       value_idx* inV,
+                                       value_t* outK,
+                                       value_idx* outV,
+                                       size_t n_samples,
+                                       int n_parts,
+                                       value_t initK,
+                                       value_idx initV,
+                                       int k,
+                                       value_idx* translations)
+{
   constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
   __shared__ value_t smemK[kNumWarps * warp_q];
@@ -61,34 +70,33 @@ __global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV,
   /**
    * Uses shared memory
    */
-  faiss::gpu::BlockSelect<value_t, value_idx, false,
-                          faiss::gpu::Comparator<value_t>, warp_q, thread_q,
-                          tpb>
-    heap(initK, initV, smemK, smemV, k);
+  faiss::gpu::
+    BlockSelect<value_t, value_idx, false, faiss::gpu::Comparator<value_t>, warp_q, thread_q, tpb>
+      heap(initK, initV, smemK, smemV, k);
 
   // Grid is exactly sized to rows available
-  int row = blockIdx.x;
+  int row     = blockIdx.x;
   int total_k = k * n_parts;
 
   int i = threadIdx.x;
 
   // Get starting pointers for cols in current thread
-  int part = i / k;
+  int part       = i / k;
   size_t row_idx = (row * k) + (part * n_samples * k);
 
   int col = i % k;
 
-  value_t *inKStart = inK + (row_idx + col);
-  value_idx *inVStart = inV + (row_idx + col);
+  value_t* inKStart   = inK + (row_idx + col);
+  value_idx* inVStart = inV + (row_idx + col);
 
-  int limit = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize);
+  int limit             = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize);
   value_idx translation = 0;
 
   for (; i < limit; i += tpb) {
     translation = translations[part];
     heap.add(*inKStart, (*inVStart) + translation);
 
-    part = (i + tpb) / k;
+    part    = (i + tpb) / k;
     row_idx = (row * k) + (part * n_samples * k);
 
     col = (i + tpb) % k;
@@ -111,22 +119,27 @@ __global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV,
   }
 }
 
-template <typename value_idx = std::int64_t, typename value_t = float,
-          int warp_q, int thread_q>
-inline void knn_merge_parts_impl(value_t *inK, value_idx *inV, value_t *outK,
-                                 value_idx *outV, size_t n_samples, int n_parts,
-                                 int k, cudaStream_t stream,
-                                 value_idx *translations) {
+template <typename value_idx = std::int64_t, typename value_t = float, int warp_q, int thread_q>
+inline void knn_merge_parts_impl(value_t* inK,
+                                 value_idx* inV,
+                                 value_t* outK,
+                                 value_idx* outV,
+                                 size_t n_samples,
+                                 int n_parts,
+                                 int k,
+                                 cudaStream_t stream,
+                                 value_idx* translations)
+{
   auto grid = dim3(n_samples);
 
   constexpr int n_threads = (warp_q <= 1024) ? 128 : 64;
-  auto block = dim3(n_threads);
+  auto block              = dim3(n_threads);
 
   auto kInit = faiss::gpu::Limits<value_t>::getMax();
   auto vInit = -1;
   knn_merge_parts_kernel<value_idx, value_t, warp_q, thread_q, n_threads>
-    <<<grid, block, 0, stream>>>(inK, inV, outK, outV, n_samples, n_parts,
-                                 kInit, vInit, k, translations);
+    <<<grid, block, 0, stream>>>(
+      inK, inV, outK, outV, n_samples, n_parts, kInit, vInit, k, translations);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -145,10 +158,16 @@ inline void knn_merge_parts_impl(value_t *inK, value_idx *inV, value_t *outK,
  * @param translations mapping of index offsets for each partition
  */
 template <typename value_idx = std::int64_t, typename value_t = float>
-inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
-                            value_idx *outV, size_t n_samples, int n_parts,
-                            int k, cudaStream_t stream,
-                            value_idx *translations) {
+inline void knn_merge_parts(value_t* inK,
+                            value_idx* inV,
+                            value_t* outK,
+                            value_idx* outV,
+                            size_t n_samples,
+                            int n_parts,
+                            int k,
+                            cudaStream_t stream,
+                            value_idx* translations)
+{
   if (k == 1)
     knn_merge_parts_impl<value_idx, value_t, 1, 1>(
       inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
@@ -197,26 +216,32 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
  * @param[in] metricArg metric argument to use. Corresponds to the p arg for lp norm
  */
 template <typename IntType = int, typename IdxType = std::int64_t>
-void brute_force_knn_impl(std::vector<float *> &input,
-                          std::vector<IntType> &sizes, IntType D,
-                          float *search_items, IntType n, IdxType *res_I,
-                          float *res_D, IntType k, cudaStream_t userStream,
-                          cudaStream_t *internalStreams = nullptr,
-                          int n_int_streams = 0, bool rowMajorIndex = true,
-                          bool rowMajorQuery = true,
-                          std::vector<IdxType> *translations = nullptr,
-                          raft::distance::DistanceType metric =
-                            raft::distance::DistanceType::L2Expanded,
-                          float metricArg = 0) {
-  ASSERT(input.size() == sizes.size(),
-         "input and sizes vectors should be the same size");
-
-  std::vector<IdxType> *id_ranges;
+void brute_force_knn_impl(
+  std::vector<float*>& input,
+  std::vector<IntType>& sizes,
+  IntType D,
+  float* search_items,
+  IntType n,
+  IdxType* res_I,
+  float* res_D,
+  IntType k,
+  cudaStream_t userStream,
+  cudaStream_t* internalStreams       = nullptr,
+  int n_int_streams                   = 0,
+  bool rowMajorIndex                  = true,
+  bool rowMajorQuery                  = true,
+  std::vector<IdxType>* translations  = nullptr,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
+  float metricArg                     = 0)
+{
+  ASSERT(input.size() == sizes.size(), "input and sizes vectors should be the same size");
+
+  std::vector<IdxType>* id_ranges;
   if (translations == nullptr) {
     // If we don't have explicit translations
     // for offsets of the indices, build them
     // from the local partitions
-    id_ranges = new std::vector<IdxType>();
+    id_ranges       = new std::vector<IdxType>();
     IdxType total_n = 0;
     for (size_t i = 0; i < input.size(); i++) {
       id_ranges->push_back(total_n);
@@ -232,11 +257,10 @@ void brute_force_knn_impl(std::vector<float *> &input,
     create_processor<float>(metric, n, D, k, rowMajorQuery, userStream);
   query_metric_processor->preprocess(search_items);
 
-  std::vector<std::unique_ptr<MetricProcessor<float>>> metric_processors(
-    input.size());
+  std::vector<std::unique_ptr<MetricProcessor<float>>> metric_processors(input.size());
   for (size_t i = 0; i < input.size(); i++) {
-    metric_processors[i] = create_processor<float>(metric, sizes[i], D, k,
-                                                   rowMajorQuery, userStream);
+    metric_processors[i] =
+      create_processor<float>(metric, sizes[i], D, k, rowMajorQuery, userStream);
     metric_processors[i]->preprocess(input[i]);
   }
 
@@ -244,14 +268,13 @@ void brute_force_knn_impl(std::vector<float *> &input,
   CUDA_CHECK(cudaGetDevice(&device));
 
   rmm::device_uvector<std::int64_t> trans(id_ranges->size(), userStream);
-  raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(),
-                      userStream);
+  raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), userStream);
 
   rmm::device_uvector<float> all_D(0, userStream);
   rmm::device_uvector<std::int64_t> all_I(0, userStream);
 
-  float *out_D = res_D;
-  IdxType *out_I = res_I;
+  float* out_D   = res_D;
+  IdxType* out_I = res_I;
 
   if (input.size() > 1) {
     all_D.resize(input.size() * k * n, userStream);
@@ -265,19 +288,28 @@ void brute_force_knn_impl(std::vector<float *> &input,
   if (n_int_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream));
 
   for (size_t i = 0; i < input.size(); i++) {
-    float *out_d_ptr = out_D + (i * k * n);
-    IdxType *out_i_ptr = out_I + (i * k * n);
+    float* out_d_ptr   = out_D + (i * k * n);
+    IdxType* out_i_ptr = out_I + (i * k * n);
 
-    cudaStream_t stream =
-      raft::select_stream(userStream, internalStreams, n_int_streams, i);
+    cudaStream_t stream = raft::select_stream(userStream, internalStreams, n_int_streams, i);
 
     if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true &&
         (metric == raft::distance::DistanceType::L2Unexpanded ||
          metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
          metric == raft::distance::DistanceType::L2Expanded ||
          metric == raft::distance::DistanceType::L2SqrtExpanded)) {
-      fusedL2Knn(D, out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n,
-                 k, rowMajorIndex, rowMajorQuery, stream, metric);
+      fusedL2Knn(D,
+                 out_i_ptr,
+                 out_d_ptr,
+                 input[i],
+                 search_items,
+                 sizes[i],
+                 n,
+                 k,
+                 rowMajorIndex,
+                 rowMajorQuery,
+                 stream,
+                 metric);
     } else {
       switch (metric) {
         case raft::distance::DistanceType::Haversine:
@@ -286,8 +318,7 @@ void brute_force_knn_impl(std::vector<float *> &input,
                  "Haversine distance requires 2 dimensions "
                  "(latitude / longitude).");
 
-          haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i],
-                        n, k, stream);
+          haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream);
           break;
         default:
           faiss::MetricType m = build_faiss_metric(metric);
@@ -298,18 +329,18 @@ void brute_force_knn_impl(std::vector<float *> &input,
           gpu_res.setDefaultStream(device, stream);
 
           faiss::gpu::GpuDistanceParams args;
-          args.metric = m;
-          args.metricArg = metricArg;
-          args.k = k;
-          args.dims = D;
-          args.vectors = input[i];
+          args.metric          = m;
+          args.metricArg       = metricArg;
+          args.k               = k;
+          args.dims            = D;
+          args.vectors         = input[i];
           args.vectorsRowMajor = rowMajorIndex;
-          args.numVectors = sizes[i];
-          args.queries = search_items;
+          args.numVectors      = sizes[i];
+          args.queries         = search_items;
           args.queriesRowMajor = rowMajorQuery;
-          args.numQueries = n;
-          args.outDistances = out_d_ptr;
-          args.outIndices = out_i_ptr;
+          args.numQueries      = n;
+          args.outDistances    = out_d_ptr;
+          args.outIndices      = out_i_ptr;
 
           /**
            * @todo: Until FAISS supports pluggable allocation strategies,
@@ -333,8 +364,7 @@ void brute_force_knn_impl(std::vector<float *> &input,
   if (input.size() > 1 || translations != nullptr) {
     // This is necessary for proper index translations. If there are
     // no translations or partitions to combine, it can be skipped.
-    knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream,
-                    trans.data());
+    knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, trans.data());
   }
 
   // Perform necessary post-processing
@@ -342,14 +372,12 @@ void brute_force_knn_impl(std::vector<float *> &input,
       metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
       metric == raft::distance::DistanceType::LpUnexpanded) {
     /**
-	* post-processing
-	*/
+     * post-processing
+     */
     float p = 0.5;  // standard l2
-    if (metric == raft::distance::DistanceType::LpUnexpanded)
-      p = 1.0 / metricArg;
+    if (metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / metricArg;
     raft::linalg::unaryOp<float>(
-      res_D, res_D, n * k,
-      [p] __device__(float input) { return powf(input, p); }, userStream);
+      res_D, res_D, n * k, [p] __device__(float input) { return powf(input, p); }, userStream);
   }
 
   query_metric_processor->revert(search_items);
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index b66ea025a2..f87fffc6cf 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -37,11 +37,11 @@ namespace knn {
 template <typename math_t>
 class MetricProcessor {
  public:
-  virtual void preprocess(math_t *data) {}
+  virtual void preprocess(math_t* data) {}
 
-  virtual void revert(math_t *data) {}
+  virtual void revert(math_t* data) {}
 
-  virtual void postprocess(math_t *data) {}
+  virtual void postprocess(math_t* data) {}
 
   virtual ~MetricProcessor() = default;
 };
@@ -57,37 +57,57 @@ class CosineMetricProcessor : public MetricProcessor<math_t> {
   rmm::device_uvector<math_t> colsums_;
 
  public:
-  CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major,
-                        cudaStream_t stream)
+  CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major, cudaStream_t stream)
     : stream_(stream),
       colsums_(n_rows, stream),
       n_cols_(n_cols),
       n_rows_(n_rows),
       row_major_(row_major),
-      k_(k) {}
+      k_(k)
+  {
+  }
 
-  void preprocess(math_t *data) {
-    raft::linalg::rowNorm(colsums_.data(), data, n_cols_, n_rows_,
-                          raft::linalg::NormType::L2Norm, row_major_, stream_,
+  void preprocess(math_t* data)
+  {
+    raft::linalg::rowNorm(colsums_.data(),
+                          data,
+                          n_cols_,
+                          n_rows_,
+                          raft::linalg::NormType::L2Norm,
+                          row_major_,
+                          stream_,
                           [] __device__(math_t in) { return sqrtf(in); });
 
     raft::linalg::matrixVectorOp(
-      data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false,
+      data,
+      data,
+      colsums_.data(),
+      n_cols_,
+      n_rows_,
+      row_major_,
+      false,
       [] __device__(math_t mat_in, math_t vec_in) { return mat_in / vec_in; },
       stream_);
   }
 
-  void revert(math_t *data) {
+  void revert(math_t* data)
+  {
     raft::linalg::matrixVectorOp(
-      data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false,
+      data,
+      data,
+      colsums_.data(),
+      n_cols_,
+      n_rows_,
+      row_major_,
+      false,
       [] __device__(math_t mat_in, math_t vec_in) { return mat_in * vec_in; },
       stream_);
   }
 
-  void postprocess(math_t *data) {
+  void postprocess(math_t* data)
+  {
     raft::linalg::unaryOp(
-      data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; },
-      stream_);
+      data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, stream_);
   }
 
   ~CosineMetricProcessor() = default;
@@ -98,41 +118,59 @@ class CorrelationMetricProcessor : public CosineMetricProcessor<math_t> {
   using cosine = CosineMetricProcessor<math_t>;
 
  public:
-  CorrelationMetricProcessor(size_t n_rows, size_t n_cols, int k,
-                             bool row_major, cudaStream_t stream)
-    : CosineMetricProcessor<math_t>(n_rows, n_cols, k, row_major, stream),
-      means_(n_rows, stream) {}
+  CorrelationMetricProcessor(
+    size_t n_rows, size_t n_cols, int k, bool row_major, cudaStream_t stream)
+    : CosineMetricProcessor<math_t>(n_rows, n_cols, k, row_major, stream), means_(n_rows, stream)
+  {
+  }
 
-  void preprocess(math_t *data) {
+  void preprocess(math_t* data)
+  {
     math_t normalizer_const = 1.0 / (math_t)cosine::n_cols_;
 
-    raft::linalg::reduce(means_.data(), data, cosine::n_cols_, cosine::n_rows_,
-                         (math_t)0.0, cosine::row_major_, true,
+    raft::linalg::reduce(means_.data(),
+                         data,
+                         cosine::n_cols_,
+                         cosine::n_rows_,
+                         (math_t)0.0,
+                         cosine::row_major_,
+                         true,
                          cosine::stream_);
 
     raft::linalg::unaryOp(
-      means_.data(), means_.data(), cosine::n_rows_,
+      means_.data(),
+      means_.data(),
+      cosine::n_rows_,
       [=] __device__(math_t in) { return in * normalizer_const; },
       cosine::stream_);
 
-    raft::stats::meanCenter(data, data, means_.data(), cosine::n_cols_,
-                            cosine::n_rows_, cosine::row_major_, false,
+    raft::stats::meanCenter(data,
+                            data,
+                            means_.data(),
+                            cosine::n_cols_,
+                            cosine::n_rows_,
+                            cosine::row_major_,
+                            false,
                             cosine::stream_);
 
     CosineMetricProcessor<math_t>::preprocess(data);
   }
 
-  void revert(math_t *data) {
+  void revert(math_t* data)
+  {
     CosineMetricProcessor<math_t>::revert(data);
 
-    raft::stats::meanAdd(data, data, means_.data(), cosine::n_cols_,
-                         cosine::n_rows_, cosine::row_major_, false,
+    raft::stats::meanAdd(data,
+                         data,
+                         means_.data(),
+                         cosine::n_cols_,
+                         cosine::n_rows_,
+                         cosine::row_major_,
+                         false,
                          cosine::stream_);
   }
 
-  void postprocess(math_t *data) {
-    CosineMetricProcessor<math_t>::postprocess(data);
-  }
+  void postprocess(math_t* data) { CosineMetricProcessor<math_t>::postprocess(data); }
 
   ~CorrelationMetricProcessor() = default;
 
@@ -142,33 +180,30 @@ class CorrelationMetricProcessor : public CosineMetricProcessor<math_t> {
 template <typename math_t>
 class DefaultMetricProcessor : public MetricProcessor<math_t> {
  public:
-  void preprocess(math_t *data) {}
+  void preprocess(math_t* data) {}
 
-  void revert(math_t *data) {}
+  void revert(math_t* data) {}
 
-  void postprocess(math_t *data) {}
+  void postprocess(math_t* data) {}
 
   ~DefaultMetricProcessor() = default;
 };
 
 template <typename math_t>
 inline std::unique_ptr<MetricProcessor<math_t>> create_processor(
-  distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery,
-  cudaStream_t userStream) {
-  MetricProcessor<math_t> *mp = nullptr;
+  distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery, cudaStream_t userStream)
+{
+  MetricProcessor<math_t>* mp = nullptr;
 
   switch (metric) {
     case distance::DistanceType::CosineExpanded:
-      mp =
-        new CosineMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream);
+      mp = new CosineMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream);
       break;
 
     case distance::DistanceType::CorrelationExpanded:
-      mp = new CorrelationMetricProcessor<math_t>(n, D, k, rowMajorQuery,
-                                                  userStream);
+      mp = new CorrelationMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream);
       break;
-    default:
-      mp = new DefaultMetricProcessor<math_t>();
+    default: mp = new DefaultMetricProcessor<math_t>();
   }
 
   return std::unique_ptr<MetricProcessor<math_t>>(mp);
diff --git a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
index 045edad0e6..88fa58a4d7 100644
--- a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
@@ -31,27 +31,33 @@ namespace spatial {
 namespace knn {
 namespace detail {
 
-template <typename K, typename IndexType, bool select_min, int warp_q,
-          int thread_q, int tpb>
-__global__ void select_k_kernel(K *inK, IndexType *inV, size_t n_rows,
-                                size_t n_cols, K *outK, IndexType *outV,
-                                K initK, IndexType initV, int k) {
+template <typename K, typename IndexType, bool select_min, int warp_q, int thread_q, int tpb>
+__global__ void select_k_kernel(K* inK,
+                                IndexType* inV,
+                                size_t n_rows,
+                                size_t n_cols,
+                                K* outK,
+                                IndexType* outV,
+                                K initK,
+                                IndexType initV,
+                                int k)
+{
   constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
   __shared__ K smemK[kNumWarps * warp_q];
   __shared__ IndexType smemV[kNumWarps * warp_q];
 
-  faiss::gpu::BlockSelect<K, IndexType, select_min, faiss::gpu::Comparator<K>,
-                          warp_q, thread_q, tpb>
-    heap(initK, initV, smemK, smemV, k);
+  faiss::gpu::
+    BlockSelect<K, IndexType, select_min, faiss::gpu::Comparator<K>, warp_q, thread_q, tpb>
+      heap(initK, initV, smemK, smemV, k);
 
   // Grid is exactly sized to rows available
   int row = blockIdx.x;
-  int i = threadIdx.x;
+  int i   = threadIdx.x;
 
-  int idx = row * n_cols;
-  K *inKStart = inK + idx + i;
-  IndexType *inVStart = inV + idx + i;
+  int idx             = row * n_cols;
+  K* inKStart         = inK + idx + i;
+  IndexType* inVStart = inV + idx + i;
 
   // Whole warps must participate in the selection
   int limit = faiss::gpu::utils::roundDown(n_cols, faiss::gpu::kWarpSize);
@@ -78,27 +84,31 @@ __global__ void select_k_kernel(K *inK, IndexType *inV, size_t n_rows,
   }
 }
 
-template <typename value_idx = int, typename value_t = float, int warp_q,
-          int thread_q>
-inline void select_k_impl(value_t *inK, value_idx *inV, size_t n_rows,
-                          size_t n_cols, value_t *outK, value_idx *outV,
-                          bool select_min, int k, cudaStream_t stream) {
+template <typename value_idx = int, typename value_t = float, int warp_q, int thread_q>
+inline void select_k_impl(value_t* inK,
+                          value_idx* inV,
+                          size_t n_rows,
+                          size_t n_cols,
+                          value_t* outK,
+                          value_idx* outV,
+                          bool select_min,
+                          int k,
+                          cudaStream_t stream)
+{
   auto grid = dim3(n_rows);
 
   constexpr int n_threads = (warp_q <= 1024) ? 128 : 64;
-  auto block = dim3(n_threads);
+  auto block              = dim3(n_threads);
 
-  auto kInit = select_min ? faiss::gpu::Limits<value_t>::getMax()
-                          : faiss::gpu::Limits<value_t>::getMin();
+  auto kInit =
+    select_min ? faiss::gpu::Limits<value_t>::getMax() : faiss::gpu::Limits<value_t>::getMin();
   auto vInit = -1;
   if (select_min) {
     select_k_kernel<value_t, value_idx, false, warp_q, thread_q, n_threads>
-      <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit,
-                                   vInit, k);
+      <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k);
   } else {
     select_k_kernel<value_t, value_idx, true, warp_q, thread_q, n_threads>
-      <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit,
-                                   vInit, k);
+      <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k);
   }
   CUDA_CHECK(cudaGetLastError());
 }
@@ -118,30 +128,37 @@ inline void select_k_impl(value_t *inK, value_idx *inV, size_t n_rows,
  * @param[in] stream CUDA stream to use
  */
 template <typename value_idx = int, typename value_t = float>
-inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols,
-                     value_t *outK, value_idx *outV, bool select_min, int k,
-                     cudaStream_t stream) {
+inline void select_k(value_t* inK,
+                     value_idx* inV,
+                     size_t n_rows,
+                     size_t n_cols,
+                     value_t* outK,
+                     value_idx* outV,
+                     bool select_min,
+                     int k,
+                     cudaStream_t stream)
+{
   if (k == 1)
-    select_k_impl<value_idx, value_t, 1, 1>(inK, inV, n_rows, n_cols, outK,
-                                            outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 1, 1>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 32)
-    select_k_impl<value_idx, value_t, 32, 2>(inK, inV, n_rows, n_cols, outK,
-                                             outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 32, 2>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 64)
-    select_k_impl<value_idx, value_t, 64, 3>(inK, inV, n_rows, n_cols, outK,
-                                             outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 64, 3>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 128)
-    select_k_impl<value_idx, value_t, 128, 3>(inK, inV, n_rows, n_cols, outK,
-                                              outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 128, 3>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 256)
-    select_k_impl<value_idx, value_t, 256, 4>(inK, inV, n_rows, n_cols, outK,
-                                              outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 256, 4>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 512)
-    select_k_impl<value_idx, value_t, 512, 8>(inK, inV, n_rows, n_cols, outK,
-                                              outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 512, 8>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 1024)
-    select_k_impl<value_idx, value_t, 1024, 8>(inK, inV, n_rows, n_cols, outK,
-                                               outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 1024, 8>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
 }
 
 };  // namespace detail
diff --git a/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh b/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh
index 84719a0e4b..abc4cdf545 100644
--- a/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh
@@ -30,21 +30,25 @@ struct KeyValuePair {
   __host__ __device__ __forceinline__ KeyValuePair() {}
 
   /// Copy Constructors
-  __host__ __device__ __forceinline__
-  KeyValuePair(cub::KeyValuePair<_Key, _Value>& kvp)
-    : key(kvp.key), value(kvp.value) {}
+  __host__ __device__ __forceinline__ KeyValuePair(cub::KeyValuePair<_Key, _Value>& kvp)
+    : key(kvp.key), value(kvp.value)
+  {
+  }
 
-  __host__ __device__ __forceinline__
-  KeyValuePair(faiss::gpu::KeyValuePair<_Key, _Value>& kvp)
-    : key(kvp.key), value(kvp.value) {}
+  __host__ __device__ __forceinline__ KeyValuePair(faiss::gpu::KeyValuePair<_Key, _Value>& kvp)
+    : key(kvp.key), value(kvp.value)
+  {
+  }
 
   /// Constructor
-  __host__ __device__ __forceinline__ KeyValuePair(Key const& key,
-                                                   Value const& value)
-    : key(key), value(value) {}
+  __host__ __device__ __forceinline__ KeyValuePair(Key const& key, Value const& value)
+    : key(key), value(value)
+  {
+  }
 
   /// Inequality operator
-  __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair& b) {
+  __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair& b)
+  {
     return (value != b.value) || (key != b.key);
   }
 };
@@ -117,9 +121,9 @@ struct KeyValuePair {
 //
 // If IsBitonic is false, the first stage is reversed, so we don't
 // need to sort directionally. It's still technically a bitonic sort.
-template <typename K, typename V, int L, bool Dir, typename Comp,
-          bool IsBitonic>
-inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair<K, V>& v) {
+template <typename K, typename V, int L, bool Dir, typename Comp, bool IsBitonic>
+inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair<K, V>& v)
+{
   static_assert(utils::isPowerOf2(L), "L must be a power-of-2");
   static_assert(L <= kWarpSize / 2, "merge list size must be <= 16");
 
@@ -129,7 +133,7 @@ inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair<K, V>& v) {
     // Reverse the first comparison stage.
     // For example, merging a list of size 8 has the exchanges:
     // 0 <-> 15, 1 <-> 14, ...
-    K otherK = shfl_xor(k, 2 * L - 1);
+    K otherK  = shfl_xor(k, 2 * L - 1);
     K otherVk = shfl_xor(v.key, 2 * L - 1);
     V otherVv = shfl_xor(v.value, 2 * L - 1);
 
@@ -157,7 +161,7 @@ inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair<K, V>& v) {
 
 #pragma unroll
   for (int stride = IsBitonic ? L : L / 2; stride > 0; stride /= 2) {
-    K otherK = shfl_xor(k, stride);
+    K otherK  = shfl_xor(k, stride);
     K otherVk = shfl_xor(v.key, stride);
     V otherVv = shfl_xor(v.value, stride);
 
@@ -183,9 +187,9 @@ inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair<K, V>& v) {
 
 // Template for performing a bitonic merge of an arbitrary set of
 // registers
-template <typename K, typename V, int N, bool Dir, typename Comp, bool Low,
-          bool Pow2>
-struct BitonicMergeStepKVP {};
+template <typename K, typename V, int N, bool Dir, typename Comp, bool Low, bool Pow2>
+struct BitonicMergeStepKVP {
+};
 
 //
 // Power-of-2 merge specialization
@@ -194,7 +198,8 @@ struct BitonicMergeStepKVP {};
 // All merges eventually call this
 template <typename K, typename V, bool Dir, typename Comp, bool Low>
 struct BitonicMergeStepKVP<K, V, 1, Dir, Comp, Low, true> {
-  static inline __device__ void merge(K k[1], KeyValuePair<K, V> v[1]) {
+  static inline __device__ void merge(K k[1], KeyValuePair<K, V> v[1])
+  {
     // Use warp shuffles
     warpBitonicMergeLE16KVP<K, V, 16, Dir, Comp, true>(k[0], v[0]);
   }
@@ -202,16 +207,17 @@ struct BitonicMergeStepKVP<K, V, 1, Dir, Comp, Low, true> {
 
 template <typename K, typename V, int N, bool Dir, typename Comp, bool Low>
 struct BitonicMergeStepKVP<K, V, N, Dir, Comp, Low, true> {
-  static inline __device__ void merge(K k[N], KeyValuePair<K, V> v[N]) {
+  static inline __device__ void merge(K k[N], KeyValuePair<K, V> v[N])
+  {
     static_assert(utils::isPowerOf2(N), "must be power of 2");
     static_assert(N > 1, "must be N > 1");
 
 #pragma unroll
     for (int i = 0; i < N / 2; ++i) {
-      K& ka = k[i];
+      K& ka                  = k[i];
       KeyValuePair<K, V>& va = v[i];
 
-      K& kb = k[i + N / 2];
+      K& kb                  = k[i + N / 2];
       KeyValuePair<K, V>& vb = v[i + N / 2];
 
       bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
@@ -226,18 +232,17 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, Low, true> {
 
 #pragma unroll
       for (int i = 0; i < N / 2; ++i) {
-        newK[i] = k[i];
-        newV[i].key = v[i].key;
+        newK[i]       = k[i];
+        newV[i].key   = v[i].key;
         newV[i].value = v[i].value;
       }
 
-      BitonicMergeStepKVP<K, V, N / 2, Dir, Comp, true, true>::merge(newK,
-                                                                     newV);
+      BitonicMergeStepKVP<K, V, N / 2, Dir, Comp, true, true>::merge(newK, newV);
 
 #pragma unroll
       for (int i = 0; i < N / 2; ++i) {
-        k[i] = newK[i];
-        v[i].key = newV[i].key;
+        k[i]       = newK[i];
+        v[i].key   = newV[i].key;
         v[i].value = newV[i].value;
       }
     }
@@ -248,18 +253,17 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, Low, true> {
 
 #pragma unroll
       for (int i = 0; i < N / 2; ++i) {
-        newK[i] = k[i + N / 2];
-        newV[i].key = v[i + N / 2].key;
+        newK[i]       = k[i + N / 2];
+        newV[i].key   = v[i + N / 2].key;
         newV[i].value = v[i + N / 2].value;
       }
 
-      BitonicMergeStepKVP<K, V, N / 2, Dir, Comp, false, true>::merge(newK,
-                                                                      newV);
+      BitonicMergeStepKVP<K, V, N / 2, Dir, Comp, false, true>::merge(newK, newV);
 
 #pragma unroll
       for (int i = 0; i < N / 2; ++i) {
-        k[i + N / 2] = newK[i];
-        v[i + N / 2].key = newV[i].key;
+        k[i + N / 2]       = newK[i];
+        v[i + N / 2].key   = newV[i].key;
         v[i + N / 2].value = newV[i].value;
       }
     }
@@ -273,7 +277,8 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, Low, true> {
 // Low recursion
 template <typename K, typename V, int N, bool Dir, typename Comp>
 struct BitonicMergeStepKVP<K, V, N, Dir, Comp, true, false> {
-  static inline __device__ void merge(K k[N], KeyValuePair<K, V> v[N]) {
+  static inline __device__ void merge(K k[N], KeyValuePair<K, V> v[N])
+  {
     static_assert(!utils::isPowerOf2(N), "must be non-power-of-2");
     static_assert(N >= 3, "must be N >= 3");
 
@@ -281,10 +286,10 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, true, false> {
 
 #pragma unroll
     for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) {
-      K& ka = k[i];
+      K& ka                  = k[i];
       KeyValuePair<K, V>& va = v[i];
 
-      K& kb = k[i + kNextHighestPowerOf2 / 2];
+      K& kb                  = k[i + kNextHighestPowerOf2 / 2];
       KeyValuePair<K, V>& vb = v[i + kNextHighestPowerOf2 / 2];
 
       bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
@@ -293,7 +298,7 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, true, false> {
       swap(s, va.value, vb.value);
     }
 
-    constexpr int kLowSize = N - kNextHighestPowerOf2 / 2;
+    constexpr int kLowSize  = N - kNextHighestPowerOf2 / 2;
     constexpr int kHighSize = kNextHighestPowerOf2 / 2;
     {
       K newK[kLowSize];
@@ -301,23 +306,26 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, true, false> {
 
 #pragma unroll
       for (int i = 0; i < kLowSize; ++i) {
-        newK[i] = k[i];
-        newV[i].key = v[i].key;
+        newK[i]       = k[i];
+        newV[i].key   = v[i].key;
         newV[i].value = v[i].value;
       }
 
-      constexpr bool kLowIsPowerOf2 =
-        utils::isPowerOf2(N - kNextHighestPowerOf2 / 2);
+      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(N - kNextHighestPowerOf2 / 2);
       // FIXME: compiler doesn't like this expression? compiler bug?
       //      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize);
-      BitonicMergeStepKVP<K, V, kLowSize, Dir, Comp,
+      BitonicMergeStepKVP<K,
+                          V,
+                          kLowSize,
+                          Dir,
+                          Comp,
                           true,  // low
                           kLowIsPowerOf2>::merge(newK, newV);
 
 #pragma unroll
       for (int i = 0; i < kLowSize; ++i) {
-        k[i] = newK[i];
-        v[i].key = newV[i].key;
+        k[i]       = newK[i];
+        v[i].key   = newV[i].key;
         v[i].value = newV[i].value;
       }
     }
@@ -328,23 +336,26 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, true, false> {
 
 #pragma unroll
       for (int i = 0; i < kHighSize; ++i) {
-        newK[i] = k[i + kLowSize];
-        newV[i].key = v[i + kLowSize].key;
+        newK[i]       = k[i + kLowSize];
+        newV[i].key   = v[i + kLowSize].key;
         newV[i].value = v[i + kLowSize].value;
       }
 
-      constexpr bool kHighIsPowerOf2 =
-        utils::isPowerOf2(kNextHighestPowerOf2 / 2);
+      constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kNextHighestPowerOf2 / 2);
       // FIXME: compiler doesn't like this expression? compiler bug?
       //      constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kHighSize);
-      BitonicMergeStepKVP<K, V, kHighSize, Dir, Comp,
+      BitonicMergeStepKVP<K,
+                          V,
+                          kHighSize,
+                          Dir,
+                          Comp,
                           false,  // high
                           kHighIsPowerOf2>::merge(newK, newV);
 
 #pragma unroll
       for (int i = 0; i < kHighSize; ++i) {
-        k[i + kLowSize] = newK[i];
-        v[i + kLowSize].key = newV[i].key;
+        k[i + kLowSize]       = newK[i];
+        v[i + kLowSize].key   = newV[i].key;
         v[i + kLowSize].value = newV[i].value;
       }
     }
@@ -354,7 +365,8 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, true, false> {
 // High recursion
 template <typename K, typename V, int N, bool Dir, typename Comp>
 struct BitonicMergeStepKVP<K, V, N, Dir, Comp, false, false> {
-  static inline __device__ void merge(K k[N], KeyValuePair<K, V> v[N]) {
+  static inline __device__ void merge(K k[N], KeyValuePair<K, V> v[N])
+  {
     static_assert(!utils::isPowerOf2(N), "must be non-power-of-2");
     static_assert(N >= 3, "must be N >= 3");
 
@@ -362,10 +374,10 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, false, false> {
 
 #pragma unroll
     for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) {
-      K& ka = k[i];
+      K& ka                  = k[i];
       KeyValuePair<K, V>& va = v[i];
 
-      K& kb = k[i + kNextHighestPowerOf2 / 2];
+      K& kb                  = k[i + kNextHighestPowerOf2 / 2];
       KeyValuePair<K, V>& vb = v[i + kNextHighestPowerOf2 / 2];
 
       bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
@@ -374,7 +386,7 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, false, false> {
       swap(s, va.value, vb.value);
     }
 
-    constexpr int kLowSize = kNextHighestPowerOf2 / 2;
+    constexpr int kLowSize  = kNextHighestPowerOf2 / 2;
     constexpr int kHighSize = N - kNextHighestPowerOf2 / 2;
     {
       K newK[kLowSize];
@@ -382,23 +394,26 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, false, false> {
 
 #pragma unroll
       for (int i = 0; i < kLowSize; ++i) {
-        newK[i] = k[i];
-        newV[i].key = v[i].key;
+        newK[i]       = k[i];
+        newV[i].key   = v[i].key;
         newV[i].value = v[i].value;
       }
 
-      constexpr bool kLowIsPowerOf2 =
-        utils::isPowerOf2(kNextHighestPowerOf2 / 2);
+      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kNextHighestPowerOf2 / 2);
       // FIXME: compiler doesn't like this expression? compiler bug?
       //      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize);
-      BitonicMergeStepKVP<K, V, kLowSize, Dir, Comp,
+      BitonicMergeStepKVP<K,
+                          V,
+                          kLowSize,
+                          Dir,
+                          Comp,
                           true,  // low
                           kLowIsPowerOf2>::merge(newK, newV);
 
 #pragma unroll
       for (int i = 0; i < kLowSize; ++i) {
-        k[i] = newK[i];
-        v[i].key = newV[i].key;
+        k[i]       = newK[i];
+        v[i].key   = newV[i].key;
         v[i].value = newV[i].value;
       }
     }
@@ -409,23 +424,26 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, false, false> {
 
 #pragma unroll
       for (int i = 0; i < kHighSize; ++i) {
-        newK[i] = k[i + kLowSize];
-        newV[i].key = v[i + kLowSize].key;
+        newK[i]       = k[i + kLowSize];
+        newV[i].key   = v[i + kLowSize].key;
         newV[i].value = v[i + kLowSize].value;
       }
 
-      constexpr bool kHighIsPowerOf2 =
-        utils::isPowerOf2(N - kNextHighestPowerOf2 / 2);
+      constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(N - kNextHighestPowerOf2 / 2);
       // FIXME: compiler doesn't like this expression? compiler bug?
       //      constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kHighSize);
-      BitonicMergeStepKVP<K, V, kHighSize, Dir, Comp,
+      BitonicMergeStepKVP<K,
+                          V,
+                          kHighSize,
+                          Dir,
+                          Comp,
                           false,  // high
                           kHighIsPowerOf2>::merge(newK, newV);
 
 #pragma unroll
       for (int i = 0; i < kHighSize; ++i) {
-        k[i + kLowSize] = newK[i];
-        v[i + kLowSize].key = newV[i].key;
+        k[i + kLowSize]       = newK[i];
+        v[i + kLowSize].key   = newV[i].key;
         v[i + kLowSize].value = newV[i].value;
       }
     }
@@ -436,20 +454,20 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, false, false> {
 /// i.e., merges a sorted k/v list of size kWarpSize * N1 with a
 /// sorted k/v list of size kWarpSize * N2, where N1 and N2 are any
 /// value >= 1
-template <typename K, typename V, int N1, int N2, bool Dir, typename Comp,
-          bool FullMerge = true>
+template <typename K, typename V, int N1, int N2, bool Dir, typename Comp, bool FullMerge = true>
 inline __device__ void warpMergeAnyRegistersKVP(K k1[N1],
                                                 KeyValuePair<K, V> v1[N1],
                                                 K k2[N2],
-                                                KeyValuePair<K, V> v2[N2]) {
+                                                KeyValuePair<K, V> v2[N2])
+{
   constexpr int kSmallestN = N1 < N2 ? N1 : N2;
 
 #pragma unroll
   for (int i = 0; i < kSmallestN; ++i) {
-    K& ka = k1[N1 - 1 - i];
+    K& ka                  = k1[N1 - 1 - i];
     KeyValuePair<K, V>& va = v1[N1 - 1 - i];
 
-    K& kb = k2[i];
+    K& kb                  = k2[i];
     KeyValuePair<K, V>& vb = v2[i];
 
     K otherKa;
@@ -457,13 +475,13 @@ inline __device__ void warpMergeAnyRegistersKVP(K k1[N1],
 
     if (FullMerge) {
       // We need the other values
-      otherKa = shfl_xor(ka, kWarpSize - 1);
+      otherKa    = shfl_xor(ka, kWarpSize - 1);
       K otherVak = shfl_xor(va.key, kWarpSize - 1);
       V otherVav = shfl_xor(va.value, kWarpSize - 1);
-      otherVa = KeyValuePair(otherVak, otherVav);
+      otherVa    = KeyValuePair(otherVak, otherVav);
     }
 
-    K otherKb = shfl_xor(kb, kWarpSize - 1);
+    K otherKb  = shfl_xor(kb, kWarpSize - 1);
     K otherVbk = shfl_xor(vb.key, kWarpSize - 1);
     V otherVbv = shfl_xor(vb.value, kWarpSize - 1);
 
@@ -487,12 +505,10 @@ inline __device__ void warpMergeAnyRegistersKVP(K k1[N1],
     }
   }
 
-  BitonicMergeStepKVP<K, V, N1, Dir, Comp, true, utils::isPowerOf2(N1)>::merge(
-    k1, v1);
+  BitonicMergeStepKVP<K, V, N1, Dir, Comp, true, utils::isPowerOf2(N1)>::merge(k1, v1);
   if (FullMerge) {
     // Only if we care about N2 do we need to bother merging it fully
-    BitonicMergeStepKVP<K, V, N2, Dir, Comp, false,
-                        utils::isPowerOf2(N2)>::merge(k2, v2);
+    BitonicMergeStepKVP<K, V, N2, Dir, Comp, false, utils::isPowerOf2(N2)>::merge(k2, v2);
   }
 }
 
@@ -500,7 +516,8 @@ inline __device__ void warpMergeAnyRegistersKVP(K k1[N1],
 // bitonic sort
 template <typename K, typename V, int N, bool Dir, typename Comp>
 struct BitonicSortStepKVP {
-  static inline __device__ void sort(K k[N], KeyValuePair<K, V> v[N]) {
+  static inline __device__ void sort(K k[N], KeyValuePair<K, V> v[N])
+  {
     static_assert(N > 1, "did not hit specialized case");
 
     // Sort recursively
@@ -512,8 +529,8 @@ struct BitonicSortStepKVP {
 
 #pragma unroll
     for (int i = 0; i < kSizeA; ++i) {
-      aK[i] = k[i];
-      aV[i].key = v[i].key;
+      aK[i]       = k[i];
+      aV[i].key   = v[i].key;
       aV[i].value = v[i].value;
     }
 
@@ -524,8 +541,8 @@ struct BitonicSortStepKVP {
 
 #pragma unroll
     for (int i = 0; i < kSizeB; ++i) {
-      bK[i] = k[i + kSizeA];
-      bV[i].key = v[i + kSizeA].key;
+      bK[i]       = k[i + kSizeA];
+      bV[i].key   = v[i + kSizeA].key;
       bV[i].value = v[i + kSizeA].value;
     }
 
@@ -536,15 +553,15 @@ struct BitonicSortStepKVP {
 
 #pragma unroll
     for (int i = 0; i < kSizeA; ++i) {
-      k[i] = aK[i];
-      v[i].key = aV[i].key;
+      k[i]       = aK[i];
+      v[i].key   = aV[i].key;
       v[i].value = aV[i].value;
     }
 
 #pragma unroll
     for (int i = 0; i < kSizeB; ++i) {
-      k[i + kSizeA] = bK[i];
-      v[i + kSizeA].key = bV[i].key;
+      k[i + kSizeA]       = bK[i];
+      v[i + kSizeA].key   = bV[i].key;
       v[i + kSizeA].value = bV[i].value;
     }
   }
@@ -553,7 +570,8 @@ struct BitonicSortStepKVP {
 // Single warp (N == 1) sorting specialization
 template <typename K, typename V, bool Dir, typename Comp>
 struct BitonicSortStepKVP<K, V, 1, Dir, Comp> {
-  static inline __device__ void sort(K k[1], KeyValuePair<K, V> v[1]) {
+  static inline __device__ void sort(K k[1], KeyValuePair<K, V> v[1])
+  {
     // Update this code if this changes
     // should go from 1 -> kWarpSize in multiples of 2
     static_assert(kWarpSize == 32, "unexpected warp size");
@@ -569,61 +587,64 @@ struct BitonicSortStepKVP<K, V, 1, Dir, Comp> {
 /// Sort a list of kWarpSize * N elements in registers, where N is an
 /// arbitrary >= 1
 template <typename K, typename V, int N, bool Dir, typename Comp>
-inline __device__ void warpSortAnyRegistersKVP(K k[N],
-                                               KeyValuePair<K, V> v[N]) {
+inline __device__ void warpSortAnyRegistersKVP(K k[N], KeyValuePair<K, V> v[N])
+{
   BitonicSortStepKVP<K, V, N, Dir, Comp>::sort(k, v);
 }
 
 // `Dir` true, produce largest values.
 // `Dir` false, produce smallest values.
-template <typename K, typename V, bool Dir, typename Comp, int NumWarpQ,
-          int NumThreadQ, int ThreadsPerBlock>
+template <typename K,
+          typename V,
+          bool Dir,
+          typename Comp,
+          int NumWarpQ,
+          int NumThreadQ,
+          int ThreadsPerBlock>
 struct KeyValueWarpSelect {
   static constexpr int kNumWarpQRegisters = NumWarpQ / faiss::gpu::kWarpSize;
 
-  __device__ inline KeyValueWarpSelect(K initKVal,
-                                       faiss::gpu::KeyValuePair<K, V> initVVal,
-                                       int k)
+  __device__ inline KeyValueWarpSelect(K initKVal, faiss::gpu::KeyValuePair<K, V> initVVal, int k)
     : initK(initKVal),
       initV(initVVal),
       numVals(0),
       warpKTop(initKVal),
       warpKTopRDist(initKVal),
-      kLane((k - 1) % faiss::gpu::kWarpSize) {
-    static_assert(faiss::gpu::utils::isPowerOf2(ThreadsPerBlock),
-                  "threads must be a power-of-2");
-    static_assert(faiss::gpu::utils::isPowerOf2(NumWarpQ),
-                  "warp queue must be power-of-2");
+      kLane((k - 1) % faiss::gpu::kWarpSize)
+  {
+    static_assert(faiss::gpu::utils::isPowerOf2(ThreadsPerBlock), "threads must be a power-of-2");
+    static_assert(faiss::gpu::utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2");
 
     // Fill the per-thread queue keys with the default value
 #pragma unroll
     for (int i = 0; i < NumThreadQ; ++i) {
-      threadK[i] = initK;
-      threadV[i].key = initV.key;
+      threadK[i]       = initK;
+      threadV[i].key   = initV.key;
       threadV[i].value = initV.value;
     }
 
     // Fill the warp queue with the default value
 #pragma unroll
     for (int i = 0; i < kNumWarpQRegisters; ++i) {
-      warpK[i] = initK;
-      warpV[i].key = initV.key;
+      warpK[i]       = initK;
+      warpV[i].key   = initV.key;
       warpV[i].value = initV.value;
     }
   }
 
-  __device__ inline void addThreadQ(K k, faiss::gpu::KeyValuePair<K, V>& v) {
+  __device__ inline void addThreadQ(K k, faiss::gpu::KeyValuePair<K, V>& v)
+  {
     if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) {
       // Rotate right
 #pragma unroll
       for (int i = NumThreadQ - 1; i > 0; --i) {
-        threadK[i] = threadK[i - 1];
-        threadV[i].key = threadV[i - 1].key;
+        threadK[i]       = threadK[i - 1];
+        threadV[i].key   = threadV[i - 1].key;
         threadV[i].value = threadV[i - 1].value;
       }
 
-      threadK[0] = k;
-      threadV[0].key = v.key;
+      threadK[0]       = k;
+      threadV[0].key   = v.key;
       threadV[0].value = v.value;
       ++numVals;
     }
@@ -633,33 +654,35 @@ struct KeyValueWarpSelect {
   /// list across both
 
   // TODO
-  __device__ inline void mergeWarpQ() {
+  __device__ inline void mergeWarpQ()
+  {
     // Sort all of the per-thread queues
-    faiss::gpu::warpSortAnyRegistersKVP<K, V, NumThreadQ, !Dir, Comp>(threadK,
-                                                                      threadV);
+    faiss::gpu::warpSortAnyRegistersKVP<K, V, NumThreadQ, !Dir, Comp>(threadK, threadV);
 
     // The warp queue is already sorted, and now that we've sorted the
     // per-thread queue, merge both sorted lists together, producing
     // one sorted list
-    faiss::gpu::warpMergeAnyRegistersKVP<K, V, kNumWarpQRegisters, NumThreadQ,
-                                         !Dir, Comp, false>(warpK, warpV,
-                                                            threadK, threadV);
+    faiss::gpu::warpMergeAnyRegistersKVP<K, V, kNumWarpQRegisters, NumThreadQ, !Dir, Comp, false>(
+      warpK, warpV, threadK, threadV);
   }
 
   /// WARNING: all threads in a warp must participate in this.
   /// Otherwise, you must call the constituent parts separately.
-  __device__ inline void add(K k, faiss::gpu::KeyValuePair<K, V>& v) {
+  __device__ inline void add(K k, faiss::gpu::KeyValuePair<K, V>& v)
+  {
     addThreadQ(k, v);
     checkThreadQ();
   }
 
-  __device__ inline void reduce() {
+  __device__ inline void reduce()
+  {
     // Have all warps dump and merge their queues; this will produce
     // the final per-warp results
     mergeWarpQ();
   }
 
-  __device__ inline void checkThreadQ() {
+  __device__ inline void checkThreadQ()
+  {
     bool needSort = (numVals == NumThreadQ);
 
 #if CUDA_VERSION >= 9000
@@ -681,18 +704,19 @@ struct KeyValueWarpSelect {
 
 #pragma unroll
     for (int i = 0; i < NumThreadQ; ++i) {
-      threadK[i] = initK;
-      threadV[i].key = initV.key;
+      threadK[i]       = initK;
+      threadV[i].key   = initV.key;
       threadV[i].value = initV.value;
     }
 
     // We have to beat at least this element
     warpKTopRDist = shfl(warpV[kNumWarpQRegisters - 1].key, kLane);
-    warpKTop = shfl(warpK[kNumWarpQRegisters - 1], kLane);
+    warpKTop      = shfl(warpK[kNumWarpQRegisters - 1], kLane);
   }
 
   /// Dump final k selected values for this warp out
-  __device__ inline void writeOut(K* outK, V* outV, int k) {
+  __device__ inline void writeOut(K* outK, V* outV, int k)
+  {
     int laneId = faiss::gpu::getLaneId();
 
 #pragma unroll
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index a2e9151dbc..eb9a8f1436 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -52,12 +52,17 @@ using deviceAllocator = raft::mr::device::allocator;
  * @param translations
  */
 template <typename value_idx = int64_t, typename value_t = float>
-inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
-                            value_idx *outV, size_t n_samples, int n_parts,
-                            int k, cudaStream_t stream,
-                            value_idx *translations) {
-  detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream,
-                          translations);
+inline void knn_merge_parts(value_t* inK,
+                            value_idx* inV,
+                            value_t* outK,
+                            value_idx* outV,
+                            size_t n_samples,
+                            int n_parts,
+                            int k,
+                            cudaStream_t stream,
+                            value_idx* translations)
+{
+  detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
 }
 
 /**
@@ -82,9 +87,16 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
  * @param stream
  */
 template <typename value_idx = int, typename value_t = float>
-inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols,
-                     value_t *outK, value_idx *outV, bool select_min, int k,
-                     cudaStream_t stream) {
+inline void select_k(value_t* inK,
+                     value_idx* inV,
+                     size_t n_rows,
+                     size_t n_cols,
+                     value_t* outK,
+                     value_idx* outV,
+                     bool select_min,
+                     int k,
+                     cudaStream_t stream)
+{
   detail::select_k(inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
 }
 
@@ -111,22 +123,41 @@ inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols,
  * @param[in] translations starting offsets for partitions. should be the same size
  *            as input vector.
  */
-inline void brute_force_knn(
-  raft::handle_t const &handle, std::vector<float *> &input,
-  std::vector<int> &sizes, int D, float *search_items, int n, int64_t *res_I,
-  float *res_D, int k, bool rowMajorIndex = true, bool rowMajorQuery = true,
-  std::vector<int64_t> *translations = nullptr,
-  distance::DistanceType metric = distance::DistanceType::L2Expanded,
-  float metric_arg = 2.0f) {
-  ASSERT(input.size() == sizes.size(),
-         "input and sizes vectors must be the same size");
+inline void brute_force_knn(raft::handle_t const& handle,
+                            std::vector<float*>& input,
+                            std::vector<int>& sizes,
+                            int D,
+                            float* search_items,
+                            int n,
+                            int64_t* res_I,
+                            float* res_D,
+                            int k,
+                            bool rowMajorIndex                 = true,
+                            bool rowMajorQuery                 = true,
+                            std::vector<int64_t>* translations = nullptr,
+                            distance::DistanceType metric      = distance::DistanceType::L2Expanded,
+                            float metric_arg                   = 2.0f)
+{
+  ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size");
 
   std::vector<cudaStream_t> int_streams = handle.get_internal_streams();
 
-  detail::brute_force_knn_impl(input, sizes, D, search_items, n, res_I, res_D,
-                               k, handle.get_stream(), int_streams.data(),
-                               handle.get_num_internal_streams(), rowMajorIndex,
-                               rowMajorQuery, translations, metric, metric_arg);
+  detail::brute_force_knn_impl(input,
+                               sizes,
+                               D,
+                               search_items,
+                               n,
+                               res_I,
+                               res_D,
+                               k,
+                               handle.get_stream(),
+                               int_streams.data(),
+                               handle.get_num_internal_streams(),
+                               rowMajorIndex,
+                               rowMajorQuery,
+                               translations,
+                               metric,
+                               metric_arg);
 }
 }  // namespace knn
 }  // namespace spatial
diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp
index 6f507331d9..221a9679d4 100644
--- a/cpp/include/raft/spectral/cluster_solvers.hpp
+++ b/cpp/include/raft/spectral/cluster_solvers.hpp
@@ -24,8 +24,7 @@ using namespace matrix;
 
 // aggregate of control params for Eigen Solver:
 //
-template <typename index_type_t, typename value_type_t,
-          typename size_type_t = index_type_t>
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
 struct cluster_solver_config_t {
   size_type_t n_clusters;
   size_type_t maxIter;
@@ -35,23 +34,35 @@ struct cluster_solver_config_t {
   unsigned long long seed{123456};
 };
 
-template <typename index_type_t, typename value_type_t,
-          typename size_type_t = index_type_t>
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
 struct kmeans_solver_t {
-  explicit kmeans_solver_t(cluster_solver_config_t<index_type_t, value_type_t,
-                                                   size_type_t> const& config)
-    : config_(config) {}
-
-  std::pair<value_type_t, index_type_t> solve(
-    handle_t const& handle, size_type_t n_obs_vecs, size_type_t dim,
-    value_type_t const* __restrict__ obs,
-    index_type_t* __restrict__ codes) const {
+  explicit kmeans_solver_t(
+    cluster_solver_config_t<index_type_t, value_type_t, size_type_t> const& config)
+    : config_(config)
+  {
+  }
+
+  std::pair<value_type_t, index_type_t> solve(handle_t const& handle,
+                                              size_type_t n_obs_vecs,
+                                              size_type_t dim,
+                                              value_type_t const* __restrict__ obs,
+                                              index_type_t* __restrict__ codes) const
+  {
     RAFT_EXPECTS(obs != nullptr, "Null obs buffer.");
     RAFT_EXPECTS(codes != nullptr, "Null codes buffer.");
     value_type_t residual{};
     index_type_t iters{};
-    kmeans(handle, n_obs_vecs, dim, config_.n_clusters, config_.tol,
-           config_.maxIter, obs, codes, residual, iters, config_.seed);
+    kmeans(handle,
+           n_obs_vecs,
+           dim,
+           config_.n_clusters,
+           config_.tol,
+           config_.maxIter,
+           obs,
+           codes,
+           residual,
+           iters,
+           config_.seed);
     return std::make_pair(residual, iters);
   }
 
diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp
index e36dca2e0c..156b996586 100644
--- a/cpp/include/raft/spectral/eigen_solvers.hpp
+++ b/cpp/include/raft/spectral/eigen_solvers.hpp
@@ -23,8 +23,7 @@ using namespace matrix;
 
 // aggregate of control params for Eigen Solver:
 //
-template <typename index_type_t, typename value_type_t,
-          typename size_type_t = index_type_t>
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
 struct eigen_solver_config_t {
   size_type_t n_eigVecs;
   size_type_t maxIter;
@@ -34,42 +33,59 @@ struct eigen_solver_config_t {
 
   bool reorthogonalize{false};
   unsigned long long seed{
-    1234567};  // CAVEAT: this default value is now common to all instances of using seed in Lanczos; was not the case before: there were places where a default seed = 123456 was used; this may trigger slightly different # solver iterations
+    1234567};  // CAVEAT: this default value is now common to all instances of using seed in
+               // Lanczos; was not the case before: there were places where a default seed = 123456
+               // was used; this may trigger slightly different # solver iterations
 };
 
-template <typename index_type_t, typename value_type_t,
-          typename size_type_t = index_type_t>
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
 struct lanczos_solver_t {
-  explicit lanczos_solver_t(eigen_solver_config_t<index_type_t, value_type_t,
-                                                  size_type_t> const& config)
-    : config_(config) {}
+  explicit lanczos_solver_t(
+    eigen_solver_config_t<index_type_t, value_type_t, size_type_t> const& config)
+    : config_(config)
+  {
+  }
 
-  index_type_t solve_smallest_eigenvectors(
-    handle_t const& handle,
-    sparse_matrix_t<index_type_t, value_type_t> const& A,
-    value_type_t* __restrict__ eigVals,
-    value_type_t* __restrict__ eigVecs) const {
+  index_type_t solve_smallest_eigenvectors(handle_t const& handle,
+                                           sparse_matrix_t<index_type_t, value_type_t> const& A,
+                                           value_type_t* __restrict__ eigVals,
+                                           value_type_t* __restrict__ eigVecs) const
+  {
     RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
     RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
     index_type_t iters{};
-    computeSmallestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter,
-                                config_.restartIter, config_.tol,
-                                config_.reorthogonalize, iters, eigVals,
-                                eigVecs, config_.seed);
+    computeSmallestEigenvectors(handle,
+                                A,
+                                config_.n_eigVecs,
+                                config_.maxIter,
+                                config_.restartIter,
+                                config_.tol,
+                                config_.reorthogonalize,
+                                iters,
+                                eigVals,
+                                eigVecs,
+                                config_.seed);
     return iters;
   }
 
-  index_type_t solve_largest_eigenvectors(
-    handle_t const& handle,
-    sparse_matrix_t<index_type_t, value_type_t> const& A,
-    value_type_t* __restrict__ eigVals,
-    value_type_t* __restrict__ eigVecs) const {
+  index_type_t solve_largest_eigenvectors(handle_t const& handle,
+                                          sparse_matrix_t<index_type_t, value_type_t> const& A,
+                                          value_type_t* __restrict__ eigVals,
+                                          value_type_t* __restrict__ eigVecs) const
+  {
     RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
     RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
     index_type_t iters{};
-    computeLargestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter,
-                               config_.restartIter, config_.tol,
-                               config_.reorthogonalize, iters, eigVals, eigVecs,
+    computeLargestEigenvectors(handle,
+                               A,
+                               config_.n_eigVecs,
+                               config_.maxIter,
+                               config_.restartIter,
+                               config_.tol,
+                               config_.reorthogonalize,
+                               iters,
+                               eigVals,
+                               eigVecs,
                                config_.seed);
     return iters;
   }
diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp
index d089b85518..18b23bea55 100644
--- a/cpp/include/raft/spectral/kmeans.hpp
+++ b/cpp/include/raft/spectral/kmeans.hpp
@@ -43,15 +43,15 @@ using namespace raft::linalg;
 // Useful grid settings
 // =========================================================
 
-constexpr unsigned int BLOCK_SIZE = 1024;
-constexpr unsigned int WARP_SIZE = 32;
+constexpr unsigned int BLOCK_SIZE      = 1024;
+constexpr unsigned int WARP_SIZE       = 32;
 constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE);
 
 // =========================================================
 // CUDA kernels
 // =========================================================
 
-/** 
+/**
  *  @brief Compute distances between observation vectors and centroids
  *    Block dimensions should be (warpSize, 1,
  *    blockSize/warpSize). Ideally, the grid is large enough so there
@@ -75,11 +75,13 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE);
  *    initialized to zero.
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void computeDistances(
-  index_type_t n, index_type_t d, index_type_t k,
-  const value_type_t* __restrict__ obs,
-  const value_type_t* __restrict__ centroids,
-  value_type_t* __restrict__ dists) {
+static __global__ void computeDistances(index_type_t n,
+                                        index_type_t d,
+                                        index_type_t k,
+                                        const value_type_t* __restrict__ obs,
+                                        const value_type_t* __restrict__ centroids,
+                                        value_type_t* __restrict__ dists)
+{
   // Loop index
   index_type_t i;
 
@@ -114,12 +116,10 @@ static __global__ void computeDistances(
 
         // Perform reduction on warp
         for (i = WARP_SIZE / 2; i > 0; i /= 2)
-          dist_private +=
-            __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i);
+          dist_private += __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i);
 
         // Write result to global memory
-        if (threadIdx.x == 0)
-          atomicAdd(dists + IDX(gidz, gidy, n), dist_private);
+        if (threadIdx.x == 0) atomicAdd(dists + IDX(gidz, gidy, n), dist_private);
 
         // Move to another observation vector
         gidz += blockDim.z * gridDim.z;
@@ -134,8 +134,8 @@ static __global__ void computeDistances(
   }
 }
 
-/** 
- *  @brief Find closest centroid to observation vectors. 
+/**
+ *  @brief Find closest centroid to observation vectors.
  *    Block and grid dimensions should be 1-dimensional. Ideally the
  *    grid is large enough so there are n threads.
  *  @tparam index_type_t the type of data used for indexing.
@@ -156,10 +156,12 @@ static __global__ void computeDistances(
  *    cluster. Entries must be initialized to zero.
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void minDistances(index_type_t n, index_type_t k,
+static __global__ void minDistances(index_type_t n,
+                                    index_type_t k,
                                     value_type_t* __restrict__ dists,
                                     index_type_t* __restrict__ codes,
-                                    index_type_t* __restrict__ clusterSizes) {
+                                    index_type_t* __restrict__ clusterSizes)
+{
   // Loop index
   index_type_t i, j;
 
@@ -178,8 +180,8 @@ static __global__ void minDistances(index_type_t n, index_type_t k,
     dist_min = dists[IDX(i, 0, n)];
     for (j = 1; j < k; ++j) {
       dist_curr = dists[IDX(i, j, n)];
-      code_min = (dist_curr < dist_min) ? j : code_min;
-      dist_min = (dist_curr < dist_min) ? dist_curr : dist_min;
+      code_min  = (dist_curr < dist_min) ? j : code_min;
+      dist_min  = (dist_curr < dist_min) ? dist_curr : dist_min;
     }
 
     // Transfer result to global memory
@@ -194,8 +196,8 @@ static __global__ void minDistances(index_type_t n, index_type_t k,
   }
 }
 
-/** 
- *  @brief Check if newly computed distances are smaller than old distances. 
+/**
+ *  @brief Check if newly computed distances are smaller than old distances.
  *    Block and grid dimensions should be 1-dimensional. Ideally the
  *    grid is large enough so there are n threads.
  *  @tparam index_type_t the type of data used for indexing.
@@ -218,7 +220,8 @@ static __global__ void minDistances2(index_type_t n,
                                      value_type_t* __restrict__ dists_old,
                                      const value_type_t* __restrict__ dists_new,
                                      index_type_t* __restrict__ codes_old,
-                                     index_type_t code_new) {
+                                     index_type_t code_new)
+{
   // Loop index
   index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -243,7 +246,7 @@ static __global__ void minDistances2(index_type_t n,
   }
 }
 
-/** 
+/**
  *  @brief Compute size of k-means clusters.
  *    Block and grid dimensions should be 1-dimensional. Ideally the
  *    grid is large enough so there are n threads.
@@ -255,9 +258,10 @@ static __global__ void minDistances2(index_type_t n,
  *    cluster. Entries must be initialized to zero.
  */
 template <typename index_type_t>
-static __global__ void computeClusterSizes(
-  index_type_t n, const index_type_t* __restrict__ codes,
-  index_type_t* __restrict__ clusterSizes) {
+static __global__ void computeClusterSizes(index_type_t n,
+                                           const index_type_t* __restrict__ codes,
+                                           index_type_t* __restrict__ clusterSizes)
+{
   index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
   while (i < n) {
     atomicAdd(clusterSizes + codes[i], 1);
@@ -265,8 +269,8 @@ static __global__ void computeClusterSizes(
   }
 }
 
-/** 
- *  @brief Divide rows of centroid matrix by cluster sizes. 
+/**
+ *  @brief Divide rows of centroid matrix by cluster sizes.
  *    Divides the ith column of the sum matrix by the size of the ith
  *    cluster. If the sum matrix has been initialized so that the ith
  *    row is the sum of all observation vectors in the ith cluster,
@@ -287,9 +291,11 @@ static __global__ void computeClusterSizes(
  *    column is the mean position of a cluster).
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void divideCentroids(
-  index_type_t d, index_type_t k, const index_type_t* __restrict__ clusterSizes,
-  value_type_t* __restrict__ centroids) {
+static __global__ void divideCentroids(index_type_t d,
+                                       index_type_t k,
+                                       const index_type_t* __restrict__ clusterSizes,
+                                       value_type_t* __restrict__ centroids)
+{
   // Global indices
   index_type_t gidx, gidy;
 
@@ -340,11 +346,14 @@ static __global__ void divideCentroids(
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-static int chooseNewCentroid(handle_t const& handle, index_type_t n,
-                             index_type_t d, value_type_t rand,
+static int chooseNewCentroid(handle_t const& handle,
+                             index_type_t n,
+                             index_type_t d,
+                             value_type_t rand,
                              const value_type_t* __restrict__ obs,
                              value_type_t* __restrict__ dists,
-                             value_type_t* __restrict__ centroid) {
+                             value_type_t* __restrict__ centroid)
+{
   // Cumulative sum of distances
   value_type_t* distsCumSum = dists + n;
   // Residual sum of squares
@@ -352,44 +361,44 @@ static int chooseNewCentroid(handle_t const& handle, index_type_t n,
   // Observation vector that is chosen as new centroid
   index_type_t obsIndex;
 
-  auto stream = handle.get_stream();
+  auto stream             = handle.get_stream();
   auto thrust_exec_policy = handle.get_thrust_policy();
 
   // Compute cumulative sum of distances
-  thrust::inclusive_scan(thrust_exec_policy, thrust::device_pointer_cast(dists),
+  thrust::inclusive_scan(thrust_exec_policy,
+                         thrust::device_pointer_cast(dists),
                          thrust::device_pointer_cast(dists + n),
                          thrust::device_pointer_cast(distsCumSum));
   CHECK_CUDA(stream);
-  CUDA_TRY(cudaMemcpyAsync(&distsSum, distsCumSum + n - 1, sizeof(value_type_t),
-                           cudaMemcpyDeviceToHost, stream));
+  CUDA_TRY(cudaMemcpyAsync(
+    &distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream));
 
   // Randomly choose observation vector
   //   Probabilities are proportional to square of distance to closest
   //   centroid (see k-means++ algorithm)
   //
-  //seg-faults due to Thrust bug
-  //on binary-search-like algorithms
-  //when run with stream dependent
-  //execution policies; fixed on Thrust GitHub
-  //hence replace w/ linear interpolation,
-  //until the Thrust issue gets resolved:
+  // seg-faults due to Thrust bug
+  // on binary-search-like algorithms
+  // when run with stream dependent
+  // execution policies; fixed on Thrust GitHub
+  // hence replace w/ linear interpolation,
+  // until the Thrust issue gets resolved:
   //
   // obsIndex = (thrust::lower_bound(
   //               thrust_exec_policy, thrust::device_pointer_cast(distsCumSum),
   //               thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) -
   //             thrust::device_pointer_cast(distsCumSum));
   //
-  //linear interpolation logic:
+  // linear interpolation logic:
   //{
   value_type_t minSum{0};
-  CUDA_TRY(cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t),
-                           cudaMemcpyDeviceToHost, stream));
+  CUDA_TRY(
+    cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream));
   CHECK_CUDA(stream);
 
   if (distsSum > minSum) {
     value_type_t vIndex = static_cast<value_type_t>(n - 1);
-    obsIndex = static_cast<index_type_t>(vIndex * (distsSum * rand - minSum) /
-                                         (distsSum - minSum));
+    obsIndex = static_cast<index_type_t>(vIndex * (distsSum * rand - minSum) / (distsSum - minSum));
   } else {
     obsIndex = 0;
   }
@@ -400,15 +409,17 @@ static int chooseNewCentroid(handle_t const& handle, index_type_t n,
   obsIndex = min(obsIndex, n - 1);
 
   // Record new centroid position
-  CUDA_TRY(cudaMemcpyAsync(centroid, obs + IDX(0, obsIndex, d),
-                           d * sizeof(value_type_t), cudaMemcpyDeviceToDevice,
+  CUDA_TRY(cudaMemcpyAsync(centroid,
+                           obs + IDX(0, obsIndex, d),
+                           d * sizeof(value_type_t),
+                           cudaMemcpyDeviceToDevice,
                            stream));
 
   return 0;
 }
 
 /**
- *  @brief Choose initial cluster centroids for k-means algorithm.  
+ *  @brief Choose initial cluster centroids for k-means algorithm.
  *    Centroids are randomly chosen with k-means++ algorithm
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
@@ -432,11 +443,17 @@ static int chooseNewCentroid(handle_t const& handle, index_type_t n,
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-static int initializeCentroids(
-  handle_t const& handle, index_type_t n, index_type_t d, index_type_t k,
-  const value_type_t* __restrict__ obs, value_type_t* __restrict__ centroids,
-  index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes,
-  value_type_t* __restrict__ dists, unsigned long long seed) {
+static int initializeCentroids(handle_t const& handle,
+                               index_type_t n,
+                               index_type_t d,
+                               index_type_t k,
+                               const value_type_t* __restrict__ obs,
+                               value_type_t* __restrict__ centroids,
+                               index_type_t* __restrict__ codes,
+                               index_type_t* __restrict__ clusterSizes,
+                               value_type_t* __restrict__ dists,
+                               unsigned long long seed)
+{
   // -------------------------------------------------------
   // Variable declarations
   // -------------------------------------------------------
@@ -448,7 +465,7 @@ static int initializeCentroids(
   thrust::default_random_engine rng(seed);
   thrust::uniform_real_distribution<value_type_t> uniformDist(0, 1);
 
-  auto stream = handle.get_stream();
+  auto stream             = handle.get_stream();
   auto thrust_exec_policy = handle.get_thrust_policy();
 
   constexpr index_type_t grid_lower_bound{65535};
@@ -461,35 +478,34 @@ static int initializeCentroids(
   dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE};
 
   // CUDA grid dimensions
-  dim3 gridDim_warp{
-    min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1,
-    min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)};
+  dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
+                    1,
+                    min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)};
 
   // CUDA grid dimensions
-  dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound),
-                     1, 1};
+  dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), 1, 1};
 
   // Assign observation vectors to code 0
   CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
 
   // Choose first centroid
-  thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists),
-               thrust::device_pointer_cast(dists + n), 1);
+  thrust::fill(thrust_exec_policy,
+               thrust::device_pointer_cast(dists),
+               thrust::device_pointer_cast(dists + n),
+               1);
   CHECK_CUDA(stream);
   if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids))
     WARNING("error in k-means++ (could not pick centroid)");
 
   // Compute distances from first centroid
   CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream));
-  computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(
-    n, d, 1, obs, centroids, dists);
+  computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(n, d, 1, obs, centroids, dists);
   CHECK_CUDA(stream);
 
   // Choose remaining centroids
   for (i = 1; i < k; ++i) {
     // Choose ith centroid
-    if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists,
-                          centroids + IDX(0, i, d)))
+    if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids + IDX(0, i, d)))
       WARNING("error in k-means++ (could not pick centroid)");
 
     // Compute distances from ith centroid
@@ -499,22 +515,20 @@ static int initializeCentroids(
     CHECK_CUDA(stream);
 
     // Recompute minimum distances
-    minDistances2<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, dists, dists + n,
-                                                            codes, i);
+    minDistances2<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, dists, dists + n, codes, i);
     CHECK_CUDA(stream);
   }
 
   // Compute cluster sizes
   CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
-  computeClusterSizes<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, codes,
-                                                                clusterSizes);
+  computeClusterSizes<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, codes, clusterSizes);
   CHECK_CUDA(stream);
 
   return 0;
 }
 
-/** 
- *  @brief Find cluster centroids closest to observation vectors. 
+/**
+ *  @brief Find cluster centroids closest to observation vectors.
  *    Distance is measured with Euclidean norm.
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
@@ -540,15 +554,18 @@ static int initializeCentroids(
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-static int assignCentroids(handle_t const& handle, index_type_t n,
-                           index_type_t d, index_type_t k,
+static int assignCentroids(handle_t const& handle,
+                           index_type_t n,
+                           index_type_t d,
+                           index_type_t k,
                            const value_type_t* __restrict__ obs,
                            const value_type_t* __restrict__ centroids,
                            value_type_t* __restrict__ dists,
                            index_type_t* __restrict__ codes,
                            index_type_t* __restrict__ clusterSizes,
-                           value_type_t* residual_host) {
-  auto stream = handle.get_stream();
+                           value_type_t* residual_host)
+{
+  auto stream             = handle.get_stream();
   auto thrust_exec_policy = handle.get_thrust_policy();
 
   // Compute distance between centroids and observation vectors
@@ -561,11 +578,9 @@ static int assignCentroids(handle_t const& handle, index_type_t n,
   constexpr index_type_t grid_lower_bound{65535};
   gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound);
   gridDim.y = min(k, grid_lower_bound);
-  gridDim.z =
-    min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound);
+  gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound);
 
-  computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, k, obs, centroids,
-                                                     dists);
+  computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, k, obs, centroids, dists);
   CHECK_CUDA(stream);
 
   // Find centroid closest to each observation vector
@@ -573,23 +588,21 @@ static int assignCentroids(handle_t const& handle, index_type_t n,
   blockDim.x = BLOCK_SIZE;
   blockDim.y = 1;
   blockDim.z = 1;
-  gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound);
-  gridDim.y = 1;
-  gridDim.z = 1;
-  minDistances<<<gridDim, blockDim, 0, stream>>>(n, k, dists, codes,
-                                                 clusterSizes);
+  gridDim.x  = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound);
+  gridDim.y  = 1;
+  gridDim.z  = 1;
+  minDistances<<<gridDim, blockDim, 0, stream>>>(n, k, dists, codes, clusterSizes);
   CHECK_CUDA(stream);
 
   // Compute residual sum of squares
-  *residual_host =
-    thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(dists),
-                   thrust::device_pointer_cast(dists + n));
+  *residual_host = thrust::reduce(
+    thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n));
 
   return 0;
 }
 
-/** 
- *  @brief Update cluster centroids for k-means algorithm. 
+/**
+ *  @brief Update cluster centroids for k-means algorithm.
  *    All clusters are assumed to be non-empty.
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
@@ -613,26 +626,29 @@ static int assignCentroids(handle_t const& handle, index_type_t n,
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-static int updateCentroids(handle_t const& handle, index_type_t n,
-                           index_type_t d, index_type_t k,
+static int updateCentroids(handle_t const& handle,
+                           index_type_t n,
+                           index_type_t d,
+                           index_type_t k,
                            const value_type_t* __restrict__ obs,
                            const index_type_t* __restrict__ codes,
                            const index_type_t* __restrict__ clusterSizes,
                            value_type_t* __restrict__ centroids,
                            value_type_t* __restrict__ work,
-                           index_type_t* __restrict__ work_int) {
+                           index_type_t* __restrict__ work_int)
+{
   // -------------------------------------------------------
   // Variable declarations
   // -------------------------------------------------------
 
   // Useful constants
-  const value_type_t one = 1;
+  const value_type_t one  = 1;
   const value_type_t zero = 0;
 
   constexpr index_type_t grid_lower_bound{65535};
 
-  auto stream = handle.get_stream();
-  auto cublas_h = handle.get_cublas_handle();
+  auto stream             = handle.get_stream();
+  auto cublas_h           = handle.get_cublas_handle();
   auto thrust_exec_policy = handle.get_thrust_policy();
 
   // Device memory
@@ -641,34 +657,56 @@ static int updateCentroids(handle_t const& handle, index_type_t n,
   thrust::device_ptr<index_type_t> rows(work_int + d * n);
 
   // Take transpose of observation matrix
-  CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, n, d, &one, obs,
-                          d, &zero, (value_type_t*)NULL, n,
-                          thrust::raw_pointer_cast(obs_copy), n, stream));
+  CUBLAS_CHECK(cublasgeam(cublas_h,
+                          CUBLAS_OP_T,
+                          CUBLAS_OP_N,
+                          n,
+                          d,
+                          &one,
+                          obs,
+                          d,
+                          &zero,
+                          (value_type_t*)NULL,
+                          n,
+                          thrust::raw_pointer_cast(obs_copy),
+                          n,
+                          stream));
 
   // Cluster assigned to each observation matrix entry
   thrust::sequence(thrust_exec_policy, rows, rows + d * n);
   CHECK_CUDA(stream);
-  thrust::transform(thrust_exec_policy, rows, rows + d * n,
-                    thrust::make_constant_iterator<index_type_t>(n), rows,
+  thrust::transform(thrust_exec_policy,
+                    rows,
+                    rows + d * n,
+                    thrust::make_constant_iterator<index_type_t>(n),
+                    rows,
                     thrust::modulus<index_type_t>());
   CHECK_CUDA(stream);
-  thrust::gather(thrust_exec_policy, rows, rows + d * n,
-                 thrust::device_pointer_cast(codes), codes_copy);
+  thrust::gather(
+    thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy);
   CHECK_CUDA(stream);
 
   // Row associated with each observation matrix entry
   thrust::sequence(thrust_exec_policy, rows, rows + d * n);
   CHECK_CUDA(stream);
-  thrust::transform(thrust_exec_policy, rows, rows + d * n,
-                    thrust::make_constant_iterator<index_type_t>(n), rows,
+  thrust::transform(thrust_exec_policy,
+                    rows,
+                    rows + d * n,
+                    thrust::make_constant_iterator<index_type_t>(n),
+                    rows,
                     thrust::divides<index_type_t>());
   CHECK_CUDA(stream);
 
   // Sort and reduce to add observation vectors in same cluster
-  thrust::stable_sort_by_key(thrust_exec_policy, codes_copy, codes_copy + d * n,
+  thrust::stable_sort_by_key(thrust_exec_policy,
+                             codes_copy,
+                             codes_copy + d * n,
                              make_zip_iterator(make_tuple(obs_copy, rows)));
   CHECK_CUDA(stream);
-  thrust::reduce_by_key(thrust_exec_policy, rows, rows + d * n, obs_copy,
+  thrust::reduce_by_key(thrust_exec_policy,
+                        rows,
+                        rows + d * n,
+                        obs_copy,
                         codes_copy,  // Output to codes_copy is ignored
                         thrust::device_pointer_cast(centroids));
   CHECK_CUDA(stream);
@@ -679,12 +717,11 @@ static int updateCentroids(handle_t const& handle, index_type_t n,
   dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1};
 
   // CUDA grid dimensions
-  dim3 gridDim{
-    min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
-    min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound), 1};
+  dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
+               min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound),
+               1};
 
-  divideCentroids<<<gridDim, blockDim, 0, stream>>>(d, k, clusterSizes,
-                                                    centroids);
+  divideCentroids<<<gridDim, blockDim, 0, stream>>>(d, k, clusterSizes, centroids);
   CHECK_CUDA(stream);
 
   return 0;
@@ -698,8 +735,8 @@ namespace raft {
 // k-means algorithm
 // =========================================================
 
-/** 
- *  @brief Find clusters with k-means algorithm. 
+/**
+ *  @brief Find clusters with k-means algorithm.
  *    Initial centroids are chosen with k-means++ algorithm. Empty
  *    clusters are reinitialized by choosing new centroids with
  *    k-means++ algorithm.
@@ -735,15 +772,22 @@ namespace raft {
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int kmeans(handle_t const& handle, index_type_t n, index_type_t d,
-           index_type_t k, value_type_t tol, index_type_t maxiter,
+int kmeans(handle_t const& handle,
+           index_type_t n,
+           index_type_t d,
+           index_type_t k,
+           value_type_t tol,
+           index_type_t maxiter,
            const value_type_t* __restrict__ obs,
            index_type_t* __restrict__ codes,
            index_type_t* __restrict__ clusterSizes,
            value_type_t* __restrict__ centroids,
-           value_type_t* __restrict__ work, index_type_t* __restrict__ work_int,
-           value_type_t* residual_host, index_type_t* iters_host,
-           unsigned long long seed) {
+           value_type_t* __restrict__ work,
+           index_type_t* __restrict__ work_int,
+           value_type_t* residual_host,
+           index_type_t* iters_host,
+           unsigned long long seed)
+{
   // -------------------------------------------------------
   // Variable declarations
   // -------------------------------------------------------
@@ -764,101 +808,93 @@ int kmeans(handle_t const& handle, index_type_t n, index_type_t d,
   // Initialization
   // -------------------------------------------------------
 
-  auto stream = handle.get_stream();
-  auto cublas_h = handle.get_cublas_handle();
+  auto stream             = handle.get_stream();
+  auto cublas_h           = handle.get_cublas_handle();
   auto thrust_exec_policy = handle.get_thrust_policy();
 
   // Trivial cases
   if (k == 1) {
     CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
-    CUDA_TRY(cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t),
-                             cudaMemcpyHostToDevice, stream));
-    if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids,
-                        work, work_int))
+    CUDA_TRY(
+      cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream));
+    if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int))
       WARNING("could not compute k-means centroids");
 
     dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};
 
     dim3 gridDim{
-      min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1,
-      min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE),
-          grid_lower_bound)};
+      min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
+      1,
+      min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), grid_lower_bound)};
 
     CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream));
-    computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, 1, obs, centroids,
-                                                       work);
+    computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, 1, obs, centroids, work);
     CHECK_CUDA(stream);
-    *residual_host =
-      thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(work),
-                     thrust::device_pointer_cast(work + n));
+    *residual_host = thrust::reduce(
+      thrust_exec_policy, thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n));
     CHECK_CUDA(stream);
     return 0;
   }
   if (n <= k) {
-    thrust::sequence(thrust_exec_policy, thrust::device_pointer_cast(codes),
+    thrust::sequence(thrust_exec_policy,
+                     thrust::device_pointer_cast(codes),
                      thrust::device_pointer_cast(codes + n));
     CHECK_CUDA(stream);
-    thrust::fill_n(thrust_exec_policy,
-                   thrust::device_pointer_cast(clusterSizes), n, 1);
+    thrust::fill_n(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), n, 1);
     CHECK_CUDA(stream);
 
     if (n < k)
-      CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0,
-                               (k - n) * sizeof(index_type_t), stream));
-    CUDA_TRY(cudaMemcpyAsync(centroids, obs, d * n * sizeof(value_type_t),
-                             cudaMemcpyDeviceToDevice, stream));
+      CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(index_type_t), stream));
+    CUDA_TRY(cudaMemcpyAsync(
+      centroids, obs, d * n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream));
     *residual_host = 0;
     return 0;
   }
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(
-    linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  CUBLAS_CHECK(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // -------------------------------------------------------
   // k-means++ algorithm
   // -------------------------------------------------------
 
   // Choose initial cluster centroids
-  if (initializeCentroids(handle, n, d, k, obs, centroids, codes, clusterSizes,
-                          work, seed))
+  if (initializeCentroids(handle, n, d, k, obs, centroids, codes, clusterSizes, work, seed))
     WARNING("could not initialize k-means centroids");
 
   // Apply k-means iteration until convergence
   for (iter = 0; iter < maxiter; ++iter) {
     // Update cluster centroids
-    if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids,
-                        work, work_int))
+    if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int))
       WARNING("could not update k-means centroids");
 
     // Determine centroid closest to each observation
     residualPrev = *residual_host;
-    if (assignCentroids(handle, n, d, k, obs, centroids, work, codes,
-                        clusterSizes, residual_host))
+    if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host))
       WARNING("could not assign observation vectors to k-means clusters");
 
     // Reinitialize empty clusters with new centroids
-    index_type_t emptyCentroid =
-      (thrust::find(thrust_exec_policy,
-                    thrust::device_pointer_cast(clusterSizes),
-                    thrust::device_pointer_cast(clusterSizes + k), 0) -
-       thrust::device_pointer_cast(clusterSizes));
+    index_type_t emptyCentroid = (thrust::find(thrust_exec_policy,
+                                               thrust::device_pointer_cast(clusterSizes),
+                                               thrust::device_pointer_cast(clusterSizes + k),
+                                               0) -
+                                  thrust::device_pointer_cast(clusterSizes));
 
     // FIXME: emptyCentroid never reaches k (infinite loop) under certain
     // conditions, such as if obs is corrupt (as seen as a result of a
     // DataFrame column of NULL edge vals used to create the Graph)
     while (emptyCentroid < k) {
-      if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, work,
-                            centroids + IDX(0, emptyCentroid, d)))
+      if (chooseNewCentroid(
+            handle, n, d, uniformDist(rng), obs, work, centroids + IDX(0, emptyCentroid, d)))
         WARNING("could not replace empty centroid");
-      if (assignCentroids(handle, n, d, k, obs, centroids, work, codes,
-                          clusterSizes, residual_host))
+      if (assignCentroids(
+            handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host))
         WARNING("could not assign observation vectors to k-means clusters");
-      emptyCentroid =
-        (thrust::find(thrust_exec_policy,
-                      thrust::device_pointer_cast(clusterSizes),
-                      thrust::device_pointer_cast(clusterSizes + k), 0) -
-         thrust::device_pointer_cast(clusterSizes));
+      emptyCentroid = (thrust::find(thrust_exec_policy,
+                                    thrust::device_pointer_cast(clusterSizes),
+                                    thrust::device_pointer_cast(clusterSizes + k),
+                                    0) -
+                       thrust::device_pointer_cast(clusterSizes));
       CHECK_CUDA(stream);
     }
 
@@ -870,14 +906,13 @@ int kmeans(handle_t const& handle, index_type_t n, index_type_t d,
   }
 
   // Warning if k-means has failed to converge
-  if (std::fabs(residualPrev - (*residual_host)) / n >= tol)
-    WARNING("k-means failed to converge");
+  if (std::fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge");
 
   *iters_host = iter;
   return 0;
 }
 
-/** 
+/**
  *  @brief Find clusters with k-means algorithm.
  *    Initial centroids are chosen with k-means++ algorithm. Empty
  *    clusters are reinitialized by choosing new centroids with
@@ -903,11 +938,18 @@ int kmeans(handle_t const& handle, index_type_t n, index_type_t d,
  *  @return error flag
  */
 template <typename index_type_t, typename value_type_t>
-int kmeans(handle_t const& handle, index_type_t n, index_type_t d,
-           index_type_t k, value_type_t tol, index_type_t maxiter,
+int kmeans(handle_t const& handle,
+           index_type_t n,
+           index_type_t d,
+           index_type_t k,
+           value_type_t tol,
+           index_type_t maxiter,
            const value_type_t* __restrict__ obs,
-           index_type_t* __restrict__ codes, value_type_t& residual,
-           index_type_t& iters, unsigned long long seed = 123456) {
+           index_type_t* __restrict__ codes,
+           value_type_t& residual,
+           index_type_t& iters,
+           unsigned long long seed = 123456)
+{
   using namespace matrix;
 
   // Check that parameters are valid
@@ -924,9 +966,21 @@ int kmeans(handle_t const& handle, index_type_t n, index_type_t d,
   vector_t<index_type_t> work_int(handle, 2 * d * n);
 
   // Perform k-means
-  return kmeans<index_type_t, value_type_t>(
-    handle, n, d, k, tol, maxiter, obs, codes, clusterSizes.raw(),
-    centroids.raw(), work.raw(), work_int.raw(), &residual, &iters, seed);
+  return kmeans<index_type_t, value_type_t>(handle,
+                                            n,
+                                            d,
+                                            k,
+                                            tol,
+                                            maxiter,
+                                            obs,
+                                            codes,
+                                            clusterSizes.raw(),
+                                            centroids.raw(),
+                                            work.raw(),
+                                            work_int.raw(),
+                                            &residual,
+                                            &iters,
+                                            seed);
 }
 
 }  // namespace raft
diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp
index d14bf05f37..35fc22c770 100644
--- a/cpp/include/raft/spectral/lapack.hpp
+++ b/cpp/include/raft/spectral/lapack.hpp
@@ -21,66 +21,125 @@
 #include <raft/linalg/cusolver_wrappers.h>
 #include <raft/error.hpp>
 
-//for now; TODO: check if/where this `define` should be;
+// for now; TODO: check if/where this `define` should be;
 //
 #define USE_LAPACK
 
 namespace raft {
 
-#define lapackCheckError(status)                        \
-  {                                                     \
-    if (status < 0) {                                   \
-      std::stringstream ss;                             \
-      ss << "Lapack error: argument number " << -status \
-         << " had an illegal value.";                   \
-      throw exception(ss.str());                        \
-    } else if (status > 0)                              \
-      RAFT_FAIL("Lapack error: internal error.");       \
+#define lapackCheckError(status)                                                     \
+  {                                                                                  \
+    if (status < 0) {                                                                \
+      std::stringstream ss;                                                          \
+      ss << "Lapack error: argument number " << -status << " had an illegal value."; \
+      throw exception(ss.str());                                                     \
+    } else if (status > 0)                                                           \
+      RAFT_FAIL("Lapack error: internal error.");                                    \
   }
 
-extern "C" void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau,
-                        float *work, int *lwork, int *info);
-extern "C" void dgeqrf_(int *m, int *n, double *a, int *lda, double *tau,
-                        double *work, int *lwork, int *info);
-extern "C" void sormqr_(char *side, char *trans, int *m, int *n, int *k,
-                        float *a, int *lda, const float *tau, float *c,
-                        int *ldc, float *work, int *lwork, int *info);
-extern "C" void dormqr_(char *side, char *trans, int *m, int *n, int *k,
-                        double *a, int *lda, const double *tau, double *c,
-                        int *ldc, double *work, int *lwork, int *info);
-extern "C" int dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda,
-                      double *wr, double *wi, double *vl, int *ldvl, double *vr,
-                      int *ldvr, double *work, int *lwork, int *info);
-
-extern "C" int sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda,
-                      float *wr, float *wi, float *vl, int *ldvl, float *vr,
-                      int *ldvr, float *work, int *lwork, int *info);
-
-extern "C" cusolverStatus_t cusolverDnSgemmHost(
-  cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
-  const float *alpha, const float *A, int lda, const float *B, int ldb,
-  const float *beta, float *C, int ldc);
-
-extern "C" cusolverStatus_t cusolverDnDgemmHost(
-  cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
-  const double *alpha, const double *A, int lda, const double *B, int ldb,
-  const double *beta, double *C, int ldc);
-
-extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float *d, float *e,
-                                                 int *info);
-
-extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double *d, double *e,
-                                                 int *info);
-
-extern "C" cusolverStatus_t cusolverDnSsteqrHost(const signed char *compz,
-                                                 int n, float *d, float *e,
-                                                 float *z, int ldz, float *work,
-                                                 int *info);
-
-extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char *compz,
-                                                 int n, double *d, double *e,
-                                                 double *z, int ldz,
-                                                 double *work, int *info);
+extern "C" void sgeqrf_(
+  int* m, int* n, float* a, int* lda, float* tau, float* work, int* lwork, int* info);
+extern "C" void dgeqrf_(
+  int* m, int* n, double* a, int* lda, double* tau, double* work, int* lwork, int* info);
+extern "C" void sormqr_(char* side,
+                        char* trans,
+                        int* m,
+                        int* n,
+                        int* k,
+                        float* a,
+                        int* lda,
+                        const float* tau,
+                        float* c,
+                        int* ldc,
+                        float* work,
+                        int* lwork,
+                        int* info);
+extern "C" void dormqr_(char* side,
+                        char* trans,
+                        int* m,
+                        int* n,
+                        int* k,
+                        double* a,
+                        int* lda,
+                        const double* tau,
+                        double* c,
+                        int* ldc,
+                        double* work,
+                        int* lwork,
+                        int* info);
+extern "C" int dgeev_(char* jobvl,
+                      char* jobvr,
+                      int* n,
+                      double* a,
+                      int* lda,
+                      double* wr,
+                      double* wi,
+                      double* vl,
+                      int* ldvl,
+                      double* vr,
+                      int* ldvr,
+                      double* work,
+                      int* lwork,
+                      int* info);
+
+extern "C" int sgeev_(char* jobvl,
+                      char* jobvr,
+                      int* n,
+                      float* a,
+                      int* lda,
+                      float* wr,
+                      float* wi,
+                      float* vl,
+                      int* ldvl,
+                      float* vr,
+                      int* ldvr,
+                      float* work,
+                      int* lwork,
+                      int* info);
+
+extern "C" cusolverStatus_t cusolverDnSgemmHost(cublasOperation_t transa,
+                                                cublasOperation_t transb,
+                                                int m,
+                                                int n,
+                                                int k,
+                                                const float* alpha,
+                                                const float* A,
+                                                int lda,
+                                                const float* B,
+                                                int ldb,
+                                                const float* beta,
+                                                float* C,
+                                                int ldc);
+
+extern "C" cusolverStatus_t cusolverDnDgemmHost(cublasOperation_t transa,
+                                                cublasOperation_t transb,
+                                                int m,
+                                                int n,
+                                                int k,
+                                                const double* alpha,
+                                                const double* A,
+                                                int lda,
+                                                const double* B,
+                                                int ldb,
+                                                const double* beta,
+                                                double* C,
+                                                int ldc);
+
+extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float* d, float* e, int* info);
+
+extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double* d, double* e, int* info);
+
+extern "C" cusolverStatus_t cusolverDnSsteqrHost(
+  const signed char* compz, int n, float* d, float* e, float* z, int ldz, float* work, int* info);
+
+extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char* compz,
+                                                 int n,
+                                                 double* d,
+                                                 double* e,
+                                                 double* z,
+                                                 int ldz,
+                                                 double* work,
+                                                 int* info);
 
 template <typename T>
 class Lapack {
@@ -91,182 +150,339 @@ class Lapack {
  public:
   static void check_lapack_enabled();
 
-  static void gemm(bool transa, bool transb, int m, int n, int k, T alpha,
-                   const T *A, int lda, const T *B, int ldb, T beta, T *C,
+  static void gemm(bool transa,
+                   bool transb,
+                   int m,
+                   int n,
+                   int k,
+                   T alpha,
+                   const T* A,
+                   int lda,
+                   const T* B,
+                   int ldb,
+                   T beta,
+                   T* C,
                    int ldc);
 
   // special QR for lanczos
-  static void sterf(int n, T *d, T *e);
-  static void steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work);
+  static void sterf(int n, T* d, T* e);
+  static void steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work);
 
   // QR
   // computes the QR factorization of a general matrix
-  static void geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork);
+  static void geqrf(int m, int n, T* a, int lda, T* tau, T* work, int* lwork);
   // Generates the real orthogonal matrix Q of the QR factorization formed by geqrf.
 
   // multiply C by implicit Q
-  static void ormqr(bool right_side, bool transq, int m, int n, int k, T *a,
-                    int lda, T *tau, T *c, int ldc, T *work, int *lwork);
-
-  static void geev(T *A, T *eigenvalues, int dim, int lda);
-  static void geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda,
+  static void ormqr(bool right_side,
+                    bool transq,
+                    int m,
+                    int n,
+                    int k,
+                    T* a,
+                    int lda,
+                    T* tau,
+                    T* c,
+                    int ldc,
+                    T* work,
+                    int* lwork);
+
+  static void geev(T* A, T* eigenvalues, int dim, int lda);
+  static void geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr);
+  static void geev(T* A,
+                   T* eigenvalues_r,
+                   T* eigenvalues_i,
+                   T* eigenvectors_r,
+                   T* eigenvectors_i,
+                   int dim,
+                   int lda,
                    int ldvr);
-  static void geev(T *A, T *eigenvalues_r, T *eigenvalues_i, T *eigenvectors_r,
-                   T *eigenvectors_i, int dim, int lda, int ldvr);
 
  private:
-  static void lapack_gemm(const char transa, const char transb, int m, int n,
-                          int k, float alpha, const float *a, int lda,
-                          const float *b, int ldb, float beta, float *c,
-                          int ldc) {
-    cublasOperation_t cublas_transa =
-      (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
-    cublasOperation_t cublas_transb =
-      (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
-    cusolverDnSgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha,
-                        (float *)a, lda, (float *)b, ldb, &beta, c, ldc);
+  static void lapack_gemm(const char transa,
+                          const char transb,
+                          int m,
+                          int n,
+                          int k,
+                          float alpha,
+                          const float* a,
+                          int lda,
+                          const float* b,
+                          int ldb,
+                          float beta,
+                          float* c,
+                          int ldc)
+  {
+    cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cusolverDnSgemmHost(
+      cublas_transa, cublas_transb, m, n, k, &alpha, (float*)a, lda, (float*)b, ldb, &beta, c, ldc);
   }
 
-  static void lapack_gemm(const signed char transa, const signed char transb,
-                          int m, int n, int k, double alpha, const double *a,
-                          int lda, const double *b, int ldb, double beta,
-                          double *c, int ldc) {
-    cublasOperation_t cublas_transa =
-      (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
-    cublasOperation_t cublas_transb =
-      (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
-    cusolverDnDgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha,
-                        (double *)a, lda, (double *)b, ldb, &beta, c, ldc);
+  static void lapack_gemm(const signed char transa,
+                          const signed char transb,
+                          int m,
+                          int n,
+                          int k,
+                          double alpha,
+                          const double* a,
+                          int lda,
+                          const double* b,
+                          int ldb,
+                          double beta,
+                          double* c,
+                          int ldc)
+  {
+    cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cusolverDnDgemmHost(cublas_transa,
+                        cublas_transb,
+                        m,
+                        n,
+                        k,
+                        &alpha,
+                        (double*)a,
+                        lda,
+                        (double*)b,
+                        ldb,
+                        &beta,
+                        c,
+                        ldc);
   }
 
-  static void lapack_sterf(int n, float *d, float *e, int *info) {
+  static void lapack_sterf(int n, float* d, float* e, int* info)
+  {
     cusolverDnSsterfHost(n, d, e, info);
   }
 
-  static void lapack_sterf(int n, double *d, double *e, int *info) {
+  static void lapack_sterf(int n, double* d, double* e, int* info)
+  {
     cusolverDnDsterfHost(n, d, e, info);
   }
 
-  static void lapack_steqr(const signed char compz, int n, float *d, float *e,
-                           float *z, int ldz, float *work, int *info) {
+  static void lapack_steqr(
+    const signed char compz, int n, float* d, float* e, float* z, int ldz, float* work, int* info)
+  {
     cusolverDnSsteqrHost(&compz, n, d, e, z, ldz, work, info);
   }
 
-  static void lapack_steqr(const signed char compz, int n, double *d, double *e,
-                           double *z, int ldz, double *work, int *info) {
+  static void lapack_steqr(const signed char compz,
+                           int n,
+                           double* d,
+                           double* e,
+                           double* z,
+                           int ldz,
+                           double* work,
+                           int* info)
+  {
     cusolverDnDsteqrHost(&compz, n, d, e, z, ldz, work, info);
   }
 
-  static void lapack_geqrf(int m, int n, float *a, int lda, float *tau,
-                           float *work, int *lwork, int *info) {
+  static void lapack_geqrf(
+    int m, int n, float* a, int lda, float* tau, float* work, int* lwork, int* info)
+  {
     sgeqrf_(&m, &n, a, &lda, tau, work, lwork, info);
   }
 
-  static void lapack_geqrf(int m, int n, double *a, int lda, double *tau,
-                           double *work, int *lwork, int *info) {
+  static void lapack_geqrf(
+    int m, int n, double* a, int lda, double* tau, double* work, int* lwork, int* info)
+  {
     dgeqrf_(&m, &n, a, &lda, tau, work, lwork, info);
   }
 
-  static void lapack_ormqr(char side, char trans, int m, int n, int k, float *a,
-                           int lda, float *tau, float *c, int ldc, float *work,
-                           int *lwork, int *info) {
-    sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork,
-            info);
+  static void lapack_ormqr(char side,
+                           char trans,
+                           int m,
+                           int n,
+                           int k,
+                           float* a,
+                           int lda,
+                           float* tau,
+                           float* c,
+                           int ldc,
+                           float* work,
+                           int* lwork,
+                           int* info)
+  {
+    sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info);
   }
 
-  static void lapack_ormqr(char side, char trans, int m, int n, int k,
-                           double *a, int lda, double *tau, double *c, int ldc,
-                           double *work, int *lwork, int *info) {
-    dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork,
-            info);
+  static void lapack_ormqr(char side,
+                           char trans,
+                           int m,
+                           int n,
+                           int k,
+                           double* a,
+                           int lda,
+                           double* tau,
+                           double* c,
+                           int ldc,
+                           double* work,
+                           int* lwork,
+                           int* info)
+  {
+    dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info);
   }
 
-  static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, double *a,
-                                  int *lda, double *wr, double *wi, double *vl,
-                                  int *ldvl, double *vr, int *ldvr,
-                                  double *work, int *lwork, int *info) {
-    return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work,
-                  lwork, info);
+  static int lapack_geev_dispatch(char* jobvl,
+                                  char* jobvr,
+                                  int* n,
+                                  double* a,
+                                  int* lda,
+                                  double* wr,
+                                  double* wi,
+                                  double* vl,
+                                  int* ldvl,
+                                  double* vr,
+                                  int* ldvr,
+                                  double* work,
+                                  int* lwork,
+                                  int* info)
+  {
+    return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info);
   }
 
-  static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, float *a,
-                                  int *lda, float *wr, float *wi, float *vl,
-                                  int *ldvl, float *vr, int *ldvr, float *work,
-                                  int *lwork, int *info) {
-    return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work,
-                  lwork, info);
+  static int lapack_geev_dispatch(char* jobvl,
+                                  char* jobvr,
+                                  int* n,
+                                  float* a,
+                                  int* lda,
+                                  float* wr,
+                                  float* wi,
+                                  float* vl,
+                                  int* ldvl,
+                                  float* vr,
+                                  int* ldvr,
+                                  float* work,
+                                  int* lwork,
+                                  int* info)
+  {
+    return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info);
   }
 
   // real eigenvalues
-  static void lapack_geev(T *A, T *eigenvalues, int dim, int lda) {
+  static void lapack_geev(T* A, T* eigenvalues, int dim, int lda)
+  {
     char job = 'N';
     std::vector<T> WI(dim);
-    int ldv = 1;
-    T *vl = 0;
+    int ldv       = 1;
+    T* vl         = 0;
     int work_size = 6 * dim;
     std::vector<T> work(work_size);
     int info;
-    lapack_geev_dispatch(&job, &job, &dim, A, &lda, eigenvalues, WI.data(), vl,
-                         &ldv, vl, &ldv, work.data(), &work_size, &info);
+    lapack_geev_dispatch(&job,
+                         &job,
+                         &dim,
+                         A,
+                         &lda,
+                         eigenvalues,
+                         WI.data(),
+                         vl,
+                         &ldv,
+                         vl,
+                         &ldv,
+                         work.data(),
+                         &work_size,
+                         &info);
     lapackCheckError(info);
   }
 
   // real eigenpairs
-  static void lapack_geev(T *A, T *eigenvalues, T *eigenvectors, int dim,
-                          int lda, int ldvr) {
+  static void lapack_geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr)
+  {
     char jobvl = 'N';
     char jobvr = 'V';
     std::vector<T> WI(dim);
     int work_size = 6 * dim;
-    T *vl = 0;
-    int ldvl = 1;
+    T* vl         = 0;
+    int ldvl      = 1;
     std::vector<T> work(work_size);
     int info;
-    lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues, WI.data(),
-                         vl, &ldvl, eigenvectors, &ldvr, work.data(),
-                         &work_size, &info);
+    lapack_geev_dispatch(&jobvl,
+                         &jobvr,
+                         &dim,
+                         A,
+                         &lda,
+                         eigenvalues,
+                         WI.data(),
+                         vl,
+                         &ldvl,
+                         eigenvectors,
+                         &ldvr,
+                         work.data(),
+                         &work_size,
+                         &info);
     lapackCheckError(info);
   }
 
   // complex eigenpairs
-  static void lapack_geev(T *A, T *eigenvalues_r, T *eigenvalues_i,
-                          T *eigenvectors_r, T *eigenvectors_i, int dim,
-                          int lda, int ldvr) {
-    char jobvl = 'N';
-    char jobvr = 'V';
+  static void lapack_geev(T* A,
+                          T* eigenvalues_r,
+                          T* eigenvalues_i,
+                          T* eigenvectors_r,
+                          T* eigenvectors_i,
+                          int dim,
+                          int lda,
+                          int ldvr)
+  {
+    char jobvl    = 'N';
+    char jobvr    = 'V';
     int work_size = 8 * dim;
-    int ldvl = 1;
+    int ldvl      = 1;
     std::vector<T> work(work_size);
     int info;
-    lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues_r,
-                         eigenvalues_i, 0, &ldvl, eigenvectors_r, &ldvr,
-                         work.data(), &work_size, &info);
+    lapack_geev_dispatch(&jobvl,
+                         &jobvr,
+                         &dim,
+                         A,
+                         &lda,
+                         eigenvalues_r,
+                         eigenvalues_i,
+                         0,
+                         &ldvl,
+                         eigenvectors_r,
+                         &ldvr,
+                         work.data(),
+                         &work_size,
+                         &info);
     lapackCheckError(info);
   }
 };
 
 template <typename T>
-void Lapack<T>::check_lapack_enabled() {
+void Lapack<T>::check_lapack_enabled()
+{
 #ifndef USE_LAPACK
   RAFT_FAIL("Error: LAPACK not enabled.");
 #endif
 }
 
 template <typename T>
-void Lapack<T>::gemm(bool transa, bool transb, int m, int n, int k, T alpha,
-                     const T *A, int lda, const T *B, int ldb, T beta, T *C,
-                     int ldc) {
+void Lapack<T>::gemm(bool transa,
+                     bool transb,
+                     int m,
+                     int n,
+                     int k,
+                     T alpha,
+                     const T* A,
+                     int lda,
+                     const T* B,
+                     int ldb,
+                     T beta,
+                     T* C,
+                     int ldc)
+{
   // check_lapack_enabled();
   //#ifdef NVGRAPH_USE_LAPACK
   const char transA_char = transa ? 'T' : 'N';
   const char transB_char = transb ? 'T' : 'N';
-  lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C,
-              ldc);
+  lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
   //#endif
 }
 
 template <typename T>
-void Lapack<T>::sterf(int n, T *d, T *e) {
+void Lapack<T>::sterf(int n, T* d, T* e)
+{
   //    check_lapack_enabled();
   //#ifdef NVGRAPH_USE_LAPACK
   int info;
@@ -276,7 +492,8 @@ void Lapack<T>::sterf(int n, T *d, T *e) {
 }
 
 template <typename T>
-void Lapack<T>::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) {
+void Lapack<T>::steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work)
+{
   //    check_lapack_enabled();
   //#ifdef NVGRAPH_USE_LAPACK
   int info;
@@ -286,8 +503,8 @@ void Lapack<T>::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) {
 }
 
 template <typename T>
-void Lapack<T>::geqrf(int m, int n, T *a, int lda, T *tau, T *work,
-                      int *lwork) {
+void Lapack<T>::geqrf(int m, int n, T* a, int lda, T* tau, T* work, int* lwork)
+{
   check_lapack_enabled();
 #ifdef USE_LAPACK
   int info;
@@ -296,11 +513,22 @@ void Lapack<T>::geqrf(int m, int n, T *a, int lda, T *tau, T *work,
 #endif
 }
 template <typename T>
-void Lapack<T>::ormqr(bool right_side, bool transq, int m, int n, int k, T *a,
-                      int lda, T *tau, T *c, int ldc, T *work, int *lwork) {
+void Lapack<T>::ormqr(bool right_side,
+                      bool transq,
+                      int m,
+                      int n,
+                      int k,
+                      T* a,
+                      int lda,
+                      T* tau,
+                      T* c,
+                      int ldc,
+                      T* work,
+                      int* lwork)
+{
   check_lapack_enabled();
 #ifdef USE_LAPACK
-  char side = right_side ? 'R' : 'L';
+  char side  = right_side ? 'R' : 'L';
   char trans = transq ? 'T' : 'N';
   int info;
   lapack_ormqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, &info);
@@ -310,7 +538,8 @@ void Lapack<T>::ormqr(bool right_side, bool transq, int m, int n, int k, T *a,
 
 // real eigenvalues
 template <typename T>
-void Lapack<T>::geev(T *A, T *eigenvalues, int dim, int lda) {
+void Lapack<T>::geev(T* A, T* eigenvalues, int dim, int lda)
+{
   check_lapack_enabled();
 #ifdef USE_LAPACK
   lapack_geev(A, eigenvalues, dim, lda);
@@ -318,8 +547,8 @@ void Lapack<T>::geev(T *A, T *eigenvalues, int dim, int lda) {
 }
 // real eigenpairs
 template <typename T>
-void Lapack<T>::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda,
-                     int ldvr) {
+void Lapack<T>::geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr)
+{
   check_lapack_enabled();
 #ifdef USE_LAPACK
   lapack_geev(A, eigenvalues, eigenvectors, dim, lda, ldvr);
@@ -327,13 +556,18 @@ void Lapack<T>::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda,
 }
 // complex eigenpairs
 template <typename T>
-void Lapack<T>::geev(T *A, T *eigenvalues_r, T *eigenvalues_i,
-                     T *eigenvectors_r, T *eigenvectors_i, int dim, int lda,
-                     int ldvr) {
+void Lapack<T>::geev(T* A,
+                     T* eigenvalues_r,
+                     T* eigenvalues_i,
+                     T* eigenvectors_r,
+                     T* eigenvectors_i,
+                     int dim,
+                     int lda,
+                     int ldvr)
+{
   check_lapack_enabled();
 #ifdef USE_LAPACK
-  lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i,
-              dim, lda, ldvr);
+  lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, dim, lda, ldvr);
 #endif
 }
 
diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp
index 42fc621a1a..9d1f899d66 100644
--- a/cpp/include/raft/spectral/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/matrix_wrappers.hpp
@@ -41,10 +41,12 @@ using size_type = int;  // for now; TODO: move it in appropriate header
 // Apply diagonal matrix to vector:
 //
 template <typename IndexType_, typename ValueType_>
-static __global__ void diagmv(IndexType_ n, ValueType_ alpha,
+static __global__ void diagmv(IndexType_ n,
+                              ValueType_ alpha,
                               const ValueType_* __restrict__ D,
                               const ValueType_* __restrict__ x,
-                              ValueType_* __restrict__ y) {
+                              ValueType_* __restrict__ y)
+{
   IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x;
   while (i < n) {
     y[i] += alpha * D[i] * x[i];
@@ -59,7 +61,7 @@ enum struct sparse_mv_alg_t : int {
   SPARSE_MV_UNDEFINED = -1,
   SPARSE_MV_ALG_DEFAULT,  // generic, for any sparse matrix
   SPARSE_MV_ALG1,         // typical for CSR
-  SPARSE_MV_ALG2  // may provide better performamce for irregular sparse matrices
+  SPARSE_MV_ALG2          // may provide better performamce for irregular sparse matrices
 };
 
 // Vector "view"-like aggregate for linear algebra purposes
@@ -69,15 +71,14 @@ struct vector_view_t {
   value_type* buffer_;
   size_type size_;
 
-  vector_view_t(value_type* buffer, size_type sz)
-    : buffer_(buffer), size_(sz) {}
+  vector_view_t(value_type* buffer, size_type sz) : buffer_(buffer), size_(sz) {}
 
-  vector_view_t(vector_view_t&& other)
-    : buffer_(other.raw()), size_(other.size()) {}
+  vector_view_t(vector_view_t&& other) : buffer_(other.raw()), size_(other.size()) {}
 
-  vector_view_t& operator=(vector_view_t&& other) {
+  vector_view_t& operator=(vector_view_t&& other)
+  {
     buffer_ = other.raw();
-    size_ = other.size();
+    size_   = other.size();
   }
 };
 
@@ -85,8 +86,9 @@ template <typename value_type>
 class vector_t {
  public:
   vector_t(handle_t const& raft_handle, size_type sz)
-    : buffer_(sz, raft_handle.get_stream()),
-      thrust_policy(raft_handle.get_thrust_policy()) {}
+    : buffer_(sz, raft_handle.get_stream()), thrust_policy(raft_handle.get_thrust_policy())
+  {
+  }
 
   size_type size(void) const { return buffer_.size(); }
 
@@ -94,32 +96,40 @@ class vector_t {
 
   value_type const* raw(void) const { return buffer_.data(); }
 
-  value_type nrm1() const {
-    return thrust::reduce(thrust_policy, buffer_.data(),
-                          buffer_.data() + buffer_.size(), value_type{0},
+  value_type nrm1() const
+  {
+    return thrust::reduce(thrust_policy,
+                          buffer_.data(),
+                          buffer_.data() + buffer_.size(),
+                          value_type{0},
                           [] __device__(auto left, auto right) {
-                            auto abs_left = left > 0 ? left : -left;
+                            auto abs_left  = left > 0 ? left : -left;
                             auto abs_right = right > 0 ? right : -right;
                             return abs_left + abs_right;
                           });
   }
 
-  void fill(value_type value) {
+  void fill(value_type value)
+  {
     thrust::fill_n(thrust_policy, buffer_.data(), buffer_.size(), value);
   }
 
  private:
-  using thrust_exec_policy_t = thrust::detail::execute_with_allocator<
-    rmm::mr::thrust_allocator<char>, thrust::cuda_cub::execute_on_stream_base>;
+  using thrust_exec_policy_t =
+    thrust::detail::execute_with_allocator<rmm::mr::thrust_allocator<char>,
+                                           thrust::cuda_cub::execute_on_stream_base>;
   rmm::device_uvector<value_type> buffer_;
   const thrust_exec_policy_t thrust_policy;
 };
 
 template <typename index_type, typename value_type>
 struct sparse_matrix_t {
-  sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets,
-                  index_type const* col_indices, value_type const* values,
-                  index_type const nrows, index_type const ncols,
+  sparse_matrix_t(handle_t const& raft_handle,
+                  index_type const* row_offsets,
+                  index_type const* col_indices,
+                  value_type const* values,
+                  index_type const nrows,
+                  index_type const ncols,
                   index_type const nnz)
     : handle_(raft_handle),
       row_offsets_(row_offsets),
@@ -127,18 +137,25 @@ struct sparse_matrix_t {
       values_(values),
       nrows_(nrows),
       ncols_(ncols),
-      nnz_(nnz) {}
+      nnz_(nnz)
+  {
+  }
 
-  sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets,
-                  index_type const* col_indices, value_type const* values,
-                  index_type const nrows, index_type const nnz)
+  sparse_matrix_t(handle_t const& raft_handle,
+                  index_type const* row_offsets,
+                  index_type const* col_indices,
+                  value_type const* values,
+                  index_type const nrows,
+                  index_type const nnz)
     : handle_(raft_handle),
       row_offsets_(row_offsets),
       col_indices_(col_indices),
       values_(values),
       nrows_(nrows),
       ncols_(nrows),
-      nnz_(nnz) {}
+      nnz_(nnz)
+  {
+  }
 
   template <typename CSRView>
   sparse_matrix_t(handle_t const& raft_handle, CSRView const& csr_view)
@@ -148,7 +165,9 @@ struct sparse_matrix_t {
       values_(csr_view.edge_data),
       nrows_(csr_view.number_of_vertices),
       ncols_(csr_view.number_of_vertices),
-      nnz_(csr_view.number_of_edges) {}
+      nnz_(csr_view.number_of_edges)
+  {
+  }
 
   virtual ~sparse_matrix_t(void) =
     default;  // virtual because used as base for following matrix types
@@ -158,21 +177,24 @@ struct sparse_matrix_t {
   // descriptor creation works with non-const, and const-casting
   // down is dangerous)
   //
-  virtual void mv(value_type alpha, value_type* __restrict__ x, value_type beta,
+  virtual void mv(value_type alpha,
+                  value_type* __restrict__ x,
+                  value_type beta,
                   value_type* __restrict__ y,
                   sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1,
-                  bool transpose = false, bool symmetric = false) const {
+                  bool transpose      = false,
+                  bool symmetric      = false) const
+  {
     using namespace sparse;
 
     RAFT_EXPECTS(x != nullptr, "Null x buffer.");
     RAFT_EXPECTS(y != nullptr, "Null y buffer.");
 
     auto cusparse_h = handle_.get_cusparse_handle();
-    auto stream = handle_.get_stream();
+    auto stream     = handle_.get_stream();
 
-    cusparseOperation_t trans =
-      transpose ? CUSPARSE_OPERATION_TRANSPOSE :  // transpose
-        CUSPARSE_OPERATION_NON_TRANSPOSE;         //non-transpose
+    cusparseOperation_t trans = transpose ? CUSPARSE_OPERATION_TRANSPOSE :  // transpose
+                                  CUSPARSE_OPERATION_NON_TRANSPOSE;         // non-transpose
 
 #if not defined CUDA_ENFORCE_LOWER and CUDA_VER_10_1_UP
     auto size_x = transpose ? nrows_ : ncols_;
@@ -180,15 +202,19 @@ struct sparse_matrix_t {
 
     cusparseSpMVAlg_t spmv_alg = translate_algorithm(alg);
 
-    //create descriptors:
+    // create descriptors:
     //(below casts are necessary, because
     // cusparseCreateCsr(...) takes non-const
     // void*; the casts should be harmless)
     //
     cusparseSpMatDescr_t matA;
-    CUSPARSE_CHECK(cusparsecreatecsr(
-      &matA, nrows_, ncols_, nnz_, const_cast<index_type*>(row_offsets_),
-      const_cast<index_type*>(col_indices_), const_cast<value_type*>(values_)));
+    CUSPARSE_CHECK(cusparsecreatecsr(&matA,
+                                     nrows_,
+                                     ncols_,
+                                     nnz_,
+                                     const_cast<index_type*>(row_offsets_),
+                                     const_cast<index_type*>(col_indices_),
+                                     const_cast<value_type*>(values_)));
 
     cusparseDnVecDescr_t vecX;
     CUSPARSE_CHECK(cusparsecreatednvec(&vecX, size_x, x));
@@ -196,31 +222,29 @@ struct sparse_matrix_t {
     cusparseDnVecDescr_t vecY;
     CUSPARSE_CHECK(cusparsecreatednvec(&vecY, size_y, y));
 
-    //get (scratch) external device buffer size:
+    // get (scratch) external device buffer size:
     //
     size_t bufferSize;
-    CUSPARSE_CHECK(cusparsespmv_buffersize(cusparse_h, trans, &alpha, matA,
-                                           vecX, &beta, vecY, spmv_alg,
-                                           &bufferSize, stream));
+    CUSPARSE_CHECK(cusparsespmv_buffersize(
+      cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, &bufferSize, stream));
 
-    //allocate external buffer:
+    // allocate external buffer:
     //
     vector_t<value_type> external_buffer(handle_, bufferSize);
 
-    //finally perform SpMV:
+    // finally perform SpMV:
     //
-    CUSPARSE_CHECK(cusparsespmv(cusparse_h, trans, &alpha, matA, vecX, &beta,
-                                vecY, spmv_alg, external_buffer.raw(), stream));
+    CUSPARSE_CHECK(cusparsespmv(
+      cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, external_buffer.raw(), stream));
 
-    //free descriptors:
+    // free descriptors:
     //(TODO: maybe wrap them in a RAII struct?)
     //
     CUSPARSE_CHECK(cusparseDestroyDnVec(vecY));
     CUSPARSE_CHECK(cusparseDestroyDnVec(vecX));
     CUSPARSE_CHECK(cusparseDestroySpMat(matA));
 #else
-    CUSPARSE_CHECK(
-      cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream));
+    CUSPARSE_CHECK(cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream));
     cusparseMatDescr_t descr = 0;
     CUSPARSE_CHECK(cusparseCreateMatDescr(&descr));
     if (symmetric) {
@@ -229,9 +253,20 @@ struct sparse_matrix_t {
       CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
     }
     CUSPARSE_CHECK(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
-    CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, trans, nrows_, ncols_, nnz_,
-                                 &alpha, descr, values_, row_offsets_,
-                                 col_indices_, x, &beta, y, stream));
+    CUSPARSE_CHECK(cusparsecsrmv(cusparse_h,
+                                 trans,
+                                 nrows_,
+                                 ncols_,
+                                 nnz_,
+                                 &alpha,
+                                 descr,
+                                 values_,
+                                 row_offsets_,
+                                 col_indices_,
+                                 x,
+                                 &beta,
+                                 y,
+                                 stream));
     CUSPARSE_CHECK(cusparseDestroyMatDescr(descr));
 #endif
   }
@@ -239,19 +274,18 @@ struct sparse_matrix_t {
   handle_t const& get_handle(void) const { return handle_; }
 
 #if not defined CUDA_ENFORCE_LOWER and CUDA_VER_10_1_UP
-  cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const {
+  cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const
+  {
     switch (alg) {
-      case sparse_mv_alg_t::SPARSE_MV_ALG1:
-        return CUSPARSE_CSRMV_ALG1;
-      case sparse_mv_alg_t::SPARSE_MV_ALG2:
-        return CUSPARSE_CSRMV_ALG2;
-      default:
-        return CUSPARSE_MV_ALG_DEFAULT;
+      case sparse_mv_alg_t::SPARSE_MV_ALG1: return CUSPARSE_CSRMV_ALG1;
+      case sparse_mv_alg_t::SPARSE_MV_ALG2: return CUSPARSE_CSRMV_ALG2;
+      default: return CUSPARSE_MV_ALG_DEFAULT;
     }
   }
 #endif
 
-  //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate
+  // private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence,
+  // aggregate
 
   handle_t const& handle_;
   index_type const* row_offsets_;
@@ -264,43 +298,51 @@ struct sparse_matrix_t {
 
 template <typename index_type, typename value_type>
 struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
-  laplacian_matrix_t(handle_t const& raft_handle, index_type const* row_offsets,
-                     index_type const* col_indices, value_type const* values,
-                     index_type const nrows, index_type const nnz)
-    : sparse_matrix_t<index_type, value_type>(raft_handle, row_offsets,
-                                              col_indices, values, nrows, nnz),
-      diagonal_(raft_handle, nrows) {
+  laplacian_matrix_t(handle_t const& raft_handle,
+                     index_type const* row_offsets,
+                     index_type const* col_indices,
+                     value_type const* values,
+                     index_type const nrows,
+                     index_type const nnz)
+    : sparse_matrix_t<index_type, value_type>(
+        raft_handle, row_offsets, col_indices, values, nrows, nnz),
+      diagonal_(raft_handle, nrows)
+  {
     vector_t<value_type> ones{raft_handle, nrows};
     ones.fill(1.0);
-    sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0,
-                                                diagonal_.raw());
+    sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0, diagonal_.raw());
   }
 
   laplacian_matrix_t(handle_t const& raft_handle,
                      sparse_matrix_t<index_type, value_type> const& csr_m)
-    : sparse_matrix_t<index_type, value_type>(raft_handle, csr_m.row_offsets_,
-                                              csr_m.col_indices_, csr_m.values_,
-                                              csr_m.nrows_, csr_m.nnz_),
-      diagonal_(raft_handle, csr_m.nrows_) {
+    : sparse_matrix_t<index_type, value_type>(raft_handle,
+                                              csr_m.row_offsets_,
+                                              csr_m.col_indices_,
+                                              csr_m.values_,
+                                              csr_m.nrows_,
+                                              csr_m.nnz_),
+      diagonal_(raft_handle, csr_m.nrows_)
+  {
     vector_t<value_type> ones{raft_handle, csr_m.nrows_};
     ones.fill(1.0);
-    sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0,
-                                                diagonal_.raw());
+    sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0, diagonal_.raw());
   }
 
   // y = alpha*A*x + beta*y
   //
-  void mv(value_type alpha, value_type* __restrict__ x, value_type beta,
+  void mv(value_type alpha,
+          value_type* __restrict__ x,
+          value_type beta,
           value_type* __restrict__ y,
           sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1,
-          bool transpose = false, bool symmetric = false) const override {
+          bool transpose      = false,
+          bool symmetric      = false) const override
+  {
     constexpr int BLOCK_SIZE = 1024;
-    auto n = sparse_matrix_t<index_type, value_type>::nrows_;
+    auto n                   = sparse_matrix_t<index_type, value_type>::nrows_;
 
-    auto cublas_h =
-      sparse_matrix_t<index_type, value_type>::get_handle().get_cublas_handle();
-    auto stream =
-      sparse_matrix_t<index_type, value_type>::get_handle().get_stream();
+    auto cublas_h = sparse_matrix_t<index_type, value_type>::get_handle().get_cublas_handle();
+    auto stream   = sparse_matrix_t<index_type, value_type>::get_handle().get_stream();
 
     // scales y by beta:
     //
@@ -312,8 +354,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
 
     // Apply diagonal matrix
     //
-    dim3 gridDim{
-      std::min<unsigned int>((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1};
+    dim3 gridDim{std::min<unsigned int>((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1};
 
     dim3 blockDim{BLOCK_SIZE, 1, 1};
     diagmv<<<gridDim, blockDim, 0, stream>>>(n, alpha, diagonal_.raw(), x, y);
@@ -321,8 +362,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
 
     // Apply adjacency matrix
     //
-    sparse_matrix_t<index_type, value_type>::mv(-alpha, x, 1, y, alg, transpose,
-                                                symmetric);
+    sparse_matrix_t<index_type, value_type>::mv(-alpha, x, 1, y, alg, transpose, symmetric);
   }
 
   vector_t<value_type> diagonal_;
@@ -332,52 +372,66 @@ template <typename index_type, typename value_type>
 struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
   modularity_matrix_t(handle_t const& raft_handle,
                       index_type const* row_offsets,
-                      index_type const* col_indices, value_type const* values,
-                      index_type const nrows, index_type const nnz)
+                      index_type const* col_indices,
+                      value_type const* values,
+                      index_type const nrows,
+                      index_type const nnz)
     : laplacian_matrix_t<index_type, value_type>(
-        raft_handle, row_offsets, col_indices, values, nrows, nnz) {
+        raft_handle, row_offsets, col_indices, values, nrows, nnz)
+  {
     edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1();
   }
 
   modularity_matrix_t(handle_t const& raft_handle,
                       sparse_matrix_t<index_type, value_type> const& csr_m)
-    : laplacian_matrix_t<index_type, value_type>(raft_handle, csr_m) {
+    : laplacian_matrix_t<index_type, value_type>(raft_handle, csr_m)
+  {
     edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1();
   }
 
   // y = alpha*A*x + beta*y
   //
-  void mv(value_type alpha, value_type* __restrict__ x, value_type beta,
+  void mv(value_type alpha,
+          value_type* __restrict__ x,
+          value_type beta,
           value_type* __restrict__ y,
           sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1,
-          bool transpose = false, bool symmetric = false) const override {
+          bool transpose      = false,
+          bool symmetric      = false) const override
+  {
     auto n = sparse_matrix_t<index_type, value_type>::nrows_;
 
-    auto cublas_h =
-      sparse_matrix_t<index_type, value_type>::get_handle().get_cublas_handle();
-    auto stream =
-      sparse_matrix_t<index_type, value_type>::get_handle().get_stream();
+    auto cublas_h = sparse_matrix_t<index_type, value_type>::get_handle().get_cublas_handle();
+    auto stream   = sparse_matrix_t<index_type, value_type>::get_handle().get_stream();
 
     // y = A*x
     //
-    sparse_matrix_t<index_type, value_type>::mv(alpha, x, 0, y, alg, transpose,
-                                                symmetric);
+    sparse_matrix_t<index_type, value_type>::mv(alpha, x, 0, y, alg, transpose, symmetric);
     value_type dot_res;
 
     // gamma = d'*x
     //
     // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res);
-    CUBLAS_CHECK(linalg::cublasdot(
-      cublas_h, n, laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
-      1, x, 1, &dot_res, stream));
+    CUBLAS_CHECK(linalg::cublasdot(cublas_h,
+                                   n,
+                                   laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
+                                   1,
+                                   x,
+                                   1,
+                                   &dot_res,
+                                   stream));
 
     // y = y -(gamma/edge_sum)*d
     //
     value_type gamma_ = -dot_res / edge_sum_;
-    CUBLAS_CHECK(linalg::cublasaxpy(
-      cublas_h, n, &gamma_,
-      laplacian_matrix_t<index_type, value_type>::diagonal_.raw(), 1, y, 1,
-      stream));
+    CUBLAS_CHECK(linalg::cublasaxpy(cublas_h,
+                                    n,
+                                    &gamma_,
+                                    laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
+                                    1,
+                                    y,
+                                    1,
+                                    stream));
   }
 
   value_type edge_sum_;
diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp
index fededbfcb4..0e0e47ddf3 100644
--- a/cpp/include/raft/spectral/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/modularity_maximization.hpp
@@ -39,7 +39,8 @@
 #endif
 
 #ifdef COLLECT_TIME_STATISTICS
-static double timer(void) {
+static double timer(void)
+{
   struct timeval tv;
   cudaDeviceSynchronize();
   gettimeofday(&tv, NULL);
@@ -78,17 +79,21 @@ using namespace linalg;
  *    performed.
  *  @return error flag.
  */
-template <typename vertex_t, typename weight_t, typename EigenSolver,
-          typename ClusterSolver>
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
-  handle_t const &handle, sparse_matrix_t<vertex_t, weight_t> const &csr_m,
-  EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver,
-  vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) {
+  handle_t const& handle,
+  sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
+{
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
   RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
   RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
 
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
   auto cublas_h = handle.get_cublas_handle();
 
   std::tuple<vertex_t, weight_t, vertex_t>
@@ -102,11 +107,10 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
   modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
 
   auto eigen_config = eigen_solver.get_config();
-  auto nEigVecs = eigen_config.n_eigVecs;
+  auto nEigVecs     = eigen_config.n_eigVecs;
 
   // Compute eigenvectors corresponding to largest eigenvalues
-  std::get<0>(stats) =
-    eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs);
+  std::get<0>(stats) = eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs);
 
   // Whiten eigenvector matrix
   transform_eigen_matrix(handle, n, nEigVecs, eigVecs);
@@ -117,8 +121,7 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
   CHECK_CUDA(stream);
 
   // Find partition clustering
-  auto pair_cluster =
-    cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
+  auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
 
   std::get<1>(stats) = pair_cluster.first;
   std::get<2>(stats) = pair_cluster.second;
@@ -137,11 +140,12 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
  *  @param modularity On exit, modularity
  */
 template <typename vertex_t, typename weight_t>
-void analyzeModularity(handle_t const &handle,
-                       sparse_matrix_t<vertex_t, weight_t> const &csr_m,
+void analyzeModularity(handle_t const& handle,
+                       sparse_matrix_t<vertex_t, weight_t> const& csr_m,
                        vertex_t nClusters,
-                       vertex_t const *__restrict__ clusters,
-                       weight_t &modularity) {
+                       vertex_t const* __restrict__ clusters,
+                       weight_t& modularity)
+{
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
 
   vertex_t i;
@@ -149,15 +153,14 @@ void analyzeModularity(handle_t const &handle,
   weight_t partModularity, clustersize;
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   // Device memory
   vector_t<weight_t> part_i(handle, n);
   vector_t<weight_t> Bx(handle, n);
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(
-    cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // Initialize Modularity
   modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
@@ -167,8 +170,7 @@ void analyzeModularity(handle_t const &handle,
 
   // Iterate through partitions
   for (i = 0; i < nClusters; ++i) {
-    if (!construct_indicator(handle, i, n, clustersize, partModularity,
-                             clusters, part_i, Bx, B)) {
+    if (!construct_indicator(handle, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) {
       WARNING("empty partition");
       continue;
     }
diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp
index 2df3812a4a..b52bfcc0d6 100644
--- a/cpp/include/raft/spectral/partition.hpp
+++ b/cpp/include/raft/spectral/partition.hpp
@@ -61,21 +61,25 @@ using namespace linalg;
  *    performed.
  *  @return statistics: number of eigensolver iterations, .
  */
-template <typename vertex_t, typename weight_t, typename EigenSolver,
-          typename ClusterSolver>
-std::tuple<vertex_t, weight_t, vertex_t> partition(
-  handle_t const &handle, sparse_matrix_t<vertex_t, weight_t> const &csr_m,
-  EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver,
-  vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) {
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> partition(handle_t const& handle,
+                                                   sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                                                   EigenSolver const& eigen_solver,
+                                                   ClusterSolver const& cluster_solver,
+                                                   vertex_t* __restrict__ clusters,
+                                                   weight_t* eigVals,
+                                                   weight_t* eigVecs)
+{
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
   RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
   RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
 
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
   auto cublas_h = handle.get_cublas_handle();
 
   std::tuple<vertex_t, weight_t, vertex_t>
-    stats;  //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver
+    stats;  //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver,
+            // cluster solver residual, # iters cluster solver
 
   vertex_t n = csr_m.nrows_;
 
@@ -86,22 +90,20 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
   // Compute eigenvectors of Laplacian
 
   // Initialize Laplacian
-  ///sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
+  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
   laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
 
   auto eigen_config = eigen_solver.get_config();
-  auto nEigVecs = eigen_config.n_eigVecs;
+  auto nEigVecs     = eigen_config.n_eigVecs;
 
   // Compute smallest eigenvalues and eigenvectors
-  std::get<0>(stats) =
-    eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs);
+  std::get<0>(stats) = eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs);
 
   // Whiten eigenvector matrix
   transform_eigen_matrix(handle, n, nEigVecs, eigVecs);
 
   // Find partition clustering
-  auto pair_cluster =
-    cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
+  auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
 
   std::get<1>(stats) = pair_cluster.first;
   std::get<2>(stats) = pair_cluster.second;
@@ -128,16 +130,19 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
  *  @return error flag.
  */
 template <typename vertex_t, typename weight_t>
-void analyzePartition(handle_t const &handle,
-                      sparse_matrix_t<vertex_t, weight_t> const &csr_m,
-                      vertex_t nClusters, const vertex_t *__restrict__ clusters,
-                      weight_t &edgeCut, weight_t &cost) {
+void analyzePartition(handle_t const& handle,
+                      sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                      vertex_t nClusters,
+                      const vertex_t* __restrict__ clusters,
+                      weight_t& edgeCut,
+                      weight_t& cost)
+{
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
 
   vertex_t i;
   vertex_t n = csr_m.nrows_;
 
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
   auto cublas_h = handle.get_cublas_handle();
 
   weight_t partEdgesCut, clustersize;
@@ -147,22 +152,20 @@ void analyzePartition(handle_t const &handle,
   vector_t<weight_t> Lx(handle, n);
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(
-    cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // Initialize Laplacian
-  ///sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
+  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
   laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
 
   // Initialize output
-  cost = 0;
+  cost    = 0;
   edgeCut = 0;
 
   // Iterate through partitions
   for (i = 0; i < nClusters; ++i) {
     // Construct indicator vector for ith partition
-    if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters,
-                             part_i, Lx, L)) {
+    if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) {
       WARNING("empty partition");
       continue;
     }
diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp
index c148350c0f..44b4af4bdc 100644
--- a/cpp/include/raft/spectral/spectral_util.hpp
+++ b/cpp/include/raft/spectral/spectral_util.hpp
@@ -27,20 +27,18 @@ namespace raft {
 namespace spectral {
 
 template <typename index_type_t, typename value_type_t>
-static __global__ void scale_obs_kernel(index_type_t m, index_type_t n,
-                                        value_type_t* obs) {
+static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, value_type_t* obs)
+{
   index_type_t i, j, k, index, mm;
   value_type_t alpha, v, last;
   bool valid;
   // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension
 
   // compute alpha
-  mm = (((m + blockDim.x - 1) / blockDim.x) *
-        blockDim.x);  // m in multiple of blockDim.x
+  mm    = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x);  // m in multiple of blockDim.x
   alpha = 0.0;
 
-  for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n;
-       j += blockDim.y * gridDim.y) {
+  for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) {
     for (i = threadIdx.x; i < mm; i += blockDim.x) {
       // check if the thread is valid
       valid = i < m;
@@ -65,17 +63,17 @@ static __global__ void scale_obs_kernel(index_type_t m, index_type_t n,
   // scale by alpha
   alpha = __shfl_sync(warp_full_mask(), alpha, blockDim.x - 1, blockDim.x);
   alpha = std::sqrt(alpha);
-  for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n;
-       j += blockDim.y * gridDim.y) {
+  for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) {
     for (i = threadIdx.x; i < m; i += blockDim.x) {  // blockDim.x=32
-      index = i + j * m;
+      index      = i + j * m;
       obs[index] = obs[index] / alpha;
     }
   }
 }
 
 template <typename index_type_t>
-index_type_t next_pow2(index_type_t n) {
+index_type_t next_pow2(index_type_t n)
+{
   index_type_t v;
   // Reference:
   // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float
@@ -89,7 +87,8 @@ index_type_t next_pow2(index_type_t n) {
 }
 
 template <typename index_type_t, typename value_type_t>
-cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) {
+cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs)
+{
   index_type_t p2m;
 
   // find next power of 2
@@ -101,17 +100,16 @@ cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) {
   dim3 nblocks{1, (n + nthreads.y - 1) / nthreads.y, 1};
 
   // launch scaling kernel (scale each column of obs by its norm)
-  scale_obs_kernel<index_type_t, value_type_t>
-    <<<nblocks, nthreads>>>(m, n, obs);
+  scale_obs_kernel<index_type_t, value_type_t><<<nblocks, nthreads>>>(m, n, obs);
 
   return cudaSuccess;
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs,
-                            weight_t* eigVecs) {
-  auto stream = handle.get_stream();
-  auto cublas_h = handle.get_cublas_handle();
+void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, weight_t* eigVecs)
+{
+  auto stream             = handle.get_stream();
+  auto cublas_h           = handle.get_cublas_handle();
   auto thrust_exec_policy = handle.get_thrust_policy();
 
   const weight_t zero{0.0};
@@ -121,9 +119,9 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs,
   for (auto i = 0; i < nEigVecs; ++i) {
     weight_t mean, std;
 
-    mean = thrust::reduce(
-      thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
-      thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)));
+    mean = thrust::reduce(thrust_exec_policy,
+                          thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+                          thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)));
     CHECK_CUDA(stream);
     mean /= n;
     thrust::transform(thrust_exec_policy,
@@ -134,8 +132,7 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs,
                       thrust::minus<weight_t>());
     CHECK_CUDA(stream);
 
-    CUBLAS_CHECK(
-      cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream));
+    CUBLAS_CHECK(cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream));
 
     std /= std::sqrt(static_cast<weight_t>(n));
 
@@ -152,16 +149,25 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs,
   //   TODO: in-place transpose
   {
     vector_t<weight_t> work(handle, nEigVecs * n);
-    CUBLAS_CHECK(
-      cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
-
-    CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, nEigVecs, n,
-                            &one, eigVecs, n, &zero, (weight_t*)NULL, nEigVecs,
-                            work.raw(), nEigVecs, stream));
-
-    CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(),
-                             nEigVecs * n * sizeof(weight_t),
-                             cudaMemcpyDeviceToDevice, stream));
+    CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+    CUBLAS_CHECK(cublasgeam(cublas_h,
+                            CUBLAS_OP_T,
+                            CUBLAS_OP_N,
+                            nEigVecs,
+                            n,
+                            &one,
+                            eigVecs,
+                            n,
+                            &zero,
+                            (weight_t*)NULL,
+                            nEigVecs,
+                            work.raw(),
+                            nEigVecs,
+                            stream));
+
+    CUDA_TRY(cudaMemcpyAsync(
+      eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream));
   }
 }
 
@@ -176,9 +182,9 @@ struct equal_to_i_op {
  public:
   equal_to_i_op(index_type_t _i) : i(_i) {}
   template <typename Tuple_>
-  __host__ __device__ void operator()(Tuple_ t) {
-    thrust::get<1>(t) =
-      (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0;
+  __host__ __device__ void operator()(Tuple_ t)
+  {
+    thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0;
   }
 };
 }  // namespace
@@ -186,38 +192,38 @@ struct equal_to_i_op {
 // Construct indicator vector for ith partition
 //
 template <typename vertex_t, typename edge_t, typename weight_t>
-bool construct_indicator(handle_t const& handle, edge_t index, edge_t n,
-                         weight_t& clustersize, weight_t& partStats,
+bool construct_indicator(handle_t const& handle,
+                         edge_t index,
+                         edge_t n,
+                         weight_t& clustersize,
+                         weight_t& partStats,
                          vertex_t const* __restrict__ clusters,
-                         vector_t<weight_t>& part_i, vector_t<weight_t>& Bx,
-                         laplacian_matrix_t<vertex_t, weight_t> const& B) {
-  auto stream = handle.get_stream();
-  auto cublas_h = handle.get_cublas_handle();
+                         vector_t<weight_t>& part_i,
+                         vector_t<weight_t>& Bx,
+                         laplacian_matrix_t<vertex_t, weight_t> const& B)
+{
+  auto stream             = handle.get_stream();
+  auto cublas_h           = handle.get_cublas_handle();
   auto thrust_exec_policy = handle.get_thrust_policy();
 
-  thrust::for_each(thrust_exec_policy,
-                   thrust::make_zip_iterator(thrust::make_tuple(
-                     thrust::device_pointer_cast(clusters),
-                     thrust::device_pointer_cast(part_i.raw()))),
-                   thrust::make_zip_iterator(thrust::make_tuple(
-                     thrust::device_pointer_cast(clusters + n),
-                     thrust::device_pointer_cast(part_i.raw() + n))),
-                   equal_to_i_op<vertex_t, weight_t>(index));
+  thrust::for_each(
+    thrust_exec_policy,
+    thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters),
+                                                 thrust::device_pointer_cast(part_i.raw()))),
+    thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters + n),
+                                                 thrust::device_pointer_cast(part_i.raw() + n))),
+    equal_to_i_op<vertex_t, weight_t>(index));
   CHECK_CUDA(stream);
 
   // Compute size of ith partition
-  CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1,
-                         &clustersize, stream));
+  CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream));
 
   clustersize = round(clustersize);
-  if (clustersize < 0.5) {
-    return false;
-  }
+  if (clustersize < 0.5) { return false; }
 
   // Compute part stats
   B.mv(1, part_i.raw(), 0, Bx.raw());
-  CUBLAS_CHECK(
-    cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream));
+  CUBLAS_CHECK(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream));
 
   return true;
 }
diff --git a/cpp/include/raft/spectral/warn_dbg.hpp b/cpp/include/raft/spectral/warn_dbg.hpp
index 406f1b7c7e..08a4e6efb5 100644
--- a/cpp/include/raft/spectral/warn_dbg.hpp
+++ b/cpp/include/raft/spectral/warn_dbg.hpp
@@ -4,13 +4,13 @@
 #include <string>
 
 #define STRINGIFY_DETAIL(x) #x
-#define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x)
+#define RAFT_STRINGIFY(x)   STRINGIFY_DETAIL(x)
 
 #ifdef DEBUG
 #define COUT() (std::cout)
 #define CERR() (std::cerr)
 
-//nope:
+// nope:
 //
 #define WARNING(message)                                                  \
   do {                                                                    \
diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh
index 1b338a035a..e8e6bea4dd 100644
--- a/cpp/include/raft/stats/detail/mean.cuh
+++ b/cpp/include/raft/stats/detail/mean.cuh
@@ -27,15 +27,15 @@ namespace detail {
 
 ///@todo: ColsPerBlk has been tested only for 32!
 template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-__global__ void meanKernelRowMajor(Type *mu, const Type *data, IdxType D,
-                                   IdxType N) {
+__global__ void meanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
+{
   const int RowsPerBlkPerIter = TPB / ColsPerBlk;
-  IdxType thisColId = threadIdx.x % ColsPerBlk;
-  IdxType thisRowId = threadIdx.x / ColsPerBlk;
-  IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
-  IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
-  Type thread_data = Type(0);
-  const IdxType stride = RowsPerBlkPerIter * gridDim.x;
+  IdxType thisColId           = threadIdx.x % ColsPerBlk;
+  IdxType thisRowId           = threadIdx.x / ColsPerBlk;
+  IdxType colId               = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
+  IdxType rowId               = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
+  Type thread_data            = Type(0);
+  const IdxType stride        = RowsPerBlkPerIter * gridDim.x;
   for (IdxType i = rowId; i < N; i += stride)
     thread_data += (colId < D) ? data[i * D + colId] : Type(0);
   __shared__ Type smu[ColsPerBlk];
@@ -47,8 +47,8 @@ __global__ void meanKernelRowMajor(Type *mu, const Type *data, IdxType D,
 }
 
 template <typename Type, typename IdxType, int TPB>
-__global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D,
-                                   IdxType N) {
+__global__ void meanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N)
+{
   typedef cub::BlockReduce<Type, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   Type thread_data = Type(0);
@@ -58,30 +58,26 @@ __global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D,
     thread_data += data[idx];
   }
   Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) {
-    mu[blockIdx.x] = acc / N;
-  }
+  if (threadIdx.x == 0) { mu[blockIdx.x] = acc / N; }
 }
 
 template <typename Type, typename IdxType = int>
-void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample,
-          bool rowMajor, cudaStream_t stream) {
+void mean(
+  Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream)
+{
   static const int TPB = 256;
   if (rowMajor) {
     static const int RowsPerThread = 4;
-    static const int ColsPerBlk = 32;
-    static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk),
-              raft::ceildiv(D, (IdxType)ColsPerBlk));
+    static const int ColsPerBlk    = 32;
+    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
+    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
     CUDA_CHECK(cudaMemsetAsync(mu, 0, sizeof(Type) * D, stream));
-    meanKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
-      <<<grid, TPB, 0, stream>>>(mu, data, D, N);
+    meanKernelRowMajor<Type, IdxType, TPB, ColsPerBlk><<<grid, TPB, 0, stream>>>(mu, data, D, N);
     CUDA_CHECK(cudaPeekAtLastError());
     Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
     raft::linalg::scalarMultiply(mu, mu, ratio, D, stream);
   } else {
-    meanKernelColMajor<Type, IdxType, TPB>
-      <<<D, TPB, 0, stream>>>(mu, data, D, N);
+    meanKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(mu, data, D, N);
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh
index e8917a60b3..42351269ea 100644
--- a/cpp/include/raft/stats/detail/stddev.cuh
+++ b/cpp/include/raft/stats/detail/stddev.cuh
@@ -27,15 +27,15 @@ namespace detail {
 
 ///@todo: ColPerBlk has been tested only for 32!
 template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-__global__ void stddevKernelRowMajor(Type *std, const Type *data, IdxType D,
-                                     IdxType N) {
+__global__ void stddevKernelRowMajor(Type* std, const Type* data, IdxType D, IdxType N)
+{
   const int RowsPerBlkPerIter = TPB / ColsPerBlk;
-  IdxType thisColId = threadIdx.x % ColsPerBlk;
-  IdxType thisRowId = threadIdx.x / ColsPerBlk;
-  IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
-  IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
-  Type thread_data = Type(0);
-  const IdxType stride = RowsPerBlkPerIter * gridDim.x;
+  IdxType thisColId           = threadIdx.x % ColsPerBlk;
+  IdxType thisRowId           = threadIdx.x / ColsPerBlk;
+  IdxType colId               = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
+  IdxType rowId               = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
+  Type thread_data            = Type(0);
+  const IdxType stride        = RowsPerBlkPerIter * gridDim.x;
   for (IdxType i = rowId; i < N; i += stride) {
     Type val = (colId < D) ? data[i * D + colId] : Type(0);
     thread_data += val * val;
@@ -49,41 +49,39 @@ __global__ void stddevKernelRowMajor(Type *std, const Type *data, IdxType D,
 }
 
 template <typename Type, typename IdxType, int TPB>
-__global__ void stddevKernelColMajor(Type *std, const Type *data,
-                                     const Type *mu, IdxType D, IdxType N) {
+__global__ void stddevKernelColMajor(
+  Type* std, const Type* data, const Type* mu, IdxType D, IdxType N)
+{
   typedef cub::BlockReduce<Type, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   Type thread_data = Type(0);
   IdxType colStart = N * blockIdx.x;
-  Type m = mu[blockIdx.x];
+  Type m           = mu[blockIdx.x];
   for (IdxType i = threadIdx.x; i < N; i += TPB) {
     IdxType idx = colStart + i;
-    Type diff = data[idx] - m;
+    Type diff   = data[idx] - m;
     thread_data += diff * diff;
   }
   Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) {
-    std[blockIdx.x] = raft::mySqrt(acc / N);
-  }
+  if (threadIdx.x == 0) { std[blockIdx.x] = raft::mySqrt(acc / N); }
 }
 
 template <typename Type, typename IdxType, int TPB>
-__global__ void varsKernelColMajor(Type *var, const Type *data, const Type *mu,
-                                   IdxType D, IdxType N) {
+__global__ void varsKernelColMajor(
+  Type* var, const Type* data, const Type* mu, IdxType D, IdxType N)
+{
   typedef cub::BlockReduce<Type, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   Type thread_data = Type(0);
   IdxType colStart = N * blockIdx.x;
-  Type m = mu[blockIdx.x];
+  Type m           = mu[blockIdx.x];
   for (IdxType i = threadIdx.x; i < N; i += TPB) {
     IdxType idx = colStart + i;
-    Type diff = data[idx] - m;
+    Type diff   = data[idx] - m;
     thread_data += diff * diff;
   }
   Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) {
-    var[blockIdx.x] = acc / N;
-  }
+  if (threadIdx.x == 0) { var[blockIdx.x] = acc / N; }
 }
 
 /**
@@ -105,70 +103,78 @@ __global__ void varsKernelColMajor(Type *var, const Type *data, const Type *mu,
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename IdxType = int>
-void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N,
-            bool sample, bool rowMajor, cudaStream_t stream) {
+void stddev(Type* std,
+            const Type* data,
+            const Type* mu,
+            IdxType D,
+            IdxType N,
+            bool sample,
+            bool rowMajor,
+            cudaStream_t stream)
+{
   static const int TPB = 256;
   if (rowMajor) {
     static const int RowsPerThread = 4;
-    static const int ColsPerBlk = 32;
-    static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk),
-              raft::ceildiv(D, (IdxType)ColsPerBlk));
+    static const int ColsPerBlk    = 32;
+    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
+    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
     CUDA_CHECK(cudaMemset(std, 0, sizeof(Type) * D));
-    stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
-      <<<grid, TPB, 0, stream>>>(std, data, D, N);
+    stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk><<<grid, TPB, 0, stream>>>(std, data, D, N);
     Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
     raft::linalg::binaryOp(
-      std, std, mu, D,
-      [ratio] __device__(Type a, Type b) {
-        return raft::mySqrt(a * ratio - b * b);
-      },
+      std,
+      std,
+      mu,
+      D,
+      [ratio] __device__(Type a, Type b) { return raft::mySqrt(a * ratio - b * b); },
       stream);
   } else {
-    stddevKernelColMajor<Type, IdxType, TPB>
-      <<<D, TPB, 0, stream>>>(std, data, mu, D, N);
+    stddevKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(std, data, mu, D, N);
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 /**
-  * @brief Compute variance of the input matrix
-  *
-  * Variance operation is assumed to be performed on a given column.
-  *
-  * @tparam Type the data type
-  * @tparam IdxType Integer type used to for addressing
-  * @param var the output stddev vector
-  * @param data the input matrix
-  * @param mu the mean vector
-  * @param D number of columns of data
-  * @param N number of rows of data
-  * @param sample whether to evaluate sample stddev or not. In other words,
-  * whether
-  *  to normalize the output using N-1 or N, for true or false, respectively
-  * @param rowMajor whether the input data is row or col major
-  * @param stream cuda stream where to launch work
-  */
+ * @brief Compute variance of the input matrix
+ *
+ * Variance operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param var the output stddev vector
+ * @param data the input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param sample whether to evaluate sample stddev or not. In other words,
+ * whether
+ *  to normalize the output using N-1 or N, for true or false, respectively
+ * @param rowMajor whether the input data is row or col major
+ * @param stream cuda stream where to launch work
+ */
 template <typename Type, typename IdxType = int>
-void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N,
-          bool sample, bool rowMajor, cudaStream_t stream) {
+void vars(Type* var,
+          const Type* data,
+          const Type* mu,
+          IdxType D,
+          IdxType N,
+          bool sample,
+          bool rowMajor,
+          cudaStream_t stream)
+{
   static const int TPB = 256;
   if (rowMajor) {
     static const int RowsPerThread = 4;
-    static const int ColsPerBlk = 32;
-    static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk),
-              raft::ceildiv(D, (IdxType)ColsPerBlk));
+    static const int ColsPerBlk    = 32;
+    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
+    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
     CUDA_CHECK(cudaMemset(var, 0, sizeof(Type) * D));
-    stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
-      <<<grid, TPB, 0, stream>>>(var, data, D, N);
+    stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk><<<grid, TPB, 0, stream>>>(var, data, D, N);
     Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
     raft::linalg::binaryOp(
-      var, var, mu, D,
-      [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream);
+      var, var, mu, D, [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream);
   } else {
-    varsKernelColMajor<Type, IdxType, TPB>
-      <<<D, TPB, 0, stream>>>(var, data, mu, D, N);
+    varsKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(var, data, mu, D, N);
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/stats/detail/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh
index 37a3313ed1..b7f5cc8ff7 100644
--- a/cpp/include/raft/stats/detail/sum.cuh
+++ b/cpp/include/raft/stats/detail/sum.cuh
@@ -27,15 +27,15 @@ namespace detail {
 
 ///@todo: ColsPerBlk has been tested only for 32!
 template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-__global__ void sumKernelRowMajor(Type *mu, const Type *data, IdxType D,
-                                  IdxType N) {
+__global__ void sumKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
+{
   const int RowsPerBlkPerIter = TPB / ColsPerBlk;
-  IdxType thisColId = threadIdx.x % ColsPerBlk;
-  IdxType thisRowId = threadIdx.x / ColsPerBlk;
-  IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
-  IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
-  Type thread_data = Type(0);
-  const IdxType stride = RowsPerBlkPerIter * gridDim.x;
+  IdxType thisColId           = threadIdx.x % ColsPerBlk;
+  IdxType thisRowId           = threadIdx.x / ColsPerBlk;
+  IdxType colId               = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
+  IdxType rowId               = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
+  Type thread_data            = Type(0);
+  const IdxType stride        = RowsPerBlkPerIter * gridDim.x;
   for (IdxType i = rowId; i < N; i += stride)
     thread_data += (colId < D) ? data[i * D + colId] : Type(0);
   __shared__ Type smu[ColsPerBlk];
@@ -47,8 +47,8 @@ __global__ void sumKernelRowMajor(Type *mu, const Type *data, IdxType D,
 }
 
 template <typename Type, typename IdxType, int TPB>
-__global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D,
-                                  IdxType N) {
+__global__ void sumKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N)
+{
   typedef cub::BlockReduce<Type, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   Type thread_data = Type(0);
@@ -58,27 +58,23 @@ __global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D,
     thread_data += data[idx];
   }
   Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) {
-    mu[blockIdx.x] = acc;
-  }
+  if (threadIdx.x == 0) { mu[blockIdx.x] = acc; }
 }
 
 template <typename Type, typename IdxType = int>
-void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor,
-         cudaStream_t stream) {
+void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream)
+{
   static const int TPB = 256;
   if (rowMajor) {
     static const int RowsPerThread = 4;
-    static const int ColsPerBlk = 32;
-    static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk),
-              raft::ceildiv(D, (IdxType)ColsPerBlk));
+    static const int ColsPerBlk    = 32;
+    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
+    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
     CUDA_CHECK(cudaMemset(output, 0, sizeof(Type) * D));
     sumKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
       <<<grid, TPB, 0, stream>>>(output, input, D, N);
   } else {
-    sumKernelColMajor<Type, IdxType, TPB>
-      <<<D, TPB, 0, stream>>>(output, input, D, N);
+    sumKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(output, input, D, N);
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/stats/mean.hpp b/cpp/include/raft/stats/mean.hpp
index 6e4cf39850..ba1eb55e71 100644
--- a/cpp/include/raft/stats/mean.hpp
+++ b/cpp/include/raft/stats/mean.hpp
@@ -41,8 +41,9 @@ namespace stats {
  * @param stream: cuda stream
  */
 template <typename Type, typename IdxType = int>
-void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample,
-          bool rowMajor, cudaStream_t stream) {
+void mean(
+  Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream)
+{
   detail::mean(mu, data, D, N, sample, rowMajor, stream);
 }
 
diff --git a/cpp/include/raft/stats/mean_center.hpp b/cpp/include/raft/stats/mean_center.hpp
index 04934d4388..c0ba24312b 100644
--- a/cpp/include/raft/stats/mean_center.hpp
+++ b/cpp/include/raft/stats/mean_center.hpp
@@ -38,12 +38,25 @@ namespace stats {
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename IdxType = int, int TPB = 256>
-void meanCenter(Type *out, const Type *data, const Type *mu, IdxType D,
-                IdxType N, bool rowMajor, bool bcastAlongRows,
-                cudaStream_t stream) {
+void meanCenter(Type* out,
+                const Type* data,
+                const Type* mu,
+                IdxType D,
+                IdxType N,
+                bool rowMajor,
+                bool bcastAlongRows,
+                cudaStream_t stream)
+{
   raft::linalg::matrixVectorOp(
-    out, data, mu, D, N, rowMajor, bcastAlongRows,
-    [] __device__(Type a, Type b) { return a - b; }, stream);
+    out,
+    data,
+    mu,
+    D,
+    N,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a - b; },
+    stream);
 }
 
 /**
@@ -61,11 +74,25 @@ void meanCenter(Type *out, const Type *data, const Type *mu, IdxType D,
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename IdxType = int, int TPB = 256>
-void meanAdd(Type *out, const Type *data, const Type *mu, IdxType D, IdxType N,
-             bool rowMajor, bool bcastAlongRows, cudaStream_t stream) {
+void meanAdd(Type* out,
+             const Type* data,
+             const Type* mu,
+             IdxType D,
+             IdxType N,
+             bool rowMajor,
+             bool bcastAlongRows,
+             cudaStream_t stream)
+{
   raft::linalg::matrixVectorOp(
-    out, data, mu, D, N, rowMajor, bcastAlongRows,
-    [] __device__(Type a, Type b) { return a + b; }, stream);
+    out,
+    data,
+    mu,
+    D,
+    N,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a + b; },
+    stream);
 }
 
 };  // end namespace stats
diff --git a/cpp/include/raft/stats/stddev.hpp b/cpp/include/raft/stats/stddev.hpp
index 17c5ae457d..9393dec8bc 100644
--- a/cpp/include/raft/stats/stddev.hpp
+++ b/cpp/include/raft/stats/stddev.hpp
@@ -42,8 +42,15 @@ namespace stats {
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename IdxType = int>
-void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N,
-            bool sample, bool rowMajor, cudaStream_t stream) {
+void stddev(Type* std,
+            const Type* data,
+            const Type* mu,
+            IdxType D,
+            IdxType N,
+            bool sample,
+            bool rowMajor,
+            cudaStream_t stream)
+{
   detail::stddev(std, data, mu, D, N, sample, rowMajor, stream);
 }
 
@@ -66,8 +73,15 @@ void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N,
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename IdxType = int>
-void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N,
-          bool sample, bool rowMajor, cudaStream_t stream) {
+void vars(Type* var,
+          const Type* data,
+          const Type* mu,
+          IdxType D,
+          IdxType N,
+          bool sample,
+          bool rowMajor,
+          cudaStream_t stream)
+{
   detail::vars(var, data, mu, D, N, sample, rowMajor, stream);
 }
 
diff --git a/cpp/include/raft/stats/sum.hpp b/cpp/include/raft/stats/sum.hpp
index 4f67acdf36..cfb5142a14 100644
--- a/cpp/include/raft/stats/sum.hpp
+++ b/cpp/include/raft/stats/sum.hpp
@@ -38,8 +38,8 @@ namespace stats {
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename IdxType = int>
-void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor,
-         cudaStream_t stream) {
+void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream)
+{
   detail::sum(output, input, D, N, rowMajor, stream);
 }
 
diff --git a/cpp/include/raft/vectorized.cuh b/cpp/include/raft/vectorized.cuh
index ceffbcca78..b44d8bb4ad 100644
--- a/cpp/include/raft/vectorized.cuh
+++ b/cpp/include/raft/vectorized.cuh
@@ -22,11 +22,11 @@
 namespace raft {
 
 template <typename math_, int VecLen>
-struct IOType {};
+struct IOType {
+};
 template <>
 struct IOType<bool, 1> {
-  static_assert(sizeof(bool) == sizeof(int8_t),
-                "IOType bool size assumption failed");
+  static_assert(sizeof(bool) == sizeof(int8_t), "IOType bool size assumption failed");
   typedef int8_t Type;
 };
 template <>
@@ -215,50 +215,50 @@ struct IOType<double, 2> {
 };
 
 /**
-     * @struct TxN_t
-     *
-     * @brief Internal data structure that is used to define a facade for vectorized
-     * loads/stores across the most common POD types. The goal of his file is to
-     * provide with CUDA programmers, an easy way to have compiler issue vectorized
-     * load or store instructions to memory (either global or shared). Vectorized
-     * accesses to memory are important as they'll utilize its resources
-     * efficiently,
-     * when compared to their non-vectorized counterparts. Obviously, for whatever
-     * reasons if one is unable to issue such vectorized operations, one can always
-     * fallback to using POD types.
-     *
-     * Concept of vectorized accesses : Threads process multiple elements
-     * to speed up processing. These are loaded in a single read thanks
-     * to type promotion. It is then reinterpreted as a vector elements
-     * to perform the kernel's work.
-     *
-     * Caution : vectorized accesses requires input adresses to be memory aligned
-     * according not to the input type but to the promoted type used for reading.
-     *
-     * Example demonstrating the use of load operations, performing math on such
-     * loaded data and finally storing it back.
-     * @code{.cu}
-     * TxN_t<uint8_t,8> mydata1, mydata2;
-     * int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * mydata1.Ratio;
-     * mydata1.load(ptr1, idx);
-     * mydata2.load(ptr2, idx);
-     * #pragma unroll
-     * for(int i=0;i<mydata1.Ratio;++i) {
-     *     mydata1.val.data[i] += mydata2.val.data[i];
-     * }
-     * mydata1.store(ptr1, idx);
-     * @endcode
-     *
-     * By doing as above, the interesting thing is that the code effectively remains
-     * almost the same, in case one wants to upgrade to TxN_t<uint16_t,16> type.
-     * Only change required is to replace variable declaration appropriately.
-     *
-     * Obviously, it's caller's responsibility to take care of pointer alignment!
-     *
-     * @tparam math_ the data-type in which the compute/math needs to happen
-     * @tparam veclen_ the number of 'math_' types to be loaded/stored per
-     * instruction
-     */
+ * @struct TxN_t
+ *
+ * @brief Internal data structure that is used to define a facade for vectorized
+ * loads/stores across the most common POD types. The goal of his file is to
+ * provide with CUDA programmers, an easy way to have compiler issue vectorized
+ * load or store instructions to memory (either global or shared). Vectorized
+ * accesses to memory are important as they'll utilize its resources
+ * efficiently,
+ * when compared to their non-vectorized counterparts. Obviously, for whatever
+ * reasons if one is unable to issue such vectorized operations, one can always
+ * fallback to using POD types.
+ *
+ * Concept of vectorized accesses : Threads process multiple elements
+ * to speed up processing. These are loaded in a single read thanks
+ * to type promotion. It is then reinterpreted as a vector elements
+ * to perform the kernel's work.
+ *
+ * Caution : vectorized accesses requires input adresses to be memory aligned
+ * according not to the input type but to the promoted type used for reading.
+ *
+ * Example demonstrating the use of load operations, performing math on such
+ * loaded data and finally storing it back.
+ * @code{.cu}
+ * TxN_t<uint8_t,8> mydata1, mydata2;
+ * int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * mydata1.Ratio;
+ * mydata1.load(ptr1, idx);
+ * mydata2.load(ptr2, idx);
+ * #pragma unroll
+ * for(int i=0;i<mydata1.Ratio;++i) {
+ *     mydata1.val.data[i] += mydata2.val.data[i];
+ * }
+ * mydata1.store(ptr1, idx);
+ * @endcode
+ *
+ * By doing as above, the interesting thing is that the code effectively remains
+ * almost the same, in case one wants to upgrade to TxN_t<uint16_t,16> type.
+ * Only change required is to replace variable declaration appropriately.
+ *
+ * Obviously, it's caller's responsibility to take care of pointer alignment!
+ *
+ * @tparam math_ the data-type in which the compute/math needs to happen
+ * @tparam veclen_ the number of 'math_' types to be loaded/stored per
+ * instruction
+ */
 template <typename math_, int veclen_>
 struct TxN_t {
   /** underlying math data type */
@@ -282,7 +282,8 @@ struct TxN_t {
    * @brief Fill the contents of this structure with a constant value
    * @param _val the constant to be filled
    */
-  DI void fill(math_t _val) {
+  DI void fill(math_t _val)
+  {
 #pragma unroll
     for (int i = 0; i < Ratio; ++i) {
       val.data[i] = _val;
@@ -307,21 +308,24 @@ struct TxN_t {
    * @{
    */
   template <typename idx_t = int>
-  DI void load(const math_t *ptr, idx_t idx) {
-    const io_t *bptr = reinterpret_cast<const io_t *>(&ptr[idx]);
-    val.internal = __ldg(bptr);
+  DI void load(const math_t* ptr, idx_t idx)
+  {
+    const io_t* bptr = reinterpret_cast<const io_t*>(&ptr[idx]);
+    val.internal     = __ldg(bptr);
   }
 
   template <typename idx_t = int>
-  DI void load(math_t *ptr, idx_t idx) {
-    io_t *bptr = reinterpret_cast<io_t *>(&ptr[idx]);
+  DI void load(math_t* ptr, idx_t idx)
+  {
+    io_t* bptr   = reinterpret_cast<io_t*>(&ptr[idx]);
     val.internal = *bptr;
   }
 
   template <typename idx_t = int>
-  DI void store(math_t *ptr, idx_t idx) {
-    io_t *bptr = reinterpret_cast<io_t *>(&ptr[idx]);
-    *bptr = val.internal;
+  DI void store(math_t* ptr, idx_t idx)
+  {
+    io_t* bptr = reinterpret_cast<io_t*>(&ptr[idx]);
+    *bptr      = val.internal;
   }
   /** @} */
 };
@@ -338,11 +342,17 @@ struct TxN_t<math_, 0> {
 
   DI void fill(math_t _val) {}
   template <typename idx_t = int>
-  DI void load(const math_t *ptr, idx_t idx) {}
+  DI void load(const math_t* ptr, idx_t idx)
+  {
+  }
   template <typename idx_t = int>
-  DI void load(math_t *ptr, idx_t idx) {}
+  DI void load(math_t* ptr, idx_t idx)
+  {
+  }
   template <typename idx_t = int>
-  DI void store(math_t *ptr, idx_t idx) {}
+  DI void store(math_t* ptr, idx_t idx)
+  {
+  }
 };
 
 }  // namespace raft
diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu
index 06b246d9a1..2c7996514a 100644
--- a/cpp/test/cluster_solvers.cu
+++ b/cpp/test/cluster_solvers.cu
@@ -23,7 +23,8 @@
 
 namespace raft {
 
-TEST(Raft, ClusterSolvers) {
+TEST(Raft, ClusterSolvers)
+{
   using namespace matrix;
   using index_type = int;
   using value_type = double;
@@ -40,7 +41,7 @@ TEST(Raft, ClusterSolvers) {
   index_type d{10};
   index_type k{5};
 
-  //nullptr expected to trigger exceptions:
+  // nullptr expected to trigger exceptions:
   //
   value_type* eigvecs{nullptr};
   index_type* codes{nullptr};
@@ -52,7 +53,8 @@ TEST(Raft, ClusterSolvers) {
   EXPECT_ANY_THROW(cluster_solver.solve(h, n, d, eigvecs, codes));
 }
 
-TEST(Raft, ModularitySolvers) {
+TEST(Raft, ModularitySolvers)
+{
   using namespace matrix;
   using index_type = int;
   using value_type = double;
@@ -66,7 +68,7 @@ TEST(Raft, ModularitySolvers) {
   value_type tol{1.0e-10};
   bool reorthog{true};
 
-  //nullptr expected to trigger exceptions:
+  // nullptr expected to trigger exceptions:
   //
   index_type* clusters{nullptr};
   value_type* eigvals{nullptr};
@@ -80,13 +82,11 @@ TEST(Raft, ModularitySolvers) {
 
   index_type k{5};
 
-  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol,
-                                                            seed};
+  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol, seed};
   kmeans_solver_t<index_type, value_type> cluster_solver{clust_cfg};
 
   auto stream = h.get_stream();
-  sparse_matrix_t<index_type, value_type> sm{h,       nullptr, nullptr,
-                                             nullptr, 0,       0};
+  sparse_matrix_t<index_type, value_type> sm{h, nullptr, nullptr, nullptr, 0, 0};
 
   EXPECT_ANY_THROW(spectral::modularity_maximization(
     h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
diff --git a/cpp/test/cudart_utils.cpp b/cpp/test/cudart_utils.cpp
index c14d880efd..150767992f 100644
--- a/cpp/test/cudart_utils.cpp
+++ b/cpp/test/cudart_utils.cpp
@@ -20,7 +20,8 @@
 
 namespace raft {
 
-TEST(Raft, Utils) {
+TEST(Raft, Utils)
+{
   ASSERT_NO_THROW(ASSERT(1 == 1, "Should not assert!"));
   ASSERT_THROW(ASSERT(1 != 1, "Should assert!"), exception);
   ASSERT_THROW(THROW("Should throw!"), exception);
diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index efa1e2cd41..21d7e9d753 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -26,30 +26,42 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-__global__ void naiveDistanceAdjKernel(bool *dist, const DataType *x,
-                                       const DataType *y, int m, int n, int k,
-                                       DataType eps, bool isRowMajor) {
+__global__ void naiveDistanceAdjKernel(bool* dist,
+                                       const DataType* x,
+                                       const DataType* y,
+                                       int m,
+                                       int n,
+                                       int k,
+                                       DataType eps,
+                                       bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
   if (midx >= m || nidx >= n) return;
   DataType acc = DataType(0);
   for (int i = 0; i < k; ++i) {
-    int xidx = isRowMajor ? i + midx * k : i * m + midx;
-    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
+    int xidx  = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx  = isRowMajor ? i + nidx * k : i * n + nidx;
     auto diff = x[xidx] - y[yidx];
     acc += diff * diff;
   }
-  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc <= eps;
 }
 
 template <typename DataType>
-void naiveDistanceAdj(bool *dist, const DataType *x, const DataType *y, int m,
-                      int n, int k, DataType eps, bool isRowMajor) {
+void naiveDistanceAdj(bool* dist,
+                      const DataType* x,
+                      const DataType* y,
+                      int m,
+                      int n,
+                      int k,
+                      DataType eps,
+                      bool isRowMajor)
+{
   static const dim3 TPB(16, 32, 1);
   dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1);
-  naiveDistanceAdjKernel<DataType>
-    <<<nblks, TPB>>>(dist, x, y, m, n, k, eps, isRowMajor);
+  naiveDistanceAdjKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, eps, isRowMajor);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -62,26 +74,28 @@ struct DistanceAdjInputs {
 };
 
 template <typename DataType>
-::std::ostream &operator<<(::std::ostream &os,
-                           const DistanceAdjInputs<DataType> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const DistanceAdjInputs<DataType>& dims)
+{
   return os;
 }
 
 template <typename DataType>
-class DistanceAdjTest
-  : public ::testing::TestWithParam<DistanceAdjInputs<DataType>> {
+class DistanceAdjTest : public ::testing::TestWithParam<DistanceAdjInputs<DataType>> {
  public:
   DistanceAdjTest()
     : params(::testing::TestWithParam<DistanceAdjInputs<DataType>>::GetParam()),
       stream(handle.get_stream()),
       dist(params.m * params.n, stream),
-      dist_ref(params.m * params.n, stream) {}
+      dist_ref(params.m * params.n, stream)
+  {
+  }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
-    int m = params.m;
-    int n = params.n;
-    int k = params.k;
+    int m           = params.m;
+    int n           = params.n;
+    int k           = params.k;
     bool isRowMajor = params.isRowMajor;
 
     rmm::device_uvector<DataType> x(m * k, stream);
@@ -92,21 +106,27 @@ class DistanceAdjTest
 
     DataType threshold = params.eps;
 
-    naiveDistanceAdj(dist_ref.data(), x.data(), y.data(), m, n, k, threshold,
-                     isRowMajor);
-    size_t worksize =
-      raft::distance::getWorkspaceSize<raft::distance::DistanceType::L2Expanded,
-                                       DataType, DataType, bool>(
+    naiveDistanceAdj(dist_ref.data(), x.data(), y.data(), m, n, k, threshold, isRowMajor);
+    size_t worksize = raft::distance::
+      getWorkspaceSize<raft::distance::DistanceType::L2Expanded, DataType, DataType, bool>(
         x.data(), y.data(), m, n, k);
     rmm::device_uvector<char> workspace(worksize, stream);
 
     auto fin_op = [threshold] __device__(DataType d_val, int g_d_idx) {
       return d_val <= threshold;
     };
-    raft::distance::distance<raft::distance::DistanceType::L2Expanded, DataType,
-                             DataType, bool>(
-      x.data(), y.data(), dist.data(), m, n, k, workspace.data(),
-      workspace.size(), fin_op, stream, isRowMajor);
+    raft::distance::distance<raft::distance::DistanceType::L2Expanded, DataType, DataType, bool>(
+      x.data(),
+      y.data(),
+      dist.data(),
+      m,
+      n,
+      k,
+      workspace.data(),
+      workspace.size(),
+      fin_op,
+      stream,
+      isRowMajor);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
@@ -131,14 +151,13 @@ const std::vector<DistanceAdjInputs<float>> inputsf = {
   {10.0f, 1024, 1024, 32, false, 1234ULL},
 };
 typedef DistanceAdjTest<float> DistanceAdjTestF;
-TEST_P(DistanceAdjTestF, Result) {
+TEST_P(DistanceAdjTestF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(
-    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare<bool>()));
+  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare<bool>()));
 }
-INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceAdjInputs<double>> inputsd = {
   {0.01, 1024, 1024, 32, true, 1234ULL},
@@ -151,14 +170,13 @@ const std::vector<DistanceAdjInputs<double>> inputsd = {
   {10.0, 1024, 1024, 32, false, 1234ULL},
 };
 typedef DistanceAdjTest<double> DistanceAdjTestD;
-TEST_P(DistanceAdjTestD, Result) {
+TEST_P(DistanceAdjTestD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(
-    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare<bool>()));
+  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare<bool>()));
 }
-INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestD, ::testing::ValuesIn(inputsd));
 
 }  // namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_canberra.cu b/cpp/test/distance/dist_canberra.cu
index bddfdff3b6..db318605b4 100644
--- a/cpp/test/distance/dist_canberra.cu
+++ b/cpp/test/distance/dist_canberra.cu
@@ -21,8 +21,8 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceCanberra
-  : public DistanceTest<raft::distance::DistanceType::Canberra, DataType> {};
+class DistanceCanberra : public DistanceTest<raft::distance::DistanceType::Canberra, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -35,14 +35,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceCanberra<float> DistanceCanberraF;
-TEST_P(DistanceCanberraF, Result) {
+TEST_P(DistanceCanberraF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -55,14 +55,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceCanberra<double> DistanceCanberraD;
-TEST_P(DistanceCanberraD, Result) {
+TEST_P(DistanceCanberraD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_chebyshev.cu b/cpp/test/distance/dist_chebyshev.cu
index 0dc6edfaad..c7dccfe712 100644
--- a/cpp/test/distance/dist_chebyshev.cu
+++ b/cpp/test/distance/dist_chebyshev.cu
@@ -21,8 +21,8 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceLinf
-  : public DistanceTest<raft::distance::DistanceType::Linf, DataType> {};
+class DistanceLinf : public DistanceTest<raft::distance::DistanceType::Linf, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -35,14 +35,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceLinf<float> DistanceLinfF;
-TEST_P(DistanceLinfF, Result) {
+TEST_P(DistanceLinfF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -55,14 +55,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceLinf<double> DistanceLinfD;
-TEST_P(DistanceLinfD, Result) {
+TEST_P(DistanceLinfD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_correlation.cu b/cpp/test/distance/dist_correlation.cu
index f6dc015738..0648ed96ca 100644
--- a/cpp/test/distance/dist_correlation.cu
+++ b/cpp/test/distance/dist_correlation.cu
@@ -22,8 +22,8 @@ namespace distance {
 
 template <typename DataType>
 class DistanceCorrelation
-  : public DistanceTest<raft::distance::DistanceType::CorrelationExpanded,
-                        DataType> {};
+  : public DistanceTest<raft::distance::DistanceType::CorrelationExpanded, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -36,14 +36,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceCorrelation<float> DistanceCorrelationF;
-TEST_P(DistanceCorrelationF, Result) {
+TEST_P(DistanceCorrelationF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -56,14 +56,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceCorrelation<double> DistanceCorrelationD;
-TEST_P(DistanceCorrelationD, Result) {
+TEST_P(DistanceCorrelationD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_cos.cu b/cpp/test/distance/dist_cos.cu
index 2487bcbd95..b3e6a4c97f 100644
--- a/cpp/test/distance/dist_cos.cu
+++ b/cpp/test/distance/dist_cos.cu
@@ -21,9 +21,8 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceExpCos
-  : public DistanceTest<raft::distance::DistanceType::CosineExpanded,
-                        DataType> {};
+class DistanceExpCos : public DistanceTest<raft::distance::DistanceType::CosineExpanded, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -36,14 +35,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceExpCos<float> DistanceExpCosF;
-TEST_P(DistanceExpCosF, Result) {
+TEST_P(DistanceExpCosF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n,
-                          raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -56,14 +55,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceExpCos<double> DistanceExpCosD;
-TEST_P(DistanceExpCosD, Result) {
+TEST_P(DistanceExpCosD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n,
-                          raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(
+    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_euc_exp.cu b/cpp/test/distance/dist_euc_exp.cu
index a6ef01aa45..75ff7e682a 100644
--- a/cpp/test/distance/dist_euc_exp.cu
+++ b/cpp/test/distance/dist_euc_exp.cu
@@ -21,8 +21,8 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceEucExpTest
-  : public DistanceTest<raft::distance::DistanceType::L2Expanded, DataType> {};
+class DistanceEucExpTest : public DistanceTest<raft::distance::DistanceType::L2Expanded, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -35,14 +35,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceEucExpTest<float> DistanceEucExpTestF;
-TEST_P(DistanceEucExpTestF, Result) {
+TEST_P(DistanceEucExpTestF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n,
-                          raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -55,14 +55,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceEucExpTest<double> DistanceEucExpTestD;
-TEST_P(DistanceEucExpTestD, Result) {
+TEST_P(DistanceEucExpTestD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n,
-                          raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(
+    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_euc_unexp.cu b/cpp/test/distance/dist_euc_unexp.cu
index 290abda352..88affa16d5 100644
--- a/cpp/test/distance/dist_euc_unexp.cu
+++ b/cpp/test/distance/dist_euc_unexp.cu
@@ -36,14 +36,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceEucUnexpTest<float> DistanceEucUnexpTestF;
-TEST_P(DistanceEucUnexpTestF, Result) {
+TEST_P(DistanceEucUnexpTestF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n,
-                          raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -56,14 +56,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceEucUnexpTest<double> DistanceEucUnexpTestD;
-TEST_P(DistanceEucUnexpTestD, Result) {
+TEST_P(DistanceEucUnexpTestD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n,
-                          raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(
+    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_hamming.cu b/cpp/test/distance/dist_hamming.cu
index 0123c8bada..631adc751c 100644
--- a/cpp/test/distance/dist_hamming.cu
+++ b/cpp/test/distance/dist_hamming.cu
@@ -22,8 +22,8 @@ namespace distance {
 
 template <typename DataType>
 class DistanceHamming
-  : public DistanceTest<raft::distance::DistanceType::HammingUnexpanded,
-                        DataType> {};
+  : public DistanceTest<raft::distance::DistanceType::HammingUnexpanded, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -36,14 +36,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceHamming<float> DistanceHammingF;
-TEST_P(DistanceHammingF, Result) {
+TEST_P(DistanceHammingF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -56,14 +56,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceHamming<double> DistanceHammingD;
-TEST_P(DistanceHammingD, Result) {
+TEST_P(DistanceHammingD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_hellinger.cu b/cpp/test/distance/dist_hellinger.cu
index 39d197f786..8a07c8836f 100644
--- a/cpp/test/distance/dist_hellinger.cu
+++ b/cpp/test/distance/dist_hellinger.cu
@@ -22,8 +22,8 @@ namespace distance {
 
 template <typename DataType>
 class DistanceHellingerExp
-  : public DistanceTest<raft::distance::DistanceType::HellingerExpanded,
-                        DataType> {};
+  : public DistanceTest<raft::distance::DistanceType::HellingerExpanded, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -36,14 +36,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceHellingerExp<float> DistanceHellingerExpF;
-TEST_P(DistanceHellingerExpF, Result) {
+TEST_P(DistanceHellingerExpF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -56,14 +56,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceHellingerExp<double> DistanceHellingerExpD;
-TEST_P(DistanceHellingerExpD, Result) {
+TEST_P(DistanceHellingerExpD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_jensen_shannon.cu b/cpp/test/distance/dist_jensen_shannon.cu
index 9070ce92c1..3cda31a852 100644
--- a/cpp/test/distance/dist_jensen_shannon.cu
+++ b/cpp/test/distance/dist_jensen_shannon.cu
@@ -36,14 +36,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceJensenShannon<float> DistanceJensenShannonF;
-TEST_P(DistanceJensenShannonF, Result) {
+TEST_P(DistanceJensenShannonF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -56,14 +56,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceJensenShannon<double> DistanceJensenShannonD;
-TEST_P(DistanceJensenShannonD, Result) {
+TEST_P(DistanceJensenShannonD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_kl_divergence.cu b/cpp/test/distance/dist_kl_divergence.cu
index 7c32596527..4303b8cc8f 100644
--- a/cpp/test/distance/dist_kl_divergence.cu
+++ b/cpp/test/distance/dist_kl_divergence.cu
@@ -36,14 +36,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceKLDivergence<float> DistanceKLDivergenceF;
-TEST_P(DistanceKLDivergenceF, Result) {
+TEST_P(DistanceKLDivergenceF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -56,14 +56,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceKLDivergence<double> DistanceKLDivergenceD;
-TEST_P(DistanceKLDivergenceD, Result) {
+TEST_P(DistanceKLDivergenceD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_l1.cu b/cpp/test/distance/dist_l1.cu
index ff7705d195..dad160ca41 100644
--- a/cpp/test/distance/dist_l1.cu
+++ b/cpp/test/distance/dist_l1.cu
@@ -21,8 +21,8 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceUnexpL1
-  : public DistanceTest<raft::distance::DistanceType::L1, DataType> {};
+class DistanceUnexpL1 : public DistanceTest<raft::distance::DistanceType::L1, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -35,14 +35,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceUnexpL1<float> DistanceUnexpL1F;
-TEST_P(DistanceUnexpL1F, Result) {
+TEST_P(DistanceUnexpL1F, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1F,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1F, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -55,14 +55,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceUnexpL1<double> DistanceUnexpL1D;
-TEST_P(DistanceUnexpL1D, Result) {
+TEST_P(DistanceUnexpL1D, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1D,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1D, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_minkowski.cu b/cpp/test/distance/dist_minkowski.cu
index 7d87bbc2c7..34f6d2825e 100644
--- a/cpp/test/distance/dist_minkowski.cu
+++ b/cpp/test/distance/dist_minkowski.cu
@@ -21,8 +21,7 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceLpUnexp
-  : public DistanceTest<raft::distance::DistanceType::LpUnexpanded, DataType> {
+class DistanceLpUnexp : public DistanceTest<raft::distance::DistanceType::LpUnexpanded, DataType> {
 };
 
 const std::vector<DistanceInputs<float>> inputsf = {
@@ -36,14 +35,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL, 3.0f},
 };
 typedef DistanceLpUnexp<float> DistanceLpUnexpF;
-TEST_P(DistanceLpUnexpF, Result) {
+TEST_P(DistanceLpUnexpF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL, 4.0},
@@ -56,14 +55,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL, 3.0},
 };
 typedef DistanceLpUnexp<double> DistanceLpUnexpD;
-TEST_P(DistanceLpUnexpD, Result) {
+TEST_P(DistanceLpUnexpD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_russell_rao.cu b/cpp/test/distance/dist_russell_rao.cu
index ae735951a8..e0bfcd7eb3 100644
--- a/cpp/test/distance/dist_russell_rao.cu
+++ b/cpp/test/distance/dist_russell_rao.cu
@@ -22,8 +22,8 @@ namespace distance {
 
 template <typename DataType>
 class DistanceRussellRao
-  : public DistanceTest<raft::distance::DistanceType::RusselRaoExpanded,
-                        DataType> {};
+  : public DistanceTest<raft::distance::DistanceType::RusselRaoExpanded, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -36,14 +36,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceRussellRao<float> DistanceRussellRaoF;
-TEST_P(DistanceRussellRaoF, Result) {
+TEST_P(DistanceRussellRaoF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -56,14 +56,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceRussellRao<double> DistanceRussellRaoD;
-TEST_P(DistanceRussellRaoD, Result) {
+TEST_P(DistanceRussellRaoD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index f31fbc9165..f445e3b578 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -25,43 +25,52 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-__global__ void naiveDistanceKernel(DataType *dist, const DataType *x,
-                                    const DataType *y, int m, int n, int k,
+__global__ void naiveDistanceKernel(DataType* dist,
+                                    const DataType* x,
+                                    const DataType* y,
+                                    int m,
+                                    int n,
+                                    int k,
                                     raft::distance::DistanceType type,
-                                    bool isRowMajor) {
+                                    bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
   if (midx >= m || nidx >= n) return;
   DataType acc = DataType(0);
   for (int i = 0; i < k; ++i) {
-    int xidx = isRowMajor ? i + midx * k : i * m + midx;
-    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
+    int xidx  = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx  = isRowMajor ? i + nidx * k : i * n + nidx;
     auto diff = x[xidx] - y[yidx];
     acc += diff * diff;
   }
   if (type == raft::distance::DistanceType::L2SqrtExpanded ||
       type == raft::distance::DistanceType::L2SqrtUnexpanded)
     acc = raft::mySqrt(acc);
-  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
 
 template <typename DataType>
-__global__ void naiveL1_Linf_CanberraDistanceKernel(
-  DataType *dist, const DataType *x, const DataType *y, int m, int n, int k,
-  raft::distance::DistanceType type, bool isRowMajor) {
+__global__ void naiveL1_Linf_CanberraDistanceKernel(DataType* dist,
+                                                    const DataType* x,
+                                                    const DataType* y,
+                                                    int m,
+                                                    int n,
+                                                    int k,
+                                                    raft::distance::DistanceType type,
+                                                    bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
-  if (midx >= m || nidx >= n) {
-    return;
-  }
+  if (midx >= m || nidx >= n) { return; }
 
   DataType acc = DataType(0);
   for (int i = 0; i < k; ++i) {
-    int xidx = isRowMajor ? i + midx * k : i * m + midx;
-    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a = x[xidx];
-    auto b = y[yidx];
+    int xidx  = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx  = isRowMajor ? i + nidx * k : i * n + nidx;
+    auto a    = x[xidx];
+    auto b    = y[yidx];
     auto diff = (a > b) ? (a - b) : (b - a);
     if (type == raft::distance::DistanceType::Linf) {
       acc = raft::myMax(acc, diff);
@@ -75,29 +84,27 @@ __global__ void naiveL1_Linf_CanberraDistanceKernel(
     }
   }
 
-  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
 
 template <typename DataType>
-__global__ void naiveCosineDistanceKernel(DataType *dist, const DataType *x,
-                                          const DataType *y, int m, int n,
-                                          int k, bool isRowMajor) {
+__global__ void naiveCosineDistanceKernel(
+  DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
-  if (midx >= m || nidx >= n) {
-    return;
-  }
+  if (midx >= m || nidx >= n) { return; }
 
-  DataType acc_a = DataType(0);
-  DataType acc_b = DataType(0);
+  DataType acc_a  = DataType(0);
+  DataType acc_b  = DataType(0);
   DataType acc_ab = DataType(0);
 
   for (int i = 0; i < k; ++i) {
     int xidx = isRowMajor ? i + midx * k : i * m + midx;
     int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a = x[xidx];
-    auto b = y[yidx];
+    auto a   = x[xidx];
+    auto b   = y[yidx];
     acc_a += a * a;
     acc_b += b * b;
     acc_ab += a * b;
@@ -106,64 +113,67 @@ __global__ void naiveCosineDistanceKernel(DataType *dist, const DataType *x,
   int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
 
   // Use 1.0 - (cosine similarity) to calc the distance
-  dist[outidx] =
-    (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b));
+  dist[outidx] = (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b));
 }
 
 template <typename DataType>
-__global__ void naiveHellingerDistanceKernel(DataType *dist, const DataType *x,
-                                             const DataType *y, int m, int n,
-                                             int k, bool isRowMajor) {
+__global__ void naiveHellingerDistanceKernel(
+  DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
-  if (midx >= m || nidx >= n) {
-    return;
-  }
+  if (midx >= m || nidx >= n) { return; }
 
   DataType acc_ab = DataType(0);
 
   for (int i = 0; i < k; ++i) {
     int xidx = isRowMajor ? i + midx * k : i * m + midx;
     int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a = x[xidx];
-    auto b = y[yidx];
+    auto a   = x[xidx];
+    auto b   = y[yidx];
     acc_ab += raft::mySqrt(a) * raft::mySqrt(b);
   }
 
   int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
 
   // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
-  acc_ab = 1 - acc_ab;
+  acc_ab         = 1 - acc_ab;
   auto rectifier = (!signbit(acc_ab));
-  dist[outidx] = raft::mySqrt(rectifier * acc_ab);
+  dist[outidx]   = raft::mySqrt(rectifier * acc_ab);
 }
 
 template <typename DataType>
-__global__ void naiveLpUnexpDistanceKernel(DataType *dist, const DataType *x,
-                                           const DataType *y, int m, int n,
-                                           int k, bool isRowMajor, DataType p) {
+__global__ void naiveLpUnexpDistanceKernel(DataType* dist,
+                                           const DataType* x,
+                                           const DataType* y,
+                                           int m,
+                                           int n,
+                                           int k,
+                                           bool isRowMajor,
+                                           DataType p)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
   if (midx >= m || nidx >= n) return;
   DataType acc = DataType(0);
   for (int i = 0; i < k; ++i) {
-    int xidx = isRowMajor ? i + midx * k : i * m + midx;
-    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a = x[xidx];
-    auto b = y[yidx];
+    int xidx  = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx  = isRowMajor ? i + nidx * k : i * n + nidx;
+    auto a    = x[xidx];
+    auto b    = y[yidx];
     auto diff = raft::L1Op<DataType>()(a - b);
     acc += raft::myPow(diff, p);
   }
   auto one_over_p = 1 / p;
-  acc = raft::myPow(acc, one_over_p);
-  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
-  dist[outidx] = acc;
+  acc             = raft::myPow(acc, one_over_p);
+  int outidx      = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  dist[outidx]    = acc;
 }
 
 template <typename DataType>
-__global__ void naiveHammingDistanceKernel(DataType *dist, const DataType *x,
-                                           const DataType *y, int m, int n,
-                                           int k, bool isRowMajor) {
+__global__ void naiveHammingDistanceKernel(
+  DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
   if (midx >= m || nidx >= n) return;
@@ -171,21 +181,19 @@ __global__ void naiveHammingDistanceKernel(DataType *dist, const DataType *x,
   for (int i = 0; i < k; ++i) {
     int xidx = isRowMajor ? i + midx * k : i * m + midx;
     int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a = x[xidx];
-    auto b = y[yidx];
+    auto a   = x[xidx];
+    auto b   = y[yidx];
     acc += (a != b);
   }
-  acc = acc / k;
-  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  acc          = acc / k;
+  int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
 
 template <typename DataType>
-__global__ void naiveJensenShannonDistanceKernel(DataType *dist,
-                                                 const DataType *x,
-                                                 const DataType *y, int m,
-                                                 int n, int k,
-                                                 bool isRowMajor) {
+__global__ void naiveJensenShannonDistanceKernel(
+  DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
   if (midx >= m || nidx >= n) return;
@@ -193,10 +201,10 @@ __global__ void naiveJensenShannonDistanceKernel(DataType *dist,
   for (int i = 0; i < k; ++i) {
     int xidx = isRowMajor ? i + midx * k : i * m + midx;
     int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a = x[xidx];
-    auto b = y[yidx];
+    auto a   = x[xidx];
+    auto b   = y[yidx];
 
-    DataType m = 0.5f * (a + b);
+    DataType m  = 0.5f * (a + b);
     bool a_zero = a == 0;
     bool b_zero = b == 0;
 
@@ -206,18 +214,17 @@ __global__ void naiveJensenShannonDistanceKernel(DataType *dist,
     bool p_zero = p == 0;
     bool q_zero = q == 0;
 
-    acc +=
-      (-a * (!p_zero * log(p + p_zero))) + (-b * (!q_zero * log(q + q_zero)));
+    acc += (-a * (!p_zero * log(p + p_zero))) + (-b * (!q_zero * log(q + q_zero)));
   }
-  acc = raft::mySqrt(0.5f * acc);
-  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  acc          = raft::mySqrt(0.5f * acc);
+  int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
 
 template <typename DataType, typename OutType>
-__global__ void naiveRussellRaoDistanceKernel(OutType *dist, const DataType *x,
-                                              const DataType *y, int m, int n,
-                                              int k, bool isRowMajor) {
+__global__ void naiveRussellRaoDistanceKernel(
+  OutType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
   if (midx >= m || nidx >= n) return;
@@ -225,56 +232,55 @@ __global__ void naiveRussellRaoDistanceKernel(OutType *dist, const DataType *x,
   for (int i = 0; i < k; ++i) {
     int xidx = isRowMajor ? i + midx * k : i * m + midx;
     int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a = x[xidx];
-    auto b = y[yidx];
+    auto a   = x[xidx];
+    auto b   = y[yidx];
     acc += (a * b);
   }
-  acc = (k - acc) / k;
-  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  acc          = (k - acc) / k;
+  int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
 
 template <typename DataType, typename OutType>
-__global__ void naiveKLDivergenceDistanceKernel(OutType *dist,
-                                                const DataType *x,
-                                                const DataType *y, int m, int n,
-                                                int k, bool isRowMajor) {
+__global__ void naiveKLDivergenceDistanceKernel(
+  OutType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
   if (midx >= m || nidx >= n) return;
   OutType acc = OutType(0);
   for (int i = 0; i < k; ++i) {
-    int xidx = isRowMajor ? i + midx * k : i * m + midx;
-    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a = x[xidx];
-    auto b = y[yidx];
-    bool b_zero = (b == 0);
-    const auto m = (!b_zero) * (a / b);
+    int xidx          = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx          = isRowMajor ? i + nidx * k : i * n + nidx;
+    auto a            = x[xidx];
+    auto b            = y[yidx];
+    bool b_zero       = (b == 0);
+    const auto m      = (!b_zero) * (a / b);
     const bool m_zero = (m == 0);
     acc += (a * (!m_zero) * log(m + m_zero));
   }
-  acc = 0.5f * acc;
-  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  acc          = 0.5f * acc;
+  int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
 
 template <typename DataType, typename OutType>
-__global__ void naiveCorrelationDistanceKernel(OutType *dist, const DataType *x,
-                                               const DataType *y, int m, int n,
-                                               int k, bool isRowMajor) {
+__global__ void naiveCorrelationDistanceKernel(
+  OutType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
   if (midx >= m || nidx >= n) return;
-  OutType acc = OutType(0);
-  auto a_norm = DataType(0);
-  auto b_norm = DataType(0);
+  OutType acc    = OutType(0);
+  auto a_norm    = DataType(0);
+  auto b_norm    = DataType(0);
   auto a_sq_norm = DataType(0);
   auto b_sq_norm = DataType(0);
   for (int i = 0; i < k; ++i) {
     int xidx = isRowMajor ? i + midx * k : i * m + midx;
     int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a = x[xidx];
-    auto b = y[yidx];
+    auto a   = x[xidx];
+    auto b   = y[yidx];
     a_norm += a;
     b_norm += b;
     a_sq_norm += (a * a);
@@ -282,20 +288,27 @@ __global__ void naiveCorrelationDistanceKernel(OutType *dist, const DataType *x,
     acc += (a * b);
   }
 
-  auto numer = k * acc - (a_norm * b_norm);
+  auto numer   = k * acc - (a_norm * b_norm);
   auto Q_denom = k * a_sq_norm - (a_norm * a_norm);
   auto R_denom = k * b_sq_norm - (b_norm * b_norm);
 
   acc = 1 - (numer / raft::mySqrt(Q_denom * R_denom));
 
-  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
 
 template <typename DataType>
-void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m,
-                   int n, int k, raft::distance::DistanceType type,
-                   bool isRowMajor, DataType metric_arg = 2.0f) {
+void naiveDistance(DataType* dist,
+                   const DataType* x,
+                   const DataType* y,
+                   int m,
+                   int n,
+                   int k,
+                   raft::distance::DistanceType type,
+                   bool isRowMajor,
+                   DataType metric_arg = 2.0f)
+{
   static const dim3 TPB(16, 32, 1);
   dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1);
 
@@ -310,43 +323,34 @@ void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m,
     case raft::distance::DistanceType::L2Unexpanded:
     case raft::distance::DistanceType::L2SqrtExpanded:
     case raft::distance::DistanceType::L2Expanded:
-      naiveDistanceKernel<DataType>
-        <<<nblks, TPB>>>(dist, x, y, m, n, k, type, isRowMajor);
+      naiveDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, type, isRowMajor);
       break;
     case raft::distance::DistanceType::CosineExpanded:
-      naiveCosineDistanceKernel<DataType>
-        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveCosineDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     case raft::distance::DistanceType::HellingerExpanded:
-      naiveHellingerDistanceKernel<DataType>
-        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveHellingerDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     case raft::distance::DistanceType::LpUnexpanded:
       naiveLpUnexpDistanceKernel<DataType>
         <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor, metric_arg);
       break;
     case raft::distance::DistanceType::HammingUnexpanded:
-      naiveHammingDistanceKernel<DataType>
-        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveHammingDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     case raft::distance::DistanceType::JensenShannon:
-      naiveJensenShannonDistanceKernel<DataType>
-        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveJensenShannonDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     case raft::distance::DistanceType::RusselRaoExpanded:
-      naiveRussellRaoDistanceKernel<DataType>
-        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveRussellRaoDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     case raft::distance::DistanceType::KLDivergence:
-      naiveKLDivergenceDistanceKernel<DataType>
-        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveKLDivergenceDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     case raft::distance::DistanceType::CorrelationExpanded:
-      naiveCorrelationDistanceKernel<DataType>
-        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveCorrelationDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
       break;
-    default:
-      FAIL() << "should be here\n";
+    default: FAIL() << "should be here\n";
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -361,24 +365,33 @@ struct DistanceInputs {
 };
 
 template <typename DataType>
-::std::ostream &operator<<(::std::ostream &os,
-                           const DistanceInputs<DataType> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const DistanceInputs<DataType>& dims)
+{
   return os;
 }
 
 template <raft::distance::DistanceType distanceType, typename DataType>
-void distanceLauncher(DataType *x, DataType *y, DataType *dist, DataType *dist2,
-                      int m, int n, int k, DistanceInputs<DataType> &params,
-                      DataType threshold, char *workspace, size_t worksize,
-                      cudaStream_t stream, bool isRowMajor,
-                      DataType metric_arg = 2.0f) {
+void distanceLauncher(DataType* x,
+                      DataType* y,
+                      DataType* dist,
+                      DataType* dist2,
+                      int m,
+                      int n,
+                      int k,
+                      DistanceInputs<DataType>& params,
+                      DataType threshold,
+                      char* workspace,
+                      size_t worksize,
+                      cudaStream_t stream,
+                      bool isRowMajor,
+                      DataType metric_arg = 2.0f)
+{
   auto fin_op = [dist2, threshold] __device__(DataType d_val, int g_d_idx) {
     dist2[g_d_idx] = (d_val < threshold) ? 0.f : d_val;
     return d_val;
   };
   raft::distance::distance<distanceType, DataType, DataType, DataType>(
-    x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor,
-    metric_arg);
+    x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg);
 }
 
 template <raft::distance::DistanceType distanceType, typename DataType>
@@ -391,23 +404,25 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
       y(params.n * params.k, stream),
       dist_ref(params.m * params.n, stream),
       dist(params.m * params.n, stream),
-      dist2(params.m * params.n, stream) {}
+      dist2(params.m * params.n, stream)
+  {
+  }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
-    int m = params.m;
-    int n = params.n;
-    int k = params.k;
+    int m               = params.m;
+    int n               = params.n;
+    int k               = params.k;
     DataType metric_arg = params.metric_arg;
-    bool isRowMajor = params.isRowMajor;
+    bool isRowMajor     = params.isRowMajor;
     if (distanceType == raft::distance::DistanceType::HellingerExpanded ||
         distanceType == raft::distance::DistanceType::JensenShannon ||
         distanceType == raft::distance::DistanceType::KLDivergence) {
       // Hellinger works only on positive numbers
       r.uniform(x.data(), m * k, DataType(0.0), DataType(1.0), stream);
       r.uniform(y.data(), n * k, DataType(0.0), DataType(1.0), stream);
-    } else if (distanceType ==
-               raft::distance::DistanceType::RusselRaoExpanded) {
+    } else if (distanceType == raft::distance::DistanceType::RusselRaoExpanded) {
       r.uniform(x.data(), m * k, DataType(0.0), DataType(1.0), stream);
       r.uniform(y.data(), n * k, DataType(0.0), DataType(1.0), stream);
       // Russel rao works on boolean values.
@@ -417,17 +432,27 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
       r.uniform(x.data(), m * k, DataType(-1.0), DataType(1.0), stream);
       r.uniform(y.data(), n * k, DataType(-1.0), DataType(1.0), stream);
     }
-    naiveDistance(dist_ref.data(), x.data(), y.data(), m, n, k, distanceType,
-                  isRowMajor, metric_arg);
-    size_t worksize =
-      raft::distance::getWorkspaceSize<distanceType, DataType, DataType,
-                                       DataType>(x.data(), y.data(), m, n, k);
+    naiveDistance(
+      dist_ref.data(), x.data(), y.data(), m, n, k, distanceType, isRowMajor, metric_arg);
+    size_t worksize = raft::distance::getWorkspaceSize<distanceType, DataType, DataType, DataType>(
+      x.data(), y.data(), m, n, k);
     rmm::device_uvector<char> workspace(worksize, stream);
 
     DataType threshold = -10000.f;
-    distanceLauncher<distanceType, DataType>(
-      x.data(), y.data(), dist.data(), dist2.data(), m, n, k, params, threshold,
-      workspace.data(), workspace.size(), stream, isRowMajor, metric_arg);
+    distanceLauncher<distanceType, DataType>(x.data(),
+                                             y.data(),
+                                             dist.data(),
+                                             dist2.data(),
+                                             m,
+                                             n,
+                                             k,
+                                             params,
+                                             threshold,
+                                             workspace.data(),
+                                             workspace.size(),
+                                             stream,
+                                             isRowMajor,
+                                             metric_arg);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 33782baf8d..932857c536 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -30,40 +30,40 @@ template <typename LabelT, typename DataT>
 struct CubKVPMinReduce {
   typedef cub::KeyValuePair<LabelT, DataT> KVP;
 
-  DI KVP operator()(LabelT rit, const KVP &a, const KVP &b) {
-    return b.value < a.value ? b : a;
-  }
+  DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
 
-  DI KVP operator()(const KVP &a, const KVP &b) {
-    return b.value < a.value ? b : a;
-  }
+  DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
 
 };  // KVPMinReduce
 
 template <typename DataT, bool Sqrt, typename ReduceOpT, int NWARPS>
-__global__ void naiveKernel(cub::KeyValuePair<int, DataT> *min, DataT *x,
-                            DataT *y, int m, int n, int k, int *workspace,
-                            DataT maxVal) {
-  int midx = threadIdx.y + blockIdx.y * blockDim.y;
-  int nidx = threadIdx.x + blockIdx.x * blockDim.x;
+__global__ void naiveKernel(cub::KeyValuePair<int, DataT>* min,
+                            DataT* x,
+                            DataT* y,
+                            int m,
+                            int n,
+                            int k,
+                            int* workspace,
+                            DataT maxVal)
+{
+  int midx  = threadIdx.y + blockIdx.y * blockDim.y;
+  int nidx  = threadIdx.x + blockIdx.x * blockDim.x;
   DataT acc = DataT(0);
   for (int i = 0; i < k; ++i) {
-    int xidx = i + midx * k;
-    int yidx = i + nidx * k;
+    int xidx  = i + midx * k;
+    int yidx  = i + nidx * k;
     auto diff = midx >= m || nidx >= n ? DataT(0) : x[xidx] - y[yidx];
     acc += diff * diff;
   }
-  if (Sqrt) {
-    acc = raft::mySqrt(acc);
-  }
+  if (Sqrt) { acc = raft::mySqrt(acc); }
   ReduceOpT redOp;
   typedef cub::WarpReduce<cub::KeyValuePair<int, DataT>> WarpReduce;
   __shared__ typename WarpReduce::TempStorage temp[NWARPS];
   int warpId = threadIdx.x / raft::WarpSize;
   cub::KeyValuePair<int, DataT> tmp;
-  tmp.key = nidx;
+  tmp.key   = nidx;
   tmp.value = midx >= m || nidx >= n ? maxVal : acc;
-  tmp = WarpReduce(temp[warpId]).Reduce(tmp, CubKVPMinReduce<int, DataT>());
+  tmp       = WarpReduce(temp[warpId]).Reduce(tmp, CubKVPMinReduce<int, DataT>());
   if (threadIdx.x % raft::WarpSize == 0 && midx < m) {
     while (atomicCAS(workspace + midx, 0, 1) == 1)
       ;
@@ -75,8 +75,15 @@ __global__ void naiveKernel(cub::KeyValuePair<int, DataT> *min, DataT *x,
 }
 
 template <typename DataT, bool Sqrt>
-void naive(cub::KeyValuePair<int, DataT> *min, DataT *x, DataT *y, int m, int n,
-           int k, int *workspace, cudaStream_t stream) {
+void naive(cub::KeyValuePair<int, DataT>* min,
+           DataT* x,
+           DataT* y,
+           int m,
+           int n,
+           int k,
+           int* workspace,
+           cudaStream_t stream)
+{
   static const dim3 TPB(32, 16, 1);
   dim3 nblks(raft::ceildiv(n, (int)TPB.x), raft::ceildiv(m, (int)TPB.y), 1);
   CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream));
@@ -86,8 +93,7 @@ void naive(cub::KeyValuePair<int, DataT> *min, DataT *x, DataT *y, int m, int n,
     <<<blks, 256, 0, stream>>>(min, m, std::numeric_limits<DataT>::max(), op);
   CUDA_CHECK(cudaGetLastError());
   naiveKernel<DataT, Sqrt, MinAndDistanceReduceOp<int, DataT>, 16>
-    <<<nblks, TPB, 0, stream>>>(min, x, y, m, n, k, workspace,
-                                std::numeric_limits<DataT>::max());
+    <<<nblks, TPB, 0, stream>>>(min, x, y, m, n, k, workspace, std::numeric_limits<DataT>::max());
   CUDA_CHECK(cudaGetLastError());
 }
 
@@ -110,10 +116,13 @@ class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
       yn(params.n, stream),
       min(params.m, stream),
       min_ref(params.m, stream),
-      workspace(params.m * sizeof(int), stream) {}
+      workspace(params.m * sizeof(int), stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int m = params.m;
     int n = params.n;
@@ -121,10 +130,8 @@ class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
     r.uniform(x.data(), m * k, DataT(-1.0), DataT(1.0), stream);
     r.uniform(y.data(), n * k, DataT(-1.0), DataT(1.0), stream);
     generateGoldenResult();
-    raft::linalg::rowNorm(xn.data(), x.data(), k, m, raft::linalg::L2Norm, true,
-                          stream);
-    raft::linalg::rowNorm(yn.data(), y.data(), k, n, raft::linalg::L2Norm, true,
-                          stream);
+    raft::linalg::rowNorm(xn.data(), x.data(), k, m, raft::linalg::L2Norm, true, stream);
+    raft::linalg::rowNorm(yn.data(), y.data(), k, n, raft::linalg::L2Norm, true, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
@@ -140,23 +147,34 @@ class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
   raft::handle_t handle;
   cudaStream_t stream;
 
-  virtual void generateGoldenResult() {
+  virtual void generateGoldenResult()
+  {
     int m = params.m;
     int n = params.n;
     int k = params.k;
-    naive<DataT, Sqrt>(min_ref.data(), x.data(), y.data(), m, n, k,
-                       (int *)workspace.data(), stream);
+    naive<DataT, Sqrt>(min_ref.data(), x.data(), y.data(), m, n, k, (int*)workspace.data(), stream);
   }
 
-  void runTest(cub::KeyValuePair<int, DataT> *out) {
+  void runTest(cub::KeyValuePair<int, DataT>* out)
+  {
     int m = params.m;
     int n = params.n;
     int k = params.k;
     MinAndDistanceReduceOp<int, DataT> redOp;
-    fusedL2NN<DataT, cub::KeyValuePair<int, DataT>, int>(
-      out, x.data(), y.data(), xn.data(), yn.data(), m, n, k,
-      (void *)workspace.data(), redOp,
-      raft::distance::KVPMinReduce<int, DataT>(), Sqrt, true, stream);
+    fusedL2NN<DataT, cub::KeyValuePair<int, DataT>, int>(out,
+                                                         x.data(),
+                                                         y.data(),
+                                                         xn.data(),
+                                                         yn.data(),
+                                                         m,
+                                                         n,
+                                                         k,
+                                                         (void*)workspace.data(),
+                                                         redOp,
+                                                         raft::distance::KVPMinReduce<int, DataT>(),
+                                                         Sqrt,
+                                                         true,
+                                                         stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 };
@@ -165,9 +183,10 @@ template <typename T>
 struct CompareApproxAbsKVP {
   typedef typename cub::KeyValuePair<int, T> KVP;
   CompareApproxAbsKVP(T eps_) : eps(eps_) {}
-  bool operator()(const KVP &a, const KVP &b) const {
-    T diff = raft::abs(raft::abs(a.value) - raft::abs(b.value));
-    T m = std::max(raft::abs(a.value), raft::abs(b.value));
+  bool operator()(const KVP& a, const KVP& b) const
+  {
+    T diff  = raft::abs(raft::abs(a.value) - raft::abs(b.value));
+    T m     = std::max(raft::abs(a.value), raft::abs(b.value));
     T ratio = m >= eps ? diff / m : diff;
     return (ratio <= eps);
   }
@@ -179,17 +198,20 @@ struct CompareApproxAbsKVP {
 template <typename T>
 struct CompareExactKVP {
   typedef typename cub::KeyValuePair<int, T> KVP;
-  bool operator()(const KVP &a, const KVP &b) const {
+  bool operator()(const KVP& a, const KVP& b) const
+  {
     if (a.value != b.value) return false;
     return true;
   }
 };
 
 template <typename K, typename V, typename L>
-::testing::AssertionResult devArrMatch(const cub::KeyValuePair<K, V> *expected,
-                                       const cub::KeyValuePair<K, V> *actual,
-                                       size_t size, L eq_compare,
-                                       cudaStream_t stream = 0) {
+::testing::AssertionResult devArrMatch(const cub::KeyValuePair<K, V>* expected,
+                                       const cub::KeyValuePair<K, V>* actual,
+                                       size_t size,
+                                       L eq_compare,
+                                       cudaStream_t stream = 0)
+{
   typedef typename cub::KeyValuePair<K, V> KVP;
   std::shared_ptr<KVP> exp_h(new KVP[size]);
   std::shared_ptr<KVP> act_h(new KVP[size]);
@@ -201,47 +223,44 @@ template <typename K, typename V, typename L>
     auto act = act_h.get()[i];
     if (!eq_compare(exp, act)) {
       return ::testing::AssertionFailure()
-             << "actual=" << act.key << "," << act.value
-             << " != expected=" << exp.key << "," << exp.value << " @" << i;
+             << "actual=" << act.key << "," << act.value << " != expected=" << exp.key << ","
+             << exp.value << " @" << i;
     }
   }
   return ::testing::AssertionSuccess();
 }
 
 const std::vector<Inputs<float>> inputsf = {
-  {0.001f, 32, 32, 32, 1234ULL},   {0.001f, 32, 64, 32, 1234ULL},
-  {0.001f, 64, 32, 32, 1234ULL},   {0.001f, 64, 64, 32, 1234ULL},
-  {0.001f, 128, 32, 32, 1234ULL},  {0.001f, 128, 64, 32, 1234ULL},
+  {0.001f, 32, 32, 32, 1234ULL},   {0.001f, 32, 64, 32, 1234ULL},   {0.001f, 64, 32, 32, 1234ULL},
+  {0.001f, 64, 64, 32, 1234ULL},   {0.001f, 128, 32, 32, 1234ULL},  {0.001f, 128, 64, 32, 1234ULL},
   {0.001f, 128, 128, 64, 1234ULL}, {0.001f, 64, 128, 128, 1234ULL},
 
-  {0.001f, 32, 32, 34, 1234ULL},   {0.001f, 32, 64, 34, 1234ULL},
-  {0.001f, 64, 32, 34, 1234ULL},   {0.001f, 64, 64, 34, 1234ULL},
-  {0.001f, 128, 32, 34, 1234ULL},  {0.001f, 128, 64, 34, 1234ULL},
+  {0.001f, 32, 32, 34, 1234ULL},   {0.001f, 32, 64, 34, 1234ULL},   {0.001f, 64, 32, 34, 1234ULL},
+  {0.001f, 64, 64, 34, 1234ULL},   {0.001f, 128, 32, 34, 1234ULL},  {0.001f, 128, 64, 34, 1234ULL},
   {0.001f, 128, 128, 66, 1234ULL}, {0.001f, 64, 128, 130, 1234ULL},
 
-  {0.001f, 32, 32, 33, 1234ULL},   {0.001f, 32, 64, 33, 1234ULL},
-  {0.001f, 64, 32, 33, 1234ULL},   {0.001f, 64, 64, 33, 1234ULL},
-  {0.001f, 128, 32, 33, 1234ULL},  {0.001f, 128, 64, 33, 1234ULL},
+  {0.001f, 32, 32, 33, 1234ULL},   {0.001f, 32, 64, 33, 1234ULL},   {0.001f, 64, 32, 33, 1234ULL},
+  {0.001f, 64, 64, 33, 1234ULL},   {0.001f, 128, 32, 33, 1234ULL},  {0.001f, 128, 64, 33, 1234ULL},
   {0.001f, 128, 128, 65, 1234ULL}, {0.001f, 64, 128, 129, 1234ULL},
 
   {0.006f, 1805, 134, 2, 1234ULL},
 };
 typedef FusedL2NNTest<float, false> FusedL2NNTestF_Sq;
-TEST_P(FusedL2NNTestF_Sq, Result) {
+TEST_P(FusedL2NNTestF_Sq, Result)
+{
   runTest(min.data());
-  ASSERT_TRUE(devArrMatch(min_ref.data(), min.data(), params.m,
-                          CompareApproxAbsKVP<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    min_ref.data(), min.data(), params.m, CompareApproxAbsKVP<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sq,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sq, ::testing::ValuesIn(inputsf));
 typedef FusedL2NNTest<float, true> FusedL2NNTestF_Sqrt;
-TEST_P(FusedL2NNTestF_Sqrt, Result) {
+TEST_P(FusedL2NNTestF_Sqrt, Result)
+{
   runTest(min.data());
-  ASSERT_TRUE(devArrMatch(min_ref.data(), min.data(), params.m,
-                          CompareApproxAbsKVP<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    min_ref.data(), min.data(), params.m, CompareApproxAbsKVP<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sqrt,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sqrt, ::testing::ValuesIn(inputsf));
 
 const std::vector<Inputs<double>> inputsd = {
   {0.00001, 32, 32, 32, 1234ULL},   {0.00001, 32, 64, 32, 1234ULL},
@@ -262,21 +281,21 @@ const std::vector<Inputs<double>> inputsd = {
   {0.00001, 1805, 134, 2, 1234ULL},
 };
 typedef FusedL2NNTest<double, false> FusedL2NNTestD_Sq;
-TEST_P(FusedL2NNTestD_Sq, Result) {
+TEST_P(FusedL2NNTestD_Sq, Result)
+{
   runTest(min.data());
-  ASSERT_TRUE(devArrMatch(min_ref.data(), min.data(), params.m,
-                          CompareApproxAbsKVP<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    min_ref.data(), min.data(), params.m, CompareApproxAbsKVP<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sq,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sq, ::testing::ValuesIn(inputsd));
 typedef FusedL2NNTest<double, true> FusedL2NNTestD_Sqrt;
-TEST_P(FusedL2NNTestD_Sqrt, Result) {
+TEST_P(FusedL2NNTestD_Sqrt, Result)
+{
   runTest(min.data());
-  ASSERT_TRUE(devArrMatch(min_ref.data(), min.data(), params.m,
-                          CompareApproxAbsKVP<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    min_ref.data(), min.data(), params.m, CompareApproxAbsKVP<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sqrt,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sqrt, ::testing::ValuesIn(inputsd));
 
 /// This is to test output determinism of the prim
 template <typename DataT, bool Sqrt>
@@ -284,7 +303,8 @@ class FusedL2NNDetTest : public FusedL2NNTest<DataT, Sqrt> {
  public:
   FusedL2NNDetTest() : stream(handle.get_stream()), min1(0, stream) {}
 
-  void SetUp() override {
+  void SetUp() override
+  {
     FusedL2NNTest<DataT, Sqrt>::SetUp();
     int m = this->params.m;
     min1.resize(m, stream);
@@ -305,50 +325,46 @@ class FusedL2NNDetTest : public FusedL2NNTest<DataT, Sqrt> {
 };
 
 typedef FusedL2NNDetTest<float, false> FusedL2NNDetTestF_Sq;
-TEST_P(FusedL2NNDetTestF_Sq, Result) {
+TEST_P(FusedL2NNDetTestF_Sq, Result)
+{
   runTest(min.data());  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
     runTest(min1.data());
-    ASSERT_TRUE(
-      devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<float>()));
+    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<float>()));
   }
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sq,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sq, ::testing::ValuesIn(inputsf));
 typedef FusedL2NNDetTest<float, true> FusedL2NNDetTestF_Sqrt;
-TEST_P(FusedL2NNDetTestF_Sqrt, Result) {
+TEST_P(FusedL2NNDetTestF_Sqrt, Result)
+{
   runTest(min.data());  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
     runTest(min1.data());
-    ASSERT_TRUE(
-      devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<float>()));
+    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<float>()));
   }
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sqrt,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sqrt, ::testing::ValuesIn(inputsf));
 
 typedef FusedL2NNDetTest<double, false> FusedL2NNDetTestD_Sq;
-TEST_P(FusedL2NNDetTestD_Sq, Result) {
+TEST_P(FusedL2NNDetTestD_Sq, Result)
+{
   runTest(min.data());  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
     runTest(min1.data());
-    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m,
-                            CompareExactKVP<double>()));
+    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<double>()));
   }
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sq,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sq, ::testing::ValuesIn(inputsd));
 typedef FusedL2NNDetTest<double, true> FusedL2NNDetTestD_Sqrt;
-TEST_P(FusedL2NNDetTestD_Sqrt, Result) {
+TEST_P(FusedL2NNDetTestD_Sqrt, Result)
+{
   runTest(min.data());  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
     runTest(min1.data());
-    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m,
-                            CompareExactKVP<double>()));
+    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<double>()));
   }
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sqrt,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sqrt, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu
index ede790b38c..dc7de92eb8 100644
--- a/cpp/test/eigen_solvers.cu
+++ b/cpp/test/eigen_solvers.cu
@@ -25,7 +25,8 @@
 
 namespace raft {
 
-TEST(Raft, EigenSolvers) {
+TEST(Raft, EigenSolvers)
+{
   using namespace matrix;
   using index_type = int;
   using value_type = double;
@@ -36,7 +37,7 @@ TEST(Raft, EigenSolvers) {
   index_type* ro{nullptr};
   index_type* ci{nullptr};
   value_type* vs{nullptr};
-  index_type nnz = 0;
+  index_type nnz   = 0;
   index_type nrows = 0;
 
   sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
@@ -48,7 +49,7 @@ TEST(Raft, EigenSolvers) {
   value_type tol{1.0e-10};
   bool reorthog{true};
 
-  //nullptr expected to trigger exceptions:
+  // nullptr expected to trigger exceptions:
   //
   value_type* eigvals{nullptr};
   value_type* eigvecs{nullptr};
@@ -59,14 +60,13 @@ TEST(Raft, EigenSolvers) {
 
   lanczos_solver_t<index_type, value_type> eig_solver{cfg};
 
-  EXPECT_ANY_THROW(
-    eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs));
+  EXPECT_ANY_THROW(eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs));
 
-  EXPECT_ANY_THROW(
-    eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs));
+  EXPECT_ANY_THROW(eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs));
 }
 
-TEST(Raft, SpectralSolvers) {
+TEST(Raft, SpectralSolvers)
+{
   using namespace matrix;
   using index_type = int;
   using value_type = double;
@@ -80,7 +80,7 @@ TEST(Raft, SpectralSolvers) {
   value_type tol{1.0e-10};
   bool reorthog{true};
 
-  //nullptr expected to trigger exceptions:
+  // nullptr expected to trigger exceptions:
   //
   index_type* clusters{nullptr};
   value_type* eigvals{nullptr};
@@ -94,19 +94,16 @@ TEST(Raft, SpectralSolvers) {
 
   index_type k{5};
 
-  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol,
-                                                            seed};
+  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol, seed};
   kmeans_solver_t<index_type, value_type> cluster_solver{clust_cfg};
 
-  sparse_matrix_t<index_type, value_type> sm{h,       nullptr, nullptr,
-                                             nullptr, 0,       0};
-  EXPECT_ANY_THROW(spectral::partition(h, sm, eig_solver, cluster_solver,
-                                       clusters, eigvals, eigvecs));
+  sparse_matrix_t<index_type, value_type> sm{h, nullptr, nullptr, nullptr, 0, 0};
+  EXPECT_ANY_THROW(
+    spectral::partition(h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
 
   value_type edgeCut{0};
   value_type cost{0};
-  EXPECT_ANY_THROW(
-    spectral::analyzePartition(h, sm, k, clusters, edgeCut, cost));
+  EXPECT_ANY_THROW(spectral::analyzePartition(h, sm, k, clusters, edgeCut, cost));
 }
 
 }  // namespace raft
diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp
index 3e27789078..698a601e85 100644
--- a/cpp/test/handle.cpp
+++ b/cpp/test/handle.cpp
@@ -22,7 +22,8 @@
 
 namespace raft {
 
-TEST(Raft, HandleDefault) {
+TEST(Raft, HandleDefault)
+{
   handle_t h;
   ASSERT_EQ(0, h.get_device());
   ASSERT_EQ(nullptr, h.get_stream());
@@ -32,7 +33,8 @@ TEST(Raft, HandleDefault) {
   ASSERT_NE(nullptr, h.get_cusparse_handle());
 }
 
-TEST(Raft, Handle) {
+TEST(Raft, Handle)
+{
   handle_t h(4);
   ASSERT_EQ(4, h.get_num_internal_streams());
   cudaStream_t stream;
@@ -43,13 +45,15 @@ TEST(Raft, Handle) {
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
-TEST(Raft, GetInternalStreams) {
+TEST(Raft, GetInternalStreams)
+{
   handle_t h(4);
   auto streams = h.get_internal_streams();
   ASSERT_EQ(4U, streams.size());
 }
 
-TEST(Raft, GetHandleFromPool) {
+TEST(Raft, GetHandleFromPool)
+{
   handle_t parent(4);
 
   handle_t child(parent, 2);
@@ -62,13 +66,13 @@ TEST(Raft, GetHandleFromPool) {
   ASSERT_EQ(parent.get_device(), child.get_device());
 }
 
-TEST(Raft, GetHandleStreamViews) {
+TEST(Raft, GetHandleStreamViews)
+{
   handle_t parent(4);
 
   handle_t child(parent, 2);
   ASSERT_EQ(parent.get_internal_stream_view(2), child.get_stream_view());
-  ASSERT_EQ(parent.get_internal_stream_view(2).value(),
-            child.get_stream_view().value());
+  ASSERT_EQ(parent.get_internal_stream_view(2).value(), child.get_stream_view().value());
   EXPECT_FALSE(child.get_stream_view().is_default());
 }
 }  // namespace raft
diff --git a/cpp/test/integer_utils.cpp b/cpp/test/integer_utils.cpp
index 830d085a40..d883de59fe 100644
--- a/cpp/test/integer_utils.cpp
+++ b/cpp/test/integer_utils.cpp
@@ -20,7 +20,8 @@
 
 namespace raft {
 
-TEST(Raft, rounding_up) {
+TEST(Raft, rounding_up)
+{
   ASSERT_EQ(raft::div_rounding_up_safe(5, 3), 2);
   ASSERT_EQ(raft::div_rounding_up_safe(0, 3), 0);
   ASSERT_EQ(raft::div_rounding_up_safe(7, 8), 1);
@@ -29,7 +30,8 @@ TEST(Raft, rounding_up) {
   ASSERT_EQ(raft::div_rounding_up_unsafe(7, 8), 1);
 }
 
-TEST(Raft, is_a_power_of_two) {
+TEST(Raft, is_a_power_of_two)
+{
   ASSERT_EQ(raft::is_a_power_of_two(1 << 5), true);
   ASSERT_EQ(raft::is_a_power_of_two((1 << 5) + 1), false);
 }
diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu
index f79d8f10c8..d983ec1162 100644
--- a/cpp/test/label/label.cu
+++ b/cpp/test/label/label.cu
@@ -35,7 +35,8 @@ class labelTest : public ::testing::Test {
 };
 
 typedef labelTest MakeMonotonicTest;
-TEST_F(MakeMonotonicTest, Result) {
+TEST_F(MakeMonotonicTest, Result)
+{
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
 
@@ -45,11 +46,9 @@ TEST_F(MakeMonotonicTest, Result) {
   rmm::device_uvector<float> actual(m, stream);
   rmm::device_uvector<float> expected(m, stream);
 
-  float *data_h =
-    new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0};
+  float* data_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0};
 
-  float *expected_h =
-    new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 6.0, 7.0};
+  float* expected_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 6.0, 7.0};
 
   raft::update_device(data.data(), data_h, m, stream);
   raft::update_device(expected.data(), expected_h, m, stream);
@@ -58,14 +57,14 @@ TEST_F(MakeMonotonicTest, Result) {
 
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
-  ASSERT_TRUE(devArrMatch(actual.data(), expected.data(), m,
-                          raft::Compare<bool>(), stream));
+  ASSERT_TRUE(devArrMatch(actual.data(), expected.data(), m, raft::Compare<bool>(), stream));
 
   delete data_h;
   delete expected_h;
 }
 
-TEST(labelTest, Classlabels) {
+TEST(labelTest, Classlabels)
+{
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
 
@@ -81,17 +80,16 @@ TEST(labelTest, Classlabels) {
   ASSERT_EQ(n_classes, 3);
 
   float y_unique_exp[] = {-1, 1, 2};
-  EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d.data(), n_classes,
-                              raft::Compare<float>(), stream));
+  EXPECT_TRUE(
+    devArrMatchHost(y_unique_exp, y_unique_d.data(), n_classes, raft::Compare<float>(), stream));
 
   rmm::device_uvector<float> y_relabeled_d(n_rows, stream);
 
-  getOvrlabels(y_d.data(), n_rows, y_unique_d.data(), n_classes,
-               y_relabeled_d.data(), 2, stream);
+  getOvrlabels(y_d.data(), n_rows, y_unique_d.data(), n_classes, y_relabeled_d.data(), 2, stream);
 
   float y_relabeled_exp[] = {1, -1, -1, 1, -1, -1};
-  EXPECT_TRUE(devArrMatchHost(y_relabeled_exp, y_relabeled_d.data(), n_rows,
-                              raft::Compare<float>(), stream));
+  EXPECT_TRUE(
+    devArrMatchHost(y_relabeled_exp, y_relabeled_d.data(), n_rows, raft::Compare<float>(), stream));
 }
 };  // namespace label
 };  // namespace raft
diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu
index 76e0a4295e..dd67f0fd89 100644
--- a/cpp/test/label/merge_labels.cu
+++ b/cpp/test/label/merge_labels.cu
@@ -39,8 +39,7 @@ struct MergeLabelsInputs {
 };
 
 template <typename Index_>
-class MergeLabelsTest
-  : public ::testing::TestWithParam<MergeLabelsInputs<Index_>> {
+class MergeLabelsTest : public ::testing::TestWithParam<MergeLabelsInputs<Index_>> {
  protected:
   MergeLabelsTest()
     : params(::testing::TestWithParam<MergeLabelsInputs<Index_>>::GetParam()),
@@ -50,25 +49,23 @@ class MergeLabelsTest
       expected(params.N, stream),
       R(params.N, stream),
       mask(params.N, stream),
-      m(stream) {}
-
-  void Run() {
-    raft::update_device(labels_a.data(), params.labels_a.data(), params.N,
-                        stream);
-    raft::update_device(labels_b.data(), params.labels_b.data(), params.N,
-                        stream);
-    raft::update_device(expected.data(), params.expected.data(), params.N,
-                        stream);
-    raft::update_device(mask.data(),
-                        reinterpret_cast<bool *>(params.mask.data()), params.N,
-                        stream);
-
-    merge_labels(labels_a.data(), labels_b.data(), mask.data(), R.data(),
-                 m.data(), params.N, stream);
+      m(stream)
+  {
+  }
+
+  void Run()
+  {
+    raft::update_device(labels_a.data(), params.labels_a.data(), params.N, stream);
+    raft::update_device(labels_b.data(), params.labels_b.data(), params.N, stream);
+    raft::update_device(expected.data(), params.expected.data(), params.N, stream);
+    raft::update_device(mask.data(), reinterpret_cast<bool*>(params.mask.data()), params.N, stream);
+
+    merge_labels(
+      labels_a.data(), labels_b.data(), mask.data(), R.data(), m.data(), params.N, stream);
 
     cudaStreamSynchronize(stream);
-    ASSERT_TRUE(raft::devArrMatch<Index_>(expected.data(), labels_a.data(),
-                                          params.N, raft::Compare<Index_>()));
+    ASSERT_TRUE(raft::devArrMatch<Index_>(
+      expected.data(), labels_a.data(), params.N, raft::Compare<Index_>()));
   }
 
  protected:
@@ -86,22 +83,14 @@ TEST_P(MergeLabelsTestI, Result) { Run(); }
 using MergeLabelsTestL = MergeLabelsTest<int64_t>;
 TEST_P(MergeLabelsTestL, Result) { Run(); }
 
-constexpr int MAX32 = std::numeric_limits<int>::max();
+constexpr int MAX32     = std::numeric_limits<int>::max();
 constexpr int64_t MAX64 = std::numeric_limits<int64_t>::max();
 
 const std::vector<MergeLabelsInputs<int>> merge_inputs_32 = {
   {4, {1, 1, 3, MAX32}, {1, 3, 3, 1}, {1, 0, 1, 0}, {1, 1, 3, 1}},
   {5, {1, 2, 2, 2, 1}, {4, 2, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-  {6,
-   {1, 2, 1, 4, 5, MAX32},
-   {1, 2, MAX32, 4, 5, 4},
-   {1, 1, 0, 1, 1, 0},
-   {1, 2, 1, 4, 5, 4}},
-  {6,
-   {1, 2, 2, 2, 2, 6},
-   {1, 1, 1, 5, 5, 5},
-   {1, 1, 1, 1, 1, 1},
-   {1, 1, 1, 1, 1, 1}},
+  {6, {1, 2, 1, 4, 5, MAX32}, {1, 2, MAX32, 4, 5, 4}, {1, 1, 0, 1, 1, 0}, {1, 2, 1, 4, 5, 4}},
+  {6, {1, 2, 2, 2, 2, 6}, {1, 1, 1, 5, 5, 5}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}},
   {8,
    {1, 1, 3, 3, MAX32, 1, 3, MAX32},
    {1, 2, 3, 2, MAX32, 2, 2, 2},
@@ -117,16 +106,8 @@ const std::vector<MergeLabelsInputs<int>> merge_inputs_32 = {
 const std::vector<MergeLabelsInputs<int64_t>> merge_inputs_64 = {
   {4, {1, 1, 3, MAX64}, {1, 3, 3, 1}, {1, 0, 1, 0}, {1, 1, 3, 1}},
   {5, {1, 2, 2, 2, 1}, {4, 2, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-  {6,
-   {1, 2, 1, 4, 5, MAX64},
-   {1, 2, MAX64, 4, 5, 4},
-   {1, 1, 0, 1, 1, 0},
-   {1, 2, 1, 4, 5, 4}},
-  {6,
-   {1, 2, 2, 2, 2, 6},
-   {1, 1, 1, 5, 5, 5},
-   {1, 1, 1, 1, 1, 1},
-   {1, 1, 1, 1, 1, 1}},
+  {6, {1, 2, 1, 4, 5, MAX64}, {1, 2, MAX64, 4, 5, 4}, {1, 1, 0, 1, 1, 0}, {1, 2, 1, 4, 5, 4}},
+  {6, {1, 2, 2, 2, 2, 6}, {1, 1, 1, 5, 5, 5}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}},
   {8,
    {1, 1, 3, 3, MAX64, 1, 3, MAX64},
    {1, 2, 3, 2, MAX64, 2, 2, 2},
@@ -139,10 +120,8 @@ const std::vector<MergeLabelsInputs<int64_t>> merge_inputs_64 = {
    {1, 1, 1, 1, 1, 7, 7, 7}},
 };
 
-INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestI,
-                        ::testing::ValuesIn(merge_inputs_32));
-INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestL,
-                        ::testing::ValuesIn(merge_inputs_64));
+INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestI, ::testing::ValuesIn(merge_inputs_32));
+INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestL, ::testing::ValuesIn(merge_inputs_64));
 
 }  // namespace label
 }  // namespace raft
diff --git a/cpp/test/lap/lap.cu b/cpp/test/lap/lap.cu
index 08429e18f2..183c0bd2f3 100644
--- a/cpp/test/lap/lap.cu
+++ b/cpp/test/lap/lap.cu
@@ -31,11 +31,11 @@
 #include <raft/lap/lap.cuh>
 #include <random>
 
-#define PROBLEMSIZE 1000  // Number of rows/columns
-#define BATCHSIZE 10      // Number of problems in the batch
-#define COSTRANGE 1000
+#define PROBLEMSIZE  1000  // Number of rows/columns
+#define BATCHSIZE    10    // Number of problems in the batch
+#define COSTRANGE    1000
 #define PROBLEMCOUNT 1
-#define REPETITIONS 1
+#define REPETITIONS  1
 
 #define SEED 01010001
 
@@ -45,38 +45,41 @@ namespace raft {
 
 // Function for generating problem with uniformly distributed integer costs between [0, COSTRANGE].
 template <typename weight_t>
-void generateProblem(weight_t *cost_matrix, int SP, int N, int costrange) {
+void generateProblem(weight_t* cost_matrix, int SP, int N, int costrange)
+{
   long N2 = SP * N * N;
 
   std::uniform_int_distribution<int> distribution(0, costrange);
 
   for (long i = 0; i < N2; i++) {
-    int val = distribution(generator);
+    int val        = distribution(generator);
     cost_matrix[i] = (weight_t)val;
   }
 }
 
 template <typename vertex_t, typename weight_t>
-void hungarian_test(int problemsize, int costrange, int problemcount,
-                    int repetitions, int batchsize, weight_t epsilon,
-                    bool verbose = false) {
+void hungarian_test(int problemsize,
+                    int costrange,
+                    int problemcount,
+                    int repetitions,
+                    int batchsize,
+                    weight_t epsilon,
+                    bool verbose = false)
+{
   raft::handle_t handle;
 
-  weight_t *h_cost = new weight_t[batchsize * problemsize * problemsize];
+  weight_t* h_cost = new weight_t[batchsize * problemsize * problemsize];
 
   for (int j = 0; j < problemcount; j++) {
     generateProblem(h_cost, batchsize, problemsize, costrange);
 
-    rmm::device_uvector<weight_t> elements_v(
-      batchsize * problemsize * problemsize, handle.get_stream());
-    rmm::device_uvector<vertex_t> row_assignment_v(batchsize * problemsize,
-                                                   handle.get_stream());
-    rmm::device_uvector<vertex_t> col_assignment_v(batchsize * problemsize,
-                                                   handle.get_stream());
+    rmm::device_uvector<weight_t> elements_v(batchsize * problemsize * problemsize,
+                                             handle.get_stream());
+    rmm::device_uvector<vertex_t> row_assignment_v(batchsize * problemsize, handle.get_stream());
+    rmm::device_uvector<vertex_t> col_assignment_v(batchsize * problemsize, handle.get_stream());
 
-    raft::update_device(elements_v.data(), h_cost,
-                        batchsize * problemsize * problemsize,
-                        handle.get_stream());
+    raft::update_device(
+      elements_v.data(), h_cost, batchsize * problemsize * problemsize, handle.get_stream());
 
     for (int i = 0; i < repetitions; i++) {
       float start = omp_get_wtime();
@@ -86,20 +89,18 @@ void hungarian_test(int problemsize, int costrange, int problemcount,
         handle, problemsize, batchsize, epsilon);
 
       // Solve LAP(s) for given cost matrix
-      lpx.solve(elements_v.data(), row_assignment_v.data(),
-                col_assignment_v.data());
+      lpx.solve(elements_v.data(), row_assignment_v.data(), col_assignment_v.data());
 
       float end = omp_get_wtime();
 
       float total_time = (end - start);
 
       if (verbose) {
-        // Use getPrimalObjectiveValue and getDualObjectiveValue APIs to get primal and dual objectives. At optimality both values should match.
+        // Use getPrimalObjectiveValue and getDualObjectiveValue APIs to get primal and dual
+        // objectives. At optimality both values should match.
         for (int k = 0; k < batchsize; k++) {
-          std::cout << j << ":" << i << ":" << k << ":"
-                    << lpx.getPrimalObjectiveValue(k) << ":"
-                    << lpx.getDualObjectiveValue(k) << ":" << total_time
-                    << std::endl;
+          std::cout << j << ":" << i << ":" << k << ":" << lpx.getPrimalObjectiveValue(k) << ":"
+                    << lpx.getDualObjectiveValue(k) << ":" << total_time << std::endl;
         }
       }
     }
@@ -108,34 +109,38 @@ void hungarian_test(int problemsize, int costrange, int problemcount,
   delete[] h_cost;
 }
 
-TEST(Raft, HungarianIntFloat) {
-  hungarian_test<int, float>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS,
-                             BATCHSIZE, float{1e-6});
+TEST(Raft, HungarianIntFloat)
+{
+  hungarian_test<int, float>(
+    PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, float{1e-6});
 }
 
-TEST(Raft, HungarianIntDouble) {
-  hungarian_test<int, double>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS,
-                              BATCHSIZE, double{1e-6});
+TEST(Raft, HungarianIntDouble)
+{
+  hungarian_test<int, double>(
+    PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, double{1e-6});
 }
 
-TEST(Raft, HungarianIntLong) {
-  hungarian_test<int, long>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS,
-                            BATCHSIZE, long{0});
+TEST(Raft, HungarianIntLong)
+{
+  hungarian_test<int, long>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, long{0});
 }
 
-TEST(Raft, HungarianLongFloat) {
-  hungarian_test<long, float>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS,
-                              BATCHSIZE, float{1e-6});
+TEST(Raft, HungarianLongFloat)
+{
+  hungarian_test<long, float>(
+    PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, float{1e-6});
 }
 
-TEST(Raft, HungarianLongDouble) {
-  hungarian_test<long, double>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT,
-                               REPETITIONS, BATCHSIZE, double{1e-6});
+TEST(Raft, HungarianLongDouble)
+{
+  hungarian_test<long, double>(
+    PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, double{1e-6});
 }
 
-TEST(Raft, HungarianLongLong) {
-  hungarian_test<long, long>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS,
-                             BATCHSIZE, long{0});
+TEST(Raft, HungarianLongLong)
+{
+  hungarian_test<long, long>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, long{0});
 }
 
 }  // namespace raft
diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu
index 48ad83dfd2..17b000044e 100644
--- a/cpp/test/linalg/add.cu
+++ b/cpp/test/linalg/add.cu
@@ -33,10 +33,13 @@ class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
       in1(params.len, stream),
       in2(params.len, stream),
       out_ref(params.len, stream),
-      out(params.len, stream) {}
+      out(params.len, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<AddInputs<InT, OutT>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
@@ -47,9 +50,10 @@ class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void compare() {
-    ASSERT_TRUE(raft::devArrMatch(out_ref.data(), out.data(), params.len,
-                                  raft::CompareApprox<OutT>(params.tolerance)));
+  void compare()
+  {
+    ASSERT_TRUE(raft::devArrMatch(
+      out_ref.data(), out.data(), params.len, raft::CompareApprox<OutT>(params.tolerance)));
   }
 
  protected:
diff --git a/cpp/test/linalg/add.cuh b/cpp/test/linalg/add.cuh
index 137419758f..1d9352bfc1 100644
--- a/cpp/test/linalg/add.cuh
+++ b/cpp/test/linalg/add.cuh
@@ -23,18 +23,17 @@ namespace raft {
 namespace linalg {
 
 template <typename InT, typename OutT = InT>
-__global__ void naiveAddElemKernel(OutT *out, const InT *in1, const InT *in2,
-                                   int len) {
+__global__ void naiveAddElemKernel(OutT* out, const InT* in1, const InT* in2, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = OutT(in1[idx] + in2[idx]);
-  }
+  if (idx < len) { out[idx] = OutT(in1[idx] + in2[idx]); }
 }
 
 template <typename InT, typename OutT = InT>
-void naiveAddElem(OutT *out, const InT *in1, const InT *in2, int len) {
+void naiveAddElem(OutT* out, const InT* in1, const InT* in2, int len)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
+  int nblks            = raft::ceildiv(len, TPB);
   naiveAddElemKernel<InT, OutT><<<nblks, TPB>>>(out, in1, in2, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -47,8 +46,8 @@ struct AddInputs {
 };
 
 template <typename InT, typename OutT = InT>
-::std::ostream &operator<<(::std::ostream &os,
-                           const AddInputs<InT, OutT> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const AddInputs<InT, OutT>& dims)
+{
   return os;
 }
 
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu
index c8121bfbe4..c833faa0b2 100644
--- a/cpp/test/linalg/binary_op.cu
+++ b/cpp/test/linalg/binary_op.cu
@@ -29,28 +29,29 @@ namespace linalg {
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename InType, typename IdxType, typename OutType>
-void binaryOpLaunch(OutType *out, const InType *in1, const InType *in2,
-                    IdxType len, cudaStream_t stream) {
+void binaryOpLaunch(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; },
-    stream);
+    out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-class BinaryOpTest
-  : public ::testing::TestWithParam<BinaryOpInputs<InType, IdxType, OutType>> {
+class BinaryOpTest : public ::testing::TestWithParam<BinaryOpInputs<InType, IdxType, OutType>> {
  public:
   BinaryOpTest()
-    : params(::testing::TestWithParam<
-             BinaryOpInputs<InType, IdxType, OutType>>::GetParam()),
+    : params(::testing::TestWithParam<BinaryOpInputs<InType, IdxType, OutType>>::GetParam()),
       stream(handle.get_stream()),
       in1(params.len, stream),
       in2(params.len, stream),
       out_ref(params.len, stream),
-      out(params.len, stream) {}
+      out(params.len, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     IdxType len = params.len;
     r.uniform(in1.data(), len, InType(-1.0), InType(1.0), stream);
@@ -71,67 +72,66 @@ class BinaryOpTest
   rmm::device_uvector<OutType> out;
 };
 
-const std::vector<BinaryOpInputs<float, int>> inputsf_i32 = {
-  {0.000001f, 1024 * 1024, 1234ULL}};
+const std::vector<BinaryOpInputs<float, int>> inputsf_i32 = {{0.000001f, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<float, int> BinaryOpTestF_i32;
-TEST_P(BinaryOpTestF_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(BinaryOpTestF_i32, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32,
-                         ::testing::ValuesIn(inputsf_i32));
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32, ::testing::ValuesIn(inputsf_i32));
 
-const std::vector<BinaryOpInputs<float, size_t>> inputsf_i64 = {
-  {0.000001f, 1024 * 1024, 1234ULL}};
+const std::vector<BinaryOpInputs<float, size_t>> inputsf_i64 = {{0.000001f, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<float, size_t> BinaryOpTestF_i64;
-TEST_P(BinaryOpTestF_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(BinaryOpTestF_i64, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i64,
-                         ::testing::ValuesIn(inputsf_i64));
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i64, ::testing::ValuesIn(inputsf_i64));
 
 const std::vector<BinaryOpInputs<float, int, double>> inputsf_i32_d = {
   {0.000001f, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<float, int, double> BinaryOpTestF_i32_D;
-TEST_P(BinaryOpTestF_i32_D, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(BinaryOpTestF_i32_D, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32_D,
-                         ::testing::ValuesIn(inputsf_i32_d));
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32_D, ::testing::ValuesIn(inputsf_i32_d));
 
-const std::vector<BinaryOpInputs<double, int>> inputsd_i32 = {
-  {0.00000001, 1024 * 1024, 1234ULL}};
+const std::vector<BinaryOpInputs<double, int>> inputsd_i32 = {{0.00000001, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<double, int> BinaryOpTestD_i32;
-TEST_P(BinaryOpTestD_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(BinaryOpTestD_i32, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i32,
-                         ::testing::ValuesIn(inputsd_i32));
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i32, ::testing::ValuesIn(inputsd_i32));
 
 const std::vector<BinaryOpInputs<double, size_t>> inputsd_i64 = {
   {0.00000001, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<double, size_t> BinaryOpTestD_i64;
-TEST_P(BinaryOpTestD_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(BinaryOpTestD_i64, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i64,
-                         ::testing::ValuesIn(inputsd_i64));
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i64, ::testing::ValuesIn(inputsd_i64));
 
 template <typename math_t>
 class BinaryOpAlignment : public ::testing::Test {
  protected:
-  BinaryOpAlignment() {
+  BinaryOpAlignment()
+  {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
   }
   void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); }
 
  public:
-  void Misaligned() {
+  void Misaligned()
+  {
     // Test to trigger cudaErrorMisalignedAddress if veclen is incorrectly
     // chosen.
     int n = 1024;
@@ -141,8 +141,12 @@ class BinaryOpAlignment : public ::testing::Test {
     CUDA_CHECK(cudaMemsetAsync(x.data(), 0, n * sizeof(math_t), stream));
     CUDA_CHECK(cudaMemsetAsync(y.data(), 0, n * sizeof(math_t), stream));
     raft::linalg::binaryOp(
-      z.data() + 9, x.data() + 137, y.data() + 19, 256,
-      [] __device__(math_t x, math_t y) { return x + y; }, stream);
+      z.data() + 9,
+      x.data() + 137,
+      y.data() + 19,
+      256,
+      [] __device__(math_t x, math_t y) { return x + y; },
+      stream);
   }
 
   raft::handle_t handle;
diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/test/linalg/binary_op.cuh
index fd8ed6dd1e..97cb3ecb24 100644
--- a/cpp/test/linalg/binary_op.cuh
+++ b/cpp/test/linalg/binary_op.cuh
@@ -24,18 +24,17 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename OutType, typename IdxType>
-__global__ void naiveAddKernel(OutType *out, const InType *in1,
-                               const InType *in2, IdxType len) {
+__global__ void naiveAddKernel(OutType* out, const InType* in1, const InType* in2, IdxType len)
+{
   IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x);
-  if (idx < len) {
-    out[idx] = static_cast<OutType>(in1[idx] + in2[idx]);
-  }
+  if (idx < len) { out[idx] = static_cast<OutType>(in1[idx] + in2[idx]); }
 }
 
 template <typename InType, typename IdxType = int, typename OutType = InType>
-void naiveAdd(OutType *out, const InType *in1, const InType *in2, IdxType len) {
+void naiveAdd(OutType* out, const InType* in1, const InType* in2, IdxType len)
+{
   static const IdxType TPB = 64;
-  IdxType nblks = raft::ceildiv(len, TPB);
+  IdxType nblks            = raft::ceildiv(len, TPB);
   naiveAddKernel<InType, OutType, IdxType><<<nblks, TPB>>>(out, in1, in2, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -48,8 +47,8 @@ struct BinaryOpInputs {
 };
 
 template <typename InType, typename IdxType = int, typename OutType = InType>
-::std::ostream &operator<<(::std::ostream &os,
-                           const BinaryOpInputs<InType, IdxType, OutType> &d) {
+::std::ostream& operator<<(::std::ostream& os, const BinaryOpInputs<InType, IdxType, OutType>& d)
+{
   return os;
 }
 
diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu
index 262a1ad26c..6c7bbd1232 100644
--- a/cpp/test/linalg/cholesky_r1.cu
+++ b/cpp/test/linalg/cholesky_r1.cu
@@ -36,7 +36,8 @@ class CholeskyR1Test : public ::testing::Test {
       L(n_rows * n_rows, handle.get_stream()),
       L_exp(n_rows * n_rows, handle.get_stream()),
       devInfo(handle.get_stream()),
-      workspace(0, handle.get_stream()) {
+      workspace(0, handle.get_stream())
+  {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
     raft::update_device(G.data(), G_host, n_rows * n_rows, stream);
@@ -48,55 +49,58 @@ class CholeskyR1Test : public ::testing::Test {
     int n_bytes = 0;
     // Initializing in CUBLAS_FILL_MODE_LOWER, because that has larger workspace
     // requirements.
-    raft::linalg::choleskyRank1Update(handle, L.data(), n_rows, n_rows, nullptr,
-                                      &n_bytes, CUBLAS_FILL_MODE_LOWER, stream);
+    raft::linalg::choleskyRank1Update(
+      handle, L.data(), n_rows, n_rows, nullptr, &n_bytes, CUBLAS_FILL_MODE_LOWER, stream);
     Lwork = std::max(Lwork * sizeof(math_t), (size_t)n_bytes);
     workspace.resize(Lwork, stream);
   }
 
   void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); }
 
-  void testR1Update() {
+  void testR1Update()
+  {
     int n = n_rows * n_rows;
-    std::vector<cublasFillMode_t> fillmode{CUBLAS_FILL_MODE_LOWER,
-                                           CUBLAS_FILL_MODE_UPPER};
+    std::vector<cublasFillMode_t> fillmode{CUBLAS_FILL_MODE_LOWER, CUBLAS_FILL_MODE_UPPER};
     for (auto uplo : fillmode) {
       raft::copy(L.data(), G.data(), n, stream);
       for (int rank = 1; rank <= n_rows; rank++) {
         std::stringstream ss;
-        ss << "Rank " << rank
-           << ((uplo == CUBLAS_FILL_MODE_LOWER) ? ", lower" : ", upper");
+        ss << "Rank " << rank << ((uplo == CUBLAS_FILL_MODE_LOWER) ? ", lower" : ", upper");
         SCOPED_TRACE(ss.str());
 
         // Expected solution using Cholesky factorization from scratch
         raft::copy(L_exp.data(), G.data(), n, stream);
-        CUSOLVER_CHECK(raft::linalg::cusolverDnpotrf(
-          solver_handle, uplo, rank, L_exp.data(), n_rows,
-          (math_t*)workspace.data(), Lwork, devInfo.data(), stream));
+        CUSOLVER_CHECK(raft::linalg::cusolverDnpotrf(solver_handle,
+                                                     uplo,
+                                                     rank,
+                                                     L_exp.data(),
+                                                     n_rows,
+                                                     (math_t*)workspace.data(),
+                                                     Lwork,
+                                                     devInfo.data(),
+                                                     stream));
 
         // Incremental Cholesky factorization using rank one updates.
-        raft::linalg::choleskyRank1Update(handle, L.data(), rank, n_rows,
-                                          workspace.data(), &Lwork, uplo,
-                                          stream);
+        raft::linalg::choleskyRank1Update(
+          handle, L.data(), rank, n_rows, workspace.data(), &Lwork, uplo, stream);
 
-        ASSERT_TRUE(raft::devArrMatch(L_exp.data(), L.data(), n_rows * rank,
-                                      raft::CompareApprox<math_t>(3e-3)));
+        ASSERT_TRUE(raft::devArrMatch(
+          L_exp.data(), L.data(), n_rows * rank, raft::CompareApprox<math_t>(3e-3)));
       }
     }
   }
 
-  void testR1Error() {
+  void testR1Error()
+  {
     raft::update_device(G.data(), G2_host, 4, stream);
-    std::vector<cublasFillMode_t> fillmode{CUBLAS_FILL_MODE_LOWER,
-                                           CUBLAS_FILL_MODE_UPPER};
+    std::vector<cublasFillMode_t> fillmode{CUBLAS_FILL_MODE_LOWER, CUBLAS_FILL_MODE_UPPER};
     for (auto uplo : fillmode) {
       raft::copy(L.data(), G.data(), 4, stream);
       ASSERT_NO_THROW(raft::linalg::choleskyRank1Update(
         handle, L.data(), 1, 2, workspace.data(), &Lwork, uplo, stream));
-      ASSERT_THROW(
-        raft::linalg::choleskyRank1Update(
-          handle, L.data(), 2, 2, workspace.data(), &Lwork, uplo, stream),
-        raft::exception);
+      ASSERT_THROW(raft::linalg::choleskyRank1Update(
+                     handle, L.data(), 2, 2, workspace.data(), &Lwork, uplo, stream),
+                   raft::exception);
 
       math_t eps = std::numeric_limits<math_t>::epsilon();
       ASSERT_NO_THROW(raft::linalg::choleskyRank1Update(
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index fdfc3052b7..9bb84e1eb7 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -33,8 +33,8 @@ struct coalescedReductionInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os,
-                           const coalescedReductionInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const coalescedReductionInputs<T>& dims)
+{
   return os;
 }
 
@@ -42,25 +42,28 @@ template <typename T>
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename T>
-void coalescedReductionLaunch(T *dots, const T *data, int cols, int rows,
-                              cudaStream_t stream, bool inplace = false) {
-  coalescedReduction(dots, data, cols, rows, (T)0, stream, inplace,
-                     [] __device__(T in, int i) { return in * in; });
+void coalescedReductionLaunch(
+  T* dots, const T* data, int cols, int rows, cudaStream_t stream, bool inplace = false)
+{
+  coalescedReduction(
+    dots, data, cols, rows, (T)0, stream, inplace, [] __device__(T in, int i) { return in * in; });
 }
 
 template <typename T>
-class coalescedReductionTest
-  : public ::testing::TestWithParam<coalescedReductionInputs<T>> {
+class coalescedReductionTest : public ::testing::TestWithParam<coalescedReductionInputs<T>> {
  public:
   coalescedReductionTest()
     : params(::testing::TestWithParam<coalescedReductionInputs<T>>::GetParam()),
       stream(handle.get_stream()),
       data(params.rows * params.cols, stream),
       dots_exp(params.rows * params.cols, stream),
-      dots_act(params.rows * params.cols, stream) {}
+      dots_act(params.rows * params.cols, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
@@ -70,8 +73,7 @@ class coalescedReductionTest
     // Perform reduction with default inplace = false first
     coalescedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream);
     // Add to result with inplace = true next
-    coalescedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream,
-                             true);
+    coalescedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream, true);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
@@ -86,34 +88,36 @@ class coalescedReductionTest
   rmm::device_uvector<T> dots_act;
 };
 
-const std::vector<coalescedReductionInputs<float>> inputsf = {
-  {0.000002f, 1024, 32, 1234ULL},
-  {0.000002f, 1024, 64, 1234ULL},
-  {0.000002f, 1024, 128, 1234ULL},
-  {0.000002f, 1024, 256, 1234ULL}};
+const std::vector<coalescedReductionInputs<float>> inputsf = {{0.000002f, 1024, 32, 1234ULL},
+                                                              {0.000002f, 1024, 64, 1234ULL},
+                                                              {0.000002f, 1024, 128, 1234ULL},
+                                                              {0.000002f, 1024, 256, 1234ULL}};
 
-const std::vector<coalescedReductionInputs<double>> inputsd = {
-  {0.000000001, 1024, 32, 1234ULL},
-  {0.000000001, 1024, 64, 1234ULL},
-  {0.000000001, 1024, 128, 1234ULL},
-  {0.000000001, 1024, 256, 1234ULL}};
+const std::vector<coalescedReductionInputs<double>> inputsd = {{0.000000001, 1024, 32, 1234ULL},
+                                                               {0.000000001, 1024, 64, 1234ULL},
+                                                               {0.000000001, 1024, 128, 1234ULL},
+                                                               {0.000000001, 1024, 256, 1234ULL}};
 
 typedef coalescedReductionTest<float> coalescedReductionTestF;
-TEST_P(coalescedReductionTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.rows,
-                                raft::CompareApprox<float>(params.tolerance)));
+TEST_P(coalescedReductionTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    dots_exp.data(), dots_act.data(), params.rows, raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef coalescedReductionTest<double> coalescedReductionTestD;
-TEST_P(coalescedReductionTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.rows,
-                                raft::CompareApprox<double>(params.tolerance)));
+TEST_P(coalescedReductionTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    dots_exp.data(), dots_act.data(), params.rows, raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_CASE_P(coalescedReductionTests, coalescedReductionTestF,
+INSTANTIATE_TEST_CASE_P(coalescedReductionTests,
+                        coalescedReductionTestF,
                         ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_CASE_P(coalescedReductionTests, coalescedReductionTestD,
+INSTANTIATE_TEST_CASE_P(coalescedReductionTests,
+                        coalescedReductionTestD,
                         ::testing::ValuesIn(inputsd));
 
 }  // end namespace linalg
diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu
index d90955147c..130a22abf0 100644
--- a/cpp/test/linalg/divide.cu
+++ b/cpp/test/linalg/divide.cu
@@ -25,37 +25,36 @@ namespace raft {
 namespace linalg {
 
 template <typename Type>
-__global__ void naiveDivideKernel(Type *out, const Type *in, Type scalar,
-                                  int len) {
+__global__ void naiveDivideKernel(Type* out, const Type* in, Type scalar, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = in[idx] / scalar;
-  }
+  if (idx < len) { out[idx] = in[idx] / scalar; }
 }
 
 template <typename Type>
-void naiveDivide(Type *out, const Type *in, Type scalar, int len,
-                 cudaStream_t stream) {
+void naiveDivide(Type* out, const Type* in, Type scalar, int len, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
+  int nblks            = raft::ceildiv(len, TPB);
   naiveDivideKernel<Type><<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename T>
-class DivideTest
-  : public ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>> {
+class DivideTest : public ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>> {
  public:
   DivideTest()
-    : params(
-        ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>>::GetParam()),
+    : params(::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>>::GetParam()),
       stream(handle.get_stream()),
       in(params.len, stream),
       out_ref(params.len, stream),
-      out(params.len, stream) {}
+      out(params.len, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int len = params.len;
     CUDA_CHECK(cudaStreamCreate(&stream));
@@ -75,25 +74,23 @@ class DivideTest
   rmm::device_uvector<T> out;
 };
 
-const std::vector<UnaryOpInputs<float>> inputsf = {
-  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+const std::vector<UnaryOpInputs<float>> inputsf = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
 typedef DivideTest<float> DivideTestF;
-TEST_P(DivideTestF, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          raft::CompareApprox<float>(params.tolerance)));
+TEST_P(DivideTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestF, ::testing::ValuesIn(inputsf));
 
 typedef DivideTest<double> DivideTestD;
-const std::vector<UnaryOpInputs<double>> inputsd = {
-  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
-TEST_P(DivideTestD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          raft::CompareApprox<double>(params.tolerance)));
+const std::vector<UnaryOpInputs<double>> inputsd = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+TEST_P(DivideTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
index 2ac9118506..3df3abd2af 100644
--- a/cpp/test/linalg/eig.cu
+++ b/cpp/test/linalg/eig.cu
@@ -35,7 +35,8 @@ struct EigInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const EigInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const EigInputs<T>& dims)
+{
   return os;
 }
 
@@ -56,34 +57,60 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
       eig_vectors_large(params.n * params.n, stream),
       eig_vectors_jacobi_large(params.n * params.n, stream),
       eig_vals_large(params.n, stream),
-      eig_vals_jacobi_large(params.n, stream) {}
+      eig_vals_jacobi_large(params.n, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int len = params.len;
 
-    T cov_matrix_h[] = {1.0,  0.9, 0.81, 0.729, 0.9,   1.0,  0.9, 0.81,
-                        0.81, 0.9, 1.0,  0.9,   0.729, 0.81, 0.9, 1.0};
+    T cov_matrix_h[] = {
+      1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0};
     ASSERT(len == 16, "This test only works with 4x4 matrices!");
     raft::update_device(cov_matrix.data(), cov_matrix_h, len, stream);
 
-    T eig_vectors_ref_h[] = {0.2790, -0.6498, 0.6498, -0.2789, -0.5123, 0.4874,
-                             0.4874, -0.5123, 0.6498, 0.2789,  -0.2789, -0.6498,
-                             0.4874, 0.5123,  0.5123, 0.4874};
-    T eig_vals_ref_h[] = {0.0614, 0.1024, 0.3096, 3.5266};
+    T eig_vectors_ref_h[] = {0.2790,
+                             -0.6498,
+                             0.6498,
+                             -0.2789,
+                             -0.5123,
+                             0.4874,
+                             0.4874,
+                             -0.5123,
+                             0.6498,
+                             0.2789,
+                             -0.2789,
+                             -0.6498,
+                             0.4874,
+                             0.5123,
+                             0.5123,
+                             0.4874};
+    T eig_vals_ref_h[]    = {0.0614, 0.1024, 0.3096, 3.5266};
 
     raft::update_device(eig_vectors_ref.data(), eig_vectors_ref_h, len, stream);
-    raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, params.n_col,
-                        stream);
+    raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, params.n_col, stream);
 
-    eigDC(handle, cov_matrix.data(), params.n_row, params.n_col,
-          eig_vectors.data(), eig_vals.data(), stream);
+    eigDC(handle,
+          cov_matrix.data(),
+          params.n_row,
+          params.n_col,
+          eig_vectors.data(),
+          eig_vals.data(),
+          stream);
 
-    T tol = 1.e-7;
+    T tol      = 1.e-7;
     int sweeps = 15;
-    eigJacobi(handle, cov_matrix.data(), params.n_row, params.n_col,
-              eig_vectors_jacobi.data(), eig_vals_jacobi.data(), stream, tol,
+    eigJacobi(handle,
+              cov_matrix.data(),
+              params.n_row,
+              params.n_col,
+              eig_vectors_jacobi.data(),
+              eig_vals_jacobi.data(),
+              stream,
+              tol,
               sweeps);
 
     // test code for comparing two methods
@@ -91,11 +118,22 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
 
     r.uniform(cov_matrix_large.data(), len, T(-1.0), T(1.0), stream);
 
-    eigDC(handle, cov_matrix_large.data(), params.n, params.n,
-          eig_vectors_large.data(), eig_vals_large.data(), stream);
-    eigJacobi(handle, cov_matrix_large.data(), params.n, params.n,
-              eig_vectors_jacobi_large.data(), eig_vals_jacobi_large.data(),
-              stream, tol, sweeps);
+    eigDC(handle,
+          cov_matrix_large.data(),
+          params.n,
+          params.n,
+          eig_vectors_large.data(),
+          eig_vals_large.data(),
+          stream);
+    eigJacobi(handle,
+              cov_matrix_large.data(),
+              params.n,
+              params.n,
+              eig_vectors_jacobi_large.data(),
+              eig_vals_jacobi_large.data(),
+              stream,
+              tol,
+              sweeps);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
@@ -105,87 +143,105 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
 
   EigInputs<T> params;
 
-  rmm::device_uvector<T> cov_matrix, eig_vectors, eig_vectors_jacobi,
-    eig_vectors_ref, eig_vals, eig_vals_jacobi, eig_vals_ref;
+  rmm::device_uvector<T> cov_matrix, eig_vectors, eig_vectors_jacobi, eig_vectors_ref, eig_vals,
+    eig_vals_jacobi, eig_vals_ref;
 
-  rmm::device_uvector<T> cov_matrix_large, eig_vectors_large,
-    eig_vectors_jacobi_large, eig_vals_large, eig_vals_jacobi_large;
+  rmm::device_uvector<T> cov_matrix_large, eig_vectors_large, eig_vectors_jacobi_large,
+    eig_vals_large, eig_vals_jacobi_large;
 };
 
-const std::vector<EigInputs<float>> inputsf2 = {
-  {0.001f, 4 * 4, 4, 4, 1234ULL, 256}};
+const std::vector<EigInputs<float>> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}};
 
-const std::vector<EigInputs<double>> inputsd2 = {
-  {0.001, 4 * 4, 4, 4, 1234ULL, 256}};
+const std::vector<EigInputs<double>> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}};
 
 typedef EigTest<float> EigTestValF;
-TEST_P(EigTestValF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref.data(), eig_vals.data(), params.n_col,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigTestValF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(),
+                                eig_vals.data(),
+                                params.n_col,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestValD;
-TEST_P(EigTestValD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref.data(), eig_vals.data(), params.n_col,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigTestValD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(),
+                                eig_vals.data(),
+                                params.n_col,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigTest<float> EigTestVecF;
-TEST_P(EigTestVecF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vectors_ref.data(), eig_vectors.data(), params.len,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigTestVecF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(),
+                                eig_vectors.data(),
+                                params.len,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestVecD;
-TEST_P(EigTestVecD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vectors_ref.data(), eig_vectors.data(), params.len,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigTestVecD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(),
+                                eig_vectors.data(),
+                                params.len,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigTest<float> EigTestValJacobiF;
-TEST_P(EigTestValJacobiF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref.data(), eig_vals_jacobi.data(), params.n_col,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigTestValJacobiF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(),
+                                eig_vals_jacobi.data(),
+                                params.n_col,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestValJacobiD;
-TEST_P(EigTestValJacobiD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref.data(), eig_vals_jacobi.data(), params.n_col,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigTestValJacobiD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(),
+                                eig_vals_jacobi.data(),
+                                params.n_col,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigTest<float> EigTestVecJacobiF;
-TEST_P(EigTestVecJacobiF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(
-    eig_vectors_ref.data(), eig_vectors_jacobi.data(), params.len,
-    raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigTestVecJacobiF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(),
+                                eig_vectors_jacobi.data(),
+                                params.len,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestVecJacobiD;
-TEST_P(EigTestVecJacobiD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(
-    eig_vectors_ref.data(), eig_vectors_jacobi.data(), params.len,
-    raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigTestVecJacobiD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(),
+                                eig_vectors_jacobi.data(),
+                                params.len,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigTest<float> EigTestVecCompareF;
-TEST_P(EigTestVecCompareF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(
-    eig_vectors_large.data(), eig_vectors_jacobi_large.data(),
-    (params.n * params.n), raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigTestVecCompareF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vectors_large.data(),
+                                eig_vectors_jacobi_large.data(),
+                                (params.n * params.n),
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestVecCompareD;
-TEST_P(EigTestVecCompareD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(
-    eig_vectors_large.data(), eig_vectors_jacobi_large.data(),
-    (params.n * params.n), raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigTestVecCompareD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vectors_large.data(),
+                                eig_vectors_jacobi_large.data(),
+                                (params.n * params.n),
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValF, ::testing::ValuesIn(inputsf2));
@@ -196,17 +252,13 @@ INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecF, ::testing::ValuesIn(inputsf2));
 
 INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecD, ::testing::ValuesIn(inputsd2));
 
-INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiD, ::testing::ValuesIn(inputsd2));
 
-INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiD, ::testing::ValuesIn(inputsd2));
 
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index 9eb1c10313..b1e88c91dd 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -37,7 +37,8 @@ struct EigSelInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const EigSelInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const EigSelInputs<T>& dims)
+{
   return os;
 }
 
@@ -51,27 +52,46 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
       eig_vectors(12, stream),
       eig_vectors_ref(12, stream),
       eig_vals(params.n_col, stream),
-      eig_vals_ref(params.n_col, stream) {}
+      eig_vals_ref(params.n_col, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     int len = params.len;
 
-    T cov_matrix_h[] = {1.0,  0.9, 0.81, 0.729, 0.9,   1.0,  0.9, 0.81,
-                        0.81, 0.9, 1.0,  0.9,   0.729, 0.81, 0.9, 1.0};
+    T cov_matrix_h[] = {
+      1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0};
     ASSERT(len == 16, "This test only works with 4x4 matrices!");
     raft::update_device(cov_matrix.data(), cov_matrix_h, len, stream);
 
-    T eig_vectors_ref_h[] = {-0.5123, 0.4874,  0.4874, -0.5123, 0.6498, 0.2789,
-                             -0.2789, -0.6498, 0.4874, 0.5123,  0.5123, 0.4874};
-    T eig_vals_ref_h[] = {0.1024, 0.3096, 3.5266, 3.5266};
+    T eig_vectors_ref_h[] = {-0.5123,
+                             0.4874,
+                             0.4874,
+                             -0.5123,
+                             0.6498,
+                             0.2789,
+                             -0.2789,
+                             -0.6498,
+                             0.4874,
+                             0.5123,
+                             0.5123,
+                             0.4874};
+    T eig_vals_ref_h[]    = {0.1024, 0.3096, 3.5266, 3.5266};
 
     raft::update_device(eig_vectors_ref.data(), eig_vectors_ref_h, 12, stream);
     raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, 4, stream);
 
-    eigSelDC(handle, cov_matrix.data(), params.n_row, params.n_col, 3,
-             eig_vectors.data(), eig_vals.data(),
-             EigVecMemUsage::OVERWRITE_INPUT, stream);
+    eigSelDC(handle,
+             cov_matrix.data(),
+             params.n_row,
+             params.n_col,
+             3,
+             eig_vectors.data(),
+             eig_vals.data(),
+             EigVecMemUsage::OVERWRITE_INPUT,
+             stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
@@ -87,51 +107,53 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
   rmm::device_uvector<T> eig_vals_ref;
 };
 
-const std::vector<EigSelInputs<float>> inputsf2 = {
-  {0.001f, 4 * 4, 4, 4, 1234ULL, 256}};
+const std::vector<EigSelInputs<float>> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}};
 
-const std::vector<EigSelInputs<double>> inputsd2 = {
-  {0.001, 4 * 4, 4, 4, 1234ULL, 256}};
+const std::vector<EigSelInputs<double>> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}};
 
 typedef EigSelTest<float> EigSelTestValF;
-TEST_P(EigSelTestValF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref.data(), eig_vals.data(), params.n_col,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigSelTestValF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(),
+                                eig_vals.data(),
+                                params.n_col,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigSelTest<double> EigSelTestValD;
-TEST_P(EigSelTestValD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref.data(), eig_vals.data(), params.n_col,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigSelTestValD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(),
+                                eig_vals.data(),
+                                params.n_col,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigSelTest<float> EigSelTestVecF;
-TEST_P(EigSelTestVecF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vectors_ref.data(), eig_vectors.data(), 12,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigSelTestVecF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(),
+                                eig_vectors.data(),
+                                12,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigSelTest<double> EigSelTestVecD;
-TEST_P(EigSelTestVecD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vectors_ref.data(), eig_vectors.data(), 12,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigSelTestVecD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(),
+                                eig_vectors.data(),
+                                12,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValD, ::testing::ValuesIn(inputsd2));
 
-INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecD, ::testing::ValuesIn(inputsd2));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu
index c3b26f5423..5ecca16be6 100644
--- a/cpp/test/linalg/eltwise.cu
+++ b/cpp/test/linalg/eltwise.cu
@@ -26,19 +26,17 @@ namespace linalg {
 //// Testing unary ops
 
 template <typename Type>
-__global__ void naiveScaleKernel(Type *out, const Type *in, Type scalar,
-                                 int len) {
+__global__ void naiveScaleKernel(Type* out, const Type* in, Type scalar, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = scalar * in[idx];
-  }
+  if (idx < len) { out[idx] = scalar * in[idx]; }
 }
 
 template <typename Type>
-void naiveScale(Type *out, const Type *in, Type scalar, int len,
-                cudaStream_t stream) {
+void naiveScale(Type* out, const Type* in, Type scalar, int len, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
+  int nblks            = raft::ceildiv(len, TPB);
   naiveScaleKernel<Type><<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -52,26 +50,28 @@ struct ScalarMultiplyInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os,
-                           const ScalarMultiplyInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const ScalarMultiplyInputs<T>& dims)
+{
   return os;
 }
 
 template <typename T>
-class ScalarMultiplyTest
-  : public ::testing::TestWithParam<ScalarMultiplyInputs<T>> {
+class ScalarMultiplyTest : public ::testing::TestWithParam<ScalarMultiplyInputs<T>> {
  public:
   ScalarMultiplyTest()
     : params(::testing::TestWithParam<ScalarMultiplyInputs<T>>::GetParam()),
       stream(handle.get_stream()),
       in(len, stream),
       out_ref(len, stream),
-      out(len, stream) {}
+      out(len, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
-    int len = params.len;
+    int len  = params.len;
     T scalar = params.scalar;
     r.uniform(in, len, T(-1.0), T(1.0), stream);
     naiveScale(out_ref, in, scalar, len, stream);
@@ -87,46 +87,43 @@ class ScalarMultiplyTest
   rmm::device_uvector<T> in, out_ref, out;
 };
 
-const std::vector<ScalarMultiplyInputs<float>> inputsf1 = {
-  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+const std::vector<ScalarMultiplyInputs<float>> inputsf1 = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
 
 const std::vector<ScalarMultiplyInputs<double>> inputsd1 = {
   {0.00000001, 1024 * 1024, 2.0, 1234ULL}};
 
 typedef ScalarMultiplyTest<float> ScalarMultiplyTestF;
-TEST_P(ScalarMultiplyTestF, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(ScalarMultiplyTestF, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
 }
 
 typedef ScalarMultiplyTest<double> ScalarMultiplyTestD;
-TEST_P(ScalarMultiplyTestD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(ScalarMultiplyTestD, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestF,
-                         ::testing::ValuesIn(inputsf1));
+INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestF, ::testing::ValuesIn(inputsf1));
 
-INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestD,
-                         ::testing::ValuesIn(inputsd1));
+INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestD, ::testing::ValuesIn(inputsd1));
 
 //// Testing binary ops
 
 template <typename Type>
-__global__ void naiveAddKernel(Type *out, const Type *in1, const Type *in2,
-                               int len) {
+__global__ void naiveAddKernel(Type* out, const Type* in1, const Type* in2, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = in1[idx] + in2[idx];
-  }
+  if (idx < len) { out[idx] = in1[idx] + in2[idx]; }
 }
 
 template <typename Type>
-void naiveAdd(Type *out, const Type *in1, const Type *in2, int len,
-              cudaStream_t stream) {
+void naiveAdd(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
+  int nblks            = raft::ceildiv(len, TPB);
   naiveAddKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -139,8 +136,8 @@ struct EltwiseAddInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os,
-                           const EltwiseAddInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const EltwiseAddInputs<T>& dims)
+{
   return os;
 }
 
@@ -153,10 +150,13 @@ class EltwiseAddTest : public ::testing::TestWithParam<EltwiseAddInputs<T>> {
       in1(params.len, stream),
       in2(params.len, stream),
       out_ref(params.len, stream),
-      out(params.len, stream) {}
+      out(params.len, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<EltwiseAddInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
@@ -175,29 +175,27 @@ class EltwiseAddTest : public ::testing::TestWithParam<EltwiseAddInputs<T>> {
   rmm::device_uvector<T> in1, in2, out_ref, out;
 };
 
-const std::vector<EltwiseAddInputs<float>> inputsf2 = {
-  {0.000001f, 1024 * 1024, 1234ULL}};
+const std::vector<EltwiseAddInputs<float>> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}};
 
-const std::vector<EltwiseAddInputs<double>> inputsd2 = {
-  {0.00000001, 1024 * 1024, 1234ULL}};
+const std::vector<EltwiseAddInputs<double>> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}};
 
 typedef EltwiseAddTest<float> EltwiseAddTestF;
-TEST_P(EltwiseAddTestF, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(EltwiseAddTestF, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
 }
 
 typedef EltwiseAddTest<double> EltwiseAddTestD;
-TEST_P(EltwiseAddTestD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(EltwiseAddTestD, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestD, ::testing::ValuesIn(inputsd2));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu
index 699d40d55e..6231715c8a 100644
--- a/cpp/test/linalg/gemm_layout.cu
+++ b/cpp/test/linalg/gemm_layout.cu
@@ -36,9 +36,9 @@ struct GemmLayoutInputs {
 
 // Reference GEMM implementation.
 template <typename T>
-__global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K,
-                          bool isZColMajor, bool isXColMajor,
-                          bool isYColMajor) {
+__global__ void naiveGemm(
+  T* Z, T* X, T* Y, int M, int N, int K, bool isZColMajor, bool isXColMajor, bool isYColMajor)
+{
   int tidx = blockIdx.x * blockDim.x + threadIdx.x;
   int tidy = blockIdx.y * blockDim.y + threadIdx.y;
 
@@ -51,7 +51,7 @@ __global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K,
         temp += X[xIndex] * Y[yIndex];
       }
       int zIndex = isZColMajor ? m + n * M : m * N + n;
-      Z[zIndex] = temp;
+      Z[zIndex]  = temp;
     }
   }
 }
@@ -59,7 +59,8 @@ __global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K,
 template <typename T>
 class GemmLayoutTest : public ::testing::TestWithParam<GemmLayoutInputs<T>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<GemmLayoutInputs<T>>::GetParam();
 
     raft::handle_t handle;
@@ -72,8 +73,8 @@ class GemmLayoutTest : public ::testing::TestWithParam<GemmLayoutInputs<T>> {
     // Dimensions of Y : K x N
     // Dimensions of Z : M x N
 
-    T *X = NULL;  // Argument X
-    T *Y = NULL;  // Argument Y
+    T* X = NULL;  // Argument X
+    T* Y = NULL;  // Argument Y
 
     size_t xElems = params.M * params.K;
     size_t yElems = params.K * params.N;
@@ -87,27 +88,35 @@ class GemmLayoutTest : public ::testing::TestWithParam<GemmLayoutInputs<T>> {
     r.uniform(X, xElems, T(-10.0), T(10.0), stream);
     r.uniform(Y, yElems, T(-10.0), T(10.0), stream);
 
-    dim3 blocks(raft::ceildiv<int>(params.M, 128),
-                raft::ceildiv<int>(params.N, 4), 1);
+    dim3 blocks(raft::ceildiv<int>(params.M, 128), raft::ceildiv<int>(params.N, 4), 1);
     dim3 threads(128, 4, 1);
 
-    naiveGemm<<<blocks, threads>>>(refZ, X, Y, params.M, params.N, params.K,
-                                   params.zLayout, params.xLayout,
-                                   params.yLayout);
-
-    gemm(handle, Z, X, Y, params.M, params.N, params.K, params.zLayout,
-         params.xLayout, params.yLayout, stream);
+    naiveGemm<<<blocks, threads>>>(
+      refZ, X, Y, params.M, params.N, params.K, params.zLayout, params.xLayout, params.yLayout);
+
+    gemm(handle,
+         Z,
+         X,
+         Y,
+         params.M,
+         params.N,
+         params.K,
+         params.zLayout,
+         params.xLayout,
+         params.yLayout,
+         stream);
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(refZ));
     CUDA_CHECK(cudaFree(Z));
   }
 
  protected:
   GemmLayoutInputs<T> params;
-  T *refZ = NULL;  // Reference result for comparison
-  T *Z = NULL;     // Computed result
+  T* refZ = NULL;  // Reference result for comparison
+  T* Z    = NULL;  // Computed result
 };
 
 const std::vector<GemmLayoutInputs<float>> inputsf = {
@@ -131,22 +140,20 @@ const std::vector<GemmLayoutInputs<double>> inputsd = {
   {50, 80, 60, false, false, false, 893038ULL}};
 
 typedef GemmLayoutTest<float> GemmLayoutTestF;
-TEST_P(GemmLayoutTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N,
-                                raft::CompareApprox<float>(1e-4)));
+TEST_P(GemmLayoutTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, raft::CompareApprox<float>(1e-4)));
 }
 
 typedef GemmLayoutTest<double> GemmLayoutTestD;
-TEST_P(GemmLayoutTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N,
-                                raft::CompareApprox<float>(1e-6)));
+TEST_P(GemmLayoutTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, raft::CompareApprox<float>(1e-6)));
 }
 
-INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/gemv.cu b/cpp/test/linalg/gemv.cu
index 92e59ae49b..4d5472f38c 100644
--- a/cpp/test/linalg/gemv.cu
+++ b/cpp/test/linalg/gemv.cu
@@ -34,10 +34,16 @@ struct GemvInputs {
 
 // Reference GEMV implementation.
 template <typename T>
-__global__ void naiveGemv(T *y, const T *A, const T *x, const int n_rows,
-                          const int n_cols, const int lda, const bool trans_a) {
+__global__ void naiveGemv(T* y,
+                          const T* A,
+                          const T* x,
+                          const int n_rows,
+                          const int n_cols,
+                          const int lda,
+                          const bool trans_a)
+{
   int istart = blockIdx.x * blockDim.x + threadIdx.x;
-  int istep = blockDim.x * gridDim.x;
+  int istep  = blockDim.x * gridDim.x;
 
   if (!trans_a) {
     for (int i = istart; i < n_rows; i += istep) {
@@ -69,12 +75,14 @@ class GemvTest : public ::testing::TestWithParam<GemvInputs<T>> {
   GemvTest()
     : testing::TestWithParam<GemvInputs<T>>(),
       refy(0, rmm::cuda_stream_default),
-      y(0, rmm::cuda_stream_default) {
+      y(0, rmm::cuda_stream_default)
+  {
     rmm::cuda_stream_default.synchronize();
   }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<GemvInputs<T>>::GetParam();
 
     raft::handle_t handle;
@@ -98,39 +106,55 @@ class GemvTest : public ::testing::TestWithParam<GemvInputs<T>> {
     dim3 blocks(raft::ceildiv<int>(yElems, 256), 1, 1);
     dim3 threads(256, 1, 1);
 
-    naiveGemv<<<blocks, threads>>>(refy.data(), A.data(), x.data(),
-                                   params.n_rows, params.n_cols, params.lda,
-                                   params.trans_a);
-
-    gemv(handle, A.data(), params.n_rows, params.n_cols, params.lda, x.data(),
-         y.data(), params.trans_a, stream);
+    naiveGemv<<<blocks, threads>>>(
+      refy.data(), A.data(), x.data(), params.n_rows, params.n_cols, params.lda, params.trans_a);
+
+    gemv(handle,
+         A.data(),
+         params.n_rows,
+         params.n_cols,
+         params.lda,
+         x.data(),
+         y.data(),
+         params.trans_a,
+         stream);
   }
 
   void TearDown() override {}
 };
 
-const std::vector<GemvInputs<float>> inputsf = {
-  {80, 70, 80, true, 76433ULL},    {80, 100, 80, true, 426646ULL},
-  {20, 100, 20, true, 37703ULL},   {100, 60, 200, true, 538004ULL},
-  {50, 10, 60, false, 73012ULL},   {90, 90, 90, false, 538147ULL},
-  {30, 100, 30, false, 412352ULL}, {40, 80, 100, false, 297941ULL}};
-
-const std::vector<GemvInputs<double>> inputsd = {
-  {10, 70, 10, true, 535648ULL},  {30, 30, 30, true, 956681ULL},
-  {70, 80, 70, true, 875083ULL},  {80, 90, 200, true, 50744ULL},
-  {90, 90, 90, false, 506321ULL}, {40, 100, 70, false, 638418ULL},
-  {80, 50, 80, false, 701529ULL}, {50, 80, 60, false, 893038ULL}};
+const std::vector<GemvInputs<float>> inputsf = {{80, 70, 80, true, 76433ULL},
+                                                {80, 100, 80, true, 426646ULL},
+                                                {20, 100, 20, true, 37703ULL},
+                                                {100, 60, 200, true, 538004ULL},
+                                                {50, 10, 60, false, 73012ULL},
+                                                {90, 90, 90, false, 538147ULL},
+                                                {30, 100, 30, false, 412352ULL},
+                                                {40, 80, 100, false, 297941ULL}};
+
+const std::vector<GemvInputs<double>> inputsd = {{10, 70, 10, true, 535648ULL},
+                                                 {30, 30, 30, true, 956681ULL},
+                                                 {70, 80, 70, true, 875083ULL},
+                                                 {80, 90, 200, true, 50744ULL},
+                                                 {90, 90, 90, false, 506321ULL},
+                                                 {40, 100, 70, false, 638418ULL},
+                                                 {80, 50, 80, false, 701529ULL},
+                                                 {50, 80, 60, false, 893038ULL}};
 
 typedef GemvTest<float> GemvTestF;
-TEST_P(GemvTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(refy.data(), y.data(),
+TEST_P(GemvTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(refy.data(),
+                                y.data(),
                                 params.trans_a ? params.n_cols : params.n_rows,
                                 raft::CompareApprox<float>(1e-4)));
 }
 
 typedef GemvTest<double> GemvTestD;
-TEST_P(GemvTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(refy.data(), y.data(),
+TEST_P(GemvTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(refy.data(),
+                                y.data(),
                                 params.trans_a ? params.n_cols : params.n_rows,
                                 raft::CompareApprox<float>(1e-6)));
 }
diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu
index f04c225aa9..787d9ba415 100644
--- a/cpp/test/linalg/map.cu
+++ b/cpp/test/linalg/map.cu
@@ -25,13 +25,22 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename IdxType, typename OutType>
-void mapLaunch(OutType *out, const InType *in1, const InType *in2,
-               const InType *in3, InType scalar, IdxType len,
-               cudaStream_t stream) {
+void mapLaunch(OutType* out,
+               const InType* in1,
+               const InType* in2,
+               const InType* in3,
+               InType scalar,
+               IdxType len,
+               cudaStream_t stream)
+{
   map(
-    out, len,
+    out,
+    len,
     [=] __device__(InType a, InType b, InType c) { return a + b + c + scalar; },
-    stream, in1, in2, in3);
+    stream,
+    in1,
+    in2,
+    in3);
 }
 
 template <typename InType, typename IdxType = int, typename OutType = InType>
@@ -43,9 +52,14 @@ struct MapInputs {
 };
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void create_ref(OutType *out_ref, const InType *in1, const InType *in2,
-                const InType *in3, InType scalar, IdxType len,
-                cudaStream_t stream) {
+void create_ref(OutType* out_ref,
+                const InType* in1,
+                const InType* in2,
+                const InType* in3,
+                InType scalar,
+                IdxType len,
+                cudaStream_t stream)
+{
   rmm::device_uvector<InType> tmp(len, stream);
   eltwiseAdd(tmp.data(), in1, in2, len, stream);
   eltwiseAdd(out_ref, tmp.data(), in3, len, stream);
@@ -54,21 +68,22 @@ void create_ref(OutType *out_ref, const InType *in1, const InType *in2,
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-class MapTest
-  : public ::testing::TestWithParam<MapInputs<InType, IdxType, OutType>> {
+class MapTest : public ::testing::TestWithParam<MapInputs<InType, IdxType, OutType>> {
  public:
   MapTest()
-    : params(::testing::TestWithParam<
-             MapInputs<InType, IdxType, OutType>>::GetParam()),
+    : params(::testing::TestWithParam<MapInputs<InType, IdxType, OutType>>::GetParam()),
       stream(handle.get_stream()),
       in1(params.len, stream),
       in2(params.len, stream),
       in3(params.len, stream),
       out_ref(params.len, stream),
-      out(params.len, stream) {}
+      out(params.len, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
 
     IdxType len = params.len;
@@ -76,10 +91,8 @@ class MapTest
     r.uniform(in2.data(), len, InType(-1.0), InType(1.0), stream);
     r.uniform(in3.data(), len, InType(-1.0), InType(1.0), stream);
 
-    create_ref(out_ref.data(), in1.data(), in2.data(), in3.data(),
-               params.scalar, len, stream);
-    mapLaunch(out.data(), in1.data(), in2.data(), in3.data(), params.scalar,
-              len, stream);
+    create_ref(out_ref.data(), in1.data(), in2.data(), in3.data(), params.scalar, len, stream);
+    mapLaunch(out.data(), in1.data(), in2.data(), in3.data(), params.scalar, len, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
@@ -92,55 +105,52 @@ class MapTest
   rmm::device_uvector<OutType> out_ref, out;
 };
 
-const std::vector<MapInputs<float, int>> inputsf_i32 = {
-  {0.000001f, 1024 * 1024, 1234ULL, 3.2}};
+const std::vector<MapInputs<float, int>> inputsf_i32 = {{0.000001f, 1024 * 1024, 1234ULL, 3.2}};
 typedef MapTest<float, int> MapTestF_i32;
-TEST_P(MapTestF_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MapTestF_i32, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32,
-                         ::testing::ValuesIn(inputsf_i32));
+INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32, ::testing::ValuesIn(inputsf_i32));
 
-const std::vector<MapInputs<float, size_t>> inputsf_i64 = {
-  {0.000001f, 1024 * 1024, 1234ULL, 9.4}};
+const std::vector<MapInputs<float, size_t>> inputsf_i64 = {{0.000001f, 1024 * 1024, 1234ULL, 9.4}};
 typedef MapTest<float, size_t> MapTestF_i64;
-TEST_P(MapTestF_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MapTestF_i64, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i64,
-                         ::testing::ValuesIn(inputsf_i64));
+INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i64, ::testing::ValuesIn(inputsf_i64));
 
 const std::vector<MapInputs<float, int, double>> inputsf_i32_d = {
   {0.000001f, 1024 * 1024, 1234ULL, 5.9}};
 typedef MapTest<float, int, double> MapTestF_i32_D;
-TEST_P(MapTestF_i32_D, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MapTestF_i32_D, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32_D,
-                         ::testing::ValuesIn(inputsf_i32_d));
+INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32_D, ::testing::ValuesIn(inputsf_i32_d));
 
-const std::vector<MapInputs<double, int>> inputsd_i32 = {
-  {0.00000001, 1024 * 1024, 1234ULL, 7.5}};
+const std::vector<MapInputs<double, int>> inputsd_i32 = {{0.00000001, 1024 * 1024, 1234ULL, 7.5}};
 typedef MapTest<double, int> MapTestD_i32;
-TEST_P(MapTestD_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MapTestD_i32, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i32,
-                         ::testing::ValuesIn(inputsd_i32));
+INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i32, ::testing::ValuesIn(inputsd_i32));
 
 const std::vector<MapInputs<double, size_t>> inputsd_i64 = {
   {0.00000001, 1024 * 1024, 1234ULL, 5.2}};
 typedef MapTest<double, size_t> MapTestD_i64;
-TEST_P(MapTestD_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MapTestD_i64, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i64,
-                         ::testing::ValuesIn(inputsd_i64));
+INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i64, ::testing::ValuesIn(inputsd_i64));
 
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index 9d59e49e60..1594cc3544 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -27,21 +27,18 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename OutType, typename MapOp>
-__global__ void naiveMapReduceKernel(OutType *out, const InType *in, size_t len,
-                                     MapOp map) {
+__global__ void naiveMapReduceKernel(OutType* out, const InType* in, size_t len, MapOp map)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    raft::myAtomicAdd(out, (OutType)map(in[idx]));
-  }
+  if (idx < len) { raft::myAtomicAdd(out, (OutType)map(in[idx])); }
 }
 
 template <typename InType, typename OutType, typename MapOp>
-void naiveMapReduce(OutType *out, const InType *in, size_t len, MapOp map,
-                    cudaStream_t stream) {
+void naiveMapReduce(OutType* out, const InType* in, size_t len, MapOp map, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, (size_t)TPB);
-  naiveMapReduceKernel<InType, OutType, MapOp>
-    <<<nblks, TPB, 0, stream>>>(out, in, len, map);
+  int nblks            = raft::ceildiv(len, (size_t)TPB);
+  naiveMapReduceKernel<InType, OutType, MapOp><<<nblks, TPB, 0, stream>>>(out, in, len, map);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -53,7 +50,8 @@ struct MapReduceInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const MapReduceInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const MapReduceInputs<T>& dims)
+{
   return os;
 }
 
@@ -61,8 +59,9 @@ template <typename T>
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename InType, typename OutType>
-void mapReduceLaunch(OutType *out_ref, OutType *out, const InType *in,
-                     size_t len, cudaStream_t stream) {
+void mapReduceLaunch(
+  OutType* out_ref, OutType* out, const InType* in, size_t len, cudaStream_t stream)
+{
   auto op = [] __device__(InType in) { return in; };
   naiveMapReduce(out_ref, in, len, op, stream);
   mapThenSumReduce(out, len, op, 0, in);
@@ -78,10 +77,12 @@ class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<InType>> {
       out_ref(params.len, stream),
       out(params.len, stream)
 
-  {}
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     auto len = params.len;
     r.uniform(in.data(), len, InType(-1.0), InType(1.0), stream);
@@ -98,42 +99,40 @@ class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<InType>> {
   rmm::device_uvector<OutType> out_ref, out;
 };
 
-const std::vector<MapReduceInputs<float>> inputsf = {
-  {0.001f, 1024 * 1024, 1234ULL}};
+const std::vector<MapReduceInputs<float>> inputsf = {{0.001f, 1024 * 1024, 1234ULL}};
 typedef MapReduceTest<float, float> MapReduceTestFF;
-TEST_P(MapReduceTestFF, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MapReduceTestFF, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF, ::testing::ValuesIn(inputsf));
 
 typedef MapReduceTest<float, double> MapReduceTestFD;
-TEST_P(MapReduceTestFD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MapReduceTestFD, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFD,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFD, ::testing::ValuesIn(inputsf));
 
-const std::vector<MapReduceInputs<double>> inputsd = {
-  {0.000001, 1024 * 1024, 1234ULL}};
+const std::vector<MapReduceInputs<double>> inputsd = {{0.000001, 1024 * 1024, 1234ULL}};
 typedef MapReduceTest<double, double> MapReduceTestDD;
-TEST_P(MapReduceTestDD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MapReduceTestDD, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestDD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestDD, ::testing::ValuesIn(inputsd));
 
 template <typename T>
 class MapGenericReduceTest : public ::testing::Test {
-  using InType = typename T::first_type;
+  using InType  = typename T::first_type;
   using OutType = typename T::second_type;
 
  protected:
-  MapGenericReduceTest()
-    : input(n, handle.get_stream()), output(handle.get_stream()) {
+  MapGenericReduceTest() : input(n, handle.get_stream()), output(handle.get_stream())
+  {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
     initInput(input.data(), input.size(), stream);
@@ -142,7 +141,8 @@ class MapGenericReduceTest : public ::testing::Test {
   void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); }
 
  public:
-  void initInput(InType *input, int n, cudaStream_t stream) {
+  void initInput(InType* input, int n, cudaStream_t stream)
+  {
     raft::random::Rng r(137);
     r.uniform(input, n, InType(2), InType(3), stream);
     InType val = 1;
@@ -151,21 +151,19 @@ class MapGenericReduceTest : public ::testing::Test {
     raft::update_device(input + 337, &val, 1, stream);
   }
 
-  void testMin() {
-    auto op = [] __device__(InType in) { return in; };
+  void testMin()
+  {
+    auto op               = [] __device__(InType in) { return in; };
     const OutType neutral = std::numeric_limits<InType>::max();
-    mapThenReduce(output.data(), input.size(), neutral, op, cub::Min(), stream,
-                  input.data());
-    EXPECT_TRUE(raft::devArrMatch(OutType(1), output.data(), 1,
-                                  raft::Compare<OutType>()));
+    mapThenReduce(output.data(), input.size(), neutral, op, cub::Min(), stream, input.data());
+    EXPECT_TRUE(raft::devArrMatch(OutType(1), output.data(), 1, raft::Compare<OutType>()));
   }
-  void testMax() {
-    auto op = [] __device__(InType in) { return in; };
+  void testMax()
+  {
+    auto op               = [] __device__(InType in) { return in; };
     const OutType neutral = std::numeric_limits<InType>::min();
-    mapThenReduce(output.data(), input.size(), neutral, op, cub::Max(), stream,
-                  input.data());
-    EXPECT_TRUE(raft::devArrMatch(OutType(5), output.data(), 1,
-                                  raft::Compare<OutType>()));
+    mapThenReduce(output.data(), input.size(), neutral, op, cub::Max(), stream, input.data());
+    EXPECT_TRUE(raft::devArrMatch(OutType(5), output.data(), 1, raft::Compare<OutType>()));
   }
 
  protected:
@@ -178,8 +176,7 @@ class MapGenericReduceTest : public ::testing::Test {
 };
 
 using IoTypePair =
-  ::testing::Types<std::pair<float, float>, std::pair<float, double>,
-                   std::pair<double, double>>;
+  ::testing::Types<std::pair<float, float>, std::pair<float, double>, std::pair<double, double>>;
 
 TYPED_TEST_CASE(MapGenericReduceTest, IoTypePair);
 TYPED_TEST(MapGenericReduceTest, min) { this->testMin(); }
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index aad1d1e137..3db7c53041 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -32,8 +32,8 @@ struct MatVecOpInputs {
 };
 
 template <typename T, typename IdxType>
-::std::ostream &operator<<(::std::ostream &os,
-                           const MatVecOpInputs<T, IdxType> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const MatVecOpInputs<T, IdxType>& dims)
+{
   return os;
 }
 
@@ -41,24 +41,45 @@ template <typename T, typename IdxType>
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename T, typename IdxType>
-void matrixVectorOpLaunch(T *out, const T *in, const T *vec1, const T *vec2,
-                          IdxType D, IdxType N, bool rowMajor,
-                          bool bcastAlongRows, bool useTwoVectors,
-                          cudaStream_t stream) {
+void matrixVectorOpLaunch(T* out,
+                          const T* in,
+                          const T* vec1,
+                          const T* vec2,
+                          IdxType D,
+                          IdxType N,
+                          bool rowMajor,
+                          bool bcastAlongRows,
+                          bool useTwoVectors,
+                          cudaStream_t stream)
+{
   if (useTwoVectors) {
     matrixVectorOp(
-      out, in, vec1, vec2, D, N, rowMajor, bcastAlongRows,
-      [] __device__(T a, T b, T c) { return a + b + c; }, stream);
+      out,
+      in,
+      vec1,
+      vec2,
+      D,
+      N,
+      rowMajor,
+      bcastAlongRows,
+      [] __device__(T a, T b, T c) { return a + b + c; },
+      stream);
   } else {
     matrixVectorOp(
-      out, in, vec1, D, N, rowMajor, bcastAlongRows,
-      [] __device__(T a, T b) { return a + b; }, stream);
+      out,
+      in,
+      vec1,
+      D,
+      N,
+      rowMajor,
+      bcastAlongRows,
+      [] __device__(T a, T b) { return a + b; },
+      stream);
   }
 }
 
 template <typename T, typename IdxType>
-class MatVecOpTest
-  : public ::testing::TestWithParam<MatVecOpInputs<T, IdxType>> {
+class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<T, IdxType>> {
  public:
   MatVecOpTest()
     : params(::testing::TestWithParam<MatVecOpInputs<T, IdxType>>::GetParam()),
@@ -67,27 +88,50 @@ class MatVecOpTest
       out_ref(params.rows * params.cols, stream),
       out(params.rows * params.cols, stream),
       vec1(params.bcastAlongRows ? params.cols : params.rows, stream),
-      vec2(params.bcastAlongRows ? params.cols : params.rows, stream) {}
+      vec2(params.bcastAlongRows ? params.cols : params.rows, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     IdxType N = params.rows, D = params.cols;
-    IdxType len = N * D;
+    IdxType len    = N * D;
     IdxType vecLen = params.bcastAlongRows ? D : N;
     r.uniform(in.data(), len, (T)-1.0, (T)1.0, stream);
     r.uniform(vec1.data(), vecLen, (T)-1.0, (T)1.0, stream);
     r.uniform(vec2.data(), vecLen, (T)-1.0, (T)1.0, stream);
     if (params.useTwoVectors) {
-      naiveMatVec(out_ref.data(), in.data(), vec1.data(), vec2.data(), D, N,
-                  params.rowMajor, params.bcastAlongRows, (T)1.0);
+      naiveMatVec(out_ref.data(),
+                  in.data(),
+                  vec1.data(),
+                  vec2.data(),
+                  D,
+                  N,
+                  params.rowMajor,
+                  params.bcastAlongRows,
+                  (T)1.0);
     } else {
-      naiveMatVec(out_ref.data(), in.data(), vec1.data(), D, N, params.rowMajor,
-                  params.bcastAlongRows, (T)1.0);
+      naiveMatVec(out_ref.data(),
+                  in.data(),
+                  vec1.data(),
+                  D,
+                  N,
+                  params.rowMajor,
+                  params.bcastAlongRows,
+                  (T)1.0);
     }
-    matrixVectorOpLaunch(out.data(), in.data(), vec1.data(), vec2.data(), D, N,
-                         params.rowMajor, params.bcastAlongRows,
-                         params.useTwoVectors, stream);
+    matrixVectorOpLaunch(out.data(),
+                         in.data(),
+                         vec1.data(),
+                         vec2.data(),
+                         D,
+                         N,
+                         params.rowMajor,
+                         params.bcastAlongRows,
+                         params.useTwoVectors,
+                         stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
@@ -118,23 +162,23 @@ const std::vector<MatVecOpInputs<float, int>> inputsf_i32 = {
   {0.00001f, 1024, 32, false, false, true, 1234ULL},
   {0.00001f, 1024, 64, false, false, true, 1234ULL}};
 typedef MatVecOpTest<float, int> MatVecOpTestF_i32;
-TEST_P(MatVecOpTestF_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.rows * params.cols,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MatVecOpTestF_i32, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.rows * params.cols, CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32,
-                         ::testing::ValuesIn(inputsf_i32));
+INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32, ::testing::ValuesIn(inputsf_i32));
 
 const std::vector<MatVecOpInputs<float, size_t>> inputsf_i64 = {
   {0.00001f, 2500, 250, false, false, false, 1234ULL},
   {0.00001f, 2500, 250, false, false, true, 1234ULL}};
 typedef MatVecOpTest<float, size_t> MatVecOpTestF_i64;
-TEST_P(MatVecOpTestF_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.rows * params.cols,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MatVecOpTestF_i64, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.rows * params.cols, CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64,
-                         ::testing::ValuesIn(inputsf_i64));
+INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64, ::testing::ValuesIn(inputsf_i64));
 
 const std::vector<MatVecOpInputs<double, int>> inputsd_i32 = {
   {0.0000001, 1024, 32, true, true, false, 1234ULL},
@@ -155,23 +199,27 @@ const std::vector<MatVecOpInputs<double, int>> inputsd_i32 = {
   {0.0000001, 1024, 32, false, false, true, 1234ULL},
   {0.0000001, 1024, 64, false, false, true, 1234ULL}};
 typedef MatVecOpTest<double, int> MatVecOpTestD_i32;
-TEST_P(MatVecOpTestD_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.rows * params.cols,
+TEST_P(MatVecOpTestD_i32, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref.data(),
+                          out.data(),
+                          params.rows * params.cols,
                           CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32,
-                         ::testing::ValuesIn(inputsd_i32));
+INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32, ::testing::ValuesIn(inputsd_i32));
 
 const std::vector<MatVecOpInputs<double, size_t>> inputsd_i64 = {
   {0.0000001, 2500, 250, false, false, false, 1234ULL},
   {0.0000001, 2500, 250, false, false, true, 1234ULL}};
 typedef MatVecOpTest<double, size_t> MatVecOpTestD_i64;
-TEST_P(MatVecOpTestD_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.rows * params.cols,
+TEST_P(MatVecOpTestD_i64, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref.data(),
+                          out.data(),
+                          params.rows * params.cols,
                           CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64,
-                         ::testing::ValuesIn(inputsd_i64));
+INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64, ::testing::ValuesIn(inputsd_i64));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh
index 69c45c9866..5f9c6f1ef3 100644
--- a/cpp/test/linalg/matrix_vector_op.cuh
+++ b/cpp/test/linalg/matrix_vector_op.cuh
@@ -22,9 +22,15 @@ namespace raft {
 namespace linalg {
 
 template <typename Type, typename IdxType = int>
-__global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec,
-                                  IdxType D, IdxType N, bool rowMajor,
-                                  bool bcastAlongRows, Type scalar) {
+__global__ void naiveMatVecKernel(Type* out,
+                                  const Type* mat,
+                                  const Type* vec,
+                                  IdxType D,
+                                  IdxType N,
+                                  bool rowMajor,
+                                  bool bcastAlongRows,
+                                  Type scalar)
+{
   IdxType idx = threadIdx.x + blockIdx.x * blockDim.x;
   IdxType len = N * D;
   IdxType col;
@@ -37,27 +43,37 @@ __global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec,
   } else {
     col = idx / N;
   }
-  if (idx < len) {
-    out[idx] = mat[idx] + scalar * vec[col];
-  }
+  if (idx < len) { out[idx] = mat[idx] + scalar * vec[col]; }
 }
 
 template <typename Type, typename IdxType = int>
-void naiveMatVec(Type *out, const Type *mat, const Type *vec, IdxType D,
-                 IdxType N, bool rowMajor, bool bcastAlongRows, Type scalar) {
+void naiveMatVec(Type* out,
+                 const Type* mat,
+                 const Type* vec,
+                 IdxType D,
+                 IdxType N,
+                 bool rowMajor,
+                 bool bcastAlongRows,
+                 Type scalar)
+{
   static const IdxType TPB = 64;
-  IdxType len = N * D;
-  IdxType nblks = raft::ceildiv(len, TPB);
-  naiveMatVecKernel<Type>
-    <<<nblks, TPB>>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar);
+  IdxType len              = N * D;
+  IdxType nblks            = raft::ceildiv(len, TPB);
+  naiveMatVecKernel<Type><<<nblks, TPB>>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename Type, typename IdxType = int>
-__global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec1,
-                                  const Type *vec2, IdxType D, IdxType N,
-                                  bool rowMajor, bool bcastAlongRows,
-                                  Type scalar) {
+__global__ void naiveMatVecKernel(Type* out,
+                                  const Type* mat,
+                                  const Type* vec1,
+                                  const Type* vec2,
+                                  IdxType D,
+                                  IdxType N,
+                                  bool rowMajor,
+                                  bool bcastAlongRows,
+                                  Type scalar)
+{
   IdxType idx = threadIdx.x + blockIdx.x * blockDim.x;
   IdxType len = N * D;
   IdxType col;
@@ -70,20 +86,25 @@ __global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec1,
   } else {
     col = idx / N;
   }
-  if (idx < len) {
-    out[idx] = mat[idx] + scalar * vec1[col] + vec2[col];
-  }
+  if (idx < len) { out[idx] = mat[idx] + scalar * vec1[col] + vec2[col]; }
 }
 
 template <typename Type, typename IdxType = int>
-void naiveMatVec(Type *out, const Type *mat, const Type *vec1, const Type *vec2,
-                 IdxType D, IdxType N, bool rowMajor, bool bcastAlongRows,
-                 Type scalar) {
+void naiveMatVec(Type* out,
+                 const Type* mat,
+                 const Type* vec1,
+                 const Type* vec2,
+                 IdxType D,
+                 IdxType N,
+                 bool rowMajor,
+                 bool bcastAlongRows,
+                 Type scalar)
+{
   static const IdxType TPB = 64;
-  IdxType len = N * D;
-  IdxType nblks = raft::ceildiv(len, TPB);
-  naiveMatVecKernel<Type><<<nblks, TPB>>>(out, mat, vec1, vec2, D, N, rowMajor,
-                                          bcastAlongRows, scalar);
+  IdxType len              = N * D;
+  IdxType nblks            = raft::ceildiv(len, TPB);
+  naiveMatVecKernel<Type>
+    <<<nblks, TPB>>>(out, mat, vec1, vec2, D, N, rowMajor, bcastAlongRows, scalar);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu
index f78ae64f05..2a632d55b2 100644
--- a/cpp/test/linalg/multiply.cu
+++ b/cpp/test/linalg/multiply.cu
@@ -32,10 +32,13 @@ class MultiplyTest : public ::testing::TestWithParam<UnaryOpInputs<T>> {
       stream(handle.get_stream()),
       in(params.len, stream),
       out_ref(params.len, stream),
-      out(params.len, stream) {}
+      out(params.len, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<UnaryOpInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
@@ -53,25 +56,23 @@ class MultiplyTest : public ::testing::TestWithParam<UnaryOpInputs<T>> {
   rmm::device_uvector<T> in, out_ref, out;
 };
 
-const std::vector<UnaryOpInputs<float>> inputsf = {
-  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+const std::vector<UnaryOpInputs<float>> inputsf = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
 typedef MultiplyTest<float> MultiplyTestF;
-TEST_P(MultiplyTestF, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          raft::CompareApprox<float>(params.tolerance)));
+TEST_P(MultiplyTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestF, ::testing::ValuesIn(inputsf));
 
 typedef MultiplyTest<double> MultiplyTestD;
-const std::vector<UnaryOpInputs<double>> inputsd = {
-  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
-TEST_P(MultiplyTestD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          raft::CompareApprox<double>(params.tolerance)));
+const std::vector<UnaryOpInputs<double>> inputsd = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+TEST_P(MultiplyTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
index 659956534e..6dae606f18 100644
--- a/cpp/test/linalg/norm.cu
+++ b/cpp/test/linalg/norm.cu
@@ -34,17 +34,19 @@ struct NormInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const NormInputs<T> &I) {
-  os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", "
-     << I.type << ", " << I.do_sqrt << ", " << I.seed << '}' << std::endl;
+::std::ostream& operator<<(::std::ostream& os, const NormInputs<T>& I)
+{
+  os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " << I.type << ", "
+     << I.do_sqrt << ", " << I.seed << '}' << std::endl;
   return os;
 }
 
 ///// Row-wise norm test definitions
 template <typename Type>
-__global__ void naiveRowNormKernel(Type *dots, const Type *data, int D, int N,
-                                   NormType type, bool do_sqrt) {
-  Type acc = (Type)0;
+__global__ void naiveRowNormKernel(
+  Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt)
+{
+  Type acc     = (Type)0;
   int rowStart = threadIdx.x + blockIdx.x * blockDim.x;
   if (rowStart < N) {
     for (int i = 0; i < D; ++i) {
@@ -59,12 +61,12 @@ __global__ void naiveRowNormKernel(Type *dots, const Type *data, int D, int N,
 }
 
 template <typename Type>
-void naiveRowNorm(Type *dots, const Type *data, int D, int N, NormType type,
-                  bool do_sqrt, cudaStream_t stream) {
+void naiveRowNorm(
+  Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(N, TPB);
-  naiveRowNormKernel<Type>
-    <<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
+  int nblks            = raft::ceildiv(N, TPB);
+  naiveRowNormKernel<Type><<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -76,21 +78,22 @@ class RowNormTest : public ::testing::TestWithParam<NormInputs<T>> {
       stream(handle.get_stream()),
       data(params.rows * params.cols, stream),
       dots_exp(params.rows, stream),
-      dots_act(params.rows, stream) {}
+      dots_act(params.rows, stream)
+  {
+  }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols, len = rows * cols;
     r.uniform(data.data(), len, T(-1.0), T(1.0), stream);
-    naiveRowNorm(dots_exp.data(), data.data(), cols, rows, params.type,
-                 params.do_sqrt, stream);
+    naiveRowNorm(dots_exp.data(), data.data(), cols, rows, params.type, params.do_sqrt, stream);
     if (params.do_sqrt) {
       auto fin_op = [] __device__(T in) { return raft::mySqrt(in); };
-      rowNorm(dots_act.data(), data.data(), cols, rows, params.type,
-              params.rowMajor, stream, fin_op);
+      rowNorm(
+        dots_act.data(), data.data(), cols, rows, params.type, params.rowMajor, stream, fin_op);
     } else {
-      rowNorm(dots_act.data(), data.data(), cols, rows, params.type,
-              params.rowMajor, stream);
+      rowNorm(dots_act.data(), data.data(), cols, rows, params.type, params.rowMajor, stream);
     }
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
@@ -105,10 +108,11 @@ class RowNormTest : public ::testing::TestWithParam<NormInputs<T>> {
 
 ///// Column-wise norm test definitisons
 template <typename Type>
-__global__ void naiveColNormKernel(Type *dots, const Type *data, int D, int N,
-                                   NormType type, bool do_sqrt) {
+__global__ void naiveColNormKernel(
+  Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt)
+{
   int colID = threadIdx.x + blockIdx.x * blockDim.x;
-  if (colID > D) return;  //avoid out-of-bounds thread
+  if (colID > D) return;  // avoid out-of-bounds thread
 
   Type acc = 0;
   for (int i = 0; i < N; i++) {
@@ -120,12 +124,12 @@ __global__ void naiveColNormKernel(Type *dots, const Type *data, int D, int N,
 }
 
 template <typename Type>
-void naiveColNorm(Type *dots, const Type *data, int D, int N, NormType type,
-                  bool do_sqrt, cudaStream_t stream) {
+void naiveColNorm(
+  Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(D, TPB);
-  naiveColNormKernel<Type>
-    <<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
+  int nblks            = raft::ceildiv(D, TPB);
+  naiveColNormKernel<Type><<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -137,22 +141,23 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<T>> {
       stream(handle.get_stream()),
       data(params.rows * params.cols, stream),
       dots_exp(params.cols, stream),
-      dots_act(params.cols, stream) {}
+      dots_act(params.cols, stream)
+  {
+  }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols, len = rows * cols;
     r.uniform(data.data(), len, T(-1.0), T(1.0), stream);
 
-    naiveColNorm(dots_exp.data(), data.data(), cols, rows, params.type,
-                 params.do_sqrt, stream);
+    naiveColNorm(dots_exp.data(), data.data(), cols, rows, params.type, params.do_sqrt, stream);
     if (params.do_sqrt) {
       auto fin_op = [] __device__(T in) { return raft::mySqrt(in); };
-      colNorm(dots_act.data(), data.data(), cols, rows, params.type,
-              params.rowMajor, stream, fin_op);
+      colNorm(
+        dots_act.data(), data.data(), cols, rows, params.type, params.rowMajor, stream, fin_op);
     } else {
-      colNorm(dots_act.data(), data.data(), cols, rows, params.type,
-              params.rowMajor, stream);
+      colNorm(dots_act.data(), data.data(), cols, rows, params.type, params.rowMajor, stream);
     }
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
@@ -166,24 +171,23 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<T>> {
 };
 
 ///// Row- and column-wise tests
-const std::vector<NormInputs<float>> inputsf = {
-  {0.00001f, 1024, 32, L1Norm, false, true, 1234ULL},
-  {0.00001f, 1024, 64, L1Norm, false, true, 1234ULL},
-  {0.00001f, 1024, 128, L1Norm, false, true, 1234ULL},
-  {0.00001f, 1024, 256, L1Norm, false, true, 1234ULL},
-  {0.00001f, 1024, 32, L2Norm, false, true, 1234ULL},
-  {0.00001f, 1024, 64, L2Norm, false, true, 1234ULL},
-  {0.00001f, 1024, 128, L2Norm, false, true, 1234ULL},
-  {0.00001f, 1024, 256, L2Norm, false, true, 1234ULL},
-
-  {0.00001f, 1024, 32, L1Norm, true, true, 1234ULL},
-  {0.00001f, 1024, 64, L1Norm, true, true, 1234ULL},
-  {0.00001f, 1024, 128, L1Norm, true, true, 1234ULL},
-  {0.00001f, 1024, 256, L1Norm, true, true, 1234ULL},
-  {0.00001f, 1024, 32, L2Norm, true, true, 1234ULL},
-  {0.00001f, 1024, 64, L2Norm, true, true, 1234ULL},
-  {0.00001f, 1024, 128, L2Norm, true, true, 1234ULL},
-  {0.00001f, 1024, 256, L2Norm, true, true, 1234ULL}};
+const std::vector<NormInputs<float>> inputsf = {{0.00001f, 1024, 32, L1Norm, false, true, 1234ULL},
+                                                {0.00001f, 1024, 64, L1Norm, false, true, 1234ULL},
+                                                {0.00001f, 1024, 128, L1Norm, false, true, 1234ULL},
+                                                {0.00001f, 1024, 256, L1Norm, false, true, 1234ULL},
+                                                {0.00001f, 1024, 32, L2Norm, false, true, 1234ULL},
+                                                {0.00001f, 1024, 64, L2Norm, false, true, 1234ULL},
+                                                {0.00001f, 1024, 128, L2Norm, false, true, 1234ULL},
+                                                {0.00001f, 1024, 256, L2Norm, false, true, 1234ULL},
+
+                                                {0.00001f, 1024, 32, L1Norm, true, true, 1234ULL},
+                                                {0.00001f, 1024, 64, L1Norm, true, true, 1234ULL},
+                                                {0.00001f, 1024, 128, L1Norm, true, true, 1234ULL},
+                                                {0.00001f, 1024, 256, L1Norm, true, true, 1234ULL},
+                                                {0.00001f, 1024, 32, L2Norm, true, true, 1234ULL},
+                                                {0.00001f, 1024, 64, L2Norm, true, true, 1234ULL},
+                                                {0.00001f, 1024, 128, L2Norm, true, true, 1234ULL},
+                                                {0.00001f, 1024, 256, L2Norm, true, true, 1234ULL}};
 
 const std::vector<NormInputs<double>> inputsd = {
   {0.00000001, 1024, 32, L1Norm, false, true, 1234ULL},
@@ -205,22 +209,22 @@ const std::vector<NormInputs<double>> inputsd = {
   {0.00000001, 1024, 256, L2Norm, true, true, 1234ULL}};
 
 typedef RowNormTest<float> RowNormTestF;
-TEST_P(RowNormTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.rows,
-                                raft::CompareApprox<float>(params.tolerance)));
+TEST_P(RowNormTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    dots_exp.data(), dots_act.data(), params.rows, raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef RowNormTest<double> RowNormTestD;
-TEST_P(RowNormTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.rows,
-                                raft::CompareApprox<double>(params.tolerance)));
+TEST_P(RowNormTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    dots_exp.data(), dots_act.data(), params.rows, raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestD, ::testing::ValuesIn(inputsd));
 
 const std::vector<NormInputs<float>> inputscf = {
   {0.00001f, 32, 1024, L1Norm, false, true, 1234ULL},
@@ -261,22 +265,22 @@ const std::vector<NormInputs<double>> inputscd = {
   {0.00000001, 256, 1024, L2Norm, true, true, 1234ULL}};
 
 typedef ColNormTest<float> ColNormTestF;
-TEST_P(ColNormTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.cols,
-                                raft::CompareApprox<float>(params.tolerance)));
+TEST_P(ColNormTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    dots_exp.data(), dots_act.data(), params.cols, raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef ColNormTest<double> ColNormTestD;
-TEST_P(ColNormTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.cols,
-                                raft::CompareApprox<double>(params.tolerance)));
+TEST_P(ColNormTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    dots_exp.data(), dots_act.data(), params.cols, raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestF,
-                        ::testing::ValuesIn(inputscf));
+INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestF, ::testing::ValuesIn(inputscf));
 
-INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestD,
-                        ::testing::ValuesIn(inputscd));
+INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestD, ::testing::ValuesIn(inputscd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
index 9822ca2c60..25ee0a7b77 100644
--- a/cpp/test/linalg/reduce.cu
+++ b/cpp/test/linalg/reduce.cu
@@ -34,8 +34,8 @@ struct ReduceInputs {
 };
 
 template <typename InType, typename OutType>
-::std::ostream &operator<<(::std::ostream &os,
-                           const ReduceInputs<InType, OutType> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const ReduceInputs<InType, OutType>& dims)
+{
   return os;
 }
 
@@ -43,44 +43,58 @@ template <typename InType, typename OutType>
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename InType, typename OutType>
-void reduceLaunch(OutType *dots, const InType *data, int cols, int rows,
-                  bool rowMajor, bool alongRows, bool inplace,
-                  cudaStream_t stream) {
-  reduce(
-    dots, data, cols, rows, (OutType)0, rowMajor, alongRows, stream, inplace,
-    [] __device__(InType in, int i) { return static_cast<OutType>(in * in); });
+void reduceLaunch(OutType* dots,
+                  const InType* data,
+                  int cols,
+                  int rows,
+                  bool rowMajor,
+                  bool alongRows,
+                  bool inplace,
+                  cudaStream_t stream)
+{
+  reduce(dots,
+         data,
+         cols,
+         rows,
+         (OutType)0,
+         rowMajor,
+         alongRows,
+         stream,
+         inplace,
+         [] __device__(InType in, int i) { return static_cast<OutType>(in * in); });
 }
 
 template <typename InType, typename OutType>
-class ReduceTest
-  : public ::testing::TestWithParam<ReduceInputs<InType, OutType>> {
+class ReduceTest : public ::testing::TestWithParam<ReduceInputs<InType, OutType>> {
  public:
   ReduceTest()
-    : params(
-        ::testing::TestWithParam<ReduceInputs<InType, OutType>>::GetParam()),
+    : params(::testing::TestWithParam<ReduceInputs<InType, OutType>>::GetParam()),
       stream(handle.get_stream()),
       data(params.rows * params.cols, stream),
       dots_exp(params.alongRows ? params.rows : params.cols, stream),
-      dots_act(params.alongRows ? params.rows : params.cols, stream) {}
+      dots_act(params.alongRows ? params.rows : params.cols, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
-    outlen = params.alongRows ? rows : cols;
+    outlen  = params.alongRows ? rows : cols;
     r.uniform(data.data(), len, InType(-1.0), InType(1.0), stream);
-    naiveReduction(dots_exp.data(), data.data(), cols, rows, params.rowMajor,
-                   params.alongRows, stream);
+    naiveReduction(
+      dots_exp.data(), data.data(), cols, rows, params.rowMajor, params.alongRows, stream);
 
     // Perform reduction with default inplace = false first
-    reduceLaunch(dots_act.data(), data.data(), cols, rows, params.rowMajor,
-                 params.alongRows, false, stream);
+    reduceLaunch(
+      dots_act.data(), data.data(), cols, rows, params.rowMajor, params.alongRows, false, stream);
     // Add to result with inplace = true next, which shouldn't affect
     // in the case of coalescedReduction!
     if (!(params.rowMajor ^ params.alongRows)) {
-      reduceLaunch(dots_act.data(), data.data(), cols, rows, params.rowMajor,
-                   params.alongRows, true, stream);
+      reduceLaunch(
+        dots_act.data(), data.data(), cols, rows, params.rowMajor, params.alongRows, true, stream);
     }
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
@@ -150,31 +164,31 @@ const std::vector<ReduceInputs<float, double>> inputsfd = {
   {0.000002f, 1024, 256, false, false, 1234ULL}};
 
 typedef ReduceTest<float, float> ReduceTestFF;
-TEST_P(ReduceTestFF, Result) {
-  ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), outlen,
-                          raft::CompareApprox<float>(params.tolerance)));
+TEST_P(ReduceTestFF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dots_exp.data(), dots_act.data(), outlen, raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef ReduceTest<double, double> ReduceTestDD;
-TEST_P(ReduceTestDD, Result) {
-  ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), outlen,
-                          raft::CompareApprox<double>(params.tolerance)));
+TEST_P(ReduceTestDD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dots_exp.data(), dots_act.data(), outlen, raft::CompareApprox<double>(params.tolerance)));
 }
 
 typedef ReduceTest<float, double> ReduceTestFD;
-TEST_P(ReduceTestFD, Result) {
-  ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), outlen,
-                          raft::CompareApprox<double>(params.tolerance)));
+TEST_P(ReduceTestFD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dots_exp.data(), dots_act.data(), outlen, raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF,
-                        ::testing::ValuesIn(inputsff));
+INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF, ::testing::ValuesIn(inputsff));
 
-INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD,
-                        ::testing::ValuesIn(inputsdd));
+INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD, ::testing::ValuesIn(inputsdd));
 
-INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD,
-                        ::testing::ValuesIn(inputsfd));
+INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD, ::testing::ValuesIn(inputsfd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh
index 7f8319636b..82ddfd4661 100644
--- a/cpp/test/linalg/reduce.cuh
+++ b/cpp/test/linalg/reduce.cuh
@@ -26,55 +26,60 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename OutType>
-__global__ void naiveCoalescedReductionKernel(OutType *dots, const InType *data,
-                                              int D, int N) {
-  OutType acc = (OutType)0;
+__global__ void naiveCoalescedReductionKernel(OutType* dots, const InType* data, int D, int N)
+{
+  OutType acc  = (OutType)0;
   int rowStart = threadIdx.x + blockIdx.x * blockDim.x;
   if (rowStart < N) {
     for (int i = 0; i < D; ++i) {
-      acc +=
-        static_cast<OutType>(data[rowStart * D + i] * data[rowStart * D + i]);
+      acc += static_cast<OutType>(data[rowStart * D + i] * data[rowStart * D + i]);
     }
     dots[rowStart] = 2 * acc;
   }
 }
 
 template <typename InType, typename OutType>
-void naiveCoalescedReduction(OutType *dots, const InType *data, int D, int N,
-                             cudaStream_t stream) {
+void naiveCoalescedReduction(OutType* dots, const InType* data, int D, int N, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(N, TPB);
-  naiveCoalescedReductionKernel<InType, OutType>
-    <<<nblks, TPB, 0, stream>>>(dots, data, D, N);
+  int nblks            = raft::ceildiv(N, TPB);
+  naiveCoalescedReductionKernel<InType, OutType><<<nblks, TPB, 0, stream>>>(dots, data, D, N);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename InType, typename OutType>
-void unaryAndGemv(OutType *dots, const InType *data, int D, int N,
-                  cudaStream_t stream) {
-  //computes a MLCommon unary op on data (squares it), then computes Ax
+void unaryAndGemv(OutType* dots, const InType* data, int D, int N, cudaStream_t stream)
+{
+  // computes a MLCommon unary op on data (squares it), then computes Ax
   //(A input matrix and x column vector) to sum columns
   rmm::device_uvector<OutType> sq(D * N, stream);
   raft::linalg::unaryOp(
-    thrust::raw_pointer_cast(sq.data()), data, D * N,
-    [] __device__(InType v) { return static_cast<OutType>(v * v); }, stream);
+    thrust::raw_pointer_cast(sq.data()),
+    data,
+    D * N,
+    [] __device__(InType v) { return static_cast<OutType>(v * v); },
+    stream);
   cublasHandle_t handle;
   CUBLAS_CHECK(cublasCreate(&handle));
-  rmm::device_uvector<OutType> ones(N, stream);  //column vector [1...1]
+  rmm::device_uvector<OutType> ones(N, stream);  // column vector [1...1]
   raft::linalg::unaryOp<OutType>(
-    ones.data(), ones.data(), ones.size(),
-    [=] __device__(OutType input) { return 1; }, stream);
+    ones.data(), ones.data(), ones.size(), [=] __device__(OutType input) { return 1; }, stream);
   OutType alpha = 1, beta = 0;
-  CUBLAS_CHECK(raft::linalg::cublasgemv(handle, CUBLAS_OP_N, D, N, &alpha,
-                                        sq.data(), D, ones.data(), 1, &beta,
-                                        dots, 1, stream));
+  CUBLAS_CHECK(raft::linalg::cublasgemv(
+    handle, CUBLAS_OP_N, D, N, &alpha, sq.data(), D, ones.data(), 1, &beta, dots, 1, stream));
   CUDA_CHECK(cudaDeviceSynchronize());
   CUBLAS_CHECK(cublasDestroy(handle));
 }
 
 template <typename InType, typename OutType>
-void naiveReduction(OutType *dots, const InType *data, int D, int N,
-                    bool rowMajor, bool alongRows, cudaStream_t stream) {
+void naiveReduction(OutType* dots,
+                    const InType* data,
+                    int D,
+                    int N,
+                    bool rowMajor,
+                    bool alongRows,
+                    cudaStream_t stream)
+{
   if (rowMajor && alongRows) {
     naiveCoalescedReduction(dots, data, D, N, stream);
   } else if (rowMajor && !alongRows) {
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu
index 4f761d39f6..ac387c16bb 100644
--- a/cpp/test/linalg/strided_reduction.cu
+++ b/cpp/test/linalg/strided_reduction.cu
@@ -32,15 +32,14 @@ struct stridedReductionInputs {
 };
 
 template <typename T>
-void stridedReductionLaunch(T *dots, const T *data, int cols, int rows,
-                            cudaStream_t stream) {
-  stridedReduction(dots, data, cols, rows, (T)0, stream, false,
-                   [] __device__(T in, int i) { return in * in; });
+void stridedReductionLaunch(T* dots, const T* data, int cols, int rows, cudaStream_t stream)
+{
+  stridedReduction(
+    dots, data, cols, rows, (T)0, stream, false, [] __device__(T in, int i) { return in * in; });
 }
 
 template <typename T>
-class stridedReductionTest
-  : public ::testing::TestWithParam<stridedReductionInputs<T>> {
+class stridedReductionTest : public ::testing::TestWithParam<stridedReductionInputs<T>> {
  public:
   stridedReductionTest()
     : params(::testing::TestWithParam<stridedReductionInputs<T>>::GetParam()),
@@ -48,15 +47,17 @@ class stridedReductionTest
       data(params.rows * params.cols, stream),
       dots_exp(params.cols, stream),  // expected dot products (from test)
       dots_act(params.cols, stream)   // actual dot products (from prim)
-  {}
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
     r.uniform(data.data(), len, T(-1.0), T(1.0),
-              stream);  //initialize matrix to random
+              stream);  // initialize matrix to random
 
     unaryAndGemv(dots_exp.data(), data.data(), cols, rows, stream);
     stridedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream);
@@ -71,35 +72,33 @@ class stridedReductionTest
   rmm::device_uvector<T> data, dots_exp, dots_act;
 };
 
-const std::vector<stridedReductionInputs<float>> inputsf = {
-  {0.00001f, 1024, 32, 1234ULL},
-  {0.00001f, 1024, 64, 1234ULL},
-  {0.00001f, 1024, 128, 1234ULL},
-  {0.00001f, 1024, 256, 1234ULL}};
+const std::vector<stridedReductionInputs<float>> inputsf = {{0.00001f, 1024, 32, 1234ULL},
+                                                            {0.00001f, 1024, 64, 1234ULL},
+                                                            {0.00001f, 1024, 128, 1234ULL},
+                                                            {0.00001f, 1024, 256, 1234ULL}};
 
-const std::vector<stridedReductionInputs<double>> inputsd = {
-  {0.000000001, 1024, 32, 1234ULL},
-  {0.000000001, 1024, 64, 1234ULL},
-  {0.000000001, 1024, 128, 1234ULL},
-  {0.000000001, 1024, 256, 1234ULL}};
+const std::vector<stridedReductionInputs<double>> inputsd = {{0.000000001, 1024, 32, 1234ULL},
+                                                             {0.000000001, 1024, 64, 1234ULL},
+                                                             {0.000000001, 1024, 128, 1234ULL},
+                                                             {0.000000001, 1024, 256, 1234ULL}};
 
 typedef stridedReductionTest<float> stridedReductionTestF;
-TEST_P(stridedReductionTestF, Result) {
-  ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), params.cols,
-                          raft::CompareApprox<float>(params.tolerance)));
+TEST_P(stridedReductionTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dots_exp.data(), dots_act.data(), params.cols, raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef stridedReductionTest<double> stridedReductionTestD;
-TEST_P(stridedReductionTestD, Result) {
-  ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), params.cols,
-                          raft::CompareApprox<double>(params.tolerance)));
+TEST_P(stridedReductionTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dots_exp.data(), dots_act.data(), params.cols, raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu
index 0a82da61c9..77c14a8a7b 100644
--- a/cpp/test/linalg/subtract.cu
+++ b/cpp/test/linalg/subtract.cu
@@ -24,39 +24,34 @@ namespace raft {
 namespace linalg {
 
 template <typename Type>
-__global__ void naiveSubtractElemKernel(Type *out, const Type *in1,
-                                        const Type *in2, int len) {
+__global__ void naiveSubtractElemKernel(Type* out, const Type* in1, const Type* in2, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = in1[idx] - in2[idx];
-  }
+  if (idx < len) { out[idx] = in1[idx] - in2[idx]; }
 }
 
 template <typename Type>
-void naiveSubtractElem(Type *out, const Type *in1, const Type *in2, int len,
-                       cudaStream_t stream) {
+void naiveSubtractElem(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
+  int nblks            = raft::ceildiv(len, TPB);
   naiveSubtractElemKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename Type>
-__global__ void naiveSubtractScalarKernel(Type *out, const Type *in1,
-                                          const Type in2, int len) {
+__global__ void naiveSubtractScalarKernel(Type* out, const Type* in1, const Type in2, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = in1[idx] - in2;
-  }
+  if (idx < len) { out[idx] = in1[idx] - in2; }
 }
 
 template <typename Type>
-void naiveSubtractScalar(Type *out, const Type *in1, const Type in2, int len,
-                         cudaStream_t stream) {
+void naiveSubtractScalar(Type* out, const Type* in1, const Type in2, int len, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
-  naiveSubtractScalarKernel<Type>
-    <<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
+  int nblks            = raft::ceildiv(len, TPB);
+  naiveSubtractScalarKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -68,7 +63,8 @@ struct SubtractInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const SubtractInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const SubtractInputs<T>& dims)
+{
   return os;
 }
 
@@ -81,10 +77,13 @@ class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
       in1(params.len, stream),
       in2(params.len, stream),
       out_ref(params.len, stream),
-      out(params.len, stream) {}
+      out(params.len, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int len = params.len;
     r.uniform(in1.data(), len, T(-1.0), T(1.0), stream);
@@ -108,35 +107,33 @@ class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
   rmm::device_uvector<T> in1, in2, out_ref, out;
 };
 
-const std::vector<SubtractInputs<float>> inputsf2 = {
-  {0.000001f, 1024 * 1024, 1234ULL}};
+const std::vector<SubtractInputs<float>> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}};
 
-const std::vector<SubtractInputs<double>> inputsd2 = {
-  {0.00000001, 1024 * 1024, 1234ULL}};
+const std::vector<SubtractInputs<double>> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}};
 
 typedef SubtractTest<float> SubtractTestF;
-TEST_P(SubtractTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(out_ref.data(), out.data(), params.len,
-                                raft::CompareApprox<float>(params.tolerance)));
+TEST_P(SubtractTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), out.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
 
-  ASSERT_TRUE(raft::devArrMatch(out_ref.data(), in1.data(), params.len,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), in1.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef SubtractTest<double> SubtractTestD;
-TEST_P(SubtractTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(out_ref.data(), out.data(), params.len,
-                                raft::CompareApprox<double>(params.tolerance)));
+TEST_P(SubtractTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), out.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
 
-  ASSERT_TRUE(raft::devArrMatch(out_ref.data(), in1.data(), params.len,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), in1.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestD, ::testing::ValuesIn(inputsd2));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu
index 8ebbf19683..61c2c2e3db 100644
--- a/cpp/test/linalg/svd.cu
+++ b/cpp/test/linalg/svd.cu
@@ -35,7 +35,8 @@ struct SvdInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const SvdInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const SvdInputs<T>& dims)
+{
   return os;
 }
 
@@ -51,10 +52,13 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
       sing_vals_qr(params.n_col, stream),
       left_eig_vectors_ref(params.n_row * params.n_col, stream),
       right_eig_vectors_ref(params.n_col * params.n_col, stream),
-      sing_vals_ref(params.len, stream) {}
+      sing_vals_ref(params.len, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int len = params.len;
 
@@ -63,26 +67,30 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
     T data_h[] = {1.0, 4.0, 2.0, 2.0, 5.0, 1.0};
     raft::update_device(data.data(), data_h, len, stream);
 
-    int left_evl = params.n_row * params.n_col;
+    int left_evl  = params.n_row * params.n_col;
     int right_evl = params.n_col * params.n_col;
 
-    T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695,
-                                  0.488195,  0.110706,  -0.865685};
+    T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695, 0.488195, 0.110706, -0.865685};
 
     T right_eig_vectors_ref_h[] = {-0.638636, -0.769509, -0.769509, 0.638636};
 
     T sing_vals_ref_h[] = {7.065283, 1.040081};
 
-    raft::update_device(left_eig_vectors_ref.data(), left_eig_vectors_ref_h,
-                        left_evl, stream);
-    raft::update_device(right_eig_vectors_ref.data(), right_eig_vectors_ref_h,
-                        right_evl, stream);
-    raft::update_device(sing_vals_ref.data(), sing_vals_ref_h, params.n_col,
-                        stream);
-
-    svdQR(handle, data.data(), params.n_row, params.n_col, sing_vals_qr.data(),
-          left_eig_vectors_qr.data(), right_eig_vectors_trans_qr.data(), true,
-          true, true, stream);
+    raft::update_device(left_eig_vectors_ref.data(), left_eig_vectors_ref_h, left_evl, stream);
+    raft::update_device(right_eig_vectors_ref.data(), right_eig_vectors_ref_h, right_evl, stream);
+    raft::update_device(sing_vals_ref.data(), sing_vals_ref_h, params.n_col, stream);
+
+    svdQR(handle,
+          data.data(),
+          params.n_row,
+          params.n_col,
+          sing_vals_qr.data(),
+          left_eig_vectors_qr.data(),
+          right_eig_vectors_trans_qr.data(),
+          true,
+          true,
+          true,
+          stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
@@ -91,71 +99,75 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
   cudaStream_t stream;
 
   SvdInputs<T> params;
-  rmm::device_uvector<T> data, left_eig_vectors_qr, right_eig_vectors_trans_qr,
-    sing_vals_qr, left_eig_vectors_ref, right_eig_vectors_ref, sing_vals_ref;
+  rmm::device_uvector<T> data, left_eig_vectors_qr, right_eig_vectors_trans_qr, sing_vals_qr,
+    left_eig_vectors_ref, right_eig_vectors_ref, sing_vals_ref;
 };
 
-const std::vector<SvdInputs<float>> inputsf2 = {
-  {0.00001f, 3 * 2, 3, 2, 1234ULL}};
+const std::vector<SvdInputs<float>> inputsf2 = {{0.00001f, 3 * 2, 3, 2, 1234ULL}};
 
-const std::vector<SvdInputs<double>> inputsd2 = {
-  {0.00001, 3 * 2, 3, 2, 1234ULL}};
+const std::vector<SvdInputs<double>> inputsd2 = {{0.00001, 3 * 2, 3, 2, 1234ULL}};
 
 typedef SvdTest<float> SvdTestValF;
-TEST_P(SvdTestValF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(sing_vals_ref.data(), sing_vals_qr.data(), params.n_col,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(SvdTestValF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(sing_vals_ref.data(),
+                                sing_vals_qr.data(),
+                                params.n_col,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef SvdTest<double> SvdTestValD;
-TEST_P(SvdTestValD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(sing_vals_ref.data(), sing_vals_qr.data(), params.n_col,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(SvdTestValD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(sing_vals_ref.data(),
+                                sing_vals_qr.data(),
+                                params.n_col,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef SvdTest<float> SvdTestLeftVecF;
-TEST_P(SvdTestLeftVecF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(left_eig_vectors_ref.data(), left_eig_vectors_qr.data(),
-                      params.n_row * params.n_col,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(SvdTestLeftVecF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(left_eig_vectors_ref.data(),
+                                left_eig_vectors_qr.data(),
+                                params.n_row * params.n_col,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef SvdTest<double> SvdTestLeftVecD;
-TEST_P(SvdTestLeftVecD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(left_eig_vectors_ref.data(), left_eig_vectors_qr.data(),
-                      params.n_row * params.n_col,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(SvdTestLeftVecD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(left_eig_vectors_ref.data(),
+                                left_eig_vectors_qr.data(),
+                                params.n_row * params.n_col,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef SvdTest<float> SvdTestRightVecF;
-TEST_P(SvdTestRightVecF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(
-    right_eig_vectors_ref.data(), right_eig_vectors_trans_qr.data(),
-    params.n_col * params.n_col,
-    raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(SvdTestRightVecF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(right_eig_vectors_ref.data(),
+                                right_eig_vectors_trans_qr.data(),
+                                params.n_col * params.n_col,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef SvdTest<double> SvdTestRightVecD;
-TEST_P(SvdTestRightVecD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(
-    right_eig_vectors_ref.data(), right_eig_vectors_trans_qr.data(),
-    params.n_col * params.n_col,
-    raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(SvdTestRightVecD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(right_eig_vectors_ref.data(),
+                                right_eig_vectors_trans_qr.data(),
+                                params.n_col * params.n_col,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValF, ::testing::ValuesIn(inputsf2));
 
 INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValD, ::testing::ValuesIn(inputsd2));
 
-INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecD, ::testing::ValuesIn(inputsd2));
 
 // INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecF,
 // ::testing::ValuesIn(inputsf2));
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index 1d8ef08673..fde5599bc1 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -34,7 +34,8 @@ struct TranposeInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const TranposeInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const TranposeInputs<T>& dims)
+{
   return os;
 }
 
@@ -46,10 +47,13 @@ class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
       stream(handle.get_stream()),
       data(params.len, stream),
       data_trans_ref(params.len, stream),
-      data_trans(params.len, stream) {}
+      data_trans(params.len, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     int len = params.len;
     ASSERT(params.len == 9, "This test works only with len=9!");
     T data_h[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
@@ -57,8 +61,7 @@ class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
     T data_ref_h[] = {1.0, 4.0, 7.0, 2.0, 5.0, 8.0, 3.0, 6.0, 9.0};
     raft::update_device(data_trans_ref.data(), data_ref_h, len, stream);
 
-    transpose(handle, data.data(), data_trans.data(), params.n_row,
-              params.n_col, stream);
+    transpose(handle, data.data(), data_trans.data(), params.n_row, params.n_col, stream);
     transpose(data.data(), params.n_row, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
@@ -71,39 +74,41 @@ class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
   rmm::device_uvector<T> data, data_trans, data_trans_ref;
 };
 
-const std::vector<TranposeInputs<float>> inputsf2 = {
-  {0.1f, 3 * 3, 3, 3, 1234ULL}};
+const std::vector<TranposeInputs<float>> inputsf2 = {{0.1f, 3 * 3, 3, 3, 1234ULL}};
 
-const std::vector<TranposeInputs<double>> inputsd2 = {
-  {0.1, 3 * 3, 3, 3, 1234ULL}};
+const std::vector<TranposeInputs<double>> inputsd2 = {{0.1, 3 * 3, 3, 3, 1234ULL}};
 
 typedef TransposeTest<float> TransposeTestValF;
-TEST_P(TransposeTestValF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(data_trans_ref.data(), data_trans.data(), params.len,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
-
-  ASSERT_TRUE(
-    raft::devArrMatch(data_trans_ref.data(), data.data(), params.len,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(TransposeTestValF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(),
+                                data_trans.data(),
+                                params.len,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
+
+  ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(),
+                                data.data(),
+                                params.len,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef TransposeTest<double> TransposeTestValD;
-TEST_P(TransposeTestValD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(data_trans_ref.data(), data_trans.data(), params.len,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
-
-  ASSERT_TRUE(
-    raft::devArrMatch(data_trans_ref.data(), data.data(), params.len,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(TransposeTestValD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(),
+                                data_trans.data(),
+                                params.len,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
+
+  ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(),
+                                data.data(),
+                                params.len,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD, ::testing::ValuesIn(inputsd2));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu
index 0fcf465150..ff6723973d 100644
--- a/cpp/test/linalg/unary_op.cu
+++ b/cpp/test/linalg/unary_op.cu
@@ -28,49 +28,49 @@ namespace linalg {
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename InType, typename IdxType = int, typename OutType = InType>
-void unaryOpLaunch(OutType *out, const InType *in, InType scalar, IdxType len,
-                   cudaStream_t stream) {
+void unaryOpLaunch(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
+{
   if (in == nullptr) {
     auto op = [scalar] __device__(OutType * ptr, IdxType idx) {
       *ptr = static_cast<OutType>(scalar * idx);
     };
     writeOnlyUnaryOp<OutType, decltype(op), IdxType>(out, len, op, stream);
   } else {
-    auto op = [scalar] __device__(InType in) {
-      return static_cast<OutType>(in * scalar);
-    };
+    auto op = [scalar] __device__(InType in) { return static_cast<OutType>(in * scalar); };
     unaryOp<InType, decltype(op), IdxType, OutType>(out, in, len, op, stream);
   }
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-class UnaryOpTest
-  : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxType, OutType>> {
+class UnaryOpTest : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxType, OutType>> {
  public:
   UnaryOpTest()
-    : params(::testing::TestWithParam<
-             UnaryOpInputs<InType, IdxType, OutType>>::GetParam()),
+    : params(::testing::TestWithParam<UnaryOpInputs<InType, IdxType, OutType>>::GetParam()),
       stream(handle.get_stream()),
       in(params.len, stream),
       out_ref(params.len, stream),
-      out(params.len, stream) {}
+      out(params.len, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     auto len = params.len;
     r.uniform(in.data(), len, InType(-1.0), InType(1.0), stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  virtual void DoTest() {
-    auto len = params.len;
+  virtual void DoTest()
+  {
+    auto len    = params.len;
     auto scalar = params.scalar;
     naiveScale(out_ref.data(), in.data(), scalar, len, stream);
     unaryOpLaunch(out.data(), in.data(), scalar, len, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
-    ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                            CompareApprox<OutType>(params.tolerance)));
+    ASSERT_TRUE(devArrMatch(
+      out_ref.data(), out.data(), params.len, CompareApprox<OutType>(params.tolerance)));
   }
 
  protected:
@@ -85,15 +85,15 @@ class UnaryOpTest
 template <typename OutType, typename IdxType>
 class WriteOnlyUnaryOpTest : public UnaryOpTest<OutType, IdxType, OutType> {
  protected:
-  void DoTest() override {
-    auto len = this->params.len;
+  void DoTest() override
+  {
+    auto len    = this->params.len;
     auto scalar = this->params.scalar;
-    naiveScale(this->out_ref.data(), (OutType *)nullptr, scalar, len,
-               this->stream);
-    unaryOpLaunch(this->out.data(), (OutType *)nullptr, scalar, len,
-                  this->stream);
+    naiveScale(this->out_ref.data(), (OutType*)nullptr, scalar, len, this->stream);
+    unaryOpLaunch(this->out.data(), (OutType*)nullptr, scalar, len, this->stream);
     CUDA_CHECK(cudaStreamSynchronize(this->stream));
-    ASSERT_TRUE(devArrMatch(this->out_ref.data(), this->out.data(),
+    ASSERT_TRUE(devArrMatch(this->out_ref.data(),
+                            this->out.data(),
                             this->params.len,
                             CompareApprox<OutType>(this->params.tolerance)));
   }
@@ -103,8 +103,7 @@ class WriteOnlyUnaryOpTest : public UnaryOpTest<OutType, IdxType, OutType> {
   TEST_P(Name, Result) { DoTest(); } \
   INSTANTIATE_TEST_SUITE_P(UnaryOpTests, Name, ::testing::ValuesIn(inputs))
 
-const std::vector<UnaryOpInputs<float, int>> inputsf_i32 = {
-  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+const std::vector<UnaryOpInputs<float, int>> inputsf_i32 = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
 typedef UnaryOpTest<float, int> UnaryOpTestF_i32;
 UNARY_OP_TEST(UnaryOpTestF_i32, inputsf_i32);
 typedef WriteOnlyUnaryOpTest<float, int> WriteOnlyUnaryOpTestF_i32;
diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/test/linalg/unary_op.cuh
index be3f1124c5..3343389af8 100644
--- a/cpp/test/linalg/unary_op.cuh
+++ b/cpp/test/linalg/unary_op.cuh
@@ -24,8 +24,8 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename OutType, typename IdxType>
-__global__ void naiveScaleKernel(OutType *out, const InType *in, InType scalar,
-                                 IdxType len) {
+__global__ void naiveScaleKernel(OutType* out, const InType* in, InType scalar, IdxType len)
+{
   IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x);
   if (idx < len) {
     if (in == nullptr) {
@@ -38,12 +38,11 @@ __global__ void naiveScaleKernel(OutType *out, const InType *in, InType scalar,
 }
 
 template <typename InType, typename IdxType = int, typename OutType = InType>
-void naiveScale(OutType *out, const InType *in, InType scalar, int len,
-                cudaStream_t stream) {
+void naiveScale(OutType* out, const InType* in, InType scalar, int len, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
-  naiveScaleKernel<InType, OutType, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
+  int nblks            = raft::ceildiv(len, TPB);
+  naiveScaleKernel<InType, OutType, IdxType><<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -56,8 +55,8 @@ struct UnaryOpInputs {
 };
 
 template <typename InType, typename IdxType = int, typename OutType = InType>
-::std::ostream &operator<<(::std::ostream &os,
-                           const UnaryOpInputs<InType, IdxType, OutType> &d) {
+::std::ostream& operator<<(::std::ostream& os, const UnaryOpInputs<InType, IdxType, OutType>& d)
+{
   return os;
 }
 
diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu
index 7c7f29815b..7042f5b48d 100644
--- a/cpp/test/matrix/math.cu
+++ b/cpp/test/matrix/math.cu
@@ -24,53 +24,51 @@ namespace raft {
 namespace matrix {
 
 template <typename Type>
-__global__ void nativePowerKernel(Type *in, Type *out, int len) {
+__global__ void nativePowerKernel(Type* in, Type* out, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = in[idx] * in[idx];
-  }
+  if (idx < len) { out[idx] = in[idx] * in[idx]; }
 }
 
 template <typename Type>
-void naivePower(Type *in, Type *out, int len, cudaStream_t stream) {
+void naivePower(Type* in, Type* out, int len, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
+  int nblks            = raft::ceildiv(len, TPB);
   nativePowerKernel<Type><<<nblks, TPB, 0, stream>>>(in, out, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename Type>
-__global__ void nativeSqrtKernel(Type *in, Type *out, int len) {
+__global__ void nativeSqrtKernel(Type* in, Type* out, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = sqrt(in[idx]);
-  }
+  if (idx < len) { out[idx] = sqrt(in[idx]); }
 }
 
 template <typename Type>
-void naiveSqrt(Type *in, Type *out, int len) {
+void naiveSqrt(Type* in, Type* out, int len)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
+  int nblks            = raft::ceildiv(len, TPB);
   nativeSqrtKernel<Type><<<nblks, TPB>>>(in, out, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename Type>
-__global__ void naiveSignFlipKernel(Type *in, Type *out, int rowCount,
-                                    int colCount) {
+__global__ void naiveSignFlipKernel(Type* in, Type* out, int rowCount, int colCount)
+{
   int d_i = blockIdx.x * rowCount;
   int end = d_i + rowCount;
 
   if (blockIdx.x < colCount) {
-    Type max = 0.0;
+    Type max      = 0.0;
     int max_index = 0;
     for (int i = d_i; i < end; i++) {
       Type val = in[i];
-      if (val < 0.0) {
-        val = -val;
-      }
+      if (val < 0.0) { val = -val; }
       if (val > max) {
-        max = val;
+        max       = val;
         max_index = i;
       }
     }
@@ -88,7 +86,8 @@ __global__ void naiveSignFlipKernel(Type *in, Type *out, int rowCount,
 }
 
 template <typename Type>
-void naiveSignFlip(Type *in, Type *out, int rowCount, int colCount) {
+void naiveSignFlip(Type* in, Type* out, int rowCount, int colCount)
+{
   naiveSignFlipKernel<Type><<<colCount, 1>>>(in, out, rowCount, colCount);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -103,7 +102,8 @@ struct MathInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const MathInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const MathInputs<T>& dims)
+{
   return os;
 }
 
@@ -126,12 +126,15 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
       out_recip(4, stream),
       in_smallzero(4, stream),
       out_smallzero(4, stream),
-      out_smallzero_ref(4, stream) {}
+      out_smallzero_ref(4, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     random::Rng r(params.seed);
-    int len = params.len;
+    int len         = params.len;
     T in_ratio_h[4] = {1.0, 2.0, 2.0, 3.0};
     update_device(in_ratio.data(), in_ratio_h, 4, stream);
 
@@ -151,12 +154,11 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
 
     ratio(handle, in_ratio.data(), in_ratio.data(), 4, stream);
 
-    naiveSignFlip(in_sign_flip.data(), out_sign_flip_ref.data(), params.n_row,
-                  params.n_col);
+    naiveSignFlip(in_sign_flip.data(), out_sign_flip_ref.data(), params.n_row, params.n_col);
     signFlip(in_sign_flip.data(), params.n_row, params.n_col, stream);
 
     // default threshold is 1e-15
-    std::vector<T> in_recip_h = {0.1, 0.01, -0.01, 0.1e-16};
+    std::vector<T> in_recip_h     = {0.1, 0.01, -0.01, 0.1e-16};
     std::vector<T> in_recip_ref_h = {10.0, 100.0, -100.0, 0.0};
     update_device(in_recip.data(), in_recip_h.data(), 4, stream);
     update_device(in_recip_ref.data(), in_recip_ref_h.data(), 4, stream);
@@ -167,12 +169,11 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
 
     reciprocal(in_recip.data(), recip_scalar, 4, stream, true);
 
-    std::vector<T> in_small_val_zero_h = {0.1, 1e-16, -1e-16, -0.1};
+    std::vector<T> in_small_val_zero_h     = {0.1, 1e-16, -1e-16, -0.1};
     std::vector<T> in_small_val_zero_ref_h = {0.1, 0.0, 0.0, -0.1};
 
     update_device(in_smallzero.data(), in_small_val_zero_h.data(), 4, stream);
-    update_device(out_smallzero_ref.data(), in_small_val_zero_ref_h.data(), 4,
-                  stream);
+    update_device(out_smallzero_ref.data(), in_small_val_zero_ref_h.data(), 4, stream);
     setSmallValuesZero(out_smallzero.data(), in_smallzero.data(), 4, stream);
     setSmallValuesZero(in_smallzero.data(), 4, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
@@ -183,137 +184,139 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
   cudaStream_t stream;
 
   MathInputs<T> params;
-  rmm::device_uvector<T> in_power, out_power_ref, in_sqrt, out_sqrt_ref,
-    in_ratio, out_ratio_ref, in_sign_flip, out_sign_flip_ref, in_recip,
-    in_recip_ref, out_recip, in_smallzero, out_smallzero, out_smallzero_ref;
+  rmm::device_uvector<T> in_power, out_power_ref, in_sqrt, out_sqrt_ref, in_ratio, out_ratio_ref,
+    in_sign_flip, out_sign_flip_ref, in_recip, in_recip_ref, out_recip, in_smallzero, out_smallzero,
+    out_smallzero_ref;
 };
 
-const std::vector<MathInputs<float>> inputsf = {
-  {0.00001f, 1024, 1024, 1024 * 1024, 1234ULL}};
+const std::vector<MathInputs<float>> inputsf = {{0.00001f, 1024, 1024, 1024 * 1024, 1234ULL}};
 
-const std::vector<MathInputs<double>> inputsd = {
-  {0.00001, 1024, 1024, 1024 * 1024, 1234ULL}};
+const std::vector<MathInputs<double>> inputsd = {{0.00001, 1024, 1024, 1024 * 1024, 1234ULL}};
 
 typedef MathTest<float> MathPowerTestF;
-TEST_P(MathPowerTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_power.data(), out_power_ref.data(), params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MathPowerTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    in_power.data(), out_power_ref.data(), params.len, CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathPowerTestD;
-TEST_P(MathPowerTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_power.data(), out_power_ref.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MathPowerTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    in_power.data(), out_power_ref.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathSqrtTestF;
-TEST_P(MathSqrtTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_sqrt.data(), out_sqrt_ref.data(), params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MathSqrtTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    in_sqrt.data(), out_sqrt_ref.data(), params.len, CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathSqrtTestD;
-TEST_P(MathSqrtTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_sqrt.data(), out_sqrt_ref.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MathSqrtTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    in_sqrt.data(), out_sqrt_ref.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathRatioTestF;
-TEST_P(MathRatioTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_ratio.data(), out_ratio_ref.data(), 4,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MathRatioTestF, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(in_ratio.data(), out_ratio_ref.data(), 4, CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathRatioTestD;
-TEST_P(MathRatioTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_ratio.data(), out_ratio_ref.data(), 4,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MathRatioTestD, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(in_ratio.data(), out_ratio_ref.data(), 4, CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathSignFlipTestF;
-TEST_P(MathSignFlipTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_sign_flip.data(), out_sign_flip_ref.data(),
-                          params.len, CompareApprox<float>(params.tolerance)));
+TEST_P(MathSignFlipTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(in_sign_flip.data(),
+                          out_sign_flip_ref.data(),
+                          params.len,
+                          CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathSignFlipTestD;
-TEST_P(MathSignFlipTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_sign_flip.data(), out_sign_flip_ref.data(),
-                          params.len, CompareApprox<double>(params.tolerance)));
+TEST_P(MathSignFlipTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(in_sign_flip.data(),
+                          out_sign_flip_ref.data(),
+                          params.len,
+                          CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathReciprocalTestF;
-TEST_P(MathReciprocalTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_recip.data(), in_recip_ref.data(), 4,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MathReciprocalTestF, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(in_recip.data(), in_recip_ref.data(), 4, CompareApprox<float>(params.tolerance)));
 
   // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`.
-  ASSERT_TRUE(devArrMatch(out_recip.data(), in_recip_ref.data(), 3,
-                          CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    devArrMatch(out_recip.data(), in_recip_ref.data(), 3, CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathReciprocalTestD;
-TEST_P(MathReciprocalTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_recip.data(), in_recip_ref.data(), 4,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MathReciprocalTestD, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(in_recip.data(), in_recip_ref.data(), 4, CompareApprox<double>(params.tolerance)));
 
   // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`.
-  ASSERT_TRUE(devArrMatch(out_recip.data(), in_recip_ref.data(), 3,
-                          CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(
+    devArrMatch(out_recip.data(), in_recip_ref.data(), 3, CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathSetSmallZeroTestF;
-TEST_P(MathSetSmallZeroTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_smallzero.data(), out_smallzero_ref.data(), 4,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MathSetSmallZeroTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    in_smallzero.data(), out_smallzero_ref.data(), 4, CompareApprox<float>(params.tolerance)));
 
-  ASSERT_TRUE(devArrMatch(out_smallzero.data(), out_smallzero_ref.data(), 4,
-                          CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_smallzero.data(), out_smallzero_ref.data(), 4, CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathSetSmallZeroTestD;
-TEST_P(MathSetSmallZeroTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_smallzero.data(), out_smallzero_ref.data(), 4,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MathSetSmallZeroTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    in_smallzero.data(), out_smallzero_ref.data(), 4, CompareApprox<double>(params.tolerance)));
 
-  ASSERT_TRUE(devArrMatch(out_smallzero.data(), out_smallzero_ref.data(), 4,
-                          CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_smallzero.data(), out_smallzero_ref.data(), 4, CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestD, ::testing::ValuesIn(inputsd));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestD, ::testing::ValuesIn(inputsd));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestD, ::testing::ValuesIn(inputsd));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestD, ::testing::ValuesIn(inputsd));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestD, ::testing::ValuesIn(inputsd));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestF,
-                         ::testing::ValuesIn(inputsf));
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestD, ::testing::ValuesIn(inputsd));
 
 }  // namespace matrix
 }  // namespace raft
diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu
index e247abad1e..6f052f7b46 100644
--- a/cpp/test/matrix/matrix.cu
+++ b/cpp/test/matrix/matrix.cu
@@ -33,7 +33,8 @@ struct MatrixInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const MatrixInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const MatrixInputs<T>& dims)
+{
   return os;
 }
 
@@ -45,10 +46,13 @@ class MatrixTest : public ::testing::TestWithParam<MatrixInputs<T>> {
       stream(handle.get_stream()),
       in1(params.n_row * params.n_col, stream),
       in2(params.n_row * params.n_col, stream),
-      in1_revr(params.n_row * params.n_col, stream) {}
+      in1_revr(params.n_row * params.n_col, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int len = params.n_row * params.n_col;
     r.uniform(in1.data(), len, T(-1.0), T(1.0), stream);
@@ -72,87 +76,84 @@ class MatrixTest : public ::testing::TestWithParam<MatrixInputs<T>> {
 
 const std::vector<MatrixInputs<float>> inputsf2 = {{0.000001f, 4, 4, 1234ULL}};
 
-const std::vector<MatrixInputs<double>> inputsd2 = {
-  {0.00000001, 4, 4, 1234ULL}};
+const std::vector<MatrixInputs<double>> inputsd2 = {{0.00000001, 4, 4, 1234ULL}};
 
 typedef MatrixTest<float> MatrixTestF;
-TEST_P(MatrixTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(in1.data(), in2.data(),
+TEST_P(MatrixTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(in1.data(),
+                                in2.data(),
                                 params.n_row * params.n_col,
                                 raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef MatrixTest<double> MatrixTestD;
-TEST_P(MatrixTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(in1.data(), in2.data(),
+TEST_P(MatrixTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(in1.data(),
+                                in2.data(),
                                 params.n_row * params.n_col,
                                 raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestD, ::testing::ValuesIn(inputsd2));
 
 template <typename T>
 class MatrixCopyRowsTest : public ::testing::Test {
-  using math_t = typename std::tuple_element<0, T>::type;
-  using idx_t = typename std::tuple_element<1, T>::type;
+  using math_t      = typename std::tuple_element<0, T>::type;
+  using idx_t       = typename std::tuple_element<1, T>::type;
   using idx_array_t = typename std::tuple_element<2, T>::type;
 
  protected:
   MatrixCopyRowsTest()
     : input(n_cols * n_rows, handle.get_stream()),
       indices(n_selected, handle.get_stream()),
-      output(n_cols * n_selected, handle.get_stream()) {
+      output(n_cols * n_selected, handle.get_stream())
+  {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
     raft::update_device(indices.data(), indices_host, n_selected, stream);
     // Init input array
     thrust::counting_iterator<idx_t> first(0);
     thrust::device_ptr<math_t> ptr(input.data());
-    thrust::copy(handle.get_thrust_policy(), first, first + n_cols * n_rows,
-                 ptr);
+    thrust::copy(handle.get_thrust_policy(), first, first + n_cols * n_rows, ptr);
   }
 
   void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); }
 
-  void testCopyRows() {
-    copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(),
-             n_selected, stream, false);
-    EXPECT_TRUE(raft::devArrMatchHost(output_exp_colmajor, output.data(),
-                                      n_selected * n_cols,
-                                      raft::Compare<math_t>()));
-    copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(),
-             n_selected, stream, true);
-    EXPECT_TRUE(raft::devArrMatchHost(output_exp_rowmajor, output.data(),
-                                      n_selected * n_cols,
-                                      raft::Compare<math_t>()));
+  void testCopyRows()
+  {
+    copyRows(
+      input.data(), n_rows, n_cols, output.data(), indices.data(), n_selected, stream, false);
+    EXPECT_TRUE(raft::devArrMatchHost(
+      output_exp_colmajor, output.data(), n_selected * n_cols, raft::Compare<math_t>()));
+    copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(), n_selected, stream, true);
+    EXPECT_TRUE(raft::devArrMatchHost(
+      output_exp_rowmajor, output.data(), n_selected * n_cols, raft::Compare<math_t>()));
   }
 
  protected:
   raft::handle_t handle;
   cudaStream_t stream;
 
-  int n_rows = 10;
-  int n_cols = 3;
+  int n_rows     = 10;
+  int n_cols     = 3;
   int n_selected = 5;
 
-  idx_array_t indices_host[5] = {0, 3, 4, 7, 9};
-  math_t output_exp_colmajor[15] = {0,  3,  4,  7,  9,  10, 13, 14,
-                                    17, 19, 20, 23, 24, 27, 29};
-  math_t output_exp_rowmajor[15] = {0,  1,  2,  9,  10, 11, 12, 13,
-                                    14, 21, 22, 23, 27, 28, 29};
+  idx_array_t indices_host[5]    = {0, 3, 4, 7, 9};
+  math_t output_exp_colmajor[15] = {0, 3, 4, 7, 9, 10, 13, 14, 17, 19, 20, 23, 24, 27, 29};
+  math_t output_exp_rowmajor[15] = {0, 1, 2, 9, 10, 11, 12, 13, 14, 21, 22, 23, 27, 28, 29};
   rmm::device_uvector<math_t> input;
   rmm::device_uvector<math_t> output;
   rmm::device_uvector<idx_array_t> indices;
 };
 
-using TypeTuple =
-  ::testing::Types<std::tuple<float, int, int>, std::tuple<float, int64_t, int>,
-                   std::tuple<double, int, int>,
-                   std::tuple<double, int64_t, int>>;
+using TypeTuple = ::testing::Types<std::tuple<float, int, int>,
+                                   std::tuple<float, int64_t, int>,
+                                   std::tuple<double, int, int>,
+                                   std::tuple<double, int64_t, int>>;
 
 TYPED_TEST_CASE(MatrixCopyRowsTest, TypeTuple);
 TYPED_TEST(MatrixCopyRowsTest, CopyRows) { this->testCopyRows(); }
diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp
index fe42cea8b3..5cfcc910fd 100644
--- a/cpp/test/mr/device/buffer.cpp
+++ b/cpp/test/mr/device/buffer.cpp
@@ -25,7 +25,8 @@ namespace raft {
 namespace mr {
 namespace device {
 
-TEST(Raft, DeviceBufferAlloc) {
+TEST(Raft, DeviceBufferAlloc)
+{
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
   // no allocation at construction
@@ -51,13 +52,14 @@ TEST(Raft, DeviceBufferAlloc) {
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
-TEST(Raft, DeviceBufferZeroResize) {
+TEST(Raft, DeviceBufferZeroResize)
+{
   // Create a limiting_resource_adaptor to track allocations
-  auto curr_mr = dynamic_cast<rmm::mr::cuda_memory_resource*>(
-    rmm::mr::get_current_device_resource());
-  auto limit_mr = std::make_shared<
-    rmm::mr::limiting_resource_adaptor<rmm::mr::cuda_memory_resource>>(curr_mr,
-                                                                       1000);
+  auto curr_mr =
+    dynamic_cast<rmm::mr::cuda_memory_resource*>(rmm::mr::get_current_device_resource());
+  auto limit_mr =
+    std::make_shared<rmm::mr::limiting_resource_adaptor<rmm::mr::cuda_memory_resource>>(curr_mr,
+                                                                                        1000);
 
   rmm::mr::set_current_device_resource(limit_mr.get());
 
diff --git a/cpp/test/mr/host/buffer.cpp b/cpp/test/mr/host/buffer.cpp
index 953f65ddfb..aadf05285c 100644
--- a/cpp/test/mr/host/buffer.cpp
+++ b/cpp/test/mr/host/buffer.cpp
@@ -24,7 +24,8 @@ namespace raft {
 namespace mr {
 namespace host {
 
-TEST(Raft, HostBuffer) {
+TEST(Raft, HostBuffer)
+{
   auto alloc = std::make_shared<default_allocator>();
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
@@ -51,14 +52,14 @@ TEST(Raft, HostBuffer) {
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
-TEST(Raft, DeviceToHostBuffer) {
+TEST(Raft, DeviceToHostBuffer)
+{
   auto d_alloc = std::make_shared<device::default_allocator>();
   auto h_alloc = std::make_shared<default_allocator>();
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
   device::buffer<char> d_buff(d_alloc, stream, 32);
-  CUDA_CHECK(
-    cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream));
+  CUDA_CHECK(cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream));
   buffer<char> h_buff(h_alloc, d_buff);
   ASSERT_EQ(d_buff.size(), h_buff.size());
   CUDA_CHECK(cudaStreamSynchronize(stream));
diff --git a/cpp/test/mst.cu b/cpp/test/mst.cu
index 781e6d1d3f..90a6d7bd87 100644
--- a/cpp/test/mst.cu
+++ b/cpp/test/mst.cu
@@ -61,7 +61,8 @@ namespace mst {
 // Sequential prims function
 // Returns total weight of MST
 template <typename vertex_t, typename edge_t, typename weight_t>
-weight_t prims(CSRHost<vertex_t, edge_t, weight_t> &csr_h) {
+weight_t prims(CSRHost<vertex_t, edge_t, weight_t>& csr_h)
+{
   std::size_t n_vertices = csr_h.offsets.size() - 1;
 
   bool active_vertex[n_vertices];
@@ -70,19 +71,18 @@ weight_t prims(CSRHost<vertex_t, edge_t, weight_t> &csr_h) {
 
   for (std::size_t i = 0; i < n_vertices; i++) {
     active_vertex[i] = false;
-    curr_edge[i] = static_cast<weight_t>(std::numeric_limits<int>::max());
+    curr_edge[i]     = static_cast<weight_t>(std::numeric_limits<int>::max());
   }
   curr_edge[0] = 0;
 
   // function to pick next min vertex-edge
-  auto min_vertex_edge = [](auto *curr_edge, auto *active_vertex,
-                            auto n_vertices) {
+  auto min_vertex_edge = [](auto* curr_edge, auto* active_vertex, auto n_vertices) {
     auto min = static_cast<weight_t>(std::numeric_limits<int>::max());
     vertex_t min_vertex{};
 
     for (std::size_t v = 0; v < n_vertices; v++) {
       if (!active_vertex[v] && curr_edge[v] < min) {
-        min = curr_edge[v];
+        min        = curr_edge[v];
         min_vertex = v;
       }
     }
@@ -98,14 +98,13 @@ weight_t prims(CSRHost<vertex_t, edge_t, weight_t> &csr_h) {
     active_vertex[curr_v] = true;  // set to active
 
     // iterate through edges of current active vertex
-    auto edge_st = csr_h.offsets[curr_v];
+    auto edge_st  = csr_h.offsets[curr_v];
     auto edge_end = csr_h.offsets[curr_v + 1];
 
     for (auto e = edge_st; e < edge_end; e++) {
       // put edges to be considered for next iteration
       auto neighbor_idx = csr_h.indices[e];
-      if (!active_vertex[neighbor_idx] &&
-          csr_h.weights[e] < curr_edge[neighbor_idx]) {
+      if (!active_vertex[neighbor_idx] && csr_h.weights[e] < curr_edge[neighbor_idx]) {
         curr_edge[neighbor_idx] = csr_h.weights[e];
       }
     }
@@ -121,15 +120,15 @@ weight_t prims(CSRHost<vertex_t, edge_t, weight_t> &csr_h) {
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-class MSTTest
-  : public ::testing::TestWithParam<MSTTestInput<vertex_t, edge_t, weight_t>> {
+class MSTTest : public ::testing::TestWithParam<MSTTestInput<vertex_t, edge_t, weight_t>> {
  protected:
   std::pair<raft::Graph_COO<vertex_t, edge_t, weight_t>,
             raft::Graph_COO<vertex_t, edge_t, weight_t>>
-  mst_gpu() {
-    edge_t *offsets = static_cast<edge_t *>(csr_d.offsets.data());
-    vertex_t *indices = static_cast<vertex_t *>(csr_d.indices.data());
-    weight_t *weights = static_cast<weight_t *>(csr_d.weights.data());
+  mst_gpu()
+  {
+    edge_t* offsets   = static_cast<edge_t*>(csr_d.offsets.data());
+    vertex_t* indices = static_cast<vertex_t*>(csr_d.indices.data());
+    weight_t* weights = static_cast<weight_t*>(csr_d.weights.data());
 
     v = static_cast<vertex_t>((csr_d.offsets.size() / sizeof(vertex_t)) - 1);
     e = static_cast<edge_t>(csr_d.indices.size() / sizeof(edge_t));
@@ -138,89 +137,95 @@ class MSTTest
     rmm::device_uvector<vertex_t> mst_dst(2 * v - 2, handle.get_stream());
     rmm::device_uvector<vertex_t> color(v, handle.get_stream());
 
-    CUDA_CHECK(
-      cudaMemsetAsync(mst_src.data(), std::numeric_limits<vertex_t>::max(),
-                      mst_src.size() * sizeof(vertex_t), handle.get_stream()));
-    CUDA_CHECK(
-      cudaMemsetAsync(mst_dst.data(), std::numeric_limits<vertex_t>::max(),
-                      mst_dst.size() * sizeof(vertex_t), handle.get_stream()));
-    CUDA_CHECK(cudaMemsetAsync(color.data(), 0, color.size() * sizeof(vertex_t),
+    CUDA_CHECK(cudaMemsetAsync(mst_src.data(),
+                               std::numeric_limits<vertex_t>::max(),
+                               mst_src.size() * sizeof(vertex_t),
+                               handle.get_stream()));
+    CUDA_CHECK(cudaMemsetAsync(mst_dst.data(),
+                               std::numeric_limits<vertex_t>::max(),
+                               mst_dst.size() * sizeof(vertex_t),
                                handle.get_stream()));
+    CUDA_CHECK(
+      cudaMemsetAsync(color.data(), 0, color.size() * sizeof(vertex_t), handle.get_stream()));
 
-    vertex_t *color_ptr = thrust::raw_pointer_cast(color.data());
+    vertex_t* color_ptr = thrust::raw_pointer_cast(color.data());
 
     if (iterations == 0) {
       MST_solver<vertex_t, edge_t, weight_t, float> symmetric_solver(
-        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(),
-        true, true, 0);
+        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), true, true, 0);
       auto symmetric_result = symmetric_solver.solve();
 
       MST_solver<vertex_t, edge_t, weight_t, float> non_symmetric_solver(
-        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(),
-        false, true, 0);
+        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), false, true, 0);
       auto non_symmetric_result = non_symmetric_solver.solve();
 
       EXPECT_LE(symmetric_result.n_edges, 2 * v - 2);
       EXPECT_LE(non_symmetric_result.n_edges, v - 1);
 
-      return std::make_pair(std::move(symmetric_result),
-                            std::move(non_symmetric_result));
+      return std::make_pair(std::move(symmetric_result), std::move(non_symmetric_result));
     } else {
-      MST_solver<vertex_t, edge_t, weight_t, float> intermediate_solver(
-        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(),
-        true, true, iterations);
+      MST_solver<vertex_t, edge_t, weight_t, float> intermediate_solver(handle,
+                                                                        offsets,
+                                                                        indices,
+                                                                        weights,
+                                                                        v,
+                                                                        e,
+                                                                        color_ptr,
+                                                                        handle.get_stream(),
+                                                                        true,
+                                                                        true,
+                                                                        iterations);
       auto intermediate_result = intermediate_solver.solve();
 
       MST_solver<vertex_t, edge_t, weight_t, float> symmetric_solver(
-        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(),
-        true, false, 0);
+        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), true, false, 0);
       auto symmetric_result = symmetric_solver.solve();
 
       // symmetric_result.n_edges += intermediate_result.n_edges;
-      auto total_edge_size =
-        symmetric_result.n_edges + intermediate_result.n_edges;
+      auto total_edge_size = symmetric_result.n_edges + intermediate_result.n_edges;
       symmetric_result.src.resize(total_edge_size, handle.get_stream());
       symmetric_result.dst.resize(total_edge_size, handle.get_stream());
       symmetric_result.weights.resize(total_edge_size, handle.get_stream());
 
       raft::copy(symmetric_result.src.data() + symmetric_result.n_edges,
-                 intermediate_result.src.data(), intermediate_result.n_edges,
+                 intermediate_result.src.data(),
+                 intermediate_result.n_edges,
                  handle.get_stream());
       raft::copy(symmetric_result.dst.data() + symmetric_result.n_edges,
-                 intermediate_result.dst.data(), intermediate_result.n_edges,
+                 intermediate_result.dst.data(),
+                 intermediate_result.n_edges,
                  handle.get_stream());
       raft::copy(symmetric_result.weights.data() + symmetric_result.n_edges,
                  intermediate_result.weights.data(),
-                 intermediate_result.n_edges, handle.get_stream());
+                 intermediate_result.n_edges,
+                 handle.get_stream());
       symmetric_result.n_edges = total_edge_size;
 
       MST_solver<vertex_t, edge_t, weight_t, float> non_symmetric_solver(
-        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(),
-        false, true, 0);
+        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), false, true, 0);
       auto non_symmetric_result = non_symmetric_solver.solve();
 
       EXPECT_LE(symmetric_result.n_edges, 2 * v - 2);
       EXPECT_LE(non_symmetric_result.n_edges, v - 1);
 
-      return std::make_pair(std::move(symmetric_result),
-                            std::move(non_symmetric_result));
+      return std::make_pair(std::move(symmetric_result), std::move(non_symmetric_result));
     }
   }
 
-  void SetUp() override {
-    mst_input = ::testing::TestWithParam<
-      MSTTestInput<vertex_t, edge_t, weight_t>>::GetParam();
+  void SetUp() override
+  {
+    mst_input  = ::testing::TestWithParam<MSTTestInput<vertex_t, edge_t, weight_t>>::GetParam();
     iterations = mst_input.iterations;
 
-    csr_d.offsets = rmm::device_buffer(
-      mst_input.csr_h.offsets.data(),
-      mst_input.csr_h.offsets.size() * sizeof(edge_t), handle.get_stream());
-    csr_d.indices = rmm::device_buffer(
-      mst_input.csr_h.indices.data(),
-      mst_input.csr_h.indices.size() * sizeof(vertex_t), handle.get_stream());
-    csr_d.weights = rmm::device_buffer(
-      mst_input.csr_h.weights.data(),
-      mst_input.csr_h.weights.size() * sizeof(weight_t), handle.get_stream());
+    csr_d.offsets = rmm::device_buffer(mst_input.csr_h.offsets.data(),
+                                       mst_input.csr_h.offsets.size() * sizeof(edge_t),
+                                       handle.get_stream());
+    csr_d.indices = rmm::device_buffer(mst_input.csr_h.indices.data(),
+                                       mst_input.csr_h.indices.size() * sizeof(vertex_t),
+                                       handle.get_stream());
+    csr_d.weights = rmm::device_buffer(mst_input.csr_h.weights.data(),
+                                       mst_input.csr_h.weights.size() * sizeof(weight_t),
+                                       handle.get_stream());
   }
 
   void TearDown() override {}
@@ -272,41 +277,68 @@ const std::vector<MSTTestInput<int, int, float>> csr_in_h = {
 const std::vector<CSRHost<int, int, float>> csr_in4_h = {
   {{0, 3, 5, 8, 10, 12, 14, 16},
    {2, 4, 5, 3, 6, 0, 4, 5, 1, 6, 0, 2, 0, 2, 1, 3},
-   {5.0f, 9.0f, 1.0f, 8.0f, 7.0f, 5.0f, 2.0f, 6.0f, 8.0f, 10.0f, 9.0f, 2.0f,
-    1.0f, 6.0f, 7.0f, 10.0f}}};
+   {5.0f,
+    9.0f,
+    1.0f,
+    8.0f,
+    7.0f,
+    5.0f,
+    2.0f,
+    6.0f,
+    8.0f,
+    10.0f,
+    9.0f,
+    2.0f,
+    1.0f,
+    6.0f,
+    7.0f,
+    10.0f}}};
 
 //  singletons
 const std::vector<CSRHost<int, int, float>> csr_in5_h = {
   {{0, 3, 5, 8, 10, 10, 10, 12, 14, 16, 16},
    {2, 8, 7, 3, 8, 0, 8, 7, 1, 8, 0, 2, 0, 2, 1, 3},
-   {5.0f, 9.0f, 1.0f, 8.0f, 7.0f, 5.0f, 2.0f, 6.0f, 8.0f, 10.0f, 9.0f, 2.0f,
-    1.0f, 6.0f, 7.0f, 10.0f}}};
+   {5.0f,
+    9.0f,
+    1.0f,
+    8.0f,
+    7.0f,
+    5.0f,
+    2.0f,
+    6.0f,
+    8.0f,
+    10.0f,
+    9.0f,
+    2.0f,
+    1.0f,
+    6.0f,
+    7.0f,
+    10.0f}}};
 
 typedef MSTTest<int, int, float> MSTTestSequential;
-TEST_P(MSTTestSequential, Sequential) {
-  auto results_pair = mst_gpu();
-  auto &symmetric_result = results_pair.first;
-  auto &non_symmetric_result = results_pair.second;
+TEST_P(MSTTestSequential, Sequential)
+{
+  auto results_pair          = mst_gpu();
+  auto& symmetric_result     = results_pair.first;
+  auto& non_symmetric_result = results_pair.second;
 
   // do assertions here
   // in this case, running sequential MST
   auto prims_result = prims(mst_input.csr_h);
 
-  auto symmetric_sum =
-    thrust::reduce(thrust::device, symmetric_result.weights.data(),
-                   symmetric_result.weights.data() + symmetric_result.n_edges);
-  auto non_symmetric_sum = thrust::reduce(
-    thrust::device, non_symmetric_result.weights.data(),
-    non_symmetric_result.weights.data() + non_symmetric_result.n_edges);
-
-  ASSERT_TRUE(raft::match(2 * prims_result, symmetric_sum,
-                          raft::CompareApprox<float>(0.1)));
-  ASSERT_TRUE(raft::match(prims_result, non_symmetric_sum,
-                          raft::CompareApprox<float>(0.1)));
+  auto symmetric_sum = thrust::reduce(thrust::device,
+                                      symmetric_result.weights.data(),
+                                      symmetric_result.weights.data() + symmetric_result.n_edges);
+  auto non_symmetric_sum =
+    thrust::reduce(thrust::device,
+                   non_symmetric_result.weights.data(),
+                   non_symmetric_result.weights.data() + non_symmetric_result.n_edges);
+
+  ASSERT_TRUE(raft::match(2 * prims_result, symmetric_sum, raft::CompareApprox<float>(0.1)));
+  ASSERT_TRUE(raft::match(prims_result, non_symmetric_sum, raft::CompareApprox<float>(0.1)));
 }
 
-INSTANTIATE_TEST_SUITE_P(MSTTests, MSTTestSequential,
-                         ::testing::ValuesIn(csr_in_h));
+INSTANTIATE_TEST_SUITE_P(MSTTests, MSTTestSequential, ::testing::ValuesIn(csr_in_h));
 
 }  // namespace mst
 }  // namespace raft
diff --git a/cpp/test/pow2_utils.cu b/cpp/test/pow2_utils.cu
index 92976e5c61..c76064ade7 100644
--- a/cpp/test/pow2_utils.cu
+++ b/cpp/test/pow2_utils.cu
@@ -24,7 +24,8 @@ struct Pow2Test : public ::testing::Test {
   typedef Pow2<Val> P;
   std::vector<TargetT> data;
 
-  void SetUp() override {
+  void SetUp() override
+  {
     std::vector<TargetT> pos = {0, 1, 2, 7, 15, 16, 17, 31, 35, 1024, 1623};
     data.insert(data.end(), pos.begin(), pos.end());
     if constexpr (std::is_signed<TargetT>::value) {
@@ -35,7 +36,8 @@ struct Pow2Test : public ::testing::Test {
     data.push_back(std::numeric_limits<TargetT>::max());
   }
 
-  void quotRem() {
+  void quotRem()
+  {
     for (auto x : data) {
       ASSERT_EQ(P::quot(x), x / P::Value) << "  where x = " << x;
       ASSERT_EQ(P::rem(x), x % P::Value) << "  where x = " << x;
@@ -43,31 +45,32 @@ struct Pow2Test : public ::testing::Test {
     }
   }
 
-  void divMod() {
+  void divMod()
+  {
     for (auto x : data) {
       ASSERT_GE(P::mod(x), 0) << "  where x = " << x;
       ASSERT_EQ(x, P::div(x) * P::Value + P::mod(x));
     }
   }
 
-  void round() {
+  void round()
+  {
     for (auto x : data) {
-      if (x <= std::numeric_limits<TargetT>::max() - TargetT(P::Value))
-        ASSERT_GE(P::roundUp(x), x);
+      if (x <= std::numeric_limits<TargetT>::max() - TargetT(P::Value)) ASSERT_GE(P::roundUp(x), x);
       if (x >= std::numeric_limits<TargetT>::min() + TargetT(P::Value))
         ASSERT_LE(P::roundDown(x), x);
       ASSERT_EQ(x - P::roundDown(x), P::mod(x)) << "  where x = " << x;
-      ASSERT_EQ(P::mod(P::roundUp(x) + P::mod(x) - x), 0)
-        << "  where x = " << x;
+      ASSERT_EQ(P::mod(P::roundUp(x) + P::mod(x) - x), 0) << "  where x = " << x;
     }
   }
 
-  void alignment() {
+  void alignment()
+  {
     for (auto x : data) {
       ASSERT_TRUE(P::areSameAlignOffsets(x, x));
       if (x <= std::numeric_limits<TargetT>::max() - TargetT(P::Value)) {
         ASSERT_TRUE(P::areSameAlignOffsets(x, x + TargetT(P::Value)));
-        int aligned_count = 0;
+        int aligned_count      = 0;
         int same_aligned_count = 0;
         for (int i = 0; i < int(P::Value); i++) {
           aligned_count += P::isAligned(x + i);
@@ -97,10 +100,11 @@ TEST_IT(Pow2_u64_i32_128);
 TEST_IT(Pow2_ll_u16_32);
 TEST_IT(Pow2_i32_u64_16);
 
-TEST(Pow2, pointers) {
+TEST(Pow2, pointers)
+{
   typedef Pow2<32UL> P;
   for (ptrdiff_t i = 0; i <= ptrdiff_t(P::Value); i++) {
-    auto *p = reinterpret_cast<float *>(16345 + i);
+    auto* p = reinterpret_cast<float*>(16345 + i);
     ASSERT_GE(P::roundUp(p), p);
     ASSERT_LE(P::roundDown(p), p);
   }
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
index 810d6cb871..69dc146486 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/test/random/rng.cu
@@ -40,12 +40,13 @@ enum RandomType {
 };
 
 template <typename T, int TPB>
-__global__ void meanKernel(T* out, const T* data, int len) {
+__global__ void meanKernel(T* out, const T* data, int len)
+{
   typedef cub::BlockReduce<T, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  T val = tid < len ? data[tid] : T(0);
-  T x = BlockReduce(temp_storage).Sum(val);
+  T val   = tid < len ? data[tid] : T(0);
+  T x     = BlockReduce(temp_storage).Sum(val);
   __syncthreads();
   T xx = BlockReduce(temp_storage).Sum(val * val);
   __syncthreads();
@@ -72,7 +73,8 @@ struct RngInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const RngInputs<T>& dims) {
+::std::ostream& operator<<(::std::ostream& os, const RngInputs<T>& dims)
+{
   return os;
 }
 
@@ -86,47 +88,36 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     : params(::testing::TestWithParam<RngInputs<T>>::GetParam()),
       stream(handle.get_stream()),
       data(0, stream),
-      stats(2, stream) {
+      stats(2, stream)
+  {
     data.resize(params.len, stream);
     CUDA_CHECK(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(T), stream));
   }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     // Tests are configured with their expected test-values sigma. For example,
     // 4 x sigma indicates the test shouldn't fail 99.9% of the time.
     num_sigma = 10;
     Rng r(params.seed, params.gtype);
     switch (params.type) {
-      case RNG_Normal:
-        r.normal(data.data(), params.len, params.start, params.end, stream);
-        break;
+      case RNG_Normal: r.normal(data.data(), params.len, params.start, params.end, stream); break;
       case RNG_LogNormal:
         r.lognormal(data.data(), params.len, params.start, params.end, stream);
         break;
-      case RNG_Uniform:
-        r.uniform(data.data(), params.len, params.start, params.end, stream);
-        break;
-      case RNG_Gumbel:
-        r.gumbel(data.data(), params.len, params.start, params.end, stream);
-        break;
+      case RNG_Uniform: r.uniform(data.data(), params.len, params.start, params.end, stream); break;
+      case RNG_Gumbel: r.gumbel(data.data(), params.len, params.start, params.end, stream); break;
       case RNG_Logistic:
         r.logistic(data.data(), params.len, params.start, params.end, stream);
         break;
-      case RNG_Exp:
-        r.exponential(data.data(), params.len, params.start, stream);
-        break;
-      case RNG_Rayleigh:
-        r.rayleigh(data.data(), params.len, params.start, stream);
-        break;
-      case RNG_Laplace:
-        r.laplace(data.data(), params.len, params.start, params.end, stream);
-        break;
+      case RNG_Exp: r.exponential(data.data(), params.len, params.start, stream); break;
+      case RNG_Rayleigh: r.rayleigh(data.data(), params.len, params.start, stream); break;
+      case RNG_Laplace: r.laplace(data.data(), params.len, params.start, params.end, stream); break;
     };
     static const int threads = 128;
-    meanKernel<T, threads>
-      <<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(
-        stats.data(), data.data(), params.len);
+    meanKernel<T, threads><<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(
+      stats.data(), data.data(), params.len);
     update_host<T>(h_stats, stats.data(), 2, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
     h_stats[0] /= params.len;
@@ -134,18 +125,18 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void getExpectedMeanVar(T meanvar[2]) {
+  void getExpectedMeanVar(T meanvar[2])
+  {
     switch (params.type) {
       case RNG_Normal:
         meanvar[0] = params.start;
         meanvar[1] = params.end * params.end;
         break;
       case RNG_LogNormal: {
-        auto var = params.end * params.end;
-        auto mu = params.start;
+        auto var   = params.end * params.end;
+        auto mu    = params.start;
         meanvar[0] = raft::myExp(mu + var * T(0.5));
-        meanvar[1] =
-          (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var);
+        meanvar[1] = (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var);
         break;
       }
       case RNG_Uniform:
@@ -169,8 +160,7 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
         break;
       case RNG_Rayleigh:
         meanvar[0] = params.start * raft::mySqrt(T(3.1415 / 2.0));
-        meanvar[1] =
-          ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start;
+        meanvar[1] = ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start;
         break;
       case RNG_Laplace:
         meanvar[0] = params.start;
@@ -264,13 +254,12 @@ const std::vector<RngInputs<float>> inputsf = {
   {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL},
   {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL}};
 
-TEST_P(RngTestF, Result) {
+TEST_P(RngTestF, Result)
+{
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(match(meanvar[0], h_stats[0],
-                    CompareApprox<float>(num_sigma * params.tolerance)));
-  ASSERT_TRUE(match(meanvar[1], h_stats[1],
-                    CompareApprox<float>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(num_sigma * params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestF, ::testing::ValuesIn(inputsf));
 
@@ -326,13 +315,12 @@ const std::vector<RngInputs<double>> inputsd = {
   {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenKiss99, 1234ULL},
   {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL},
   {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL}};
-TEST_P(RngTestD, Result) {
+TEST_P(RngTestD, Result)
+{
   double meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(match(meanvar[0], h_stats[0],
-                    CompareApprox<double>(num_sigma * params.tolerance)));
-  ASSERT_TRUE(match(meanvar[1], h_stats[1],
-                    CompareApprox<double>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<double>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<double>(num_sigma * params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestD, ::testing::ValuesIn(inputsd));
 
@@ -340,7 +328,8 @@ INSTANTIATE_TEST_SUITE_P(RngTests, RngTestD, ::testing::ValuesIn(inputsd));
 // Test for expected variance in mean calculations
 
 template <typename T>
-T quick_mean(const std::vector<T>& d) {
+T quick_mean(const std::vector<T>& d)
+{
   T acc = T(0);
   for (const auto& di : d) {
     acc += di;
@@ -349,8 +338,9 @@ T quick_mean(const std::vector<T>& d) {
 }
 
 template <typename T>
-T quick_std(const std::vector<T>& d) {
-  T acc = T(0);
+T quick_std(const std::vector<T>& d)
+{
+  T acc    = T(0);
   T d_mean = quick_mean(d);
   for (const auto& di : d) {
     acc += ((di - d_mean) * (di - d_mean));
@@ -359,7 +349,8 @@ T quick_std(const std::vector<T>& d) {
 }
 
 template <typename T>
-std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
+std::ostream& operator<<(std::ostream& out, const std::vector<T>& v)
+{
   if (!v.empty()) {
     out << '[';
     std::copy(v.begin(), v.end(), std::ostream_iterator<T>(out, ", "));
@@ -374,13 +365,14 @@ std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
 // experiments computing the mean, giving us a distribution of the mean
 // itself. The mean error is simply the standard deviation of this
 // distribution (the standard deviation of the mean).
-TEST(Rng, MeanError) {
+TEST(Rng, MeanError)
+{
   timeb time_struct;
   ftime(&time_struct);
-  int seed = time_struct.millitm;
-  int num_samples = 1024;
+  int seed            = time_struct.millitm;
+  int num_samples     = 1024;
   int num_experiments = 1024;
-  int len = num_samples * num_experiments;
+  int len             = num_samples * num_experiments;
 
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
@@ -393,22 +385,26 @@ TEST(Rng, MeanError) {
     Rng r(seed, rtype);
     r.normal(data.data(), len, 3.3f, 0.23f, stream);
     // r.uniform(data, len, -1.0, 2.0);
-    raft::stats::mean(mean_result.data(), data.data(), num_samples,
-                      num_experiments, false, false, stream);
-    raft::stats::stddev(std_result.data(), data.data(), mean_result.data(),
-                        num_samples, num_experiments, false, false, stream);
+    raft::stats::mean(
+      mean_result.data(), data.data(), num_samples, num_experiments, false, false, stream);
+    raft::stats::stddev(std_result.data(),
+                        data.data(),
+                        mean_result.data(),
+                        num_samples,
+                        num_experiments,
+                        false,
+                        false,
+                        stream);
     std::vector<float> h_mean_result(num_experiments);
     std::vector<float> h_std_result(num_experiments);
-    update_host(h_mean_result.data(), mean_result.data(), num_experiments,
-                stream);
-    update_host(h_std_result.data(), std_result.data(), num_experiments,
-                stream);
+    update_host(h_mean_result.data(), mean_result.data(), num_experiments, stream);
+    update_host(h_std_result.data(), std_result.data(), num_experiments, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
     auto d_mean = quick_mean(h_mean_result);
 
     // std-dev of mean; also known as mean error
-    auto d_std_of_mean = quick_std(h_mean_result);
-    auto d_std = quick_mean(h_std_result);
+    auto d_std_of_mean            = quick_std(h_mean_result);
+    auto d_std                    = quick_mean(h_std_result);
     auto d_std_of_mean_analytical = d_std / std::sqrt(num_samples);
 
     // std::cout << "measured mean error: " << d_std_of_mean << "\n";
@@ -417,8 +413,7 @@ TEST(Rng, MeanError) {
     auto diff_expected_vs_measured_mean_error =
       std::abs(d_std_of_mean - d_std / std::sqrt(num_samples));
 
-    ASSERT_TRUE(
-      (diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5));
+    ASSERT_TRUE((diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5));
   }
   CUDA_CHECK(cudaStreamDestroy(stream));
 
@@ -431,18 +426,19 @@ class ScaledBernoulliTest : public ::testing::Test {
   ScaledBernoulliTest() : stream(handle.get_stream()), data(len, stream) {}
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     CUDA_CHECK(cudaStreamCreate(&stream));
     Rng r(42);
     r.scaled_bernoulli(data.data(), len, T(0.5), T(scale), stream);
   }
 
-  void rangeCheck() {
+  void rangeCheck()
+  {
     T* h_data = new T[len];
     update_host(h_data, data.data(), len, stream);
-    ASSERT_TRUE(std::none_of(h_data, h_data + len, [](const T& a) {
-      return a < -scale || a > scale;
-    }));
+    ASSERT_TRUE(
+      std::none_of(h_data, h_data + len, [](const T& a) { return a < -scale || a > scale; }));
     delete[] h_data;
   }
 
@@ -464,13 +460,15 @@ class BernoulliTest : public ::testing::Test {
   BernoulliTest() : stream(handle.get_stream()), data(len, stream) {}
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     Rng r(42);
     r.bernoulli(data.data(), len, T(0.5), stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void trueFalseCheck() {
+  void trueFalseCheck()
+  {
     // both true and false values must be present
     bool* h_data = new bool[len];
     update_host(h_data, data.data(), len, stream);
@@ -502,38 +500,39 @@ struct RngNormalTableInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os,
-                           const RngNormalTableInputs<T>& dims) {
+::std::ostream& operator<<(::std::ostream& os, const RngNormalTableInputs<T>& dims)
+{
   return os;
 }
 
 template <typename T>
-class RngNormalTableTest
-  : public ::testing::TestWithParam<RngNormalTableInputs<T>> {
+class RngNormalTableTest : public ::testing::TestWithParam<RngNormalTableInputs<T>> {
  public:
   RngNormalTableTest()
     : params(::testing::TestWithParam<RngNormalTableInputs<T>>::GetParam()),
       stream(handle.get_stream()),
       data(params.rows * params.cols, stream),
       stats(2, stream),
-      mu_vec(params.cols, stream) {
+      mu_vec(params.cols, stream)
+  {
     CUDA_CHECK(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(T), stream));
   }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     // Tests are configured with their expected test-values sigma. For example,
     // 4 x sigma indicates the test shouldn't fail 99.9% of the time.
     num_sigma = 10;
-    int len = params.rows * params.cols;
+    int len   = params.rows * params.cols;
     Rng r(params.seed, params.gtype);
     r.fill(mu_vec.data(), params.cols, params.mu, stream);
     T* sigma_vec = nullptr;
-    r.normalTable(data.data(), params.rows, params.cols, mu_vec.data(),
-                  sigma_vec, params.sigma, stream);
+    r.normalTable(
+      data.data(), params.rows, params.cols, mu_vec.data(), sigma_vec, params.sigma, stream);
     static const int threads = 128;
-    meanKernel<T, threads><<<raft::ceildiv(len, threads), threads, 0, stream>>>(
-      stats.data(), data.data(), len);
+    meanKernel<T, threads>
+      <<<raft::ceildiv(len, threads), threads, 0, stream>>>(stats.data(), data.data(), len);
     update_host<T>(h_stats, stats.data(), 2, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
     h_stats[0] /= len;
@@ -541,7 +540,8 @@ class RngNormalTableTest
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void getExpectedMeanVar(T meanvar[2]) {
+  void getExpectedMeanVar(T meanvar[2])
+  {
     meanvar[0] = params.mu;
     meanvar[1] = params.sigma * params.sigma;
   }
@@ -565,16 +565,14 @@ const std::vector<RngNormalTableInputs<float>> inputsf_t = {
   {0.0055, 32, 1024, 1.f, 1.f, GenKiss99, 1234ULL},
   {0.011, 8, 1024, 1.f, 1.f, GenKiss99, 1234ULL}};
 
-TEST_P(RngNormalTableTestF, Result) {
+TEST_P(RngNormalTableTestF, Result)
+{
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(match(meanvar[0], h_stats[0],
-                    CompareApprox<float>(num_sigma * params.tolerance)));
-  ASSERT_TRUE(match(meanvar[1], h_stats[1],
-                    CompareApprox<float>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(num_sigma * params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestF,
-                         ::testing::ValuesIn(inputsf_t));
+INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestF, ::testing::ValuesIn(inputsf_t));
 
 typedef RngNormalTableTest<double> RngNormalTableTestD;
 const std::vector<RngNormalTableInputs<double>> inputsd_t = {
@@ -584,16 +582,14 @@ const std::vector<RngNormalTableInputs<double>> inputsd_t = {
   {0.011, 8, 1024, 1.0, 1.0, GenTaps, 1234ULL},
   {0.0055, 32, 1024, 1.0, 1.0, GenKiss99, 1234ULL},
   {0.011, 8, 1024, 1.0, 1.0, GenKiss99, 1234ULL}};
-TEST_P(RngNormalTableTestD, Result) {
+TEST_P(RngNormalTableTestD, Result)
+{
   double meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(match(meanvar[0], h_stats[0],
-                    CompareApprox<double>(num_sigma * params.tolerance)));
-  ASSERT_TRUE(match(meanvar[1], h_stats[1],
-                    CompareApprox<double>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<double>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<double>(num_sigma * params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestD,
-                         ::testing::ValuesIn(inputsd_t));
+INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestD, ::testing::ValuesIn(inputsd_t));
 
 struct RngAffineInputs {
   int n;
@@ -602,13 +598,15 @@ struct RngAffineInputs {
 
 class RngAffineTest : public ::testing::TestWithParam<RngAffineInputs> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<RngAffineInputs>::GetParam();
     Rng r(params.seed);
     r.affine_transform_params(params.n, a, b);
   }
 
-  void check() {
+  void check()
+  {
     ASSERT_TRUE(gcd(a, params.n) == 1);
     ASSERT_TRUE(0 <= b && b < params.n);
   }
@@ -619,13 +617,17 @@ class RngAffineTest : public ::testing::TestWithParam<RngAffineInputs> {
 };  // RngAffineTest
 
 const std::vector<RngAffineInputs> inputs_affine = {
-  {100, 123456ULL},     {100, 1234567890ULL},  {101, 123456ULL},
-  {101, 1234567890ULL}, {7, 123456ULL},        {7, 1234567890ULL},
-  {2568, 123456ULL},    {2568, 1234567890ULL},
+  {100, 123456ULL},
+  {100, 1234567890ULL},
+  {101, 123456ULL},
+  {101, 1234567890ULL},
+  {7, 123456ULL},
+  {7, 1234567890ULL},
+  {2568, 123456ULL},
+  {2568, 1234567890ULL},
 };
 TEST_P(RngAffineTest, Result) { check(); }
-INSTANTIATE_TEST_SUITE_P(RngAffineTests, RngAffineTest,
-                         ::testing::ValuesIn(inputs_affine));
+INSTANTIATE_TEST_SUITE_P(RngAffineTests, RngAffineTest, ::testing::ValuesIn(inputs_affine));
 
 }  // namespace random
 }  // namespace raft
diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu
index cef2d47276..f0331b7746 100644
--- a/cpp/test/random/rng_int.cu
+++ b/cpp/test/random/rng_int.cu
@@ -29,12 +29,13 @@ using namespace raft::random::detail;
 enum RandomType { RNG_Uniform };
 
 template <typename T, int TPB>
-__global__ void meanKernel(float *out, const T *data, int len) {
+__global__ void meanKernel(float* out, const T* data, int len)
+{
   typedef cub::BlockReduce<float, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid   = threadIdx.x + blockIdx.x * blockDim.x;
   float val = tid < len ? data[tid] : T(0);
-  float x = BlockReduce(temp_storage).Sum(val);
+  float x   = BlockReduce(temp_storage).Sum(val);
   __syncthreads();
   float xx = BlockReduce(temp_storage).Sum(val * val);
   __syncthreads();
@@ -61,7 +62,8 @@ struct RngInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const RngInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const RngInputs<T>& dims)
+{
   return os;
 }
 
@@ -72,13 +74,15 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     : params(::testing::TestWithParam<RngInputs<T>>::GetParam()),
       stream(handle.get_stream()),
       data(0, stream),
-      stats(2, stream) {
+      stats(2, stream)
+  {
     data.resize(params.len, stream);
     CUDA_CHECK(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(float), stream));
   }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     Rng r(params.seed, params.gtype);
 
     switch (params.type) {
@@ -87,9 +91,8 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
         break;
     };
     static const int threads = 128;
-    meanKernel<T, threads>
-      <<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(
-        stats.data(), data.data(), params.len);
+    meanKernel<T, threads><<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(
+      stats.data(), data.data(), params.len);
     update_host<float>(h_stats, stats.data(), 2, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
     h_stats[0] /= params.len;
@@ -97,7 +100,8 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void getExpectedMeanVar(float meanvar[2]) {
+  void getExpectedMeanVar(float meanvar[2])
+  {
     switch (params.type) {
       case RNG_Uniform:
         meanvar[0] = (params.start + params.end) * 0.5f;
@@ -125,13 +129,12 @@ const std::vector<RngInputs<uint32_t>> inputs_u32 = {
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
-TEST_P(RngTestU32, Result) {
+TEST_P(RngTestU32, Result)
+{
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(
-    match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
-  ASSERT_TRUE(
-    match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestU32, ::testing::ValuesIn(inputs_u32));
 
@@ -143,13 +146,12 @@ const std::vector<RngInputs<uint64_t>> inputs_u64 = {
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
-TEST_P(RngTestU64, Result) {
+TEST_P(RngTestU64, Result)
+{
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(
-    match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
-  ASSERT_TRUE(
-    match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestU64, ::testing::ValuesIn(inputs_u64));
 
@@ -161,13 +163,12 @@ const std::vector<RngInputs<int32_t>> inputs_s32 = {
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
-TEST_P(RngTestS32, Result) {
+TEST_P(RngTestS32, Result)
+{
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(
-    match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
-  ASSERT_TRUE(
-    match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestS32, ::testing::ValuesIn(inputs_s32));
 
@@ -179,13 +180,12 @@ const std::vector<RngInputs<int64_t>> inputs_s64 = {
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
-TEST_P(RngTestS64, Result) {
+TEST_P(RngTestS64, Result)
+{
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(
-    match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
-  ASSERT_TRUE(
-    match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestS64, ::testing::ValuesIn(inputs_s64));
 
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu
index 1d33f08c62..a681bbb07d 100644
--- a/cpp/test/random/sample_without_replacement.cu
+++ b/cpp/test/random/sample_without_replacement.cu
@@ -40,7 +40,8 @@ struct SWoRInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const SWoRInputs<T>& dims) {
+::std::ostream& operator<<(::std::ostream& os, const SWoRInputs<T>& dims)
+{
   return os;
 }
 
@@ -53,20 +54,27 @@ class SWoRTest : public ::testing::TestWithParam<SWoRInputs<T>> {
       in(params.len, stream),
       wts(params.len, stream),
       out(params.sampledLen, stream),
-      outIdx(params.sampledLen, stream) {}
+      outIdx(params.sampledLen, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     Rng r(params.seed, params.gtype);
     h_outIdx.resize(params.sampledLen);
     r.uniform(in.data(), params.len, T(-1.0), T(1.0), stream);
     r.uniform(wts.data(), params.len, T(1.0), T(2.0), stream);
     if (params.largeWeightIndex >= 0) {
-      update_device(wts.data() + params.largeWeightIndex, &params.largeWeight,
-                    1, stream);
+      update_device(wts.data() + params.largeWeightIndex, &params.largeWeight, 1, stream);
     }
-    r.sampleWithoutReplacement(handle, out.data(), outIdx.data(), in.data(),
-                               wts.data(), params.sampledLen, params.len,
+    r.sampleWithoutReplacement(handle,
+                               out.data(),
+                               outIdx.data(),
+                               in.data(),
+                               wts.data(),
+                               params.sampledLen,
+                               params.len,
                                stream);
     update_host(&(h_outIdx[0]), outIdx.data(), params.sampledLen, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
@@ -145,14 +153,14 @@ const std::vector<SWoRInputs<float>> inputsf = {
   {1024, 512, 10, 100000.f, GenKiss99, 1234ULL},
 };
 
-TEST_P(SWoRTestF, Result) {
+TEST_P(SWoRTestF, Result)
+{
   std::set<int> occurence;
   for (int i = 0; i < params.sampledLen; ++i) {
     auto val = h_outIdx[i];
     // indices must be in the given range
     ASSERT_TRUE(0 <= val && val < params.len)
-      << "out-of-range index @i=" << i << " val=" << val
-      << " sampledLen=" << params.sampledLen;
+      << "out-of-range index @i=" << i << " val=" << val << " sampledLen=" << params.sampledLen;
     // indices should not repeat
     ASSERT_TRUE(occurence.find(val) == occurence.end())
       << "repeated index @i=" << i << " idx=" << val;
@@ -160,9 +168,7 @@ TEST_P(SWoRTestF, Result) {
   }
   // if there's a skewed distribution, the top index should correspond to the
   // particular item with a large weight
-  if (params.largeWeightIndex >= 0) {
-    ASSERT_EQ(h_outIdx[0], params.largeWeightIndex);
-  }
+  if (params.largeWeightIndex >= 0) { ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); }
 }
 INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestF, ::testing::ValuesIn(inputsf));
 
@@ -229,14 +235,14 @@ const std::vector<SWoRInputs<double>> inputsd = {
   {1024, 512, 10, 100000.0, GenKiss99, 1234ULL},
 };
 
-TEST_P(SWoRTestD, Result) {
+TEST_P(SWoRTestD, Result)
+{
   std::set<int> occurence;
   for (int i = 0; i < params.sampledLen; ++i) {
     auto val = h_outIdx[i];
     // indices must be in the given range
     ASSERT_TRUE(0 <= val && val < params.len)
-      << "out-of-range index @i=" << i << " val=" << val
-      << " sampledLen=" << params.sampledLen;
+      << "out-of-range index @i=" << i << " val=" << val << " sampledLen=" << params.sampledLen;
     // indices should not repeat
     ASSERT_TRUE(occurence.find(val) == occurence.end())
       << "repeated index @i=" << i << " idx=" << val;
@@ -244,9 +250,7 @@ TEST_P(SWoRTestD, Result) {
   }
   // if there's a skewed distribution, the top index should correspond to the
   // particular item with a large weight
-  if (params.largeWeightIndex >= 0) {
-    ASSERT_EQ(h_outIdx[0], params.largeWeightIndex);
-  }
+  if (params.largeWeightIndex >= 0) { ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); }
 }
 INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestD, ::testing::ValuesIn(inputsd));
 
diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu
index a5f08489f1..d7e11e8fef 100644
--- a/cpp/test/sparse/add.cu
+++ b/cpp/test/sparse/add.cu
@@ -44,12 +44,10 @@ struct CSRAddInputs {
 };
 
 template <typename Type_f, typename Index_>
-class CSRAddTest
-  : public ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>> {
+class CSRAddTest : public ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>> {
  public:
   CSRAddTest()
-    : params(
-        ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>>::GetParam()),
+    : params(::testing::TestWithParam<CSRAddInputs<Type_f, Index_>>::GetParam()),
       stream(handle.get_stream()),
       ind_a(params.matrix_a.row_ind.size(), stream),
       ind_ptr_a(params.matrix_a.row_ind_ptr.size(), stream),
@@ -62,59 +60,69 @@ class CSRAddTest
       values_verify(params.matrix_verify.row_ind_ptr.size(), stream),
       ind_result(params.matrix_a.row_ind.size(), stream),
       ind_ptr_result(params.matrix_verify.row_ind_ptr.size(), stream),
-      values_result(params.matrix_verify.row_ind_ptr.size(), stream) {}
+      values_result(params.matrix_verify.row_ind_ptr.size(), stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
-    n_rows = params.matrix_a.row_ind.size();
-    nnz_a = params.matrix_a.row_ind_ptr.size();
-    nnz_b = params.matrix_b.row_ind_ptr.size();
+  void SetUp() override
+  {
+    n_rows     = params.matrix_a.row_ind.size();
+    nnz_a      = params.matrix_a.row_ind_ptr.size();
+    nnz_b      = params.matrix_b.row_ind_ptr.size();
     nnz_result = params.matrix_verify.row_ind_ptr.size();
   }
 
-  void Run() {
-    raft::update_device(ind_a.data(), params.matrix_a.row_ind.data(), n_rows,
-                        stream);
-    raft::update_device(ind_ptr_a.data(), params.matrix_a.row_ind_ptr.data(),
-                        nnz_a, stream);
-    raft::update_device(values_a.data(), params.matrix_a.values.data(), nnz_a,
-                        stream);
-
-    raft::update_device(ind_b.data(), params.matrix_b.row_ind.data(), n_rows,
-                        stream);
-    raft::update_device(ind_ptr_b.data(), params.matrix_b.row_ind_ptr.data(),
-                        nnz_b, stream);
-    raft::update_device(values_b.data(), params.matrix_b.values.data(), nnz_b,
-                        stream);
-
-    raft::update_device(ind_verify.data(), params.matrix_verify.row_ind.data(),
-                        n_rows, stream);
-    raft::update_device(ind_ptr_verify.data(),
-                        params.matrix_verify.row_ind_ptr.data(), nnz_result,
-                        stream);
-    raft::update_device(values_verify.data(),
-                        params.matrix_verify.values.data(), nnz_result, stream);
-
-    Index_ nnz = linalg::csr_add_calc_inds<Type_f, 32>(
-      ind_a.data(), ind_ptr_a.data(), values_a.data(), nnz_a, ind_b.data(),
-      ind_ptr_b.data(), values_b.data(), nnz_b, n_rows, ind_result.data(),
-      stream);
+  void Run()
+  {
+    raft::update_device(ind_a.data(), params.matrix_a.row_ind.data(), n_rows, stream);
+    raft::update_device(ind_ptr_a.data(), params.matrix_a.row_ind_ptr.data(), nnz_a, stream);
+    raft::update_device(values_a.data(), params.matrix_a.values.data(), nnz_a, stream);
+
+    raft::update_device(ind_b.data(), params.matrix_b.row_ind.data(), n_rows, stream);
+    raft::update_device(ind_ptr_b.data(), params.matrix_b.row_ind_ptr.data(), nnz_b, stream);
+    raft::update_device(values_b.data(), params.matrix_b.values.data(), nnz_b, stream);
+
+    raft::update_device(ind_verify.data(), params.matrix_verify.row_ind.data(), n_rows, stream);
+    raft::update_device(
+      ind_ptr_verify.data(), params.matrix_verify.row_ind_ptr.data(), nnz_result, stream);
+    raft::update_device(
+      values_verify.data(), params.matrix_verify.values.data(), nnz_result, stream);
+
+    Index_ nnz = linalg::csr_add_calc_inds<Type_f, 32>(ind_a.data(),
+                                                       ind_ptr_a.data(),
+                                                       values_a.data(),
+                                                       nnz_a,
+                                                       ind_b.data(),
+                                                       ind_ptr_b.data(),
+                                                       values_b.data(),
+                                                       nnz_b,
+                                                       n_rows,
+                                                       ind_result.data(),
+                                                       stream);
 
     ASSERT_TRUE(nnz == nnz_result);
-    ASSERT_TRUE(raft::devArrMatch<Index_>(ind_verify.data(), ind_result.data(),
-                                          n_rows, raft::Compare<Index_>()));
-
-    linalg::csr_add_finalize<Type_f, 32>(
-      ind_a.data(), ind_ptr_a.data(), values_a.data(), nnz_a, ind_b.data(),
-      ind_ptr_b.data(), values_b.data(), nnz_b, n_rows, ind_result.data(),
-      ind_ptr_result.data(), values_result.data(), stream);
-
-    ASSERT_TRUE(raft::devArrMatch<Index_>(ind_ptr_verify.data(),
-                                          ind_ptr_result.data(), nnz,
-                                          raft::Compare<Index_>()));
-    ASSERT_TRUE(raft::devArrMatch<Type_f>(values_verify.data(),
-                                          values_result.data(), nnz,
-                                          raft::Compare<Type_f>()));
+    ASSERT_TRUE(raft::devArrMatch<Index_>(
+      ind_verify.data(), ind_result.data(), n_rows, raft::Compare<Index_>()));
+
+    linalg::csr_add_finalize<Type_f, 32>(ind_a.data(),
+                                         ind_ptr_a.data(),
+                                         values_a.data(),
+                                         nnz_a,
+                                         ind_b.data(),
+                                         ind_ptr_b.data(),
+                                         values_b.data(),
+                                         nnz_b,
+                                         n_rows,
+                                         ind_result.data(),
+                                         ind_ptr_result.data(),
+                                         values_result.data(),
+                                         stream);
+
+    ASSERT_TRUE(raft::devArrMatch<Index_>(
+      ind_ptr_verify.data(), ind_ptr_result.data(), nnz, raft::Compare<Index_>()));
+    ASSERT_TRUE(raft::devArrMatch<Type_f>(
+      values_verify.data(), values_result.data(), nnz, raft::Compare<Type_f>()));
   }
 
  protected:
@@ -123,8 +131,8 @@ class CSRAddTest
 
   CSRAddInputs<Type_f, Index_> params;
   Index_ n_rows, nnz_a, nnz_b, nnz_result;
-  rmm::device_uvector<Index_> ind_a, ind_b, ind_verify, ind_result, ind_ptr_a,
-    ind_ptr_b, ind_ptr_verify, ind_ptr_result;
+  rmm::device_uvector<Index_> ind_a, ind_b, ind_verify, ind_result, ind_ptr_a, ind_ptr_b,
+    ind_ptr_verify, ind_ptr_result;
   rmm::device_uvector<Type_f> values_a, values_b, values_verify, values_result;
 };
 
@@ -157,10 +165,8 @@ const std::vector<CSRAddInputs<double, int>> csradd_inputs_d = {
     {2.0, 2.0, 0.5, 1.0, 0.5, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}}},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestF,
-                        ::testing::ValuesIn(csradd_inputs_f));
-INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestD,
-                        ::testing::ValuesIn(csradd_inputs_d));
+INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestF, ::testing::ValuesIn(csradd_inputs_f));
+INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestD, ::testing::ValuesIn(csradd_inputs_d));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu
index dd6ba1479e..5e4b164b37 100644
--- a/cpp/test/sparse/connect_components.cu
+++ b/cpp/test/sparse/connect_components.cu
@@ -50,24 +50,22 @@ struct ConnectComponentsInputs {
 };
 
 template <typename value_idx, typename value_t>
-class ConnectComponentsTest : public ::testing::TestWithParam<
-                                ConnectComponentsInputs<value_t, value_idx>> {
+class ConnectComponentsTest
+  : public ::testing::TestWithParam<ConnectComponentsInputs<value_t, value_idx>> {
  protected:
-  void basicTest() {
+  void basicTest()
+  {
     raft::handle_t handle;
 
     auto stream = handle.get_stream();
 
-    params = ::testing::TestWithParam<
-      ConnectComponentsInputs<value_t, value_idx>>::GetParam();
+    params = ::testing::TestWithParam<ConnectComponentsInputs<value_t, value_idx>>::GetParam();
 
     raft::sparse::COO<value_t, value_idx> out_edges(handle.get_stream());
 
-    rmm::device_uvector<value_t> data(params.n_row * params.n_col,
-                                      handle.get_stream());
+    rmm::device_uvector<value_t> data(params.n_row * params.n_col, handle.get_stream());
 
-    raft::copy(data.data(), params.data.data(), data.size(),
-               handle.get_stream());
+    raft::copy(data.data(), params.data.data(), data.size(), handle.get_stream());
 
     rmm::device_uvector<value_idx> indptr(params.n_row + 1, stream);
 
@@ -76,44 +74,58 @@ class ConnectComponentsTest : public ::testing::TestWithParam<
      */
     raft::sparse::COO<value_t, value_idx> knn_graph_coo(stream);
 
-    raft::sparse::selection::knn_graph(
-      handle, data.data(), params.n_row, params.n_col,
-      raft::distance::DistanceType::L2SqrtExpanded, knn_graph_coo, params.c);
+    raft::sparse::selection::knn_graph(handle,
+                                       data.data(),
+                                       params.n_row,
+                                       params.n_col,
+                                       raft::distance::DistanceType::L2SqrtExpanded,
+                                       knn_graph_coo,
+                                       params.c);
 
-    raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(),
-                                             knn_graph_coo.nnz, indptr.data(),
-                                             params.n_row + 1, stream);
+    raft::sparse::convert::sorted_coo_to_csr(
+      knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), params.n_row + 1, stream);
 
     /**
      * 2. Construct MST, sorted by weights
      */
     rmm::device_uvector<value_idx> colors(params.n_row, stream);
 
-    auto mst_coo = raft::mst::mst<value_idx, value_idx, value_t, double>(
-      handle, indptr.data(), knn_graph_coo.cols(), knn_graph_coo.vals(),
-      params.n_row, knn_graph_coo.nnz, colors.data(), stream, false, true);
+    auto mst_coo = raft::mst::mst<value_idx, value_idx, value_t, double>(handle,
+                                                                         indptr.data(),
+                                                                         knn_graph_coo.cols(),
+                                                                         knn_graph_coo.vals(),
+                                                                         params.n_row,
+                                                                         knn_graph_coo.nnz,
+                                                                         colors.data(),
+                                                                         stream,
+                                                                         false,
+                                                                         true);
 
     /**
      * 3. connect_components to fix connectivities
      */
-    raft::linkage::FixConnectivitiesRedOp<value_idx, value_t> red_op(
-      colors.data(), params.n_row);
+    raft::linkage::FixConnectivitiesRedOp<value_idx, value_t> red_op(colors.data(), params.n_row);
     raft::linkage::connect_components<value_idx, value_t>(
-      handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col,
-      red_op);
+      handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col, red_op);
 
     /**
      * Construct final edge list
      */
     rmm::device_uvector<value_idx> indptr2(params.n_row + 1, stream);
 
-    raft::sparse::convert::sorted_coo_to_csr(out_edges.rows(), out_edges.nnz,
-                                             indptr2.data(), params.n_row + 1,
-                                             stream);
+    raft::sparse::convert::sorted_coo_to_csr(
+      out_edges.rows(), out_edges.nnz, indptr2.data(), params.n_row + 1, stream);
 
-    auto output_mst = raft::mst::mst<value_idx, value_idx, value_t>(
-      handle, indptr2.data(), out_edges.cols(), out_edges.vals(), params.n_row,
-      out_edges.nnz, colors.data(), stream, false, false);
+    auto output_mst = raft::mst::mst<value_idx, value_idx, value_t>(handle,
+                                                                    indptr2.data(),
+                                                                    out_edges.cols(),
+                                                                    out_edges.vals(),
+                                                                    params.n_row,
+                                                                    out_edges.nnz,
+                                                                    colors.data(),
+                                                                    stream,
+                                                                    false,
+                                                                    false);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
@@ -135,366 +147,199 @@ const std::vector<ConnectComponentsInputs<float, int>> fix_conn_inputsf2 = {
   // Test n_clusters == n_points
   {10,
    5,
-   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392,
-    0.77782677, 0.43772379, 0.4035871,  0.3282796,  0.47544681, 0.59862974,
-    0.12319357, 0.06239463, 0.28200272, 0.1345717,  0.50498218, 0.5113505,
-    0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
-    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,
-    0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792,
-    0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692,
-    0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
+   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379,
+    0.4035871,  0.3282796,  0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717,
+    0.50498218, 0.5113505,  0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
+    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,  0.84854131, 0.28890216,
+    0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554,
+    0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
     0.76166195, 0.66613745},
    -1},
   // Test n_points == 100
   {100,
    10,
-   {6.26168372e-01, 9.30437651e-01, 6.02450208e-01,
-    2.73025296e-01, 9.53050619e-01, 3.32164396e-01,
-    6.88942598e-01, 5.79163537e-01, 6.70341547e-01,
-    2.70140602e-02, 9.30429671e-01, 7.17721157e-01,
-    9.89948537e-01, 7.75253347e-01, 1.34491522e-02,
-    2.48522428e-02, 3.51413378e-01, 7.64405834e-01,
-    7.86373507e-01, 7.18748577e-01, 8.66998621e-01,
-    6.80316582e-01, 2.51288712e-01, 4.91078420e-01,
-    3.76246281e-01, 4.86828710e-01, 5.67464772e-01,
-    5.30734742e-01, 8.99478296e-01, 7.66699088e-01,
-    9.49339111e-01, 3.55248484e-01, 9.06046929e-01,
-    4.48407772e-01, 6.96395305e-01, 2.44277335e-01,
-    7.74840000e-01, 5.21046603e-01, 4.66423971e-02,
-    5.12019638e-02, 8.95019614e-01, 5.28956953e-01,
-    4.31536306e-01, 5.83857744e-01, 4.41787364e-01,
-    4.68656523e-01, 5.73971433e-01, 6.79989654e-01,
-    3.19650588e-01, 6.12579596e-01, 6.49126442e-02,
-    8.39131142e-01, 2.85252117e-01, 5.84848929e-01,
-    9.46507115e-01, 8.58440748e-01, 3.61528940e-01,
-    2.44215959e-01, 3.80101125e-01, 4.57128957e-02,
-    8.82216988e-01, 8.31498633e-01, 7.23474381e-01,
-    7.75788607e-01, 1.40864146e-01, 6.62092382e-01,
-    5.13985168e-01, 3.00686418e-01, 8.70109949e-01,
-    2.43187753e-01, 2.89391938e-01, 2.84214238e-01,
-    8.70985521e-01, 8.77491176e-01, 6.72537226e-01,
-    3.30929686e-01, 1.85934324e-01, 9.16222614e-01,
-    6.18239142e-01, 2.64768597e-01, 5.76145451e-01,
-    8.62961369e-01, 6.84757925e-01, 7.60549082e-01,
-    1.27645356e-01, 4.51004673e-01, 3.92292980e-01,
-    4.63170803e-01, 4.35449330e-02, 2.17583404e-01,
-    5.71832605e-02, 2.06763039e-01, 3.70116249e-01,
-    2.09750028e-01, 6.17283019e-01, 8.62549231e-01,
-    9.84156240e-02, 2.66249156e-01, 3.87635103e-01,
-    2.85591012e-02, 4.24826068e-01, 4.45795088e-01,
-    6.86227676e-01, 1.08848960e-01, 5.96731841e-02,
-    3.71770228e-01, 1.91548833e-01, 6.95136078e-01,
-    9.00700636e-01, 8.76363105e-01, 2.67334632e-01,
-    1.80619709e-01, 7.94060419e-01, 1.42854171e-02,
-    1.09372387e-01, 8.74028108e-01, 6.46403232e-01,
-    4.86588834e-01, 5.93446175e-02, 6.11886291e-01,
-    8.83865057e-01, 3.15879821e-01, 2.27043992e-01,
-    9.76764951e-01, 6.15620336e-01, 9.76199360e-01,
-    2.40548962e-01, 3.21795663e-01, 8.75087904e-02,
-    8.11234663e-01, 6.96070480e-01, 8.12062321e-01,
-    1.21958818e-01, 3.44348628e-02, 8.72630414e-01,
-    3.06162776e-01, 1.76043529e-02, 9.45894971e-01,
-    5.33896401e-01, 6.21642973e-01, 4.93062535e-01,
-    4.48984262e-01, 2.24560379e-01, 4.24052195e-02,
-    4.43447610e-01, 8.95646149e-01, 6.05220676e-01,
-    1.81840491e-01, 9.70831206e-01, 2.12563586e-02,
-    6.92582693e-01, 7.55946922e-01, 7.95086143e-01,
-    6.05328941e-01, 3.99350764e-01, 4.32846636e-01,
-    9.81114529e-01, 4.98266428e-01, 6.37127930e-03,
-    1.59085889e-01, 6.34682067e-05, 5.59429440e-01,
-    7.38827633e-01, 8.93214770e-01, 2.16494306e-01,
-    9.35430573e-02, 4.75665868e-02, 7.80503518e-01,
-    7.86240041e-01, 7.06854594e-01, 2.13725879e-02,
-    7.68246091e-01, 4.50234808e-01, 5.21231104e-01,
-    5.01989826e-03, 4.22081572e-02, 1.65337732e-01,
-    8.54134740e-01, 4.99430262e-01, 8.94525601e-01,
-    1.14028379e-01, 3.69739861e-01, 1.32955599e-01,
-    2.65563824e-01, 2.52811151e-01, 1.44792843e-01,
-    6.88449594e-01, 4.44921417e-01, 8.23296587e-01,
-    1.93266317e-01, 1.19033309e-01, 1.36368966e-01,
-    3.42600285e-01, 5.64505195e-01, 5.57594559e-01,
-    7.44257892e-01, 8.38231569e-02, 4.11548847e-01,
-    3.21010077e-01, 8.55081359e-01, 4.30105779e-01,
-    1.16229135e-01, 9.87731964e-02, 3.14712335e-01,
-    4.50880592e-01, 2.72289598e-01, 6.31615256e-01,
-    8.97432958e-01, 4.44764250e-01, 8.03776440e-01,
-    2.68767748e-02, 2.43374608e-01, 4.02141103e-01,
-    4.98881209e-01, 5.33173003e-01, 8.82890436e-01,
-    7.16149148e-01, 4.19664401e-01, 2.29335357e-01,
-    2.88637806e-01, 3.44696803e-01, 6.78171906e-01,
-    5.69849716e-01, 5.86454477e-01, 3.54474989e-01,
-    9.03876540e-01, 6.45980000e-01, 6.34887593e-01,
-    7.88039746e-02, 2.04814126e-01, 7.82251754e-01,
-    2.43147074e-01, 7.50951808e-01, 1.72799092e-02,
-    2.95349590e-01, 6.57991826e-01, 8.81214312e-01,
-    5.73970708e-01, 2.77610881e-01, 1.82155097e-01,
-    7.69797417e-02, 6.44792402e-01, 9.46950998e-01,
-    7.73064845e-01, 6.04733624e-01, 5.80094567e-01,
-    1.67498426e-01, 2.66514296e-01, 6.50140368e-01,
-    1.91170299e-01, 2.08752199e-01, 3.01664091e-01,
-    9.85033484e-01, 2.92909152e-01, 8.65816607e-01,
-    1.85222119e-01, 2.28814559e-01, 1.34286382e-02,
-    2.89234322e-01, 8.18668708e-01, 4.71706924e-01,
-    9.23199803e-01, 2.80879188e-01, 1.47319284e-01,
-    4.13915748e-01, 9.31274932e-02, 6.66322195e-01,
-    9.66953974e-01, 3.19405786e-01, 6.69486551e-01,
-    5.03096313e-02, 6.95225201e-01, 5.78469859e-01,
-    6.29481655e-01, 1.39252534e-01, 1.22564968e-01,
-    6.80663678e-01, 6.34607157e-01, 6.42765834e-01,
-    1.57127410e-02, 2.92132086e-01, 5.24423878e-01,
-    4.68676824e-01, 2.86003928e-01, 7.18608322e-01,
-    8.95617933e-01, 5.48844309e-01, 1.74517278e-01,
-    5.24379196e-01, 2.13526524e-01, 5.88375435e-01,
-    9.88560185e-01, 4.17435771e-01, 6.14438688e-01,
-    9.53760881e-01, 5.27151288e-01, 7.03017278e-01,
-    3.44448559e-01, 4.47059676e-01, 2.83414901e-01,
-    1.98979011e-01, 4.24917361e-01, 5.73172761e-01,
-    2.32398853e-02, 1.65887230e-01, 4.05552785e-01,
-    9.29665524e-01, 2.26135696e-01, 9.20563384e-01,
-    7.65259963e-01, 4.54820075e-01, 8.97710267e-01,
-    3.78559302e-03, 9.15219382e-01, 3.55705698e-01,
-    6.94905124e-01, 8.58540202e-01, 3.89790666e-01,
-    2.49478206e-01, 7.93679304e-01, 4.75830027e-01,
-    4.40425353e-01, 3.70579459e-01, 1.40578049e-01,
-    1.70386675e-01, 7.04056121e-01, 4.85963102e-01,
-    9.68450060e-01, 6.77178001e-01, 2.65934654e-01,
-    2.58915007e-01, 6.70052890e-01, 2.61945109e-01,
-    8.46207759e-01, 1.01928951e-01, 2.85611334e-01,
-    2.45776933e-01, 2.66658783e-01, 3.71724077e-01,
-    4.34319025e-01, 4.24407347e-01, 7.15417683e-01,
-    8.07997684e-01, 1.64296275e-01, 6.01638065e-01,
-    8.60606804e-02, 2.68719187e-01, 5.11764101e-01,
-    9.75844338e-01, 7.81226782e-01, 2.20925515e-01,
-    7.18135040e-01, 9.82395577e-01, 8.39160243e-01,
-    9.08058083e-01, 6.88010677e-01, 8.14271847e-01,
-    5.12460821e-01, 1.17311345e-01, 5.96075228e-01,
-    9.17455497e-01, 2.12052706e-01, 7.04074603e-01,
-    8.72872565e-02, 8.76047818e-01, 6.96235046e-01,
-    8.54801557e-01, 2.49729159e-01, 9.76594604e-01,
-    2.87386363e-01, 2.36461559e-02, 9.94075254e-01,
-    4.25193986e-01, 7.61869994e-01, 5.13334255e-01,
-    6.44711165e-02, 8.92156689e-01, 3.55235167e-01,
-    1.08154647e-01, 8.78446825e-01, 2.43833016e-01,
-    9.23071293e-01, 2.72724115e-01, 9.46631338e-01,
-    3.74510294e-01, 4.08451278e-02, 9.78392777e-01,
-    3.65079221e-01, 6.37199516e-01, 5.51144906e-01,
-    5.25978080e-01, 1.42803678e-01, 4.05451674e-01,
-    7.79788219e-01, 6.26009784e-01, 3.35249497e-01,
-    1.43159543e-02, 1.80363779e-01, 5.05096904e-01,
-    2.82619947e-01, 5.83561392e-01, 3.10951324e-01,
-    8.73223968e-01, 4.38545619e-01, 4.81348800e-01,
-    6.68497085e-01, 3.79345401e-01, 9.58832501e-01,
-    1.89869550e-01, 2.34083070e-01, 2.94066207e-01,
-    5.74892667e-02, 6.92106828e-02, 9.61127686e-02,
-    6.72650672e-02, 8.47345378e-01, 2.80916761e-01,
-    7.32177357e-03, 9.80785961e-01, 5.73192225e-02,
-    8.48781331e-01, 8.83225408e-01, 7.34398275e-01,
-    7.70381941e-01, 6.20778343e-01, 8.96822048e-01,
-    5.40732486e-01, 3.69704071e-01, 5.77305837e-01,
-    2.08221827e-01, 7.34275341e-01, 1.06110900e-01,
-    3.49496706e-01, 8.34948910e-01, 1.56403291e-02,
-    6.78576376e-01, 8.96141268e-01, 5.94835119e-01,
-    1.43943153e-01, 3.49618530e-01, 2.10440392e-01,
-    3.46585620e-01, 1.05153093e-01, 3.45446174e-01,
-    2.72177079e-01, 7.07946300e-01, 4.33717726e-02,
-    3.31232203e-01, 3.91874320e-01, 4.76338141e-01,
-    6.22777789e-01, 2.95989228e-02, 4.32855769e-01,
-    7.61049310e-01, 3.63279149e-01, 9.47210350e-01,
-    6.43721247e-01, 6.58025802e-01, 1.05247633e-02,
-    5.29974442e-01, 7.30675767e-01, 4.30041079e-01,
-    6.62634841e-01, 8.25936616e-01, 9.91253704e-01,
-    6.79399281e-01, 5.44177006e-01, 7.52876048e-01,
-    3.32139049e-01, 7.98732398e-01, 7.38865223e-01,
-    9.16055132e-01, 6.11736493e-01, 9.63672879e-01,
-    1.83778839e-01, 7.27558919e-02, 5.91602822e-01,
-    3.25235484e-01, 2.34741217e-01, 9.52346277e-01,
-    9.18556407e-01, 9.35373324e-01, 6.89209070e-01,
-    2.56049054e-01, 6.17975395e-01, 7.82285691e-01,
-    9.84983432e-01, 6.62322741e-01, 2.04144457e-01,
-    3.98446577e-01, 1.38918297e-01, 3.05919921e-01,
-    3.14043787e-01, 5.91072666e-01, 7.44703771e-01,
-    8.92272567e-01, 9.78017873e-01, 9.01203161e-01,
-    1.41526372e-01, 4.14878484e-01, 6.80683651e-01,
-    5.01733152e-02, 8.14635389e-01, 2.27926375e-01,
-    9.03269815e-01, 8.68443745e-01, 9.86939190e-01,
-    7.40779486e-01, 2.61005311e-01, 3.19276232e-01,
-    9.69509248e-01, 1.11908818e-01, 4.49198556e-01,
-    1.27056715e-01, 3.84064823e-01, 5.14591811e-01,
-    2.10747488e-01, 9.53884090e-01, 8.43167950e-01,
-    4.51187972e-01, 3.75331782e-01, 6.23566461e-01,
-    3.55290379e-01, 2.95705968e-01, 1.69622690e-01,
-    1.42981830e-01, 2.72180991e-01, 9.46468040e-01,
-    3.70932500e-01, 9.94292830e-01, 4.62587505e-01,
-    7.14817405e-01, 2.45370540e-02, 3.00906377e-01,
-    5.75768304e-01, 9.71448393e-01, 6.95574827e-02,
-    3.93693854e-01, 5.29306116e-01, 5.04694554e-01,
-    6.73797120e-02, 6.76596969e-01, 5.50948898e-01,
-    3.24909641e-01, 7.70337719e-01, 6.51842631e-03,
-    3.03264879e-01, 7.61037886e-03, 2.72289601e-01,
-    1.50502041e-01, 6.71103888e-02, 7.41503703e-01,
-    1.92088941e-01, 2.19043977e-01, 9.09320161e-01,
-    2.37993569e-01, 6.18107973e-02, 8.31447852e-01,
-    2.23355609e-01, 1.84789435e-01, 4.16104518e-01,
-    4.21573859e-01, 8.72446305e-02, 2.97294197e-01,
-    4.50328256e-01, 8.72199917e-01, 2.51279916e-01,
-    4.86219272e-01, 7.57071329e-01, 4.85655942e-01,
-    1.06187277e-01, 4.92341327e-01, 1.46017513e-01,
-    5.25421017e-01, 4.22637906e-01, 2.24685018e-01,
-    8.72648431e-01, 5.54051490e-01, 1.80745062e-01,
-    2.12756336e-01, 5.20883169e-01, 7.60363654e-01,
-    8.30254678e-01, 5.00003328e-01, 4.69017439e-01,
-    6.38105527e-01, 3.50638261e-02, 5.22217353e-02,
-    9.06516882e-02, 8.52975842e-01, 1.19985883e-01,
-    3.74926753e-01, 6.50302066e-01, 1.98875727e-01,
-    6.28362507e-02, 4.32693501e-01, 3.10500685e-01,
-    6.20732833e-01, 4.58503272e-01, 3.20790034e-01,
-    7.91284868e-01, 7.93054570e-01, 2.93406765e-01,
-    8.95399023e-01, 1.06441034e-01, 7.53085241e-02,
-    8.67523104e-01, 1.47963482e-01, 1.25584706e-01,
-    3.81545040e-02, 6.34338619e-01, 1.76368938e-02,
-    5.75553531e-02, 5.31607516e-01, 2.63869588e-01,
-    9.41945823e-01, 9.24028838e-02, 5.21496463e-01,
-    7.74866558e-01, 5.65210610e-01, 7.28015327e-02,
-    6.51963790e-01, 8.94727453e-01, 4.49571590e-01,
-    1.29932405e-01, 8.64026259e-01, 9.92599934e-01,
-    7.43721560e-01, 8.87300215e-01, 1.06369925e-01,
-    8.11335531e-01, 7.87734900e-01, 9.87344678e-01,
-    5.32502820e-01, 4.42612382e-01, 9.64041183e-01,
-    1.66085871e-01, 1.12937664e-01, 5.24423470e-01,
-    6.54689333e-01, 4.59119726e-01, 5.22774091e-01,
-    3.08722276e-02, 6.26979315e-01, 4.49754105e-01,
-    8.07495757e-01, 2.34199499e-01, 1.67765675e-01,
-    9.22168418e-01, 3.73210378e-01, 8.04432575e-01,
-    5.61890354e-01, 4.47025593e-01, 6.43155678e-01,
-    2.40407640e-01, 5.91631279e-01, 1.59369206e-01,
-    7.75799090e-01, 8.32067212e-01, 5.59791576e-02,
-    6.39105224e-01, 4.85274738e-01, 2.12630838e-01,
-    2.81431312e-02, 7.16205363e-01, 6.83885011e-01,
-    5.23869697e-01, 9.99418314e-01, 8.35331599e-01,
-    4.69877463e-02, 6.74712562e-01, 7.99273684e-01,
-    2.77001890e-02, 5.75809742e-01, 2.78513031e-01,
-    8.36209905e-01, 7.25472379e-01, 4.87173943e-01,
-    7.88311357e-01, 9.64676177e-01, 1.75752651e-01,
-    4.98112580e-01, 8.08850418e-02, 6.40981131e-01,
-    4.06647450e-01, 8.46539387e-01, 2.12620694e-01,
-    9.11012851e-01, 8.25041445e-01, 8.90065575e-01,
-    9.63626055e-01, 5.96689242e-01, 1.63372670e-01,
-    4.51640148e-01, 3.43026542e-01, 5.80658851e-01,
-    2.82327625e-01, 4.75535418e-01, 6.27760926e-01,
-    8.46314115e-01, 9.61961932e-01, 3.19806094e-01,
-    5.05508062e-01, 5.28102944e-01, 6.13045057e-01,
-    7.44714938e-01, 1.50586073e-01, 7.91878033e-01,
-    4.89839179e-01, 3.10496849e-01, 8.82309038e-01,
-    2.86922314e-01, 4.84687559e-01, 5.20838630e-01,
-    4.62955493e-01, 2.38185305e-01, 5.47259907e-02,
-    7.10916137e-01, 7.31887202e-01, 6.25602317e-01,
-    8.77741168e-01, 4.19881322e-01, 4.81222328e-01,
-    1.28224501e-01, 2.46034010e-01, 3.34971854e-01,
-    7.37216484e-01, 5.62134821e-02, 7.14089724e-01,
-    9.85549393e-01, 4.66295827e-01, 3.08722434e-03,
-    4.70237690e-01, 2.66524167e-01, 7.93875484e-01,
-    4.54795911e-02, 8.09702944e-01, 1.47709735e-02,
-    1.70082405e-01, 6.35905179e-01, 3.75379109e-01,
-    4.30315011e-01, 3.15788760e-01, 5.58065230e-01,
-    2.24643800e-01, 2.42142981e-01, 6.57283636e-01,
-    3.34921891e-01, 1.26588975e-01, 7.68064155e-01,
-    9.43856291e-01, 4.47518596e-01, 5.44453573e-01,
-    9.95764932e-01, 7.16444391e-01, 8.51019765e-01,
-    1.01179183e-01, 4.45473958e-01, 4.60327322e-01,
-    4.96895844e-02, 4.72907738e-01, 5.58987444e-01,
-    3.41027487e-01, 1.56175026e-01, 7.58283148e-01,
-    6.83600909e-01, 2.14623396e-01, 3.27348880e-01,
-    3.92517893e-01, 6.70418431e-01, 5.16440832e-01,
-    8.63140348e-01, 5.73277464e-01, 3.46608058e-01,
-    7.39396341e-01, 7.20852434e-01, 2.35653246e-02,
-    3.89935659e-01, 7.53783745e-01, 6.34563528e-01,
-    8.79339335e-01, 7.41599159e-02, 5.62433904e-01,
-    6.15553852e-01, 4.56956324e-01, 5.20047447e-01,
-    5.26845015e-02, 5.58471266e-01, 1.63632233e-01,
-    5.38936665e-02, 6.49593683e-01, 2.56838748e-01,
-    8.99035326e-01, 7.20847756e-01, 5.68954684e-01,
-    7.43684755e-01, 5.70924238e-01, 3.82318724e-01,
-    4.89328290e-01, 5.62208561e-01, 4.97540804e-02,
-    4.18011085e-01, 6.88041565e-01, 2.16234653e-01,
-    7.89548214e-01, 8.46136387e-01, 8.46816189e-01,
-    1.73842353e-01, 6.11627842e-02, 8.44440559e-01,
-    4.50646654e-01, 3.74785037e-01, 4.87196697e-01,
-    4.56276448e-01, 9.13284391e-01, 4.15715464e-01,
-    7.13597697e-01, 1.23641270e-02, 5.10031271e-01,
-    4.74601930e-02, 2.55731159e-01, 3.22090006e-01,
-    1.91165703e-01, 4.51170940e-01, 7.50843157e-01,
-    4.42420576e-01, 4.25380660e-01, 4.50667257e-01,
-    6.55689206e-01, 9.68257670e-02, 1.96528793e-01,
-    8.97343028e-01, 4.99940904e-01, 6.65504083e-01,
-    9.41828079e-01, 4.54397338e-01, 5.61893331e-01,
-    5.09839880e-01, 4.53117514e-01, 8.96804127e-02,
-    1.74888861e-01, 6.65641378e-01, 2.81668336e-01,
-    1.89532742e-01, 5.61668382e-01, 8.68330157e-02,
-    8.25092797e-01, 5.18106324e-01, 1.71904024e-01,
-    3.68385523e-01, 1.62005436e-01, 7.48507399e-01,
-    9.30274827e-01, 2.38198517e-01, 9.52222901e-01,
-    5.23587800e-01, 6.94384557e-01, 1.09338652e-01,
-    4.83356794e-01, 2.73050402e-01, 3.68027050e-01,
-    5.92366466e-01, 1.83192289e-01, 8.60376029e-01,
-    7.13926203e-01, 8.16750052e-01, 1.57890291e-01,
-    6.25691951e-01, 5.24831646e-01, 1.73873797e-01,
-    1.02429784e-01, 9.17488471e-01, 4.03584434e-01,
-    9.31170884e-01, 2.79386137e-01, 8.77745206e-01,
-    2.45200576e-01, 1.28896951e-01, 3.15713052e-01,
-    5.27874291e-01, 2.16444335e-01, 7.03883817e-01,
-    7.74738919e-02, 8.42422142e-01, 3.75598924e-01,
-    3.51002411e-01, 6.22752776e-01, 4.82407943e-01,
-    7.43107867e-01, 9.46182666e-01, 9.44344819e-01,
-    3.28124763e-01, 1.06147431e-01, 1.65102684e-01,
-    3.84060507e-01, 2.91057722e-01, 7.68173662e-02,
-    1.03543651e-01, 6.76698940e-01, 1.43141994e-01,
-    7.21342202e-01, 6.69471294e-03, 9.07298311e-01,
-    5.57080171e-01, 8.10954489e-01, 4.11120526e-01,
-    2.06407453e-01, 2.59590556e-01, 7.58512718e-01,
-    5.79873897e-01, 2.92875650e-01, 2.83686529e-01,
-    2.42829343e-01, 9.19323719e-01, 3.46832864e-01,
-    3.58238858e-01, 7.42827585e-01, 2.05760059e-01,
-    9.58438860e-01, 5.66326411e-01, 6.60292846e-01,
-    5.61095078e-02, 6.79465531e-01, 7.05118513e-01,
-    4.44713264e-01, 2.09732933e-01, 5.22732436e-01,
-    1.74396512e-01, 5.29356748e-01, 4.38475687e-01,
-    4.94036404e-01, 4.09785794e-01, 6.40025507e-01,
-    5.79371821e-01, 1.57726118e-01, 6.04572263e-01,
-    5.41072639e-01, 5.18847173e-01, 1.97093284e-01,
-    8.91767002e-01, 4.29050835e-01, 8.25490570e-01,
-    3.87699807e-01, 4.50705808e-01, 2.49371643e-01,
-    3.36074898e-01, 9.29925118e-01, 6.65393649e-01,
-    9.07275994e-01, 3.73075859e-01, 4.14044139e-03,
-    2.37463702e-01, 2.25893784e-01, 2.46900245e-01,
-    4.50350196e-01, 3.48618117e-01, 5.07193932e-01,
-    5.23435142e-01, 8.13611417e-01, 8.92715622e-01,
-    1.02623450e-01, 3.06088345e-01, 7.80461650e-01,
-    2.21453645e-01, 2.01419652e-01, 2.84254457e-01,
-    3.68286735e-01, 7.39358243e-01, 8.97879394e-01,
-    9.81599566e-01, 7.56526442e-01, 7.37645545e-01,
-    4.23976657e-02, 8.25922012e-01, 2.60956996e-01,
-    2.90702065e-01, 8.98388344e-01, 3.03733299e-01,
-    8.49071471e-01, 3.45835425e-01, 7.65458276e-01,
-    5.68094872e-01, 8.93770930e-01, 9.93161641e-01,
-    5.63368667e-02, 4.26548945e-01, 5.46745780e-01,
-    5.75674571e-01, 7.94599487e-01, 7.18935553e-02,
-    4.46492976e-01, 6.40240123e-01, 2.73246969e-01,
-    2.00465968e-01, 1.30718835e-01, 1.92492005e-01,
-    1.96617189e-01, 6.61271644e-01, 8.12687657e-01,
-    8.66342445e-01
+   {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, 2.73025296e-01, 9.53050619e-01, 3.32164396e-01,
+    6.88942598e-01, 5.79163537e-01, 6.70341547e-01, 2.70140602e-02, 9.30429671e-01, 7.17721157e-01,
+    9.89948537e-01, 7.75253347e-01, 1.34491522e-02, 2.48522428e-02, 3.51413378e-01, 7.64405834e-01,
+    7.86373507e-01, 7.18748577e-01, 8.66998621e-01, 6.80316582e-01, 2.51288712e-01, 4.91078420e-01,
+    3.76246281e-01, 4.86828710e-01, 5.67464772e-01, 5.30734742e-01, 8.99478296e-01, 7.66699088e-01,
+    9.49339111e-01, 3.55248484e-01, 9.06046929e-01, 4.48407772e-01, 6.96395305e-01, 2.44277335e-01,
+    7.74840000e-01, 5.21046603e-01, 4.66423971e-02, 5.12019638e-02, 8.95019614e-01, 5.28956953e-01,
+    4.31536306e-01, 5.83857744e-01, 4.41787364e-01, 4.68656523e-01, 5.73971433e-01, 6.79989654e-01,
+    3.19650588e-01, 6.12579596e-01, 6.49126442e-02, 8.39131142e-01, 2.85252117e-01, 5.84848929e-01,
+    9.46507115e-01, 8.58440748e-01, 3.61528940e-01, 2.44215959e-01, 3.80101125e-01, 4.57128957e-02,
+    8.82216988e-01, 8.31498633e-01, 7.23474381e-01, 7.75788607e-01, 1.40864146e-01, 6.62092382e-01,
+    5.13985168e-01, 3.00686418e-01, 8.70109949e-01, 2.43187753e-01, 2.89391938e-01, 2.84214238e-01,
+    8.70985521e-01, 8.77491176e-01, 6.72537226e-01, 3.30929686e-01, 1.85934324e-01, 9.16222614e-01,
+    6.18239142e-01, 2.64768597e-01, 5.76145451e-01, 8.62961369e-01, 6.84757925e-01, 7.60549082e-01,
+    1.27645356e-01, 4.51004673e-01, 3.92292980e-01, 4.63170803e-01, 4.35449330e-02, 2.17583404e-01,
+    5.71832605e-02, 2.06763039e-01, 3.70116249e-01, 2.09750028e-01, 6.17283019e-01, 8.62549231e-01,
+    9.84156240e-02, 2.66249156e-01, 3.87635103e-01, 2.85591012e-02, 4.24826068e-01, 4.45795088e-01,
+    6.86227676e-01, 1.08848960e-01, 5.96731841e-02, 3.71770228e-01, 1.91548833e-01, 6.95136078e-01,
+    9.00700636e-01, 8.76363105e-01, 2.67334632e-01, 1.80619709e-01, 7.94060419e-01, 1.42854171e-02,
+    1.09372387e-01, 8.74028108e-01, 6.46403232e-01, 4.86588834e-01, 5.93446175e-02, 6.11886291e-01,
+    8.83865057e-01, 3.15879821e-01, 2.27043992e-01, 9.76764951e-01, 6.15620336e-01, 9.76199360e-01,
+    2.40548962e-01, 3.21795663e-01, 8.75087904e-02, 8.11234663e-01, 6.96070480e-01, 8.12062321e-01,
+    1.21958818e-01, 3.44348628e-02, 8.72630414e-01, 3.06162776e-01, 1.76043529e-02, 9.45894971e-01,
+    5.33896401e-01, 6.21642973e-01, 4.93062535e-01, 4.48984262e-01, 2.24560379e-01, 4.24052195e-02,
+    4.43447610e-01, 8.95646149e-01, 6.05220676e-01, 1.81840491e-01, 9.70831206e-01, 2.12563586e-02,
+    6.92582693e-01, 7.55946922e-01, 7.95086143e-01, 6.05328941e-01, 3.99350764e-01, 4.32846636e-01,
+    9.81114529e-01, 4.98266428e-01, 6.37127930e-03, 1.59085889e-01, 6.34682067e-05, 5.59429440e-01,
+    7.38827633e-01, 8.93214770e-01, 2.16494306e-01, 9.35430573e-02, 4.75665868e-02, 7.80503518e-01,
+    7.86240041e-01, 7.06854594e-01, 2.13725879e-02, 7.68246091e-01, 4.50234808e-01, 5.21231104e-01,
+    5.01989826e-03, 4.22081572e-02, 1.65337732e-01, 8.54134740e-01, 4.99430262e-01, 8.94525601e-01,
+    1.14028379e-01, 3.69739861e-01, 1.32955599e-01, 2.65563824e-01, 2.52811151e-01, 1.44792843e-01,
+    6.88449594e-01, 4.44921417e-01, 8.23296587e-01, 1.93266317e-01, 1.19033309e-01, 1.36368966e-01,
+    3.42600285e-01, 5.64505195e-01, 5.57594559e-01, 7.44257892e-01, 8.38231569e-02, 4.11548847e-01,
+    3.21010077e-01, 8.55081359e-01, 4.30105779e-01, 1.16229135e-01, 9.87731964e-02, 3.14712335e-01,
+    4.50880592e-01, 2.72289598e-01, 6.31615256e-01, 8.97432958e-01, 4.44764250e-01, 8.03776440e-01,
+    2.68767748e-02, 2.43374608e-01, 4.02141103e-01, 4.98881209e-01, 5.33173003e-01, 8.82890436e-01,
+    7.16149148e-01, 4.19664401e-01, 2.29335357e-01, 2.88637806e-01, 3.44696803e-01, 6.78171906e-01,
+    5.69849716e-01, 5.86454477e-01, 3.54474989e-01, 9.03876540e-01, 6.45980000e-01, 6.34887593e-01,
+    7.88039746e-02, 2.04814126e-01, 7.82251754e-01, 2.43147074e-01, 7.50951808e-01, 1.72799092e-02,
+    2.95349590e-01, 6.57991826e-01, 8.81214312e-01, 5.73970708e-01, 2.77610881e-01, 1.82155097e-01,
+    7.69797417e-02, 6.44792402e-01, 9.46950998e-01, 7.73064845e-01, 6.04733624e-01, 5.80094567e-01,
+    1.67498426e-01, 2.66514296e-01, 6.50140368e-01, 1.91170299e-01, 2.08752199e-01, 3.01664091e-01,
+    9.85033484e-01, 2.92909152e-01, 8.65816607e-01, 1.85222119e-01, 2.28814559e-01, 1.34286382e-02,
+    2.89234322e-01, 8.18668708e-01, 4.71706924e-01, 9.23199803e-01, 2.80879188e-01, 1.47319284e-01,
+    4.13915748e-01, 9.31274932e-02, 6.66322195e-01, 9.66953974e-01, 3.19405786e-01, 6.69486551e-01,
+    5.03096313e-02, 6.95225201e-01, 5.78469859e-01, 6.29481655e-01, 1.39252534e-01, 1.22564968e-01,
+    6.80663678e-01, 6.34607157e-01, 6.42765834e-01, 1.57127410e-02, 2.92132086e-01, 5.24423878e-01,
+    4.68676824e-01, 2.86003928e-01, 7.18608322e-01, 8.95617933e-01, 5.48844309e-01, 1.74517278e-01,
+    5.24379196e-01, 2.13526524e-01, 5.88375435e-01, 9.88560185e-01, 4.17435771e-01, 6.14438688e-01,
+    9.53760881e-01, 5.27151288e-01, 7.03017278e-01, 3.44448559e-01, 4.47059676e-01, 2.83414901e-01,
+    1.98979011e-01, 4.24917361e-01, 5.73172761e-01, 2.32398853e-02, 1.65887230e-01, 4.05552785e-01,
+    9.29665524e-01, 2.26135696e-01, 9.20563384e-01, 7.65259963e-01, 4.54820075e-01, 8.97710267e-01,
+    3.78559302e-03, 9.15219382e-01, 3.55705698e-01, 6.94905124e-01, 8.58540202e-01, 3.89790666e-01,
+    2.49478206e-01, 7.93679304e-01, 4.75830027e-01, 4.40425353e-01, 3.70579459e-01, 1.40578049e-01,
+    1.70386675e-01, 7.04056121e-01, 4.85963102e-01, 9.68450060e-01, 6.77178001e-01, 2.65934654e-01,
+    2.58915007e-01, 6.70052890e-01, 2.61945109e-01, 8.46207759e-01, 1.01928951e-01, 2.85611334e-01,
+    2.45776933e-01, 2.66658783e-01, 3.71724077e-01, 4.34319025e-01, 4.24407347e-01, 7.15417683e-01,
+    8.07997684e-01, 1.64296275e-01, 6.01638065e-01, 8.60606804e-02, 2.68719187e-01, 5.11764101e-01,
+    9.75844338e-01, 7.81226782e-01, 2.20925515e-01, 7.18135040e-01, 9.82395577e-01, 8.39160243e-01,
+    9.08058083e-01, 6.88010677e-01, 8.14271847e-01, 5.12460821e-01, 1.17311345e-01, 5.96075228e-01,
+    9.17455497e-01, 2.12052706e-01, 7.04074603e-01, 8.72872565e-02, 8.76047818e-01, 6.96235046e-01,
+    8.54801557e-01, 2.49729159e-01, 9.76594604e-01, 2.87386363e-01, 2.36461559e-02, 9.94075254e-01,
+    4.25193986e-01, 7.61869994e-01, 5.13334255e-01, 6.44711165e-02, 8.92156689e-01, 3.55235167e-01,
+    1.08154647e-01, 8.78446825e-01, 2.43833016e-01, 9.23071293e-01, 2.72724115e-01, 9.46631338e-01,
+    3.74510294e-01, 4.08451278e-02, 9.78392777e-01, 3.65079221e-01, 6.37199516e-01, 5.51144906e-01,
+    5.25978080e-01, 1.42803678e-01, 4.05451674e-01, 7.79788219e-01, 6.26009784e-01, 3.35249497e-01,
+    1.43159543e-02, 1.80363779e-01, 5.05096904e-01, 2.82619947e-01, 5.83561392e-01, 3.10951324e-01,
+    8.73223968e-01, 4.38545619e-01, 4.81348800e-01, 6.68497085e-01, 3.79345401e-01, 9.58832501e-01,
+    1.89869550e-01, 2.34083070e-01, 2.94066207e-01, 5.74892667e-02, 6.92106828e-02, 9.61127686e-02,
+    6.72650672e-02, 8.47345378e-01, 2.80916761e-01, 7.32177357e-03, 9.80785961e-01, 5.73192225e-02,
+    8.48781331e-01, 8.83225408e-01, 7.34398275e-01, 7.70381941e-01, 6.20778343e-01, 8.96822048e-01,
+    5.40732486e-01, 3.69704071e-01, 5.77305837e-01, 2.08221827e-01, 7.34275341e-01, 1.06110900e-01,
+    3.49496706e-01, 8.34948910e-01, 1.56403291e-02, 6.78576376e-01, 8.96141268e-01, 5.94835119e-01,
+    1.43943153e-01, 3.49618530e-01, 2.10440392e-01, 3.46585620e-01, 1.05153093e-01, 3.45446174e-01,
+    2.72177079e-01, 7.07946300e-01, 4.33717726e-02, 3.31232203e-01, 3.91874320e-01, 4.76338141e-01,
+    6.22777789e-01, 2.95989228e-02, 4.32855769e-01, 7.61049310e-01, 3.63279149e-01, 9.47210350e-01,
+    6.43721247e-01, 6.58025802e-01, 1.05247633e-02, 5.29974442e-01, 7.30675767e-01, 4.30041079e-01,
+    6.62634841e-01, 8.25936616e-01, 9.91253704e-01, 6.79399281e-01, 5.44177006e-01, 7.52876048e-01,
+    3.32139049e-01, 7.98732398e-01, 7.38865223e-01, 9.16055132e-01, 6.11736493e-01, 9.63672879e-01,
+    1.83778839e-01, 7.27558919e-02, 5.91602822e-01, 3.25235484e-01, 2.34741217e-01, 9.52346277e-01,
+    9.18556407e-01, 9.35373324e-01, 6.89209070e-01, 2.56049054e-01, 6.17975395e-01, 7.82285691e-01,
+    9.84983432e-01, 6.62322741e-01, 2.04144457e-01, 3.98446577e-01, 1.38918297e-01, 3.05919921e-01,
+    3.14043787e-01, 5.91072666e-01, 7.44703771e-01, 8.92272567e-01, 9.78017873e-01, 9.01203161e-01,
+    1.41526372e-01, 4.14878484e-01, 6.80683651e-01, 5.01733152e-02, 8.14635389e-01, 2.27926375e-01,
+    9.03269815e-01, 8.68443745e-01, 9.86939190e-01, 7.40779486e-01, 2.61005311e-01, 3.19276232e-01,
+    9.69509248e-01, 1.11908818e-01, 4.49198556e-01, 1.27056715e-01, 3.84064823e-01, 5.14591811e-01,
+    2.10747488e-01, 9.53884090e-01, 8.43167950e-01, 4.51187972e-01, 3.75331782e-01, 6.23566461e-01,
+    3.55290379e-01, 2.95705968e-01, 1.69622690e-01, 1.42981830e-01, 2.72180991e-01, 9.46468040e-01,
+    3.70932500e-01, 9.94292830e-01, 4.62587505e-01, 7.14817405e-01, 2.45370540e-02, 3.00906377e-01,
+    5.75768304e-01, 9.71448393e-01, 6.95574827e-02, 3.93693854e-01, 5.29306116e-01, 5.04694554e-01,
+    6.73797120e-02, 6.76596969e-01, 5.50948898e-01, 3.24909641e-01, 7.70337719e-01, 6.51842631e-03,
+    3.03264879e-01, 7.61037886e-03, 2.72289601e-01, 1.50502041e-01, 6.71103888e-02, 7.41503703e-01,
+    1.92088941e-01, 2.19043977e-01, 9.09320161e-01, 2.37993569e-01, 6.18107973e-02, 8.31447852e-01,
+    2.23355609e-01, 1.84789435e-01, 4.16104518e-01, 4.21573859e-01, 8.72446305e-02, 2.97294197e-01,
+    4.50328256e-01, 8.72199917e-01, 2.51279916e-01, 4.86219272e-01, 7.57071329e-01, 4.85655942e-01,
+    1.06187277e-01, 4.92341327e-01, 1.46017513e-01, 5.25421017e-01, 4.22637906e-01, 2.24685018e-01,
+    8.72648431e-01, 5.54051490e-01, 1.80745062e-01, 2.12756336e-01, 5.20883169e-01, 7.60363654e-01,
+    8.30254678e-01, 5.00003328e-01, 4.69017439e-01, 6.38105527e-01, 3.50638261e-02, 5.22217353e-02,
+    9.06516882e-02, 8.52975842e-01, 1.19985883e-01, 3.74926753e-01, 6.50302066e-01, 1.98875727e-01,
+    6.28362507e-02, 4.32693501e-01, 3.10500685e-01, 6.20732833e-01, 4.58503272e-01, 3.20790034e-01,
+    7.91284868e-01, 7.93054570e-01, 2.93406765e-01, 8.95399023e-01, 1.06441034e-01, 7.53085241e-02,
+    8.67523104e-01, 1.47963482e-01, 1.25584706e-01, 3.81545040e-02, 6.34338619e-01, 1.76368938e-02,
+    5.75553531e-02, 5.31607516e-01, 2.63869588e-01, 9.41945823e-01, 9.24028838e-02, 5.21496463e-01,
+    7.74866558e-01, 5.65210610e-01, 7.28015327e-02, 6.51963790e-01, 8.94727453e-01, 4.49571590e-01,
+    1.29932405e-01, 8.64026259e-01, 9.92599934e-01, 7.43721560e-01, 8.87300215e-01, 1.06369925e-01,
+    8.11335531e-01, 7.87734900e-01, 9.87344678e-01, 5.32502820e-01, 4.42612382e-01, 9.64041183e-01,
+    1.66085871e-01, 1.12937664e-01, 5.24423470e-01, 6.54689333e-01, 4.59119726e-01, 5.22774091e-01,
+    3.08722276e-02, 6.26979315e-01, 4.49754105e-01, 8.07495757e-01, 2.34199499e-01, 1.67765675e-01,
+    9.22168418e-01, 3.73210378e-01, 8.04432575e-01, 5.61890354e-01, 4.47025593e-01, 6.43155678e-01,
+    2.40407640e-01, 5.91631279e-01, 1.59369206e-01, 7.75799090e-01, 8.32067212e-01, 5.59791576e-02,
+    6.39105224e-01, 4.85274738e-01, 2.12630838e-01, 2.81431312e-02, 7.16205363e-01, 6.83885011e-01,
+    5.23869697e-01, 9.99418314e-01, 8.35331599e-01, 4.69877463e-02, 6.74712562e-01, 7.99273684e-01,
+    2.77001890e-02, 5.75809742e-01, 2.78513031e-01, 8.36209905e-01, 7.25472379e-01, 4.87173943e-01,
+    7.88311357e-01, 9.64676177e-01, 1.75752651e-01, 4.98112580e-01, 8.08850418e-02, 6.40981131e-01,
+    4.06647450e-01, 8.46539387e-01, 2.12620694e-01, 9.11012851e-01, 8.25041445e-01, 8.90065575e-01,
+    9.63626055e-01, 5.96689242e-01, 1.63372670e-01, 4.51640148e-01, 3.43026542e-01, 5.80658851e-01,
+    2.82327625e-01, 4.75535418e-01, 6.27760926e-01, 8.46314115e-01, 9.61961932e-01, 3.19806094e-01,
+    5.05508062e-01, 5.28102944e-01, 6.13045057e-01, 7.44714938e-01, 1.50586073e-01, 7.91878033e-01,
+    4.89839179e-01, 3.10496849e-01, 8.82309038e-01, 2.86922314e-01, 4.84687559e-01, 5.20838630e-01,
+    4.62955493e-01, 2.38185305e-01, 5.47259907e-02, 7.10916137e-01, 7.31887202e-01, 6.25602317e-01,
+    8.77741168e-01, 4.19881322e-01, 4.81222328e-01, 1.28224501e-01, 2.46034010e-01, 3.34971854e-01,
+    7.37216484e-01, 5.62134821e-02, 7.14089724e-01, 9.85549393e-01, 4.66295827e-01, 3.08722434e-03,
+    4.70237690e-01, 2.66524167e-01, 7.93875484e-01, 4.54795911e-02, 8.09702944e-01, 1.47709735e-02,
+    1.70082405e-01, 6.35905179e-01, 3.75379109e-01, 4.30315011e-01, 3.15788760e-01, 5.58065230e-01,
+    2.24643800e-01, 2.42142981e-01, 6.57283636e-01, 3.34921891e-01, 1.26588975e-01, 7.68064155e-01,
+    9.43856291e-01, 4.47518596e-01, 5.44453573e-01, 9.95764932e-01, 7.16444391e-01, 8.51019765e-01,
+    1.01179183e-01, 4.45473958e-01, 4.60327322e-01, 4.96895844e-02, 4.72907738e-01, 5.58987444e-01,
+    3.41027487e-01, 1.56175026e-01, 7.58283148e-01, 6.83600909e-01, 2.14623396e-01, 3.27348880e-01,
+    3.92517893e-01, 6.70418431e-01, 5.16440832e-01, 8.63140348e-01, 5.73277464e-01, 3.46608058e-01,
+    7.39396341e-01, 7.20852434e-01, 2.35653246e-02, 3.89935659e-01, 7.53783745e-01, 6.34563528e-01,
+    8.79339335e-01, 7.41599159e-02, 5.62433904e-01, 6.15553852e-01, 4.56956324e-01, 5.20047447e-01,
+    5.26845015e-02, 5.58471266e-01, 1.63632233e-01, 5.38936665e-02, 6.49593683e-01, 2.56838748e-01,
+    8.99035326e-01, 7.20847756e-01, 5.68954684e-01, 7.43684755e-01, 5.70924238e-01, 3.82318724e-01,
+    4.89328290e-01, 5.62208561e-01, 4.97540804e-02, 4.18011085e-01, 6.88041565e-01, 2.16234653e-01,
+    7.89548214e-01, 8.46136387e-01, 8.46816189e-01, 1.73842353e-01, 6.11627842e-02, 8.44440559e-01,
+    4.50646654e-01, 3.74785037e-01, 4.87196697e-01, 4.56276448e-01, 9.13284391e-01, 4.15715464e-01,
+    7.13597697e-01, 1.23641270e-02, 5.10031271e-01, 4.74601930e-02, 2.55731159e-01, 3.22090006e-01,
+    1.91165703e-01, 4.51170940e-01, 7.50843157e-01, 4.42420576e-01, 4.25380660e-01, 4.50667257e-01,
+    6.55689206e-01, 9.68257670e-02, 1.96528793e-01, 8.97343028e-01, 4.99940904e-01, 6.65504083e-01,
+    9.41828079e-01, 4.54397338e-01, 5.61893331e-01, 5.09839880e-01, 4.53117514e-01, 8.96804127e-02,
+    1.74888861e-01, 6.65641378e-01, 2.81668336e-01, 1.89532742e-01, 5.61668382e-01, 8.68330157e-02,
+    8.25092797e-01, 5.18106324e-01, 1.71904024e-01, 3.68385523e-01, 1.62005436e-01, 7.48507399e-01,
+    9.30274827e-01, 2.38198517e-01, 9.52222901e-01, 5.23587800e-01, 6.94384557e-01, 1.09338652e-01,
+    4.83356794e-01, 2.73050402e-01, 3.68027050e-01, 5.92366466e-01, 1.83192289e-01, 8.60376029e-01,
+    7.13926203e-01, 8.16750052e-01, 1.57890291e-01, 6.25691951e-01, 5.24831646e-01, 1.73873797e-01,
+    1.02429784e-01, 9.17488471e-01, 4.03584434e-01, 9.31170884e-01, 2.79386137e-01, 8.77745206e-01,
+    2.45200576e-01, 1.28896951e-01, 3.15713052e-01, 5.27874291e-01, 2.16444335e-01, 7.03883817e-01,
+    7.74738919e-02, 8.42422142e-01, 3.75598924e-01, 3.51002411e-01, 6.22752776e-01, 4.82407943e-01,
+    7.43107867e-01, 9.46182666e-01, 9.44344819e-01, 3.28124763e-01, 1.06147431e-01, 1.65102684e-01,
+    3.84060507e-01, 2.91057722e-01, 7.68173662e-02, 1.03543651e-01, 6.76698940e-01, 1.43141994e-01,
+    7.21342202e-01, 6.69471294e-03, 9.07298311e-01, 5.57080171e-01, 8.10954489e-01, 4.11120526e-01,
+    2.06407453e-01, 2.59590556e-01, 7.58512718e-01, 5.79873897e-01, 2.92875650e-01, 2.83686529e-01,
+    2.42829343e-01, 9.19323719e-01, 3.46832864e-01, 3.58238858e-01, 7.42827585e-01, 2.05760059e-01,
+    9.58438860e-01, 5.66326411e-01, 6.60292846e-01, 5.61095078e-02, 6.79465531e-01, 7.05118513e-01,
+    4.44713264e-01, 2.09732933e-01, 5.22732436e-01, 1.74396512e-01, 5.29356748e-01, 4.38475687e-01,
+    4.94036404e-01, 4.09785794e-01, 6.40025507e-01, 5.79371821e-01, 1.57726118e-01, 6.04572263e-01,
+    5.41072639e-01, 5.18847173e-01, 1.97093284e-01, 8.91767002e-01, 4.29050835e-01, 8.25490570e-01,
+    3.87699807e-01, 4.50705808e-01, 2.49371643e-01, 3.36074898e-01, 9.29925118e-01, 6.65393649e-01,
+    9.07275994e-01, 3.73075859e-01, 4.14044139e-03, 2.37463702e-01, 2.25893784e-01, 2.46900245e-01,
+    4.50350196e-01, 3.48618117e-01, 5.07193932e-01, 5.23435142e-01, 8.13611417e-01, 8.92715622e-01,
+    1.02623450e-01, 3.06088345e-01, 7.80461650e-01, 2.21453645e-01, 2.01419652e-01, 2.84254457e-01,
+    3.68286735e-01, 7.39358243e-01, 8.97879394e-01, 9.81599566e-01, 7.56526442e-01, 7.37645545e-01,
+    4.23976657e-02, 8.25922012e-01, 2.60956996e-01, 2.90702065e-01, 8.98388344e-01, 3.03733299e-01,
+    8.49071471e-01, 3.45835425e-01, 7.65458276e-01, 5.68094872e-01, 8.93770930e-01, 9.93161641e-01,
+    5.63368667e-02, 4.26548945e-01, 5.46745780e-01, 5.75674571e-01, 7.94599487e-01, 7.18935553e-02,
+    4.46492976e-01, 6.40240123e-01, 2.73246969e-01, 2.00465968e-01, 1.30718835e-01, 1.92492005e-01,
+    1.96617189e-01, 6.61271644e-01, 8.12687657e-01, 8.66342445e-01
 
    },
    -4}};
 
 typedef ConnectComponentsTest<int, float> ConnectComponentsTestF_Int;
-TEST_P(ConnectComponentsTestF_Int, Result) {
+TEST_P(ConnectComponentsTestF_Int, Result)
+{
   /**
-     * Verify the src & dst vertices on each edge have different colors
-     */
+   * Verify the src & dst vertices on each edge have different colors
+   */
   EXPECT_TRUE(final_edges == params.n_row - 1);
 }
 
-INSTANTIATE_TEST_CASE_P(ConnectComponentsTest, ConnectComponentsTestF_Int,
+INSTANTIATE_TEST_CASE_P(ConnectComponentsTest,
+                        ConnectComponentsTestF_Int,
                         ::testing::ValuesIn(fix_conn_inputsf2));
 };  // namespace sparse
 };  // end namespace raft
diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu
index d30114bbcb..2028513010 100644
--- a/cpp/test/sparse/convert_coo.cu
+++ b/cpp/test/sparse/convert_coo.cu
@@ -44,23 +44,25 @@ class CSRtoCOOTest : public ::testing::TestWithParam<CSRtoCOOInputs<Index_>> {
       stream(handle.get_stream()),
       ex_scan(params.ex_scan.size(), stream),
       verify(params.verify.size(), stream),
-      result(params.verify.size(), stream) {}
+      result(params.verify.size(), stream)
+  {
+  }
 
  protected:
   void SetUp() override {}
 
-  void Run() {
+  void Run()
+  {
     Index_ n_rows = params.ex_scan.size();
-    Index_ nnz = params.verify.size();
+    Index_ nnz    = params.verify.size();
 
     raft::update_device(ex_scan.data(), params.ex_scan.data(), n_rows, stream);
     raft::update_device(verify.data(), params.verify.data(), nnz, stream);
 
-    convert::csr_to_coo<Index_, 32>(ex_scan.data(), n_rows, result.data(), nnz,
-                                    stream);
+    convert::csr_to_coo<Index_, 32>(ex_scan.data(), n_rows, result.data(), nnz, stream);
 
-    ASSERT_TRUE(raft::devArrMatch<Index_>(verify.data(), result.data(), nnz,
-                                          raft::Compare<float>(), stream));
+    ASSERT_TRUE(
+      raft::devArrMatch<Index_>(verify.data(), result.data(), nnz, raft::Compare<float>(), stream));
   }
 
  protected:
@@ -86,9 +88,11 @@ const std::vector<CSRtoCOOInputs<int64_t>> csrtocoo_inputs_64 = {
   {{0, 4, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 2, 3}},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestI,
+INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest,
+                        CSRtoCOOTestI,
                         ::testing::ValuesIn(csrtocoo_inputs_32));
-INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestL,
+INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest,
+                        CSRtoCOOTestL,
                         ::testing::ValuesIn(csrtocoo_inputs_64));
 
 }  // namespace sparse
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index cd665934c2..18e8b874bb 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -36,14 +36,13 @@ struct SparseConvertCSRInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os,
-                           const SparseConvertCSRInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const SparseConvertCSRInputs<T>& dims)
+{
   return os;
 }
 
 template <typename T>
-class SparseConvertCSRTest
-  : public ::testing::TestWithParam<SparseConvertCSRInputs<T>> {
+class SparseConvertCSRTest : public ::testing::TestWithParam<SparseConvertCSRInputs<T>> {
  protected:
   void SetUp() override {}
 
@@ -53,18 +52,18 @@ class SparseConvertCSRTest
   SparseConvertCSRInputs<T> params;
 };
 
-const std::vector<SparseConvertCSRInputs<float>> inputsf = {
-  {5, 10, 5, 1234ULL}};
+const std::vector<SparseConvertCSRInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
 typedef SparseConvertCSRTest<float> SortedCOOToCSR;
-TEST_P(SortedCOOToCSR, Result) {
+TEST_P(SortedCOOToCSR, Result)
+{
   cudaStream_t stream;
   cudaStreamCreate(&stream);
 
   int nnz = 8;
 
-  int *in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
-  int *exp_h = new int[4]{0, 2, 4, 6};
+  int* in_h  = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
+  int* exp_h = new int[4]{0, 2, 4, 6};
 
   rmm::device_uvector<int> in(nnz, stream);
   rmm::device_uvector<int> exp(4, stream);
@@ -78,8 +77,7 @@ TEST_P(SortedCOOToCSR, Result) {
 
   convert::sorted_coo_to_csr<int>(in.data(), nnz, out.data(), 4, stream);
 
-  ASSERT_TRUE(
-    raft::devArrMatch<int>(out.data(), exp.data(), 4, raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out.data(), exp.data(), 4, raft::Compare<int>()));
 
   cudaStreamDestroy(stream);
 
@@ -87,8 +85,7 @@ TEST_P(SortedCOOToCSR, Result) {
   delete[] exp_h;
 }
 
-INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR, ::testing::ValuesIn(inputsf));
 
 /******************************** adj graph ********************************/
 
@@ -102,8 +99,7 @@ struct CSRAdjGraphInputs {
 };
 
 template <typename Index_>
-class CSRAdjGraphTest
-  : public ::testing::TestWithParam<CSRAdjGraphInputs<Index_>> {
+class CSRAdjGraphTest : public ::testing::TestWithParam<CSRAdjGraphInputs<Index_>> {
  public:
   CSRAdjGraphTest()
     : params(::testing::TestWithParam<CSRAdjGraphInputs<Index_>>::GetParam()),
@@ -111,24 +107,27 @@ class CSRAdjGraphTest
       row_ind(params.n_rows, stream),
       adj(params.n_rows * params.n_cols, stream),
       result(params.verify.size(), stream),
-      verify(params.verify.size(), stream) {}
+      verify(params.verify.size(), stream)
+  {
+  }
 
  protected:
   void SetUp() override { nnz = params.verify.size(); }
 
-  void Run() {
-    raft::update_device(row_ind.data(), params.row_ind.data(), params.n_rows,
+  void Run()
+  {
+    raft::update_device(row_ind.data(), params.row_ind.data(), params.n_rows, stream);
+    raft::update_device(adj.data(),
+                        reinterpret_cast<bool*>(params.adj.data()),
+                        params.n_rows * params.n_cols,
                         stream);
-    raft::update_device(adj.data(), reinterpret_cast<bool *>(params.adj.data()),
-                        params.n_rows * params.n_cols, stream);
     raft::update_device(verify.data(), params.verify.data(), nnz, stream);
 
-    convert::csr_adj_graph_batched<Index_, 32>(row_ind.data(), params.n_cols,
-                                               nnz, params.n_rows, adj.data(),
-                                               result.data(), stream);
+    convert::csr_adj_graph_batched<Index_, 32>(
+      row_ind.data(), params.n_cols, nnz, params.n_rows, adj.data(), result.data(), stream);
 
-    ASSERT_TRUE(raft::devArrMatch<Index_>(verify.data(), result.data(), nnz,
-                                          raft::Compare<Index_>()));
+    ASSERT_TRUE(
+      raft::devArrMatch<Index_>(verify.data(), result.data(), nnz, raft::Compare<Index_>()));
   }
 
  protected:
@@ -162,9 +161,11 @@ const std::vector<CSRAdjGraphInputs<int64_t>> csradjgraph_inputs_l = {
    {0, 1, 2, 0, 1, 2, 0, 1, 2}},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestI,
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
+                        CSRAdjGraphTestI,
                         ::testing::ValuesIn(csradjgraph_inputs_i));
-INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestL,
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
+                        CSRAdjGraphTestL,
                         ::testing::ValuesIn(csradjgraph_inputs_l));
 
 }  // namespace sparse
diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu
index 33893649bd..16372dc0f6 100644
--- a/cpp/test/sparse/csr_row_slice.cu
+++ b/cpp/test/sparse/csr_row_slice.cu
@@ -47,18 +47,16 @@ struct CSRRowSliceInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(::std::ostream &os,
-                           const CSRRowSliceInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const CSRRowSliceInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class CSRRowSliceTest
-  : public ::testing::TestWithParam<CSRRowSliceInputs<value_idx, value_t>> {
+class CSRRowSliceTest : public ::testing::TestWithParam<CSRRowSliceInputs<value_idx, value_t>> {
  public:
   CSRRowSliceTest()
-    : params(::testing::TestWithParam<
-             CSRRowSliceInputs<value_idx, value_t>>::GetParam()),
+    : params(::testing::TestWithParam<CSRRowSliceInputs<value_idx, value_t>>::GetParam()),
       stream(handle.get_stream()),
       indptr(0, stream),
       indices(0, stream),
@@ -68,7 +66,8 @@ class CSRRowSliceTest
       out_data_ref(0, stream),
       out_indptr(0, stream),
       out_indices(0, stream),
-      out_data(0, stream) {
+      out_data(0, stream)
+  {
     indptr.resize(params.indptr_h.size(), stream);
     indices.resize(params.indices_h.size(), stream);
     data.resize(params.data_h.size(), stream);
@@ -81,54 +80,65 @@ class CSRRowSliceTest
   }
 
  protected:
-  void make_data() {
-    std::vector<value_idx> indptr_h = params.indptr_h;
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h = params.data_h;
+    std::vector<value_t> data_h      = params.data_h;
 
     update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream);
     update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
     update_device(data.data(), data_h.data(), data_h.size(), stream);
 
-    std::vector<value_idx> out_indptr_ref_h = params.out_indptr_ref_h;
+    std::vector<value_idx> out_indptr_ref_h  = params.out_indptr_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
-    std::vector<value_t> out_data_ref_h = params.out_data_ref_h;
-
-    update_device(out_indptr_ref.data(), out_indptr_ref_h.data(),
-                  out_indptr_ref_h.size(), stream);
-    update_device(out_indices_ref.data(), out_indices_ref_h.data(),
-                  out_indices_ref_h.size(), stream);
-    update_device(out_data_ref.data(), out_data_ref_h.data(),
-                  out_data_ref_h.size(), stream);
+    std::vector<value_t> out_data_ref_h      = params.out_data_ref_h;
+
+    update_device(out_indptr_ref.data(), out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream);
+    update_device(
+      out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream);
+    update_device(out_data_ref.data(), out_data_ref_h.data(), out_data_ref_h.size(), stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     make_data();
 
     int csr_start_offset;
     int csr_stop_offset;
 
-    raft::sparse::op::csr_row_slice_indptr(
-      params.start_row, params.stop_row, indptr.data(), out_indptr.data(),
-      &csr_start_offset, &csr_stop_offset, stream);
-
-    raft::sparse::op::csr_row_slice_populate(
-      csr_start_offset, csr_stop_offset, indices.data(), data.data(),
-      out_indices.data(), out_data.data(), stream);
+    raft::sparse::op::csr_row_slice_indptr(params.start_row,
+                                           params.stop_row,
+                                           indptr.data(),
+                                           out_indptr.data(),
+                                           &csr_start_offset,
+                                           &csr_stop_offset,
+                                           stream);
+
+    raft::sparse::op::csr_row_slice_populate(csr_start_offset,
+                                             csr_stop_offset,
+                                             indices.data(),
+                                             data.data(),
+                                             out_indices.data(),
+                                             out_data.data(),
+                                             stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void compare() {
-    ASSERT_TRUE(devArrMatch(out_indptr.data(), out_indptr_ref.data(),
+  void compare()
+  {
+    ASSERT_TRUE(devArrMatch(out_indptr.data(),
+                            out_indptr_ref.data(),
                             params.out_indptr_ref_h.size(),
                             Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(out_indices.data(), out_indices_ref.data(),
+    ASSERT_TRUE(devArrMatch(out_indices.data(),
+                            out_indices_ref.data(),
                             params.out_indices_ref_h.size(),
                             Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(out_data.data(), out_data_ref.data(),
-                            params.out_data_ref_h.size(), Compare<value_t>()));
+    ASSERT_TRUE(devArrMatch(
+      out_data.data(), out_data_ref.data(), params.out_data_ref_h.size(), Compare<value_t>()));
   }
 
  protected:
@@ -173,8 +183,7 @@ const std::vector<CSRRowSliceInputs<int, float>> inputs_i32_f = {
 };
 typedef CSRRowSliceTest<int, float> CSRRowSliceTestF;
 TEST_P(CSRRowSliceTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(CSRRowSliceTest, CSRRowSliceTestF,
-                        ::testing::ValuesIn(inputs_i32_f));
+INSTANTIATE_TEST_CASE_P(CSRRowSliceTest, CSRRowSliceTestF, ::testing::ValuesIn(inputs_i32_f));
 
 };  // end namespace sparse
 };  // end namespace raft
diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu
index 1a206c8499..85f00cdd27 100644
--- a/cpp/test/sparse/csr_to_dense.cu
+++ b/cpp/test/sparse/csr_to_dense.cu
@@ -45,24 +45,23 @@ struct CSRToDenseInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(::std::ostream &os,
-                           const CSRToDenseInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const CSRToDenseInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class CSRToDenseTest
-  : public ::testing::TestWithParam<CSRToDenseInputs<value_idx, value_t>> {
+class CSRToDenseTest : public ::testing::TestWithParam<CSRToDenseInputs<value_idx, value_t>> {
  public:
   CSRToDenseTest()
-    : params(::testing::TestWithParam<
-             CSRToDenseInputs<value_idx, value_t>>::GetParam()),
+    : params(::testing::TestWithParam<CSRToDenseInputs<value_idx, value_t>>::GetParam()),
       stream(raft_handle.get_stream()),
       indptr(0, stream),
       indices(0, stream),
       data(0, stream),
       out_ref(0, stream),
-      out(0, stream) {
+      out(0, stream)
+  {
     indptr.resize(params.indptr_h.size(), stream);
     indices.resize(params.indices_h.size(), stream);
     data.resize(params.data_h.size(), stream);
@@ -71,10 +70,11 @@ class CSRToDenseTest
   }
 
  protected:
-  void make_data() {
-    std::vector<value_idx> indptr_h = params.indptr_h;
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h = params.data_h;
+    std::vector<value_t> data_h      = params.data_h;
 
     update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream);
     update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
@@ -86,22 +86,31 @@ class CSRToDenseTest
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     CUSPARSE_CHECK(cusparseCreate(&handle));
 
     make_data();
 
-    convert::csr_to_dense(handle, params.nrows, params.ncols, indptr.data(),
-                          indices.data(), data.data(), params.nrows, out.data(),
-                          stream, true);
+    convert::csr_to_dense(handle,
+                          params.nrows,
+                          params.ncols,
+                          indptr.data(),
+                          indices.data(),
+                          data.data(),
+                          params.nrows,
+                          out.data(),
+                          stream,
+                          true);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUSPARSE_CHECK(cusparseDestroy(handle));
   }
 
-  void compare() {
-    ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.out_ref_h.size(),
-                            Compare<value_t>()));
+  void compare()
+  {
+    ASSERT_TRUE(
+      devArrMatch(out.data(), out_ref.data(), params.out_ref_h.size(), Compare<value_t>()));
   }
 
  protected:
@@ -129,13 +138,26 @@ const std::vector<CSRToDenseInputs<int, float>> inputs_i32_f = {
    {0, 2, 4, 6, 8},
    {0, 1, 2, 3, 0, 1, 2, 3},  // indices
    {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f},
-   {1.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 5.0f, 50.0f, 28.0f, 0.0f, 0.0f,
-    0.0f, 0.0f, 16.0f, 2.0f}},
+   {1.0f,
+    3.0f,
+    0.0f,
+    0.0f,
+    0.0f,
+    0.0f,
+    1.0f,
+    5.0f,
+    50.0f,
+    28.0f,
+    0.0f,
+    0.0f,
+    0.0f,
+    0.0f,
+    16.0f,
+    2.0f}},
 };
 typedef CSRToDenseTest<int, float> CSRToDenseTestF;
 TEST_P(CSRToDenseTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(CSRToDenseTest, CSRToDenseTestF,
-                        ::testing::ValuesIn(inputs_i32_f));
+INSTANTIATE_TEST_CASE_P(CSRToDenseTest, CSRToDenseTestF, ::testing::ValuesIn(inputs_i32_f));
 
 };  // end namespace sparse
 };  // end namespace raft
diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu
index 8983f10d2b..3380eaa6fb 100644
--- a/cpp/test/sparse/csr_transpose.cu
+++ b/cpp/test/sparse/csr_transpose.cu
@@ -47,18 +47,16 @@ struct CSRTransposeInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(::std::ostream &os,
-                           const CSRTransposeInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const CSRTransposeInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class CSRTransposeTest
-  : public ::testing::TestWithParam<CSRTransposeInputs<value_idx, value_t>> {
+class CSRTransposeTest : public ::testing::TestWithParam<CSRTransposeInputs<value_idx, value_t>> {
  public:
   CSRTransposeTest()
-    : params(::testing::TestWithParam<
-             CSRTransposeInputs<value_idx, value_t>>::GetParam()),
+    : params(::testing::TestWithParam<CSRTransposeInputs<value_idx, value_t>>::GetParam()),
       stream(raft_handle.get_stream()),
       indptr(0, stream),
       indices(0, stream),
@@ -68,7 +66,8 @@ class CSRTransposeTest
       out_data_ref(0, stream),
       out_indptr(0, stream),
       out_indices(0, stream),
-      out_data(0, stream) {
+      out_data(0, stream)
+  {
     indptr.resize(params.indptr_h.size(), stream);
     indices.resize(params.indices_h.size(), stream);
     data.resize(params.data_h.size(), stream);
@@ -81,50 +80,60 @@ class CSRTransposeTest
   }
 
  protected:
-  void make_data() {
-    std::vector<value_idx> indptr_h = params.indptr_h;
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h = params.data_h;
+    std::vector<value_t> data_h      = params.data_h;
 
     update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream);
     update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
     update_device(data.data(), data_h.data(), data_h.size(), stream);
 
-    std::vector<value_idx> out_indptr_ref_h = params.out_indptr_ref_h;
+    std::vector<value_idx> out_indptr_ref_h  = params.out_indptr_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
-    std::vector<value_t> out_data_ref_h = params.out_data_ref_h;
-
-    update_device(out_indptr_ref.data(), out_indptr_ref_h.data(),
-                  out_indptr_ref_h.size(), stream);
-    update_device(out_indices_ref.data(), out_indices_ref_h.data(),
-                  out_indices_ref_h.size(), stream);
-    update_device(out_data_ref.data(), out_data_ref_h.data(),
-                  out_data_ref_h.size(), stream);
+    std::vector<value_t> out_data_ref_h      = params.out_data_ref_h;
+
+    update_device(out_indptr_ref.data(), out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream);
+    update_device(
+      out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream);
+    update_device(out_data_ref.data(), out_data_ref_h.data(), out_data_ref_h.size(), stream);
   }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     CUSPARSE_CHECK(cusparseCreate(&handle));
 
     make_data();
 
-    raft::sparse::linalg::csr_transpose(
-      handle, indptr.data(), indices.data(), data.data(), out_indptr.data(),
-      out_indices.data(), out_data.data(), params.nrows, params.ncols,
-      params.nnz, stream);
+    raft::sparse::linalg::csr_transpose(handle,
+                                        indptr.data(),
+                                        indices.data(),
+                                        data.data(),
+                                        out_indptr.data(),
+                                        out_indices.data(),
+                                        out_data.data(),
+                                        params.nrows,
+                                        params.ncols,
+                                        params.nnz,
+                                        stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUSPARSE_CHECK(cusparseDestroy(handle));
   }
 
-  void compare() {
-    ASSERT_TRUE(devArrMatch(out_indptr.data(), out_indptr_ref.data(),
+  void compare()
+  {
+    ASSERT_TRUE(devArrMatch(out_indptr.data(),
+                            out_indptr_ref.data(),
                             params.out_indptr_ref_h.size(),
                             Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(out_indices.data(), out_indices_ref.data(),
+    ASSERT_TRUE(devArrMatch(out_indices.data(),
+                            out_indices_ref.data(),
                             params.out_indices_ref_h.size(),
                             Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(out_data.data(), out_data_ref.data(),
-                            params.out_data_ref_h.size(), Compare<value_t>()));
+    ASSERT_TRUE(devArrMatch(
+      out_data.data(), out_data_ref.data(), params.out_data_ref_h.size(), Compare<value_t>()));
   }
 
  protected:
@@ -163,8 +172,7 @@ const std::vector<CSRTransposeInputs<int, float>> inputs_i32_f = {
 };
 typedef CSRTransposeTest<int, float> CSRTransposeTestF;
 TEST_P(CSRTransposeTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(CSRTransposeTest, CSRTransposeTestF,
-                        ::testing::ValuesIn(inputs_i32_f));
+INSTANTIATE_TEST_CASE_P(CSRTransposeTest, CSRTransposeTestF, ::testing::ValuesIn(inputs_i32_f));
 
 };  // end namespace sparse
 };  // end namespace raft
diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu
index fbadadb29d..8b1c7988d6 100644
--- a/cpp/test/sparse/degree.cu
+++ b/cpp/test/sparse/degree.cu
@@ -33,8 +33,7 @@ struct SparseDegreeInputs {
 };
 
 template <typename T>
-class SparseDegreeTests
-  : public ::testing::TestWithParam<SparseDegreeInputs<T>> {
+class SparseDegreeTests : public ::testing::TestWithParam<SparseDegreeInputs<T>> {
  protected:
   void SetUp() override {}
 
@@ -47,20 +46,19 @@ class SparseDegreeTests
 const std::vector<SparseDegreeInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
 typedef SparseDegreeTests<float> COODegree;
-TEST_P(COODegree, Result) {
+TEST_P(COODegree, Result)
+{
   cudaStream_t stream;
   cudaStreamCreate(&stream);
 
   int in_rows_h[5] = {0, 0, 1, 2, 2};
-  int verify_h[5] = {2, 1, 2, 0, 0};
+  int verify_h[5]  = {2, 1, 2, 0, 0};
 
   rmm::device_uvector<int> in_rows(5, stream);
   rmm::device_uvector<int> verify(5, stream);
   rmm::device_uvector<int> results(5, stream);
-  CUDA_CHECK(
-    cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream));
-  CUDA_CHECK(
-    cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream));
 
   raft::update_device(in_rows.data(), *&in_rows_h, 5, stream);
   raft::update_device(verify.data(), *&verify_h, 5, stream);
@@ -68,50 +66,43 @@ TEST_P(COODegree, Result) {
   linalg::coo_degree<32>(in_rows.data(), 5, results.data(), stream);
   cudaDeviceSynchronize();
 
-  ASSERT_TRUE(raft::devArrMatch<int>(verify.data(), results.data(), 5,
-                                     raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(verify.data(), results.data(), 5, raft::Compare<int>()));
 
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
 typedef SparseDegreeTests<float> COODegreeNonzero;
-TEST_P(COODegreeNonzero, Result) {
+TEST_P(COODegreeNonzero, Result)
+{
   cudaStream_t stream;
   cudaStreamCreate(&stream);
 
-  int in_rows_h[5] = {0, 0, 1, 2, 2};
+  int in_rows_h[5]   = {0, 0, 1, 2, 2};
   float in_vals_h[5] = {0.0, 5.0, 0.0, 1.0, 1.0};
-  int verify_h[5] = {1, 0, 2, 0, 0};
+  int verify_h[5]    = {1, 0, 2, 0, 0};
 
   rmm::device_uvector<int> in_rows(5, stream);
   rmm::device_uvector<int> verify(5, stream);
   rmm::device_uvector<int> results(5, stream);
   rmm::device_uvector<float> in_vals(5, stream);
-  CUDA_CHECK(
-    cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream));
-  CUDA_CHECK(
-    cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream));
-  CUDA_CHECK(
-    cudaMemsetAsync(in_vals.data(), 0, in_vals.size() * sizeof(float), stream));
+  CUDA_CHECK(cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(in_vals.data(), 0, in_vals.size() * sizeof(float), stream));
 
   raft::update_device(in_rows.data(), *&in_rows_h, 5, stream);
   raft::update_device(verify.data(), *&verify_h, 5, stream);
   raft::update_device(in_vals.data(), *&in_vals_h, 5, stream);
 
-  linalg::coo_degree_nz<32, float>(in_rows.data(), in_vals.data(), 5,
-                                   results.data(), stream);
+  linalg::coo_degree_nz<32, float>(in_rows.data(), in_vals.data(), 5, results.data(), stream);
   cudaDeviceSynchronize();
 
-  ASSERT_TRUE(raft::devArrMatch<int>(verify.data(), results.data(), 5,
-                                     raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(verify.data(), results.data(), 5, raft::Compare<int>()));
 
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
-INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree,
-                        ::testing::ValuesIn(inputsf));
-INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegreeNonzero,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegreeNonzero, ::testing::ValuesIn(inputsf));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index d24199c5fc..000d58d029 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -55,28 +55,26 @@ struct InputConfiguration {
 };
 
 using dense_smem_strategy_t = detail::dense_smem_strategy<int, float, 1024>;
-using hash_strategy_t = detail::hash_strategy<int, float, 1024>;
+using hash_strategy_t       = detail::hash_strategy<int, float, 1024>;
 
 template <typename value_idx, typename value_t, typename strategy_t>
 struct SparseDistanceCOOSPMVInputs {
   InputConfiguration<value_idx, value_t> input_configuration;
 
   float capacity_threshold = 0.5;
-  int map_size =
-    detail::hash_strategy<value_idx, value_t, 1024>::get_map_size();
+  int map_size             = detail::hash_strategy<value_idx, value_t, 1024>::get_map_size();
 };
 
 template <typename value_idx, typename value_t, typename strategy_t>
-::std::ostream &operator<<(
-  ::std::ostream &os,
-  const SparseDistanceCOOSPMVInputs<value_idx, value_t, strategy_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os,
+                           const SparseDistanceCOOSPMVInputs<value_idx, value_t, strategy_t>& dims)
+{
   return os;
 }
 
 template <typename value_idx, typename value_t, typename strategy_t>
 class SparseDistanceCOOSPMVTest
-  : public ::testing::TestWithParam<
-      SparseDistanceCOOSPMVInputs<value_idx, value_t, strategy_t>> {
+  : public ::testing::TestWithParam<SparseDistanceCOOSPMVInputs<value_idx, value_t, strategy_t>> {
  public:
   SparseDistanceCOOSPMVTest()
     : dist_config(handle),
@@ -84,62 +82,74 @@ class SparseDistanceCOOSPMVTest
       indices(0, handle.get_stream()),
       data(0, handle.get_stream()),
       out_dists(0, handle.get_stream()),
-      out_dists_ref(0, handle.get_stream()) {}
+      out_dists_ref(0, handle.get_stream())
+  {
+  }
 
-  template <typename U,
-            std::enable_if_t<std::is_same_v<U, hash_strategy_t>> * = nullptr>
-  U make_strategy() {
+  template <typename U, std::enable_if_t<std::is_same_v<U, hash_strategy_t>>* = nullptr>
+  U make_strategy()
+  {
     return strategy_t(dist_config, params.capacity_threshold, params.map_size);
   }
 
-  template <typename U, std::enable_if_t<
-                          std::is_same_v<U, dense_smem_strategy_t>> * = nullptr>
-  U make_strategy() {
+  template <typename U, std::enable_if_t<std::is_same_v<U, dense_smem_strategy_t>>* = nullptr>
+  U make_strategy()
+  {
     return strategy_t(dist_config);
   }
 
   template <typename reduce_f, typename accum_f, typename write_f>
-  void compute_dist(reduce_f reduce_func, accum_f accum_func,
-                    write_f write_func, bool rev = true) {
-    rmm::device_uvector<value_idx> coo_rows(
-      max(dist_config.b_nnz, dist_config.a_nnz),
-      dist_config.handle.get_stream());
-
-    raft::sparse::convert::csr_to_coo(dist_config.b_indptr, dist_config.b_nrows,
-                                      coo_rows.data(), dist_config.b_nnz,
+  void compute_dist(reduce_f reduce_func, accum_f accum_func, write_f write_func, bool rev = true)
+  {
+    rmm::device_uvector<value_idx> coo_rows(max(dist_config.b_nnz, dist_config.a_nnz),
+                                            dist_config.handle.get_stream());
+
+    raft::sparse::convert::csr_to_coo(dist_config.b_indptr,
+                                      dist_config.b_nrows,
+                                      coo_rows.data(),
+                                      dist_config.b_nnz,
                                       dist_config.handle.get_stream());
 
     strategy_t selected_strategy = make_strategy<strategy_t>();
-    detail::balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
-      out_dists.data(), dist_config, coo_rows.data(), reduce_func, accum_func,
-      write_func, selected_strategy);
+    detail::balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(out_dists.data(),
+                                                                       dist_config,
+                                                                       coo_rows.data(),
+                                                                       reduce_func,
+                                                                       accum_func,
+                                                                       write_func,
+                                                                       selected_strategy);
 
     if (rev) {
-      raft::sparse::convert::csr_to_coo(
-        dist_config.a_indptr, dist_config.a_nrows, coo_rows.data(),
-        dist_config.a_nnz, dist_config.handle.get_stream());
-
-      detail::balanced_coo_pairwise_generalized_spmv_rev<value_idx, value_t>(
-        out_dists.data(), dist_config, coo_rows.data(), reduce_func, accum_func,
-        write_func, selected_strategy);
+      raft::sparse::convert::csr_to_coo(dist_config.a_indptr,
+                                        dist_config.a_nrows,
+                                        coo_rows.data(),
+                                        dist_config.a_nnz,
+                                        dist_config.handle.get_stream());
+
+      detail::balanced_coo_pairwise_generalized_spmv_rev<value_idx, value_t>(out_dists.data(),
+                                                                             dist_config,
+                                                                             coo_rows.data(),
+                                                                             reduce_func,
+                                                                             accum_func,
+                                                                             write_func,
+                                                                             selected_strategy);
     }
   }
 
-  void run_spmv() {
+  void run_spmv()
+  {
     switch (params.input_configuration.metric) {
       case raft::distance::DistanceType::InnerProduct:
-        compute_dist(detail::Product(), detail::Sum(), detail::AtomicAdd(),
-                     true);
+        compute_dist(detail::Product(), detail::Sum(), detail::AtomicAdd(), true);
         break;
       case raft::distance::DistanceType::L2Unexpanded:
         compute_dist(detail::SqDiff(), detail::Sum(), detail::AtomicAdd());
         break;
       case raft::distance::DistanceType::Canberra:
         compute_dist(
-          [] __device__(value_t a, value_t b) {
-            return fabsf(a - b) / (fabsf(a) + fabsf(b));
-          },
-          detail::Sum(), detail::AtomicAdd());
+          [] __device__(value_t a, value_t b) { return fabsf(a - b) / (fabsf(a) + fabsf(b)); },
+          detail::Sum(),
+          detail::AtomicAdd());
         break;
       case raft::distance::DistanceType::L1:
         compute_dist(detail::AbsDiff(), detail::Sum(), detail::AtomicAdd());
@@ -148,26 +158,27 @@ class SparseDistanceCOOSPMVTest
         compute_dist(detail::AbsDiff(), detail::Max(), detail::AtomicMax());
         break;
       case raft::distance::DistanceType::LpUnexpanded: {
-        compute_dist(detail::PDiff(params.input_configuration.metric_arg),
-                     detail::Sum(), detail::AtomicAdd());
+        compute_dist(
+          detail::PDiff(params.input_configuration.metric_arg), detail::Sum(), detail::AtomicAdd());
         float p = 1.0f / params.input_configuration.metric_arg;
         raft::linalg::unaryOp<value_t>(
-          out_dists.data(), out_dists.data(),
+          out_dists.data(),
+          out_dists.data(),
           dist_config.a_nrows * dist_config.b_nrows,
           [=] __device__(value_t input) { return powf(input, p); },
           dist_config.handle.get_stream());
 
       } break;
-      default:
-        throw raft::exception("Unknown distance");
+      default: throw raft::exception("Unknown distance");
     }
   }
 
  protected:
-  void make_data() {
-    std::vector<value_idx> indptr_h = params.input_configuration.indptr_h;
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.input_configuration.indptr_h;
     std::vector<value_idx> indices_h = params.input_configuration.indices_h;
-    std::vector<value_t> data_h = params.input_configuration.data_h;
+    std::vector<value_t> data_h      = params.input_configuration.data_h;
 
     auto stream = handle.get_stream();
     indptr.resize(indptr_h.size(), stream);
@@ -178,33 +189,32 @@ class SparseDistanceCOOSPMVTest
     update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
     update_device(data.data(), data_h.data(), data_h.size(), stream);
 
-    std::vector<value_t> out_dists_ref_h =
-      params.input_configuration.out_dists_ref_h;
+    std::vector<value_t> out_dists_ref_h = params.input_configuration.out_dists_ref_h;
 
     out_dists_ref.resize((indptr_h.size() - 1) * (indptr_h.size() - 1), stream);
 
-    update_device(out_dists_ref.data(), out_dists_ref_h.data(),
-                  out_dists_ref_h.size(), stream);
+    update_device(out_dists_ref.data(), out_dists_ref_h.data(), out_dists_ref_h.size(), stream);
   }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<
       SparseDistanceCOOSPMVInputs<value_idx, value_t, strategy_t>>::GetParam();
 
     make_data();
 
-    dist_config.b_nrows = params.input_configuration.indptr_h.size() - 1;
-    dist_config.b_ncols = params.input_configuration.n_cols;
-    dist_config.b_nnz = params.input_configuration.indices_h.size();
-    dist_config.b_indptr = indptr.data();
+    dist_config.b_nrows   = params.input_configuration.indptr_h.size() - 1;
+    dist_config.b_ncols   = params.input_configuration.n_cols;
+    dist_config.b_nnz     = params.input_configuration.indices_h.size();
+    dist_config.b_indptr  = indptr.data();
     dist_config.b_indices = indices.data();
-    dist_config.b_data = data.data();
-    dist_config.a_nrows = params.input_configuration.indptr_h.size() - 1;
-    dist_config.a_ncols = params.input_configuration.n_cols;
-    dist_config.a_nnz = params.input_configuration.indices_h.size();
-    dist_config.a_indptr = indptr.data();
+    dist_config.b_data    = data.data();
+    dist_config.a_nrows   = params.input_configuration.indptr_h.size() - 1;
+    dist_config.a_ncols   = params.input_configuration.n_cols;
+    dist_config.a_nnz     = params.input_configuration.indices_h.size();
+    dist_config.a_indptr  = indptr.data();
     dist_config.a_indices = indices.data();
-    dist_config.a_data = data.data();
+    dist_config.a_data    = data.data();
 
     int out_size = dist_config.a_nrows * dist_config.b_nrows;
 
@@ -215,8 +225,10 @@ class SparseDistanceCOOSPMVTest
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
   }
 
-  void compare() {
-    ASSERT_TRUE(devArrMatch(out_dists_ref.data(), out_dists.data(),
+  void compare()
+  {
+    ASSERT_TRUE(devArrMatch(out_dists_ref.data(),
+                            out_dists.data(),
                             params.input_configuration.out_dists_ref_h.size(),
                             CompareApprox<value_t>(1e-3)));
   }
@@ -241,8 +253,7 @@ const InputConfiguration<int, float> input_inner_product = {
   {0, 2, 4, 6, 8},
   {0, 1, 0, 1, 0, 1, 0, 1},
   {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f},
-  {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,
-   5.0},
+  {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0},
   raft::distance::DistanceType::InnerProduct,
   0.0};
 
@@ -273,384 +284,379 @@ const InputConfiguration<int, float> input_l2_unexpanded = {
   raft::distance::DistanceType::L2Unexpanded,
   0.0};
 
-const InputConfiguration<int, float> input_canberra =
-  {10,
-   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
-    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
-    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
-    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
-    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
-    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
-   {0.0,
-    3.3954660629919076,
-    5.6469232737388815,
-    6.373112846266441,
-    4.0212880272531715,
-    6.916281504639404,
-    5.741508386786526,
-    5.411470999663036,
-    9.0,
-    4.977014354725805,
-    3.3954660629919076,
-    0.0,
-    7.56256082439209,
-    5.540261147481582,
-    4.832322929216881,
-    4.62003193872216,
-    6.498056792320361,
-    4.309846252268695,
-    6.317531174829905,
-    6.016362684141827,
-    5.6469232737388815,
-    7.56256082439209,
-    0.0,
-    5.974878731322299,
-    4.898357301336036,
-    6.442097410320605,
-    5.227077347287883,
-    7.134101195584642,
-    5.457753923371659,
-    7.0,
-    6.373112846266441,
-    5.540261147481582,
-    5.974878731322299,
-    0.0,
-    5.5507273748583,
-    4.897749658726415,
-    9.0,
-    8.398776718824767,
-    3.908281400328807,
-    4.83431066343688,
-    4.0212880272531715,
-    4.832322929216881,
-    4.898357301336036,
-    5.5507273748583,
-    0.0,
-    6.632989819428174,
-    7.438852294822894,
-    5.6631570310967465,
-    7.579428202635459,
-    6.760811985364303,
-    6.916281504639404,
-    4.62003193872216,
-    6.442097410320605,
-    4.897749658726415,
-    6.632989819428174,
-    0.0,
-    5.249404187382862,
-    6.072559523278559,
-    4.07661278488929,
-    6.19678948003145,
-    5.741508386786526,
-    6.498056792320361,
-    5.227077347287883,
-    9.0,
-    7.438852294822894,
-    5.249404187382862,
-    0.0,
-    3.854811639654704,
-    6.652724827169063,
-    5.298236851430971,
-    5.411470999663036,
-    4.309846252268695,
-    7.134101195584642,
-    8.398776718824767,
-    5.6631570310967465,
-    6.072559523278559,
-    3.854811639654704,
-    0.0,
-    7.529184598969917,
-    6.903282911791188,
-    9.0,
-    6.317531174829905,
-    5.457753923371659,
-    3.908281400328807,
-    7.579428202635459,
-    4.07661278488929,
-    6.652724827169063,
-    7.529184598969917,
-    0.0,
-    7.0,
-    4.977014354725805,
-    6.016362684141827,
-    7.0,
-    4.83431066343688,
-    6.760811985364303,
-    6.19678948003145,
-    5.298236851430971,
-    6.903282911791188,
-    7.0,
-    0.0},
-   raft::distance::DistanceType::Canberra,
-   0.0};
-
-const InputConfiguration<int, float> input_lp_unexpanded =
-  {10,
-   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
-    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
-    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
-    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
-    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
-    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
-   {0.0,
-    1.31462855332296,
-    1.3690307816129905,
-    1.698603990921237,
-    1.3460470789553531,
-    1.6636670712582544,
-    1.2651744044972217,
-    1.1938329352055201,
-    1.8811409082590185,
-    1.3653115050624267,
-    1.31462855332296,
-    0.0,
-    1.9447722703291133,
-    1.42818777206562,
-    1.4685491458946494,
-    1.3071999866010466,
-    1.4988622861692171,
-    0.9698559287406783,
-    1.4972023224597841,
-    1.5243383567266802,
-    1.3690307816129905,
-    1.9447722703291133,
-    0.0,
-    1.2748400840107568,
-    1.0599569946448246,
-    1.546591282841402,
-    1.147526531928459,
-    1.447002179128145,
-    1.5982242387673176,
-    1.3112533607072414,
-    1.698603990921237,
-    1.42818777206562,
-    1.2748400840107568,
-    0.0,
-    1.038121552545461,
-    1.011788365364402,
-    1.3907391109256988,
-    1.3128200942311496,
-    1.19595706584447,
-    1.3233328139624725,
-    1.3460470789553531,
-    1.4685491458946494,
-    1.0599569946448246,
-    1.038121552545461,
-    0.0,
-    1.3642741698145529,
-    1.3493868683808095,
-    1.394942694628328,
-    1.572881849642552,
-    1.380122665319464,
-    1.6636670712582544,
-    1.3071999866010466,
-    1.546591282841402,
-    1.011788365364402,
-    1.3642741698145529,
-    0.0,
-    1.018961640373018,
-    1.0114394258945634,
-    0.8338711034820684,
-    1.1247823842299223,
-    1.2651744044972217,
-    1.4988622861692171,
-    1.147526531928459,
-    1.3907391109256988,
-    1.3493868683808095,
-    1.018961640373018,
-    0.0,
-    0.7701238110357329,
-    1.245486437864406,
-    0.5551259549534626,
-    1.1938329352055201,
-    0.9698559287406783,
-    1.447002179128145,
-    1.3128200942311496,
-    1.394942694628328,
-    1.0114394258945634,
-    0.7701238110357329,
-    0.0,
-    1.1886800117391216,
-    1.0083692448135637,
-    1.8811409082590185,
-    1.4972023224597841,
-    1.5982242387673176,
-    1.19595706584447,
-    1.572881849642552,
-    0.8338711034820684,
-    1.245486437864406,
-    1.1886800117391216,
-    0.0,
-    1.3661374102525012,
-    1.3653115050624267,
-    1.5243383567266802,
-    1.3112533607072414,
-    1.3233328139624725,
-    1.380122665319464,
-    1.1247823842299223,
-    0.5551259549534626,
-    1.0083692448135637,
-    1.3661374102525012,
-    0.0},
-   raft::distance::DistanceType::LpUnexpanded,
-   2.0};
-
-const InputConfiguration<int, float> input_linf =
-  {10,
-   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
-    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
-    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
-    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
-    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
-    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
-   {0.0,
-    0.9251771844789913,
-    0.9036452083899731,
-    0.9251771844789913,
-    0.8706483735804971,
-    0.9251771844789913,
-    0.717493881903289,
-    0.6920214832303888,
-    0.9251771844789913,
-    0.9251771844789913,
-    0.9251771844789913,
-    0.0,
-    0.9036452083899731,
-    0.8655339692155823,
-    0.8706483735804971,
-    0.8655339692155823,
-    0.8655339692155823,
-    0.6329837991017668,
-    0.8655339692155823,
-    0.8655339692155823,
-    0.9036452083899731,
-    0.9036452083899731,
-    0.0,
-    0.7988276152181608,
-    0.7028075145996631,
-    0.9036452083899731,
-    0.9036452083899731,
-    0.9036452083899731,
-    0.8429599432532096,
-    0.9036452083899731,
-    0.9251771844789913,
-    0.8655339692155823,
-    0.7988276152181608,
-    0.0,
-    0.48376552205293305,
-    0.8206394616536681,
-    0.8206394616536681,
-    0.8206394616536681,
-    0.8429599432532096,
-    0.8206394616536681,
-    0.8706483735804971,
-    0.8706483735804971,
-    0.7028075145996631,
-    0.48376552205293305,
-    0.0,
-    0.8706483735804971,
-    0.8706483735804971,
-    0.8706483735804971,
-    0.8429599432532096,
-    0.8706483735804971,
-    0.9251771844789913,
-    0.8655339692155823,
-    0.9036452083899731,
-    0.8206394616536681,
-    0.8706483735804971,
-    0.0,
-    0.8853924473642432,
-    0.535821510936138,
-    0.6497196601457607,
-    0.8853924473642432,
-    0.717493881903289,
-    0.8655339692155823,
-    0.9036452083899731,
-    0.8206394616536681,
-    0.8706483735804971,
-    0.8853924473642432,
-    0.0,
-    0.5279604218147174,
-    0.6658348373853169,
-    0.33799874888632914,
-    0.6920214832303888,
-    0.6329837991017668,
-    0.9036452083899731,
-    0.8206394616536681,
-    0.8706483735804971,
-    0.535821510936138,
-    0.5279604218147174,
-    0.0,
-    0.662579808115858,
-    0.5079750812968089,
-    0.9251771844789913,
-    0.8655339692155823,
-    0.8429599432532096,
-    0.8429599432532096,
-    0.8429599432532096,
-    0.6497196601457607,
-    0.6658348373853169,
-    0.662579808115858,
-    0.0,
-    0.8429599432532096,
-    0.9251771844789913,
-    0.8655339692155823,
-    0.9036452083899731,
-    0.8206394616536681,
-    0.8706483735804971,
-    0.8853924473642432,
-    0.33799874888632914,
-    0.5079750812968089,
-    0.8429599432532096,
-    0.0},
-   raft::distance::DistanceType::Linf,
-   0.0};
-
-const InputConfiguration<int, float> input_l1 = {
-  4,
-  {0, 1, 1, 2, 4},
-  {3, 2, 0, 1},  // indices
-  {0.99296, 0.42180, 0.11687, 0.305869},
-  {
-    // dense output
-    0.0,
-    0.99296,
-    1.41476,
-    1.415707,
-    0.99296,
-    0.0,
-    0.42180,
-    0.42274,
-    1.41476,
-    0.42180,
-    0.0,
-    0.84454,
-    1.41570,
-    0.42274,
-    0.84454,
-    0.0,
-  },
-  raft::distance::DistanceType::L1,
+const InputConfiguration<int, float> input_canberra = {
+  10,
+  {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+  {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+   6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+  {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+   0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+   0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+   0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+   0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+  {0.0,
+   3.3954660629919076,
+   5.6469232737388815,
+   6.373112846266441,
+   4.0212880272531715,
+   6.916281504639404,
+   5.741508386786526,
+   5.411470999663036,
+   9.0,
+   4.977014354725805,
+   3.3954660629919076,
+   0.0,
+   7.56256082439209,
+   5.540261147481582,
+   4.832322929216881,
+   4.62003193872216,
+   6.498056792320361,
+   4.309846252268695,
+   6.317531174829905,
+   6.016362684141827,
+   5.6469232737388815,
+   7.56256082439209,
+   0.0,
+   5.974878731322299,
+   4.898357301336036,
+   6.442097410320605,
+   5.227077347287883,
+   7.134101195584642,
+   5.457753923371659,
+   7.0,
+   6.373112846266441,
+   5.540261147481582,
+   5.974878731322299,
+   0.0,
+   5.5507273748583,
+   4.897749658726415,
+   9.0,
+   8.398776718824767,
+   3.908281400328807,
+   4.83431066343688,
+   4.0212880272531715,
+   4.832322929216881,
+   4.898357301336036,
+   5.5507273748583,
+   0.0,
+   6.632989819428174,
+   7.438852294822894,
+   5.6631570310967465,
+   7.579428202635459,
+   6.760811985364303,
+   6.916281504639404,
+   4.62003193872216,
+   6.442097410320605,
+   4.897749658726415,
+   6.632989819428174,
+   0.0,
+   5.249404187382862,
+   6.072559523278559,
+   4.07661278488929,
+   6.19678948003145,
+   5.741508386786526,
+   6.498056792320361,
+   5.227077347287883,
+   9.0,
+   7.438852294822894,
+   5.249404187382862,
+   0.0,
+   3.854811639654704,
+   6.652724827169063,
+   5.298236851430971,
+   5.411470999663036,
+   4.309846252268695,
+   7.134101195584642,
+   8.398776718824767,
+   5.6631570310967465,
+   6.072559523278559,
+   3.854811639654704,
+   0.0,
+   7.529184598969917,
+   6.903282911791188,
+   9.0,
+   6.317531174829905,
+   5.457753923371659,
+   3.908281400328807,
+   7.579428202635459,
+   4.07661278488929,
+   6.652724827169063,
+   7.529184598969917,
+   0.0,
+   7.0,
+   4.977014354725805,
+   6.016362684141827,
+   7.0,
+   4.83431066343688,
+   6.760811985364303,
+   6.19678948003145,
+   5.298236851430971,
+   6.903282911791188,
+   7.0,
+   0.0},
+  raft::distance::DistanceType::Canberra,
   0.0};
 
+const InputConfiguration<int, float> input_lp_unexpanded = {
+  10,
+  {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+  {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+   6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+  {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+   0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+   0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+   0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+   0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+  {0.0,
+   1.31462855332296,
+   1.3690307816129905,
+   1.698603990921237,
+   1.3460470789553531,
+   1.6636670712582544,
+   1.2651744044972217,
+   1.1938329352055201,
+   1.8811409082590185,
+   1.3653115050624267,
+   1.31462855332296,
+   0.0,
+   1.9447722703291133,
+   1.42818777206562,
+   1.4685491458946494,
+   1.3071999866010466,
+   1.4988622861692171,
+   0.9698559287406783,
+   1.4972023224597841,
+   1.5243383567266802,
+   1.3690307816129905,
+   1.9447722703291133,
+   0.0,
+   1.2748400840107568,
+   1.0599569946448246,
+   1.546591282841402,
+   1.147526531928459,
+   1.447002179128145,
+   1.5982242387673176,
+   1.3112533607072414,
+   1.698603990921237,
+   1.42818777206562,
+   1.2748400840107568,
+   0.0,
+   1.038121552545461,
+   1.011788365364402,
+   1.3907391109256988,
+   1.3128200942311496,
+   1.19595706584447,
+   1.3233328139624725,
+   1.3460470789553531,
+   1.4685491458946494,
+   1.0599569946448246,
+   1.038121552545461,
+   0.0,
+   1.3642741698145529,
+   1.3493868683808095,
+   1.394942694628328,
+   1.572881849642552,
+   1.380122665319464,
+   1.6636670712582544,
+   1.3071999866010466,
+   1.546591282841402,
+   1.011788365364402,
+   1.3642741698145529,
+   0.0,
+   1.018961640373018,
+   1.0114394258945634,
+   0.8338711034820684,
+   1.1247823842299223,
+   1.2651744044972217,
+   1.4988622861692171,
+   1.147526531928459,
+   1.3907391109256988,
+   1.3493868683808095,
+   1.018961640373018,
+   0.0,
+   0.7701238110357329,
+   1.245486437864406,
+   0.5551259549534626,
+   1.1938329352055201,
+   0.9698559287406783,
+   1.447002179128145,
+   1.3128200942311496,
+   1.394942694628328,
+   1.0114394258945634,
+   0.7701238110357329,
+   0.0,
+   1.1886800117391216,
+   1.0083692448135637,
+   1.8811409082590185,
+   1.4972023224597841,
+   1.5982242387673176,
+   1.19595706584447,
+   1.572881849642552,
+   0.8338711034820684,
+   1.245486437864406,
+   1.1886800117391216,
+   0.0,
+   1.3661374102525012,
+   1.3653115050624267,
+   1.5243383567266802,
+   1.3112533607072414,
+   1.3233328139624725,
+   1.380122665319464,
+   1.1247823842299223,
+   0.5551259549534626,
+   1.0083692448135637,
+   1.3661374102525012,
+   0.0},
+  raft::distance::DistanceType::LpUnexpanded,
+  2.0};
+
+const InputConfiguration<int, float> input_linf = {
+  10,
+  {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+  {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+   6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+  {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+   0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+   0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+   0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+   0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+  {0.0,
+   0.9251771844789913,
+   0.9036452083899731,
+   0.9251771844789913,
+   0.8706483735804971,
+   0.9251771844789913,
+   0.717493881903289,
+   0.6920214832303888,
+   0.9251771844789913,
+   0.9251771844789913,
+   0.9251771844789913,
+   0.0,
+   0.9036452083899731,
+   0.8655339692155823,
+   0.8706483735804971,
+   0.8655339692155823,
+   0.8655339692155823,
+   0.6329837991017668,
+   0.8655339692155823,
+   0.8655339692155823,
+   0.9036452083899731,
+   0.9036452083899731,
+   0.0,
+   0.7988276152181608,
+   0.7028075145996631,
+   0.9036452083899731,
+   0.9036452083899731,
+   0.9036452083899731,
+   0.8429599432532096,
+   0.9036452083899731,
+   0.9251771844789913,
+   0.8655339692155823,
+   0.7988276152181608,
+   0.0,
+   0.48376552205293305,
+   0.8206394616536681,
+   0.8206394616536681,
+   0.8206394616536681,
+   0.8429599432532096,
+   0.8206394616536681,
+   0.8706483735804971,
+   0.8706483735804971,
+   0.7028075145996631,
+   0.48376552205293305,
+   0.0,
+   0.8706483735804971,
+   0.8706483735804971,
+   0.8706483735804971,
+   0.8429599432532096,
+   0.8706483735804971,
+   0.9251771844789913,
+   0.8655339692155823,
+   0.9036452083899731,
+   0.8206394616536681,
+   0.8706483735804971,
+   0.0,
+   0.8853924473642432,
+   0.535821510936138,
+   0.6497196601457607,
+   0.8853924473642432,
+   0.717493881903289,
+   0.8655339692155823,
+   0.9036452083899731,
+   0.8206394616536681,
+   0.8706483735804971,
+   0.8853924473642432,
+   0.0,
+   0.5279604218147174,
+   0.6658348373853169,
+   0.33799874888632914,
+   0.6920214832303888,
+   0.6329837991017668,
+   0.9036452083899731,
+   0.8206394616536681,
+   0.8706483735804971,
+   0.535821510936138,
+   0.5279604218147174,
+   0.0,
+   0.662579808115858,
+   0.5079750812968089,
+   0.9251771844789913,
+   0.8655339692155823,
+   0.8429599432532096,
+   0.8429599432532096,
+   0.8429599432532096,
+   0.6497196601457607,
+   0.6658348373853169,
+   0.662579808115858,
+   0.0,
+   0.8429599432532096,
+   0.9251771844789913,
+   0.8655339692155823,
+   0.9036452083899731,
+   0.8206394616536681,
+   0.8706483735804971,
+   0.8853924473642432,
+   0.33799874888632914,
+   0.5079750812968089,
+   0.8429599432532096,
+   0.0},
+  raft::distance::DistanceType::Linf,
+  0.0};
+
+const InputConfiguration<int, float> input_l1 = {4,
+                                                 {0, 1, 1, 2, 4},
+                                                 {3, 2, 0, 1},  // indices
+                                                 {0.99296, 0.42180, 0.11687, 0.305869},
+                                                 {
+                                                   // dense output
+                                                   0.0,
+                                                   0.99296,
+                                                   1.41476,
+                                                   1.415707,
+                                                   0.99296,
+                                                   0.0,
+                                                   0.42180,
+                                                   0.42274,
+                                                   1.41476,
+                                                   0.42180,
+                                                   0.0,
+                                                   0.84454,
+                                                   1.41570,
+                                                   0.42274,
+                                                   0.84454,
+                                                   0.0,
+                                                 },
+                                                 raft::distance::DistanceType::L1,
+                                                 0.0};
+
 // test dense smem strategy
-const std::vector<
-  SparseDistanceCOOSPMVInputs<int, float, dense_smem_strategy_t>>
-  inputs_dense_strategy = {{input_inner_product}, {input_l2_unexpanded},
-                           {input_canberra},      {input_lp_unexpanded},
-                           {input_linf},          {input_l1}};
+const std::vector<SparseDistanceCOOSPMVInputs<int, float, dense_smem_strategy_t>>
+  inputs_dense_strategy = {{input_inner_product},
+                           {input_l2_unexpanded},
+                           {input_canberra},
+                           {input_lp_unexpanded},
+                           {input_linf},
+                           {input_l1}};
 
 typedef SparseDistanceCOOSPMVTest<int, float, dense_smem_strategy_t>
   SparseDistanceCOOSPMVTestDenseStrategyF;
@@ -660,22 +666,22 @@ INSTANTIATE_TEST_CASE_P(SparseDistanceCOOSPMVTests,
                         ::testing::ValuesIn(inputs_dense_strategy));
 
 // test hash and chunk strategy
-const std::vector<SparseDistanceCOOSPMVInputs<int, float, hash_strategy_t>>
-  inputs_hash_strategy = {{input_inner_product},
-                          {input_inner_product, 0.5, 2},
-                          {input_l2_unexpanded},
-                          {input_l2_unexpanded, 0.5, 2},
-                          {input_canberra},
-                          {input_canberra, 0.5, 2},
-                          {input_canberra, 0.5, 6},
-                          {input_lp_unexpanded},
-                          {input_lp_unexpanded, 0.5, 2},
-                          {input_lp_unexpanded, 0.5, 6},
-                          {input_linf},
-                          {input_linf, 0.5, 2},
-                          {input_linf, 0.5, 6},
-                          {input_l1},
-                          {input_l1, 0.5, 2}};
+const std::vector<SparseDistanceCOOSPMVInputs<int, float, hash_strategy_t>> inputs_hash_strategy = {
+  {input_inner_product},
+  {input_inner_product, 0.5, 2},
+  {input_l2_unexpanded},
+  {input_l2_unexpanded, 0.5, 2},
+  {input_canberra},
+  {input_canberra, 0.5, 2},
+  {input_canberra, 0.5, 6},
+  {input_lp_unexpanded},
+  {input_lp_unexpanded, 0.5, 2},
+  {input_lp_unexpanded, 0.5, 6},
+  {input_linf},
+  {input_linf, 0.5, 2},
+  {input_linf, 0.5, 6},
+  {input_l1},
+  {input_l1, 0.5, 2}};
 
 typedef SparseDistanceCOOSPMVTest<int, float, hash_strategy_t>
   SparseDistanceCOOSPMVTestHashStrategyF;
diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu
index 3bc562bb68..8538c9cf39 100644
--- a/cpp/test/sparse/distance.cu
+++ b/cpp/test/sparse/distance.cu
@@ -49,8 +49,8 @@ struct SparseDistanceInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(
-  ::std::ostream &os, const SparseDistanceInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const SparseDistanceInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
@@ -59,52 +59,56 @@ class SparseDistanceTest
   : public ::testing::TestWithParam<SparseDistanceInputs<value_idx, value_t>> {
  public:
   SparseDistanceTest()
-    : params(::testing::TestWithParam<
-             SparseDistanceInputs<value_idx, value_t>>::GetParam()),
+    : params(::testing::TestWithParam<SparseDistanceInputs<value_idx, value_t>>::GetParam()),
       dist_config(handle),
       indptr(0, handle.get_stream()),
       indices(0, handle.get_stream()),
       data(0, handle.get_stream()),
       out_dists(0, handle.get_stream()),
-      out_dists_ref(0, handle.get_stream()) {}
+      out_dists_ref(0, handle.get_stream())
+  {
+  }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     make_data();
 
-    dist_config.b_nrows = params.indptr_h.size() - 1;
-    dist_config.b_ncols = params.n_cols;
-    dist_config.b_nnz = params.indices_h.size();
-    dist_config.b_indptr = indptr.data();
+    dist_config.b_nrows   = params.indptr_h.size() - 1;
+    dist_config.b_ncols   = params.n_cols;
+    dist_config.b_nnz     = params.indices_h.size();
+    dist_config.b_indptr  = indptr.data();
     dist_config.b_indices = indices.data();
-    dist_config.b_data = data.data();
-    dist_config.a_nrows = params.indptr_h.size() - 1;
-    dist_config.a_ncols = params.n_cols;
-    dist_config.a_nnz = params.indices_h.size();
-    dist_config.a_indptr = indptr.data();
+    dist_config.b_data    = data.data();
+    dist_config.a_nrows   = params.indptr_h.size() - 1;
+    dist_config.a_ncols   = params.n_cols;
+    dist_config.a_nnz     = params.indices_h.size();
+    dist_config.a_indptr  = indptr.data();
     dist_config.a_indices = indices.data();
-    dist_config.a_data = data.data();
+    dist_config.a_data    = data.data();
 
     int out_size = dist_config.a_nrows * dist_config.b_nrows;
 
     out_dists.resize(out_size, handle.get_stream());
 
-    pairwiseDistance(out_dists.data(), dist_config, params.metric,
-                     params.metric_arg);
+    pairwiseDistance(out_dists.data(), dist_config, params.metric, params.metric_arg);
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
   }
 
-  void compare() {
-    ASSERT_TRUE(devArrMatch(out_dists_ref.data(), out_dists.data(),
+  void compare()
+  {
+    ASSERT_TRUE(devArrMatch(out_dists_ref.data(),
+                            out_dists.data(),
                             params.out_dists_ref_h.size(),
                             CompareApprox<value_t>(1e-3)));
   }
 
  protected:
-  void make_data() {
-    std::vector<value_idx> indptr_h = params.indptr_h;
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h = params.data_h;
+    std::vector<value_t> data_h      = params.data_h;
 
     auto stream = handle.get_stream();
     indptr.resize(indptr_h.size(), stream);
@@ -119,8 +123,10 @@ class SparseDistanceTest
 
     out_dists_ref.resize((indptr_h.size() - 1) * (indptr_h.size() - 1), stream);
 
-    update_device(out_dists_ref.data(), out_dists_ref_h.data(),
-                  out_dists_ref_h.size(), dist_config.handle.get_stream());
+    update_device(out_dists_ref.data(),
+                  out_dists_ref_h.data(),
+                  out_dists_ref_h.size(),
+                  dist_config.handle.get_stream());
   }
 
   raft::handle_t handle;
@@ -182,8 +188,7 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
    {0, 2, 4, 6, 8},
    {0, 1, 0, 1, 0, 1, 0, 1},
    {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f},
-   {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,
-    5.0},
+   {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0},
    raft::distance::DistanceType::InnerProduct,
    0.0},
   {2,
@@ -214,40 +219,33 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
   {10,
    {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
-    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
-    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
-    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
-    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
-    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
-   {0.,         0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219,
-    0.58146987, 0.44940102, 1.,         0.76978799, 0.39419924, 0.,
-    0.97577154, 0.48904013, 0.48300801, 0.45087445, 0.73323749, 0.21050481,
-    0.54847744, 0.78021386, 0.54823225, 0.97577154, 0.,         0.51413997,
-    0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819,  1.,
-    0.79593037, 0.48904013, 0.51413997, 0.,         0.28605559, 0.35772784,
-    1.,         0.60889396, 0.43324829, 0.84923694, 0.45658883, 0.48300801,
-    0.31195441, 0.28605559, 0.,         0.58623212, 0.6745457,  0.60287165,
-    0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784,
-    0.58623212, 0.,         0.77917274, 0.48390993, 0.24558392, 0.99166225,
-    0.58146987, 0.73323749, 0.67534399, 1.,         0.6745457,  0.77917274,
-    0.,         0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481,
-    0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0.,
-    0.51360432, 0.68185144, 1.,         0.54847744, 0.8321819,  0.43324829,
-    0.67676228, 0.24558392, 0.76064776, 0.51360432, 0.,         1.,
-    0.76978799, 0.78021386, 1.,         0.84923694, 0.73155632, 0.99166225,
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.,         0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219, 0.58146987, 0.44940102,
+    1.,         0.76978799, 0.39419924, 0.,         0.97577154, 0.48904013, 0.48300801, 0.45087445,
+    0.73323749, 0.21050481, 0.54847744, 0.78021386, 0.54823225, 0.97577154, 0.,         0.51413997,
+    0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819,  1.,         0.79593037, 0.48904013,
+    0.51413997, 0.,         0.28605559, 0.35772784, 1.,         0.60889396, 0.43324829, 0.84923694,
+    0.45658883, 0.48300801, 0.31195441, 0.28605559, 0.,         0.58623212, 0.6745457,  0.60287165,
+    0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784, 0.58623212, 0.,
+    0.77917274, 0.48390993, 0.24558392, 0.99166225, 0.58146987, 0.73323749, 0.67534399, 1.,
+    0.6745457,  0.77917274, 0.,         0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481,
+    0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0.,         0.51360432, 0.68185144,
+    1.,         0.54847744, 0.8321819,  0.43324829, 0.67676228, 0.24558392, 0.76064776, 0.51360432,
+    0.,         1.,         0.76978799, 0.78021386, 1.,         0.84923694, 0.73155632, 0.99166225,
     0.61547536, 0.68185144, 1.,         0.},
    raft::distance::DistanceType::CosineExpanded,
    0.0},
 
   {10,
    {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
    {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
     1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
     1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
@@ -356,15 +354,13 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
   {10,
    {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
-    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
-    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
-    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
-    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
-    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
    {0.0,
     3.3954660629919076,
     5.6469232737388815,
@@ -470,15 +466,13 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
   {10,
    {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
-    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
-    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
-    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
-    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
-    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
    {0.0,
     1.31462855332296,
     1.3690307816129905,
@@ -584,15 +578,13 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
   {10,
    {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
-    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
-    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
-    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
-    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
-    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
    {0.0,
     0.9251771844789913,
     0.9036452083899731,
@@ -698,17 +690,14 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
   {15,
    {0, 5, 8, 9, 15, 20, 26, 31, 34, 38, 45},
-   {0,  1, 5,  6,  9,  1, 4, 14, 7, 3, 4,  7, 9, 11, 14,
-    0,  3, 7,  8,  12, 0, 2, 5,  7, 8, 14, 4, 9, 10, 11,
-    13, 4, 10, 14, 5,  6, 8, 9,  0, 2, 3,  4, 6, 10, 11},
-   {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507,
-    0.73789274, 0.08450219, 1.,         0.20184723, 0.18036963, 0.12581403,
-    0.13867603, 0.24040536, 0.11288773, 0.00290246, 0.09120187, 0.31190555,
-    0.43245423, 0.16153588, 0.3233026,  0.05279589, 0.1387149,  0.05962761,
-    0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881,
-    0.15605804, 0.3867739,  0.24908977, 0.36413632, 0.37643732, 0.28910679,
-    0.0198409,  0.31461499, 0.24412279, 0.08327667, 0.04444576, 0.05047969,
-    0.26190054, 0.2077349,  0.10803964},
+   {0, 1, 5,  6, 9, 1,  4,  14, 7, 3,  4,  7, 9, 11, 14, 0, 3, 7, 8, 12, 0,  2, 5,
+    7, 8, 14, 4, 9, 10, 11, 13, 4, 10, 14, 5, 6, 8,  9,  0, 2, 3, 4, 6,  10, 11},
+   {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507, 0.73789274, 0.08450219,
+    1.,         0.20184723, 0.18036963, 0.12581403, 0.13867603, 0.24040536, 0.11288773, 0.00290246,
+    0.09120187, 0.31190555, 0.43245423, 0.16153588, 0.3233026,  0.05279589, 0.1387149,  0.05962761,
+    0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881, 0.15605804, 0.3867739,
+    0.24908977, 0.36413632, 0.37643732, 0.28910679, 0.0198409,  0.31461499, 0.24412279, 0.08327667,
+    0.04444576, 0.05047969, 0.26190054, 0.2077349,  0.10803964},
    {1.05367121e-08, 8.35309089e-01, 1.00000000e+00, 9.24116813e-01,
     9.90039274e-01, 7.97613546e-01, 8.91271059e-01, 1.00000000e+00,
     6.64669302e-01, 8.59439512e-01, 8.35309089e-01, 1.05367121e-08,
@@ -767,31 +756,25 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
    {0, 3, 8, 12, 16, 20, 25, 30, 35, 40, 45},
    {0, 3, 4, 0, 1, 2, 3, 4, 1, 2, 3, 4, 0, 2, 3, 4, 0, 1, 3, 4, 0, 1, 2,
     3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4},
-   {0.70862347, 0.8232774,  0.12108795, 0.84527547, 0.94937088, 0.03258545,
-    0.99584118, 0.76835667, 0.34426657, 0.2357925,  0.01274851, 0.11422017,
-    0.3437756,  0.31967718, 0.5956055,  0.31610373, 0.04147273, 0.03724415,
-    0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529,
-    0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329,
-    0.61364678, 0.22837736, 0.56609561, 0.29809423, 0.76736686, 0.56460608,
-    0.98165371, 0.02140123, 0.19881268, 0.26057815, 0.31648823, 0.89874295,
-    0.27366735, 0.5119944,  0.11416134},
+   {0.70862347, 0.8232774,  0.12108795, 0.84527547, 0.94937088, 0.03258545, 0.99584118, 0.76835667,
+    0.34426657, 0.2357925,  0.01274851, 0.11422017, 0.3437756,  0.31967718, 0.5956055,  0.31610373,
+    0.04147273, 0.03724415, 0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529,
+    0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329, 0.61364678, 0.22837736,
+    0.56609561, 0.29809423, 0.76736686, 0.56460608, 0.98165371, 0.02140123, 0.19881268, 0.26057815,
+    0.31648823, 0.89874295, 0.27366735, 0.5119944,  0.11416134},
    {// dense output
-    0.,         0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794,
-    0.76962708, 1.122858,   1.1232498,  1.08166081, 0.48769777, 0.,
-    1.31332116, 0.98318907, 0.42661815, 0.09279052, 1.35187836, 1.38429055,
-    0.40658897, 0.56136388, 1.88014197, 1.31332116, 0.,         1.82943642,
-    1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848,
-    0.26127048, 0.98318907, 1.82943642, 0.,         0.29945563, 1.08494093,
-    0.22934281, 0.82801925, 1.74288748, 1.50610116, 0.26657011, 0.42661815,
-    1.54826077, 0.29945563, 0.,         0.45060069, 0.77814948, 1.45245711,
-    1.18328348, 0.82486987, 0.7874794,  0.09279052, 1.05918884, 1.08494093,
-    0.45060069, 0.,         1.29899154, 1.40683824, 0.48505269, 0.53862363,
-    0.76962708, 1.35187836, 1.59360067, 0.22934281, 0.77814948, 1.29899154,
-    0.,         0.33202426, 1.92108999, 1.88812175, 1.122858,   1.38429055,
-    1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0.,
-    1.47318624, 1.92660889, 1.1232498,  0.40658897, 0.60215168, 1.74288748,
-    1.18328348, 0.48505269, 1.92108999, 1.47318624, 0.,         0.24992619,
-    1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363,
+    0.,         0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794,  0.76962708, 1.122858,
+    1.1232498,  1.08166081, 0.48769777, 0.,         1.31332116, 0.98318907, 0.42661815, 0.09279052,
+    1.35187836, 1.38429055, 0.40658897, 0.56136388, 1.88014197, 1.31332116, 0.,         1.82943642,
+    1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848, 0.26127048, 0.98318907,
+    1.82943642, 0.,         0.29945563, 1.08494093, 0.22934281, 0.82801925, 1.74288748, 1.50610116,
+    0.26657011, 0.42661815, 1.54826077, 0.29945563, 0.,         0.45060069, 0.77814948, 1.45245711,
+    1.18328348, 0.82486987, 0.7874794,  0.09279052, 1.05918884, 1.08494093, 0.45060069, 0.,
+    1.29899154, 1.40683824, 0.48505269, 0.53862363, 0.76962708, 1.35187836, 1.59360067, 0.22934281,
+    0.77814948, 1.29899154, 0.,         0.33202426, 1.92108999, 1.88812175, 1.122858,   1.38429055,
+    1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0.,         1.47318624, 1.92660889,
+    1.1232498,  0.40658897, 0.60215168, 1.74288748, 1.18328348, 0.48505269, 1.92108999, 1.47318624,
+    0.,         0.24992619, 1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363,
     1.88812175, 1.92660889, 0.24992619, 0.},
    raft::distance::DistanceType::CorrelationExpanded,
    0.0},
@@ -800,12 +783,11 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
    {1, 4, 0, 4, 1, 3, 0, 1, 3, 0},
    {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
    {// dense output
-    0., 1.,  1.,  1., 0.8, 1., 1.,  0.8, 1., 1.,  1., 0.,  0.8, 1.,  1., 1., 1.,
-    1., 1.,  1.,  1., 0.8, 0., 1.,  1.,  1., 0.8, 1., 1.,  0.8, 1.,  1., 1., 0.,
-    1., 1.,  1.,  1., 1.,  1., 0.8, 1.,  1., 1.,  0., 1.,  1.,  0.8, 1., 1., 1.,
-    1., 1.,  1.,  1., 0.,  1., 0.8, 1.,  1., 1.,  1., 0.8, 1.,  1.,  1., 0., 1.,
-    1., 0.8, 0.8, 1., 1.,  1., 0.8, 0.8, 1., 0.,  1., 1.,  1.,  1.,  1., 1., 1.,
-    1., 1.,  1.,  0., 1.,  1., 1.,  0.8, 1., 1.,  1., 0.8, 1.,  1.,  0.},
+    0.,  1.,  1.,  1., 0.8, 1., 1.,  0.8, 1., 1.,  1.,  0., 0.8, 1., 1.,  1.,  1.,  1.,  1., 1.,
+    1.,  0.8, 0.,  1., 1.,  1., 0.8, 1.,  1., 0.8, 1.,  1., 1.,  0., 1.,  1.,  1.,  1.,  1., 1.,
+    0.8, 1.,  1.,  1., 0.,  1., 1.,  0.8, 1., 1.,  1.,  1., 1.,  1., 1.,  0.,  1.,  0.8, 1., 1.,
+    1.,  1.,  0.8, 1., 1.,  1., 0.,  1.,  1., 0.8, 0.8, 1., 1.,  1., 0.8, 0.8, 1.,  0.,  1., 1.,
+    1.,  1.,  1.,  1., 1.,  1., 1.,  1.,  0., 1.,  1.,  1., 0.8, 1., 1.,  1.,  0.8, 1.,  1., 0.},
    raft::distance::DistanceType::RusselRaoExpanded,
    0.0},
   {5,
@@ -813,13 +795,12 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
    {0, 3, 4, 4, 2, 3, 0, 2, 3, 2},
    {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
    {// dense output
-    0.,  0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0.,  0.4, 0.,  0.2,
-    0.,  0.4, 0.6, 0.2, 0.,  0.6, 0.4, 0.,  0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4,
-    0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.,  0.4, 0.2, 0.2, 0.2, 0.,
-    0.2, 0.6, 0.8, 0.4, 0.2, 0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.,
-    0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0.,  0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8,
-    0.6, 0.2, 0.,  0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4, 0.2, 0.2, 0.4, 0.,  0.2,
-    0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.},
+    0.,  0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4,
+    0.6, 0.2, 0.,  0.6, 0.4, 0.,  0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4, 0.2, 0.,  0.4, 0.,
+    0.2, 0.,  0.4, 0.6, 0.2, 0.,  0.4, 0.2, 0.2, 0.2, 0.,  0.2, 0.6, 0.8, 0.4, 0.2, 0.2,
+    0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.,  0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0.,  0.2,
+    0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8, 0.6, 0.2, 0.,  0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4,
+    0.2, 0.2, 0.4, 0.,  0.2, 0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.},
    raft::distance::DistanceType::HammingUnexpanded,
    0.0},
   {3,
@@ -863,7 +844,8 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
 typedef SparseDistanceTest<int, float> SparseDistanceTestF;
 TEST_P(SparseDistanceTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(SparseDistanceTests, SparseDistanceTestF,
+INSTANTIATE_TEST_CASE_P(SparseDistanceTests,
+                        SparseDistanceTestF,
                         ::testing::ValuesIn(inputs_i32_f));
 
 };  // namespace distance
diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu
index 58ad9cf803..63245a63b0 100644
--- a/cpp/test/sparse/filter.cu
+++ b/cpp/test/sparse/filter.cu
@@ -35,8 +35,7 @@ struct SparseFilterInputs {
 };
 
 template <typename T>
-class SparseFilterTests
-  : public ::testing::TestWithParam<SparseFilterInputs<T>> {
+class SparseFilterTests : public ::testing::TestWithParam<SparseFilterInputs<T>> {
  protected:
   void SetUp() override {}
 
@@ -49,12 +48,13 @@ class SparseFilterTests
 const std::vector<SparseFilterInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
 typedef SparseFilterTests<float> COORemoveZeros;
-TEST_P(COORemoveZeros, Result) {
+TEST_P(COORemoveZeros, Result)
+{
   cudaStream_t stream;
   cudaStreamCreate(&stream);
   params = ::testing::TestWithParam<SparseFilterInputs<float>>::GetParam();
 
-  float *in_h_vals = new float[params.nnz];
+  float* in_h_vals = new float[params.nnz];
 
   COO<float> in(stream, params.nnz, 5, 5);
 
@@ -67,8 +67,8 @@ TEST_P(COORemoveZeros, Result) {
   in_h_vals[2] = 0;
   in_h_vals[3] = 0;
 
-  int *in_h_rows = new int[params.nnz];
-  int *in_h_cols = new int[params.nnz];
+  int* in_h_rows = new int[params.nnz];
+  int* in_h_cols = new int[params.nnz];
 
   for (int i = 0; i < params.nnz; i++) {
     in_h_rows[i] = params.nnz - i - 1;
@@ -84,9 +84,9 @@ TEST_P(COORemoveZeros, Result) {
   int out_rows_ref_h[2] = {0, 3};
   int out_cols_ref_h[2] = {4, 1};
 
-  float *out_vals_ref_h = (float *)malloc(2 * sizeof(float));
-  out_vals_ref_h[0] = in_h_vals[4];
-  out_vals_ref_h[1] = in_h_vals[1];
+  float* out_vals_ref_h = (float*)malloc(2 * sizeof(float));
+  out_vals_ref_h[0]     = in_h_vals[4];
+  out_vals_ref_h[1]     = in_h_vals[1];
 
   COO<float> out_ref(stream, 2, 5, 5);
   COO<float> out(stream);
@@ -97,12 +97,9 @@ TEST_P(COORemoveZeros, Result) {
 
   op::coo_remove_zeros<32, float>(&in, &out, stream);
 
-  ASSERT_TRUE(raft::devArrMatch<int>(out_ref.rows(), out.rows(), 2,
-                                     raft::Compare<int>()));
-  ASSERT_TRUE(raft::devArrMatch<int>(out_ref.cols(), out.cols(), 2,
-                                     raft::Compare<int>()));
-  ASSERT_TRUE(raft::devArrMatch<float>(out_ref.vals(), out.vals(), 2,
-                                       raft::Compare<float>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out_ref.rows(), out.rows(), 2, raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out_ref.cols(), out.cols(), 2, raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<float>(out_ref.vals(), out.vals(), 2, raft::Compare<float>()));
 
   CUDA_CHECK(cudaStreamDestroy(stream));
   free(out_vals_ref_h);
@@ -112,8 +109,7 @@ TEST_P(COORemoveZeros, Result) {
   delete[] in_h_vals;
 }
 
-INSTANTIATE_TEST_CASE_P(SparseFilterTests, COORemoveZeros,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseFilterTests, COORemoveZeros, ::testing::ValuesIn(inputsf));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu
index 86b3b3d382..a693262193 100644
--- a/cpp/test/sparse/knn.cu
+++ b/cpp/test/sparse/knn.cu
@@ -48,60 +48,76 @@ struct SparseKNNInputs {
   int batch_size_index = 2;
   int batch_size_query = 2;
 
-  raft::distance::DistanceType metric =
-    raft::distance::DistanceType::L2SqrtExpanded;
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded;
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(::std::ostream &os,
-                           const SparseKNNInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const SparseKNNInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class SparseKNNTest
-  : public ::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>> {
+class SparseKNNTest : public ::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>> {
  public:
   SparseKNNTest()
-    : params(::testing::TestWithParam<
-             SparseKNNInputs<value_idx, value_t>>::GetParam()),
+    : params(::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>>::GetParam()),
       indptr(0, handle.get_stream()),
       indices(0, handle.get_stream()),
       data(0, handle.get_stream()),
       out_indices(0, handle.get_stream()),
       out_dists(0, handle.get_stream()),
       out_indices_ref(0, handle.get_stream()),
-      out_dists_ref(0, handle.get_stream()) {}
+      out_dists_ref(0, handle.get_stream())
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     n_rows = params.indptr_h.size() - 1;
-    nnz = params.indices_h.size();
-    k = params.k;
+    nnz    = params.indices_h.size();
+    k      = params.k;
 
     make_data();
 
-    raft::sparse::selection::brute_force_knn<value_idx, value_t>(
-      indptr.data(), indices.data(), data.data(), nnz, n_rows, params.n_cols,
-      indptr.data(), indices.data(), data.data(), nnz, n_rows, params.n_cols,
-      out_indices.data(), out_dists.data(), k, handle, params.batch_size_index,
-      params.batch_size_query, params.metric);
+    raft::sparse::selection::brute_force_knn<value_idx, value_t>(indptr.data(),
+                                                                 indices.data(),
+                                                                 data.data(),
+                                                                 nnz,
+                                                                 n_rows,
+                                                                 params.n_cols,
+                                                                 indptr.data(),
+                                                                 indices.data(),
+                                                                 data.data(),
+                                                                 nnz,
+                                                                 n_rows,
+                                                                 params.n_cols,
+                                                                 out_indices.data(),
+                                                                 out_dists.data(),
+                                                                 k,
+                                                                 handle,
+                                                                 params.batch_size_index,
+                                                                 params.batch_size_query,
+                                                                 params.metric);
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
   }
 
-  void compare() {
-    ASSERT_TRUE(devArrMatch(out_dists_ref.data(), out_dists.data(), n_rows * k,
-                            CompareApprox<value_t>(1e-4)));
-    ASSERT_TRUE(devArrMatch(out_indices_ref.data(), out_indices.data(),
-                            n_rows * k, Compare<value_idx>()));
+  void compare()
+  {
+    ASSERT_TRUE(devArrMatch(
+      out_dists_ref.data(), out_dists.data(), n_rows * k, CompareApprox<value_t>(1e-4)));
+    ASSERT_TRUE(
+      devArrMatch(out_indices_ref.data(), out_indices.data(), n_rows * k, Compare<value_idx>()));
   }
 
  protected:
-  void make_data() {
-    std::vector<value_idx> indptr_h = params.indptr_h;
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h = params.data_h;
+    std::vector<value_t> data_h      = params.data_h;
 
     auto stream = handle.get_stream();
     indptr.resize(indptr_h.size(), stream);
@@ -112,16 +128,15 @@ class SparseKNNTest
     update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
     update_device(data.data(), data_h.data(), data_h.size(), stream);
 
-    std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
+    std::vector<value_t> out_dists_ref_h     = params.out_dists_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
 
     out_indices_ref.resize(out_indices_ref_h.size(), stream);
     out_dists_ref.resize(out_dists_ref_h.size(), stream);
 
-    update_device(out_indices_ref.data(), out_indices_ref_h.data(),
-                  out_indices_ref_h.size(), stream);
-    update_device(out_dists_ref.data(), out_dists_ref_h.data(),
-                  out_dists_ref_h.size(), stream);
+    update_device(
+      out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream);
+    update_device(out_dists_ref.data(), out_dists_ref_h.data(), out_dists_ref_h.size(), stream);
 
     out_dists.resize(n_rows * k, stream);
     out_indices.resize(n_rows * k, stream);
@@ -158,8 +173,7 @@ const std::vector<SparseKNNInputs<int, float>> inputs_i32_f = {
    raft::distance::DistanceType::L2SqrtExpanded}};
 typedef SparseKNNTest<int, float> SparseKNNTestF;
 TEST_P(SparseKNNTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF,
-                        ::testing::ValuesIn(inputs_i32_f));
+INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF, ::testing::ValuesIn(inputs_i32_f));
 
 };  // end namespace selection
 };  // end namespace sparse
diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu
index c2a1c4b93c..1ed017f40a 100644
--- a/cpp/test/sparse/knn_graph.cu
+++ b/cpp/test/sparse/knn_graph.cu
@@ -30,8 +30,9 @@ namespace raft {
 namespace sparse {
 
 template <typename value_idx, typename value_t>
-__global__ void assert_symmetry(value_idx *rows, value_idx *cols, value_t *vals,
-                                value_idx nnz, value_idx *sum) {
+__global__ void assert_symmetry(
+  value_idx* rows, value_idx* cols, value_t* vals, value_idx nnz, value_idx* sum)
+{
   int tid = blockDim.x * blockIdx.x + threadIdx.x;
 
   if (tid >= nnz) return;
@@ -51,32 +52,31 @@ struct KNNGraphInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(::std::ostream &os,
-                           const KNNGraphInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const KNNGraphInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class KNNGraphTest
-  : public ::testing::TestWithParam<KNNGraphInputs<value_idx, value_t>> {
+class KNNGraphTest : public ::testing::TestWithParam<KNNGraphInputs<value_idx, value_t>> {
  public:
   KNNGraphTest()
-    : params(::testing::TestWithParam<
-             KNNGraphInputs<value_idx, value_t>>::GetParam()),
+    : params(::testing::TestWithParam<KNNGraphInputs<value_idx, value_t>>::GetParam()),
       stream(handle.get_stream()),
-      X(0, stream) {
+      X(0, stream)
+  {
     X.resize(params.X.size(), stream);
   }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     out = new raft::sparse::COO<value_t, value_idx>(stream);
 
     update_device(X.data(), params.X.data(), params.X.size(), stream);
 
     raft::sparse::selection::knn_graph(
-      handle, X.data(), params.m, params.n,
-      raft::distance::DistanceType::L2Unexpanded, *out);
+      handle, X.data(), params.m, params.n, raft::distance::DistanceType::L2Unexpanded, *out);
 
     rmm::device_scalar<value_idx> sum(stream);
     sum.set_value_to_zero_async(stream);
@@ -98,7 +98,7 @@ class KNNGraphTest
   cudaStream_t stream;
 
   // input data
-  raft::sparse::COO<value_t, value_idx> *out;
+  raft::sparse::COO<value_t, value_idx>* out;
 
   rmm::device_uvector<value_t> X;
 
@@ -112,13 +112,15 @@ const std::vector<KNNGraphInputs<int, float>> knn_graph_inputs_fint = {
   {4, 2, {0, 100, 0.01, 0.02, 5000, 10000, -5, -2}, 2}};
 
 typedef KNNGraphTest<int, float> KNNGraphTestF_int;
-TEST_P(KNNGraphTestF_int, Result) {
+TEST_P(KNNGraphTestF_int, Result)
+{
   // nnz should not be larger than twice m * k
   ASSERT_TRUE(out->nnz <= (params.m * params.k * 2));
   ASSERT_TRUE(sum_h == 0);
 }
 
-INSTANTIATE_TEST_CASE_P(KNNGraphTest, KNNGraphTestF_int,
+INSTANTIATE_TEST_CASE_P(KNNGraphTest,
+                        KNNGraphTestF_int,
                         ::testing::ValuesIn(knn_graph_inputs_fint));
 
 }  // namespace sparse
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu
index 6d4af7f016..50401e5b7a 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/sparse/linkage.cu
@@ -55,45 +55,44 @@ struct LinkageInputs {
  * @param b: number of pairs of points that both the clusters have classified differently
  */
 template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y>
-__global__ void computeTheNumerator(const T* firstClusterArray,
-                                    const T* secondClusterArray, uint64_t size,
-                                    uint64_t* a, uint64_t* b) {
-  //calculating the indices of pairs of datapoints compared by the current thread
+__global__ void computeTheNumerator(
+  const T* firstClusterArray, const T* secondClusterArray, uint64_t size, uint64_t* a, uint64_t* b)
+{
+  // calculating the indices of pairs of datapoints compared by the current thread
   uint64_t j = threadIdx.x + blockIdx.x * blockDim.x;
   uint64_t i = threadIdx.y + blockIdx.y * blockDim.y;
 
-  //thread-local variables to count a and b
+  // thread-local variables to count a and b
   uint64_t myA = 0, myB = 0;
 
   if (i < size && j < size && j < i) {
-    //checking if the pair have been classified the same by both the clusters
+    // checking if the pair have been classified the same by both the clusters
     if (firstClusterArray[i] == firstClusterArray[j] &&
         secondClusterArray[i] == secondClusterArray[j]) {
       ++myA;
     }
 
-    //checking if the pair have been classified differently by both the clusters
+    // checking if the pair have been classified differently by both the clusters
     else if (firstClusterArray[i] != firstClusterArray[j] &&
              secondClusterArray[i] != secondClusterArray[j]) {
       ++myB;
     }
   }
 
-  //specialize blockReduce for a 2D block of 1024 threads of type uint64_t
-  typedef cub::BlockReduce<uint64_t, BLOCK_DIM_X,
-                           cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
+  // specialize blockReduce for a 2D block of 1024 threads of type uint64_t
+  typedef cub::BlockReduce<uint64_t, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
     BlockReduce;
 
-  //Allocate shared memory for blockReduce
+  // Allocate shared memory for blockReduce
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
-  //summing up thread-local counts specific to a block
+  // summing up thread-local counts specific to a block
   myA = BlockReduce(temp_storage).Sum(myA);
   __syncthreads();
   myB = BlockReduce(temp_storage).Sum(myB);
   __syncthreads();
 
-  //executed once per block
+  // executed once per block
   if (threadIdx.x == 0 && threadIdx.y == 0) {
     raft::myAtomicAdd<unsigned long long int>((unsigned long long int*)a, myA);
     raft::myAtomicAdd<unsigned long long int>((unsigned long long int*)b, myB);
@@ -101,53 +100,54 @@ __global__ void computeTheNumerator(const T* firstClusterArray,
 }
 
 /**
-* @brief Function to calculate RandIndex
-* <a href="https://en.wikipedia.org/wiki/Rand_index">more info on rand index</a>
-* @param firstClusterArray: the array of classes of type T
-* @param secondClusterArray: the array of classes of type T
-* @param size: the size of the data points of type uint64_t
-* @param stream: the cudaStream object
-*/
+ * @brief Function to calculate RandIndex
+ * <a href="https://en.wikipedia.org/wiki/Rand_index">more info on rand index</a>
+ * @param firstClusterArray: the array of classes of type T
+ * @param secondClusterArray: the array of classes of type T
+ * @param size: the size of the data points of type uint64_t
+ * @param stream: the cudaStream object
+ */
 template <typename T>
-double compute_rand_index(T* firstClusterArray, T* secondClusterArray,
-                          uint64_t size, cudaStream_t stream) {
-  //rand index for size less than 2 is not defined
+double compute_rand_index(T* firstClusterArray,
+                          T* secondClusterArray,
+                          uint64_t size,
+                          cudaStream_t stream)
+{
+  // rand index for size less than 2 is not defined
   ASSERT(size >= 2, "Rand Index for size less than 2 not defined!");
 
-  //allocating and initializing memory for a and b in the GPU
+  // allocating and initializing memory for a and b in the GPU
   rmm::device_uvector<uint64_t> arr_buf(2, stream);
   CUDA_CHECK(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream));
 
-  //kernel configuration
+  // kernel configuration
   static const int BLOCK_DIM_Y = 16, BLOCK_DIM_X = 16;
   dim3 numThreadsPerBlock(BLOCK_DIM_X, BLOCK_DIM_Y);
   dim3 numBlocks(raft::ceildiv<int>(size, numThreadsPerBlock.x),
                  raft::ceildiv<int>(size, numThreadsPerBlock.y));
 
-  //calling the kernel
-  computeTheNumerator<T, BLOCK_DIM_X, BLOCK_DIM_Y>
-    <<<numBlocks, numThreadsPerBlock, 0, stream>>>(
-      firstClusterArray, secondClusterArray, size, arr_buf.data(),
-      arr_buf.data() + 1);
+  // calling the kernel
+  computeTheNumerator<T, BLOCK_DIM_X, BLOCK_DIM_Y><<<numBlocks, numThreadsPerBlock, 0, stream>>>(
+    firstClusterArray, secondClusterArray, size, arr_buf.data(), arr_buf.data() + 1);
 
-  //synchronizing and updating the calculated values of a and b from device to host
+  // synchronizing and updating the calculated values of a and b from device to host
   uint64_t ab_host[2] = {0};
   raft::update_host(ab_host, arr_buf.data(), 2, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
-  //error handling
+  // error handling
   CUDA_CHECK(cudaGetLastError());
 
-  //denominator
+  // denominator
   uint64_t nChooseTwo = size * (size - 1) / 2;
 
-  //calculating the rand_index
+  // calculating the rand_index
   return (double)(((double)(ab_host[0] + ab_host[1])) / (double)nChooseTwo);
 }
 
 template <typename T, typename IdxT>
-::std::ostream& operator<<(::std::ostream& os,
-                           const LinkageInputs<T, IdxT>& dims) {
+::std::ostream& operator<<(::std::ostream& os, const LinkageInputs<T, IdxT>& dims)
+{
   return os;
 }
 
@@ -158,15 +158,17 @@ class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
     : params(::testing::TestWithParam<LinkageInputs<T, IdxT>>::GetParam()),
       stream(handle.get_stream()),
       labels(params.n_row, stream),
-      labels_ref(params.n_row, stream) {}
+      labels_ref(params.n_row, stream)
+  {
+  }
 
  protected:
-  void basicTest() {
+  void basicTest()
+  {
     rmm::device_uvector<T> data(params.n_row * params.n_col, stream);
 
     raft::copy(data.data(), params.data.data(), data.size(), stream);
-    raft::copy(labels_ref.data(), params.expected_labels.data(), params.n_row,
-               stream);
+    raft::copy(labels_ref.data(), params.expected_labels.data(), params.n_row, stream);
 
     raft::hierarchy::linkage_output<IdxT, T> out_arrs;
     out_arrs.labels = labels.data();
@@ -176,16 +178,19 @@ class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
     out_arrs.children = out_children.data();
 
     raft::handle_t handle;
-    raft::hierarchy::single_linkage<
-      IdxT, T, raft::hierarchy::LinkageDistance::KNN_GRAPH>(
-      handle, data.data(), params.n_row, params.n_col,
-      raft::distance::DistanceType::L2SqrtExpanded, &out_arrs, params.c,
+    raft::hierarchy::single_linkage<IdxT, T, raft::hierarchy::LinkageDistance::KNN_GRAPH>(
+      handle,
+      data.data(),
+      params.n_row,
+      params.n_col,
+      raft::distance::DistanceType::L2SqrtExpanded,
+      &out_arrs,
+      params.c,
       params.n_clusters);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
-    score = compute_rand_index(labels.data(), labels_ref.data(), params.n_row,
-                               stream);
+    score = compute_rand_index(labels.data(), labels_ref.data(), params.n_row, stream);
   }
 
   void SetUp() override { basicTest(); }
@@ -203,14 +208,12 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
   // Test n_clusters == n_points
   {10,
    5,
-   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392,
-    0.77782677, 0.43772379, 0.4035871,  0.3282796,  0.47544681, 0.59862974,
-    0.12319357, 0.06239463, 0.28200272, 0.1345717,  0.50498218, 0.5113505,
-    0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
-    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,
-    0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792,
-    0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692,
-    0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
+   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379,
+    0.4035871,  0.3282796,  0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717,
+    0.50498218, 0.5113505,  0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
+    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,  0.84854131, 0.28890216,
+    0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554,
+    0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
     0.76166195, 0.66613745},
    {9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
    10,
@@ -218,8 +221,7 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
   //  // Test outlier points
   {9,
    2,
-   {-1, -50, 3, 4, 5000, 10000, 1, 3, 4, 5, 0.000005, 0.00002, 2000000, 500000,
-    10, 50, 30, 5},
+   {-1, -50, 3, 4, 5000, 10000, 1, 3, 4, 5, 0.000005, 0.00002, 2000000, 500000, 10, 50, 30, 5},
    {6, 0, 5, 0, 0, 4, 3, 2, 1},
    7,
    -1},
@@ -227,14 +229,12 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
   // Test n_clusters == (n_points / 2)
   {10,
    5,
-   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392,
-    0.77782677, 0.43772379, 0.4035871,  0.3282796,  0.47544681, 0.59862974,
-    0.12319357, 0.06239463, 0.28200272, 0.1345717,  0.50498218, 0.5113505,
-    0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
-    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,
-    0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792,
-    0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692,
-    0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
+   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379,
+    0.4035871,  0.3282796,  0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717,
+    0.50498218, 0.5113505,  0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
+    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,  0.84854131, 0.28890216,
+    0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554,
+    0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
     0.76166195, 0.66613745},
    {1, 0, 4, 0, 0, 3, 2, 0, 2, 1},
    5,
@@ -243,340 +243,173 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
   // Test n_points == 100
   {100,
    10,
-   {6.26168372e-01, 9.30437651e-01, 6.02450208e-01,
-    2.73025296e-01, 9.53050619e-01, 3.32164396e-01,
-    6.88942598e-01, 5.79163537e-01, 6.70341547e-01,
-    2.70140602e-02, 9.30429671e-01, 7.17721157e-01,
-    9.89948537e-01, 7.75253347e-01, 1.34491522e-02,
-    2.48522428e-02, 3.51413378e-01, 7.64405834e-01,
-    7.86373507e-01, 7.18748577e-01, 8.66998621e-01,
-    6.80316582e-01, 2.51288712e-01, 4.91078420e-01,
-    3.76246281e-01, 4.86828710e-01, 5.67464772e-01,
-    5.30734742e-01, 8.99478296e-01, 7.66699088e-01,
-    9.49339111e-01, 3.55248484e-01, 9.06046929e-01,
-    4.48407772e-01, 6.96395305e-01, 2.44277335e-01,
-    7.74840000e-01, 5.21046603e-01, 4.66423971e-02,
-    5.12019638e-02, 8.95019614e-01, 5.28956953e-01,
-    4.31536306e-01, 5.83857744e-01, 4.41787364e-01,
-    4.68656523e-01, 5.73971433e-01, 6.79989654e-01,
-    3.19650588e-01, 6.12579596e-01, 6.49126442e-02,
-    8.39131142e-01, 2.85252117e-01, 5.84848929e-01,
-    9.46507115e-01, 8.58440748e-01, 3.61528940e-01,
-    2.44215959e-01, 3.80101125e-01, 4.57128957e-02,
-    8.82216988e-01, 8.31498633e-01, 7.23474381e-01,
-    7.75788607e-01, 1.40864146e-01, 6.62092382e-01,
-    5.13985168e-01, 3.00686418e-01, 8.70109949e-01,
-    2.43187753e-01, 2.89391938e-01, 2.84214238e-01,
-    8.70985521e-01, 8.77491176e-01, 6.72537226e-01,
-    3.30929686e-01, 1.85934324e-01, 9.16222614e-01,
-    6.18239142e-01, 2.64768597e-01, 5.76145451e-01,
-    8.62961369e-01, 6.84757925e-01, 7.60549082e-01,
-    1.27645356e-01, 4.51004673e-01, 3.92292980e-01,
-    4.63170803e-01, 4.35449330e-02, 2.17583404e-01,
-    5.71832605e-02, 2.06763039e-01, 3.70116249e-01,
-    2.09750028e-01, 6.17283019e-01, 8.62549231e-01,
-    9.84156240e-02, 2.66249156e-01, 3.87635103e-01,
-    2.85591012e-02, 4.24826068e-01, 4.45795088e-01,
-    6.86227676e-01, 1.08848960e-01, 5.96731841e-02,
-    3.71770228e-01, 1.91548833e-01, 6.95136078e-01,
-    9.00700636e-01, 8.76363105e-01, 2.67334632e-01,
-    1.80619709e-01, 7.94060419e-01, 1.42854171e-02,
-    1.09372387e-01, 8.74028108e-01, 6.46403232e-01,
-    4.86588834e-01, 5.93446175e-02, 6.11886291e-01,
-    8.83865057e-01, 3.15879821e-01, 2.27043992e-01,
-    9.76764951e-01, 6.15620336e-01, 9.76199360e-01,
-    2.40548962e-01, 3.21795663e-01, 8.75087904e-02,
-    8.11234663e-01, 6.96070480e-01, 8.12062321e-01,
-    1.21958818e-01, 3.44348628e-02, 8.72630414e-01,
-    3.06162776e-01, 1.76043529e-02, 9.45894971e-01,
-    5.33896401e-01, 6.21642973e-01, 4.93062535e-01,
-    4.48984262e-01, 2.24560379e-01, 4.24052195e-02,
-    4.43447610e-01, 8.95646149e-01, 6.05220676e-01,
-    1.81840491e-01, 9.70831206e-01, 2.12563586e-02,
-    6.92582693e-01, 7.55946922e-01, 7.95086143e-01,
-    6.05328941e-01, 3.99350764e-01, 4.32846636e-01,
-    9.81114529e-01, 4.98266428e-01, 6.37127930e-03,
-    1.59085889e-01, 6.34682067e-05, 5.59429440e-01,
-    7.38827633e-01, 8.93214770e-01, 2.16494306e-01,
-    9.35430573e-02, 4.75665868e-02, 7.80503518e-01,
-    7.86240041e-01, 7.06854594e-01, 2.13725879e-02,
-    7.68246091e-01, 4.50234808e-01, 5.21231104e-01,
-    5.01989826e-03, 4.22081572e-02, 1.65337732e-01,
-    8.54134740e-01, 4.99430262e-01, 8.94525601e-01,
-    1.14028379e-01, 3.69739861e-01, 1.32955599e-01,
-    2.65563824e-01, 2.52811151e-01, 1.44792843e-01,
-    6.88449594e-01, 4.44921417e-01, 8.23296587e-01,
-    1.93266317e-01, 1.19033309e-01, 1.36368966e-01,
-    3.42600285e-01, 5.64505195e-01, 5.57594559e-01,
-    7.44257892e-01, 8.38231569e-02, 4.11548847e-01,
-    3.21010077e-01, 8.55081359e-01, 4.30105779e-01,
-    1.16229135e-01, 9.87731964e-02, 3.14712335e-01,
-    4.50880592e-01, 2.72289598e-01, 6.31615256e-01,
-    8.97432958e-01, 4.44764250e-01, 8.03776440e-01,
-    2.68767748e-02, 2.43374608e-01, 4.02141103e-01,
-    4.98881209e-01, 5.33173003e-01, 8.82890436e-01,
-    7.16149148e-01, 4.19664401e-01, 2.29335357e-01,
-    2.88637806e-01, 3.44696803e-01, 6.78171906e-01,
-    5.69849716e-01, 5.86454477e-01, 3.54474989e-01,
-    9.03876540e-01, 6.45980000e-01, 6.34887593e-01,
-    7.88039746e-02, 2.04814126e-01, 7.82251754e-01,
-    2.43147074e-01, 7.50951808e-01, 1.72799092e-02,
-    2.95349590e-01, 6.57991826e-01, 8.81214312e-01,
-    5.73970708e-01, 2.77610881e-01, 1.82155097e-01,
-    7.69797417e-02, 6.44792402e-01, 9.46950998e-01,
-    7.73064845e-01, 6.04733624e-01, 5.80094567e-01,
-    1.67498426e-01, 2.66514296e-01, 6.50140368e-01,
-    1.91170299e-01, 2.08752199e-01, 3.01664091e-01,
-    9.85033484e-01, 2.92909152e-01, 8.65816607e-01,
-    1.85222119e-01, 2.28814559e-01, 1.34286382e-02,
-    2.89234322e-01, 8.18668708e-01, 4.71706924e-01,
-    9.23199803e-01, 2.80879188e-01, 1.47319284e-01,
-    4.13915748e-01, 9.31274932e-02, 6.66322195e-01,
-    9.66953974e-01, 3.19405786e-01, 6.69486551e-01,
-    5.03096313e-02, 6.95225201e-01, 5.78469859e-01,
-    6.29481655e-01, 1.39252534e-01, 1.22564968e-01,
-    6.80663678e-01, 6.34607157e-01, 6.42765834e-01,
-    1.57127410e-02, 2.92132086e-01, 5.24423878e-01,
-    4.68676824e-01, 2.86003928e-01, 7.18608322e-01,
-    8.95617933e-01, 5.48844309e-01, 1.74517278e-01,
-    5.24379196e-01, 2.13526524e-01, 5.88375435e-01,
-    9.88560185e-01, 4.17435771e-01, 6.14438688e-01,
-    9.53760881e-01, 5.27151288e-01, 7.03017278e-01,
-    3.44448559e-01, 4.47059676e-01, 2.83414901e-01,
-    1.98979011e-01, 4.24917361e-01, 5.73172761e-01,
-    2.32398853e-02, 1.65887230e-01, 4.05552785e-01,
-    9.29665524e-01, 2.26135696e-01, 9.20563384e-01,
-    7.65259963e-01, 4.54820075e-01, 8.97710267e-01,
-    3.78559302e-03, 9.15219382e-01, 3.55705698e-01,
-    6.94905124e-01, 8.58540202e-01, 3.89790666e-01,
-    2.49478206e-01, 7.93679304e-01, 4.75830027e-01,
-    4.40425353e-01, 3.70579459e-01, 1.40578049e-01,
-    1.70386675e-01, 7.04056121e-01, 4.85963102e-01,
-    9.68450060e-01, 6.77178001e-01, 2.65934654e-01,
-    2.58915007e-01, 6.70052890e-01, 2.61945109e-01,
-    8.46207759e-01, 1.01928951e-01, 2.85611334e-01,
-    2.45776933e-01, 2.66658783e-01, 3.71724077e-01,
-    4.34319025e-01, 4.24407347e-01, 7.15417683e-01,
-    8.07997684e-01, 1.64296275e-01, 6.01638065e-01,
-    8.60606804e-02, 2.68719187e-01, 5.11764101e-01,
-    9.75844338e-01, 7.81226782e-01, 2.20925515e-01,
-    7.18135040e-01, 9.82395577e-01, 8.39160243e-01,
-    9.08058083e-01, 6.88010677e-01, 8.14271847e-01,
-    5.12460821e-01, 1.17311345e-01, 5.96075228e-01,
-    9.17455497e-01, 2.12052706e-01, 7.04074603e-01,
-    8.72872565e-02, 8.76047818e-01, 6.96235046e-01,
-    8.54801557e-01, 2.49729159e-01, 9.76594604e-01,
-    2.87386363e-01, 2.36461559e-02, 9.94075254e-01,
-    4.25193986e-01, 7.61869994e-01, 5.13334255e-01,
-    6.44711165e-02, 8.92156689e-01, 3.55235167e-01,
-    1.08154647e-01, 8.78446825e-01, 2.43833016e-01,
-    9.23071293e-01, 2.72724115e-01, 9.46631338e-01,
-    3.74510294e-01, 4.08451278e-02, 9.78392777e-01,
-    3.65079221e-01, 6.37199516e-01, 5.51144906e-01,
-    5.25978080e-01, 1.42803678e-01, 4.05451674e-01,
-    7.79788219e-01, 6.26009784e-01, 3.35249497e-01,
-    1.43159543e-02, 1.80363779e-01, 5.05096904e-01,
-    2.82619947e-01, 5.83561392e-01, 3.10951324e-01,
-    8.73223968e-01, 4.38545619e-01, 4.81348800e-01,
-    6.68497085e-01, 3.79345401e-01, 9.58832501e-01,
-    1.89869550e-01, 2.34083070e-01, 2.94066207e-01,
-    5.74892667e-02, 6.92106828e-02, 9.61127686e-02,
-    6.72650672e-02, 8.47345378e-01, 2.80916761e-01,
-    7.32177357e-03, 9.80785961e-01, 5.73192225e-02,
-    8.48781331e-01, 8.83225408e-01, 7.34398275e-01,
-    7.70381941e-01, 6.20778343e-01, 8.96822048e-01,
-    5.40732486e-01, 3.69704071e-01, 5.77305837e-01,
-    2.08221827e-01, 7.34275341e-01, 1.06110900e-01,
-    3.49496706e-01, 8.34948910e-01, 1.56403291e-02,
-    6.78576376e-01, 8.96141268e-01, 5.94835119e-01,
-    1.43943153e-01, 3.49618530e-01, 2.10440392e-01,
-    3.46585620e-01, 1.05153093e-01, 3.45446174e-01,
-    2.72177079e-01, 7.07946300e-01, 4.33717726e-02,
-    3.31232203e-01, 3.91874320e-01, 4.76338141e-01,
-    6.22777789e-01, 2.95989228e-02, 4.32855769e-01,
-    7.61049310e-01, 3.63279149e-01, 9.47210350e-01,
-    6.43721247e-01, 6.58025802e-01, 1.05247633e-02,
-    5.29974442e-01, 7.30675767e-01, 4.30041079e-01,
-    6.62634841e-01, 8.25936616e-01, 9.91253704e-01,
-    6.79399281e-01, 5.44177006e-01, 7.52876048e-01,
-    3.32139049e-01, 7.98732398e-01, 7.38865223e-01,
-    9.16055132e-01, 6.11736493e-01, 9.63672879e-01,
-    1.83778839e-01, 7.27558919e-02, 5.91602822e-01,
-    3.25235484e-01, 2.34741217e-01, 9.52346277e-01,
-    9.18556407e-01, 9.35373324e-01, 6.89209070e-01,
-    2.56049054e-01, 6.17975395e-01, 7.82285691e-01,
-    9.84983432e-01, 6.62322741e-01, 2.04144457e-01,
-    3.98446577e-01, 1.38918297e-01, 3.05919921e-01,
-    3.14043787e-01, 5.91072666e-01, 7.44703771e-01,
-    8.92272567e-01, 9.78017873e-01, 9.01203161e-01,
-    1.41526372e-01, 4.14878484e-01, 6.80683651e-01,
-    5.01733152e-02, 8.14635389e-01, 2.27926375e-01,
-    9.03269815e-01, 8.68443745e-01, 9.86939190e-01,
-    7.40779486e-01, 2.61005311e-01, 3.19276232e-01,
-    9.69509248e-01, 1.11908818e-01, 4.49198556e-01,
-    1.27056715e-01, 3.84064823e-01, 5.14591811e-01,
-    2.10747488e-01, 9.53884090e-01, 8.43167950e-01,
-    4.51187972e-01, 3.75331782e-01, 6.23566461e-01,
-    3.55290379e-01, 2.95705968e-01, 1.69622690e-01,
-    1.42981830e-01, 2.72180991e-01, 9.46468040e-01,
-    3.70932500e-01, 9.94292830e-01, 4.62587505e-01,
-    7.14817405e-01, 2.45370540e-02, 3.00906377e-01,
-    5.75768304e-01, 9.71448393e-01, 6.95574827e-02,
-    3.93693854e-01, 5.29306116e-01, 5.04694554e-01,
-    6.73797120e-02, 6.76596969e-01, 5.50948898e-01,
-    3.24909641e-01, 7.70337719e-01, 6.51842631e-03,
-    3.03264879e-01, 7.61037886e-03, 2.72289601e-01,
-    1.50502041e-01, 6.71103888e-02, 7.41503703e-01,
-    1.92088941e-01, 2.19043977e-01, 9.09320161e-01,
-    2.37993569e-01, 6.18107973e-02, 8.31447852e-01,
-    2.23355609e-01, 1.84789435e-01, 4.16104518e-01,
-    4.21573859e-01, 8.72446305e-02, 2.97294197e-01,
-    4.50328256e-01, 8.72199917e-01, 2.51279916e-01,
-    4.86219272e-01, 7.57071329e-01, 4.85655942e-01,
-    1.06187277e-01, 4.92341327e-01, 1.46017513e-01,
-    5.25421017e-01, 4.22637906e-01, 2.24685018e-01,
-    8.72648431e-01, 5.54051490e-01, 1.80745062e-01,
-    2.12756336e-01, 5.20883169e-01, 7.60363654e-01,
-    8.30254678e-01, 5.00003328e-01, 4.69017439e-01,
-    6.38105527e-01, 3.50638261e-02, 5.22217353e-02,
-    9.06516882e-02, 8.52975842e-01, 1.19985883e-01,
-    3.74926753e-01, 6.50302066e-01, 1.98875727e-01,
-    6.28362507e-02, 4.32693501e-01, 3.10500685e-01,
-    6.20732833e-01, 4.58503272e-01, 3.20790034e-01,
-    7.91284868e-01, 7.93054570e-01, 2.93406765e-01,
-    8.95399023e-01, 1.06441034e-01, 7.53085241e-02,
-    8.67523104e-01, 1.47963482e-01, 1.25584706e-01,
-    3.81545040e-02, 6.34338619e-01, 1.76368938e-02,
-    5.75553531e-02, 5.31607516e-01, 2.63869588e-01,
-    9.41945823e-01, 9.24028838e-02, 5.21496463e-01,
-    7.74866558e-01, 5.65210610e-01, 7.28015327e-02,
-    6.51963790e-01, 8.94727453e-01, 4.49571590e-01,
-    1.29932405e-01, 8.64026259e-01, 9.92599934e-01,
-    7.43721560e-01, 8.87300215e-01, 1.06369925e-01,
-    8.11335531e-01, 7.87734900e-01, 9.87344678e-01,
-    5.32502820e-01, 4.42612382e-01, 9.64041183e-01,
-    1.66085871e-01, 1.12937664e-01, 5.24423470e-01,
-    6.54689333e-01, 4.59119726e-01, 5.22774091e-01,
-    3.08722276e-02, 6.26979315e-01, 4.49754105e-01,
-    8.07495757e-01, 2.34199499e-01, 1.67765675e-01,
-    9.22168418e-01, 3.73210378e-01, 8.04432575e-01,
-    5.61890354e-01, 4.47025593e-01, 6.43155678e-01,
-    2.40407640e-01, 5.91631279e-01, 1.59369206e-01,
-    7.75799090e-01, 8.32067212e-01, 5.59791576e-02,
-    6.39105224e-01, 4.85274738e-01, 2.12630838e-01,
-    2.81431312e-02, 7.16205363e-01, 6.83885011e-01,
-    5.23869697e-01, 9.99418314e-01, 8.35331599e-01,
-    4.69877463e-02, 6.74712562e-01, 7.99273684e-01,
-    2.77001890e-02, 5.75809742e-01, 2.78513031e-01,
-    8.36209905e-01, 7.25472379e-01, 4.87173943e-01,
-    7.88311357e-01, 9.64676177e-01, 1.75752651e-01,
-    4.98112580e-01, 8.08850418e-02, 6.40981131e-01,
-    4.06647450e-01, 8.46539387e-01, 2.12620694e-01,
-    9.11012851e-01, 8.25041445e-01, 8.90065575e-01,
-    9.63626055e-01, 5.96689242e-01, 1.63372670e-01,
-    4.51640148e-01, 3.43026542e-01, 5.80658851e-01,
-    2.82327625e-01, 4.75535418e-01, 6.27760926e-01,
-    8.46314115e-01, 9.61961932e-01, 3.19806094e-01,
-    5.05508062e-01, 5.28102944e-01, 6.13045057e-01,
-    7.44714938e-01, 1.50586073e-01, 7.91878033e-01,
-    4.89839179e-01, 3.10496849e-01, 8.82309038e-01,
-    2.86922314e-01, 4.84687559e-01, 5.20838630e-01,
-    4.62955493e-01, 2.38185305e-01, 5.47259907e-02,
-    7.10916137e-01, 7.31887202e-01, 6.25602317e-01,
-    8.77741168e-01, 4.19881322e-01, 4.81222328e-01,
-    1.28224501e-01, 2.46034010e-01, 3.34971854e-01,
-    7.37216484e-01, 5.62134821e-02, 7.14089724e-01,
-    9.85549393e-01, 4.66295827e-01, 3.08722434e-03,
-    4.70237690e-01, 2.66524167e-01, 7.93875484e-01,
-    4.54795911e-02, 8.09702944e-01, 1.47709735e-02,
-    1.70082405e-01, 6.35905179e-01, 3.75379109e-01,
-    4.30315011e-01, 3.15788760e-01, 5.58065230e-01,
-    2.24643800e-01, 2.42142981e-01, 6.57283636e-01,
-    3.34921891e-01, 1.26588975e-01, 7.68064155e-01,
-    9.43856291e-01, 4.47518596e-01, 5.44453573e-01,
-    9.95764932e-01, 7.16444391e-01, 8.51019765e-01,
-    1.01179183e-01, 4.45473958e-01, 4.60327322e-01,
-    4.96895844e-02, 4.72907738e-01, 5.58987444e-01,
-    3.41027487e-01, 1.56175026e-01, 7.58283148e-01,
-    6.83600909e-01, 2.14623396e-01, 3.27348880e-01,
-    3.92517893e-01, 6.70418431e-01, 5.16440832e-01,
-    8.63140348e-01, 5.73277464e-01, 3.46608058e-01,
-    7.39396341e-01, 7.20852434e-01, 2.35653246e-02,
-    3.89935659e-01, 7.53783745e-01, 6.34563528e-01,
-    8.79339335e-01, 7.41599159e-02, 5.62433904e-01,
-    6.15553852e-01, 4.56956324e-01, 5.20047447e-01,
-    5.26845015e-02, 5.58471266e-01, 1.63632233e-01,
-    5.38936665e-02, 6.49593683e-01, 2.56838748e-01,
-    8.99035326e-01, 7.20847756e-01, 5.68954684e-01,
-    7.43684755e-01, 5.70924238e-01, 3.82318724e-01,
-    4.89328290e-01, 5.62208561e-01, 4.97540804e-02,
-    4.18011085e-01, 6.88041565e-01, 2.16234653e-01,
-    7.89548214e-01, 8.46136387e-01, 8.46816189e-01,
-    1.73842353e-01, 6.11627842e-02, 8.44440559e-01,
-    4.50646654e-01, 3.74785037e-01, 4.87196697e-01,
-    4.56276448e-01, 9.13284391e-01, 4.15715464e-01,
-    7.13597697e-01, 1.23641270e-02, 5.10031271e-01,
-    4.74601930e-02, 2.55731159e-01, 3.22090006e-01,
-    1.91165703e-01, 4.51170940e-01, 7.50843157e-01,
-    4.42420576e-01, 4.25380660e-01, 4.50667257e-01,
-    6.55689206e-01, 9.68257670e-02, 1.96528793e-01,
-    8.97343028e-01, 4.99940904e-01, 6.65504083e-01,
-    9.41828079e-01, 4.54397338e-01, 5.61893331e-01,
-    5.09839880e-01, 4.53117514e-01, 8.96804127e-02,
-    1.74888861e-01, 6.65641378e-01, 2.81668336e-01,
-    1.89532742e-01, 5.61668382e-01, 8.68330157e-02,
-    8.25092797e-01, 5.18106324e-01, 1.71904024e-01,
-    3.68385523e-01, 1.62005436e-01, 7.48507399e-01,
-    9.30274827e-01, 2.38198517e-01, 9.52222901e-01,
-    5.23587800e-01, 6.94384557e-01, 1.09338652e-01,
-    4.83356794e-01, 2.73050402e-01, 3.68027050e-01,
-    5.92366466e-01, 1.83192289e-01, 8.60376029e-01,
-    7.13926203e-01, 8.16750052e-01, 1.57890291e-01,
-    6.25691951e-01, 5.24831646e-01, 1.73873797e-01,
-    1.02429784e-01, 9.17488471e-01, 4.03584434e-01,
-    9.31170884e-01, 2.79386137e-01, 8.77745206e-01,
-    2.45200576e-01, 1.28896951e-01, 3.15713052e-01,
-    5.27874291e-01, 2.16444335e-01, 7.03883817e-01,
-    7.74738919e-02, 8.42422142e-01, 3.75598924e-01,
-    3.51002411e-01, 6.22752776e-01, 4.82407943e-01,
-    7.43107867e-01, 9.46182666e-01, 9.44344819e-01,
-    3.28124763e-01, 1.06147431e-01, 1.65102684e-01,
-    3.84060507e-01, 2.91057722e-01, 7.68173662e-02,
-    1.03543651e-01, 6.76698940e-01, 1.43141994e-01,
-    7.21342202e-01, 6.69471294e-03, 9.07298311e-01,
-    5.57080171e-01, 8.10954489e-01, 4.11120526e-01,
-    2.06407453e-01, 2.59590556e-01, 7.58512718e-01,
-    5.79873897e-01, 2.92875650e-01, 2.83686529e-01,
-    2.42829343e-01, 9.19323719e-01, 3.46832864e-01,
-    3.58238858e-01, 7.42827585e-01, 2.05760059e-01,
-    9.58438860e-01, 5.66326411e-01, 6.60292846e-01,
-    5.61095078e-02, 6.79465531e-01, 7.05118513e-01,
-    4.44713264e-01, 2.09732933e-01, 5.22732436e-01,
-    1.74396512e-01, 5.29356748e-01, 4.38475687e-01,
-    4.94036404e-01, 4.09785794e-01, 6.40025507e-01,
-    5.79371821e-01, 1.57726118e-01, 6.04572263e-01,
-    5.41072639e-01, 5.18847173e-01, 1.97093284e-01,
-    8.91767002e-01, 4.29050835e-01, 8.25490570e-01,
-    3.87699807e-01, 4.50705808e-01, 2.49371643e-01,
-    3.36074898e-01, 9.29925118e-01, 6.65393649e-01,
-    9.07275994e-01, 3.73075859e-01, 4.14044139e-03,
-    2.37463702e-01, 2.25893784e-01, 2.46900245e-01,
-    4.50350196e-01, 3.48618117e-01, 5.07193932e-01,
-    5.23435142e-01, 8.13611417e-01, 8.92715622e-01,
-    1.02623450e-01, 3.06088345e-01, 7.80461650e-01,
-    2.21453645e-01, 2.01419652e-01, 2.84254457e-01,
-    3.68286735e-01, 7.39358243e-01, 8.97879394e-01,
-    9.81599566e-01, 7.56526442e-01, 7.37645545e-01,
-    4.23976657e-02, 8.25922012e-01, 2.60956996e-01,
-    2.90702065e-01, 8.98388344e-01, 3.03733299e-01,
-    8.49071471e-01, 3.45835425e-01, 7.65458276e-01,
-    5.68094872e-01, 8.93770930e-01, 9.93161641e-01,
-    5.63368667e-02, 4.26548945e-01, 5.46745780e-01,
-    5.75674571e-01, 7.94599487e-01, 7.18935553e-02,
-    4.46492976e-01, 6.40240123e-01, 2.73246969e-01,
-    2.00465968e-01, 1.30718835e-01, 1.92492005e-01,
-    1.96617189e-01, 6.61271644e-01, 8.12687657e-01,
-    8.66342445e-01
+   {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, 2.73025296e-01, 9.53050619e-01, 3.32164396e-01,
+    6.88942598e-01, 5.79163537e-01, 6.70341547e-01, 2.70140602e-02, 9.30429671e-01, 7.17721157e-01,
+    9.89948537e-01, 7.75253347e-01, 1.34491522e-02, 2.48522428e-02, 3.51413378e-01, 7.64405834e-01,
+    7.86373507e-01, 7.18748577e-01, 8.66998621e-01, 6.80316582e-01, 2.51288712e-01, 4.91078420e-01,
+    3.76246281e-01, 4.86828710e-01, 5.67464772e-01, 5.30734742e-01, 8.99478296e-01, 7.66699088e-01,
+    9.49339111e-01, 3.55248484e-01, 9.06046929e-01, 4.48407772e-01, 6.96395305e-01, 2.44277335e-01,
+    7.74840000e-01, 5.21046603e-01, 4.66423971e-02, 5.12019638e-02, 8.95019614e-01, 5.28956953e-01,
+    4.31536306e-01, 5.83857744e-01, 4.41787364e-01, 4.68656523e-01, 5.73971433e-01, 6.79989654e-01,
+    3.19650588e-01, 6.12579596e-01, 6.49126442e-02, 8.39131142e-01, 2.85252117e-01, 5.84848929e-01,
+    9.46507115e-01, 8.58440748e-01, 3.61528940e-01, 2.44215959e-01, 3.80101125e-01, 4.57128957e-02,
+    8.82216988e-01, 8.31498633e-01, 7.23474381e-01, 7.75788607e-01, 1.40864146e-01, 6.62092382e-01,
+    5.13985168e-01, 3.00686418e-01, 8.70109949e-01, 2.43187753e-01, 2.89391938e-01, 2.84214238e-01,
+    8.70985521e-01, 8.77491176e-01, 6.72537226e-01, 3.30929686e-01, 1.85934324e-01, 9.16222614e-01,
+    6.18239142e-01, 2.64768597e-01, 5.76145451e-01, 8.62961369e-01, 6.84757925e-01, 7.60549082e-01,
+    1.27645356e-01, 4.51004673e-01, 3.92292980e-01, 4.63170803e-01, 4.35449330e-02, 2.17583404e-01,
+    5.71832605e-02, 2.06763039e-01, 3.70116249e-01, 2.09750028e-01, 6.17283019e-01, 8.62549231e-01,
+    9.84156240e-02, 2.66249156e-01, 3.87635103e-01, 2.85591012e-02, 4.24826068e-01, 4.45795088e-01,
+    6.86227676e-01, 1.08848960e-01, 5.96731841e-02, 3.71770228e-01, 1.91548833e-01, 6.95136078e-01,
+    9.00700636e-01, 8.76363105e-01, 2.67334632e-01, 1.80619709e-01, 7.94060419e-01, 1.42854171e-02,
+    1.09372387e-01, 8.74028108e-01, 6.46403232e-01, 4.86588834e-01, 5.93446175e-02, 6.11886291e-01,
+    8.83865057e-01, 3.15879821e-01, 2.27043992e-01, 9.76764951e-01, 6.15620336e-01, 9.76199360e-01,
+    2.40548962e-01, 3.21795663e-01, 8.75087904e-02, 8.11234663e-01, 6.96070480e-01, 8.12062321e-01,
+    1.21958818e-01, 3.44348628e-02, 8.72630414e-01, 3.06162776e-01, 1.76043529e-02, 9.45894971e-01,
+    5.33896401e-01, 6.21642973e-01, 4.93062535e-01, 4.48984262e-01, 2.24560379e-01, 4.24052195e-02,
+    4.43447610e-01, 8.95646149e-01, 6.05220676e-01, 1.81840491e-01, 9.70831206e-01, 2.12563586e-02,
+    6.92582693e-01, 7.55946922e-01, 7.95086143e-01, 6.05328941e-01, 3.99350764e-01, 4.32846636e-01,
+    9.81114529e-01, 4.98266428e-01, 6.37127930e-03, 1.59085889e-01, 6.34682067e-05, 5.59429440e-01,
+    7.38827633e-01, 8.93214770e-01, 2.16494306e-01, 9.35430573e-02, 4.75665868e-02, 7.80503518e-01,
+    7.86240041e-01, 7.06854594e-01, 2.13725879e-02, 7.68246091e-01, 4.50234808e-01, 5.21231104e-01,
+    5.01989826e-03, 4.22081572e-02, 1.65337732e-01, 8.54134740e-01, 4.99430262e-01, 8.94525601e-01,
+    1.14028379e-01, 3.69739861e-01, 1.32955599e-01, 2.65563824e-01, 2.52811151e-01, 1.44792843e-01,
+    6.88449594e-01, 4.44921417e-01, 8.23296587e-01, 1.93266317e-01, 1.19033309e-01, 1.36368966e-01,
+    3.42600285e-01, 5.64505195e-01, 5.57594559e-01, 7.44257892e-01, 8.38231569e-02, 4.11548847e-01,
+    3.21010077e-01, 8.55081359e-01, 4.30105779e-01, 1.16229135e-01, 9.87731964e-02, 3.14712335e-01,
+    4.50880592e-01, 2.72289598e-01, 6.31615256e-01, 8.97432958e-01, 4.44764250e-01, 8.03776440e-01,
+    2.68767748e-02, 2.43374608e-01, 4.02141103e-01, 4.98881209e-01, 5.33173003e-01, 8.82890436e-01,
+    7.16149148e-01, 4.19664401e-01, 2.29335357e-01, 2.88637806e-01, 3.44696803e-01, 6.78171906e-01,
+    5.69849716e-01, 5.86454477e-01, 3.54474989e-01, 9.03876540e-01, 6.45980000e-01, 6.34887593e-01,
+    7.88039746e-02, 2.04814126e-01, 7.82251754e-01, 2.43147074e-01, 7.50951808e-01, 1.72799092e-02,
+    2.95349590e-01, 6.57991826e-01, 8.81214312e-01, 5.73970708e-01, 2.77610881e-01, 1.82155097e-01,
+    7.69797417e-02, 6.44792402e-01, 9.46950998e-01, 7.73064845e-01, 6.04733624e-01, 5.80094567e-01,
+    1.67498426e-01, 2.66514296e-01, 6.50140368e-01, 1.91170299e-01, 2.08752199e-01, 3.01664091e-01,
+    9.85033484e-01, 2.92909152e-01, 8.65816607e-01, 1.85222119e-01, 2.28814559e-01, 1.34286382e-02,
+    2.89234322e-01, 8.18668708e-01, 4.71706924e-01, 9.23199803e-01, 2.80879188e-01, 1.47319284e-01,
+    4.13915748e-01, 9.31274932e-02, 6.66322195e-01, 9.66953974e-01, 3.19405786e-01, 6.69486551e-01,
+    5.03096313e-02, 6.95225201e-01, 5.78469859e-01, 6.29481655e-01, 1.39252534e-01, 1.22564968e-01,
+    6.80663678e-01, 6.34607157e-01, 6.42765834e-01, 1.57127410e-02, 2.92132086e-01, 5.24423878e-01,
+    4.68676824e-01, 2.86003928e-01, 7.18608322e-01, 8.95617933e-01, 5.48844309e-01, 1.74517278e-01,
+    5.24379196e-01, 2.13526524e-01, 5.88375435e-01, 9.88560185e-01, 4.17435771e-01, 6.14438688e-01,
+    9.53760881e-01, 5.27151288e-01, 7.03017278e-01, 3.44448559e-01, 4.47059676e-01, 2.83414901e-01,
+    1.98979011e-01, 4.24917361e-01, 5.73172761e-01, 2.32398853e-02, 1.65887230e-01, 4.05552785e-01,
+    9.29665524e-01, 2.26135696e-01, 9.20563384e-01, 7.65259963e-01, 4.54820075e-01, 8.97710267e-01,
+    3.78559302e-03, 9.15219382e-01, 3.55705698e-01, 6.94905124e-01, 8.58540202e-01, 3.89790666e-01,
+    2.49478206e-01, 7.93679304e-01, 4.75830027e-01, 4.40425353e-01, 3.70579459e-01, 1.40578049e-01,
+    1.70386675e-01, 7.04056121e-01, 4.85963102e-01, 9.68450060e-01, 6.77178001e-01, 2.65934654e-01,
+    2.58915007e-01, 6.70052890e-01, 2.61945109e-01, 8.46207759e-01, 1.01928951e-01, 2.85611334e-01,
+    2.45776933e-01, 2.66658783e-01, 3.71724077e-01, 4.34319025e-01, 4.24407347e-01, 7.15417683e-01,
+    8.07997684e-01, 1.64296275e-01, 6.01638065e-01, 8.60606804e-02, 2.68719187e-01, 5.11764101e-01,
+    9.75844338e-01, 7.81226782e-01, 2.20925515e-01, 7.18135040e-01, 9.82395577e-01, 8.39160243e-01,
+    9.08058083e-01, 6.88010677e-01, 8.14271847e-01, 5.12460821e-01, 1.17311345e-01, 5.96075228e-01,
+    9.17455497e-01, 2.12052706e-01, 7.04074603e-01, 8.72872565e-02, 8.76047818e-01, 6.96235046e-01,
+    8.54801557e-01, 2.49729159e-01, 9.76594604e-01, 2.87386363e-01, 2.36461559e-02, 9.94075254e-01,
+    4.25193986e-01, 7.61869994e-01, 5.13334255e-01, 6.44711165e-02, 8.92156689e-01, 3.55235167e-01,
+    1.08154647e-01, 8.78446825e-01, 2.43833016e-01, 9.23071293e-01, 2.72724115e-01, 9.46631338e-01,
+    3.74510294e-01, 4.08451278e-02, 9.78392777e-01, 3.65079221e-01, 6.37199516e-01, 5.51144906e-01,
+    5.25978080e-01, 1.42803678e-01, 4.05451674e-01, 7.79788219e-01, 6.26009784e-01, 3.35249497e-01,
+    1.43159543e-02, 1.80363779e-01, 5.05096904e-01, 2.82619947e-01, 5.83561392e-01, 3.10951324e-01,
+    8.73223968e-01, 4.38545619e-01, 4.81348800e-01, 6.68497085e-01, 3.79345401e-01, 9.58832501e-01,
+    1.89869550e-01, 2.34083070e-01, 2.94066207e-01, 5.74892667e-02, 6.92106828e-02, 9.61127686e-02,
+    6.72650672e-02, 8.47345378e-01, 2.80916761e-01, 7.32177357e-03, 9.80785961e-01, 5.73192225e-02,
+    8.48781331e-01, 8.83225408e-01, 7.34398275e-01, 7.70381941e-01, 6.20778343e-01, 8.96822048e-01,
+    5.40732486e-01, 3.69704071e-01, 5.77305837e-01, 2.08221827e-01, 7.34275341e-01, 1.06110900e-01,
+    3.49496706e-01, 8.34948910e-01, 1.56403291e-02, 6.78576376e-01, 8.96141268e-01, 5.94835119e-01,
+    1.43943153e-01, 3.49618530e-01, 2.10440392e-01, 3.46585620e-01, 1.05153093e-01, 3.45446174e-01,
+    2.72177079e-01, 7.07946300e-01, 4.33717726e-02, 3.31232203e-01, 3.91874320e-01, 4.76338141e-01,
+    6.22777789e-01, 2.95989228e-02, 4.32855769e-01, 7.61049310e-01, 3.63279149e-01, 9.47210350e-01,
+    6.43721247e-01, 6.58025802e-01, 1.05247633e-02, 5.29974442e-01, 7.30675767e-01, 4.30041079e-01,
+    6.62634841e-01, 8.25936616e-01, 9.91253704e-01, 6.79399281e-01, 5.44177006e-01, 7.52876048e-01,
+    3.32139049e-01, 7.98732398e-01, 7.38865223e-01, 9.16055132e-01, 6.11736493e-01, 9.63672879e-01,
+    1.83778839e-01, 7.27558919e-02, 5.91602822e-01, 3.25235484e-01, 2.34741217e-01, 9.52346277e-01,
+    9.18556407e-01, 9.35373324e-01, 6.89209070e-01, 2.56049054e-01, 6.17975395e-01, 7.82285691e-01,
+    9.84983432e-01, 6.62322741e-01, 2.04144457e-01, 3.98446577e-01, 1.38918297e-01, 3.05919921e-01,
+    3.14043787e-01, 5.91072666e-01, 7.44703771e-01, 8.92272567e-01, 9.78017873e-01, 9.01203161e-01,
+    1.41526372e-01, 4.14878484e-01, 6.80683651e-01, 5.01733152e-02, 8.14635389e-01, 2.27926375e-01,
+    9.03269815e-01, 8.68443745e-01, 9.86939190e-01, 7.40779486e-01, 2.61005311e-01, 3.19276232e-01,
+    9.69509248e-01, 1.11908818e-01, 4.49198556e-01, 1.27056715e-01, 3.84064823e-01, 5.14591811e-01,
+    2.10747488e-01, 9.53884090e-01, 8.43167950e-01, 4.51187972e-01, 3.75331782e-01, 6.23566461e-01,
+    3.55290379e-01, 2.95705968e-01, 1.69622690e-01, 1.42981830e-01, 2.72180991e-01, 9.46468040e-01,
+    3.70932500e-01, 9.94292830e-01, 4.62587505e-01, 7.14817405e-01, 2.45370540e-02, 3.00906377e-01,
+    5.75768304e-01, 9.71448393e-01, 6.95574827e-02, 3.93693854e-01, 5.29306116e-01, 5.04694554e-01,
+    6.73797120e-02, 6.76596969e-01, 5.50948898e-01, 3.24909641e-01, 7.70337719e-01, 6.51842631e-03,
+    3.03264879e-01, 7.61037886e-03, 2.72289601e-01, 1.50502041e-01, 6.71103888e-02, 7.41503703e-01,
+    1.92088941e-01, 2.19043977e-01, 9.09320161e-01, 2.37993569e-01, 6.18107973e-02, 8.31447852e-01,
+    2.23355609e-01, 1.84789435e-01, 4.16104518e-01, 4.21573859e-01, 8.72446305e-02, 2.97294197e-01,
+    4.50328256e-01, 8.72199917e-01, 2.51279916e-01, 4.86219272e-01, 7.57071329e-01, 4.85655942e-01,
+    1.06187277e-01, 4.92341327e-01, 1.46017513e-01, 5.25421017e-01, 4.22637906e-01, 2.24685018e-01,
+    8.72648431e-01, 5.54051490e-01, 1.80745062e-01, 2.12756336e-01, 5.20883169e-01, 7.60363654e-01,
+    8.30254678e-01, 5.00003328e-01, 4.69017439e-01, 6.38105527e-01, 3.50638261e-02, 5.22217353e-02,
+    9.06516882e-02, 8.52975842e-01, 1.19985883e-01, 3.74926753e-01, 6.50302066e-01, 1.98875727e-01,
+    6.28362507e-02, 4.32693501e-01, 3.10500685e-01, 6.20732833e-01, 4.58503272e-01, 3.20790034e-01,
+    7.91284868e-01, 7.93054570e-01, 2.93406765e-01, 8.95399023e-01, 1.06441034e-01, 7.53085241e-02,
+    8.67523104e-01, 1.47963482e-01, 1.25584706e-01, 3.81545040e-02, 6.34338619e-01, 1.76368938e-02,
+    5.75553531e-02, 5.31607516e-01, 2.63869588e-01, 9.41945823e-01, 9.24028838e-02, 5.21496463e-01,
+    7.74866558e-01, 5.65210610e-01, 7.28015327e-02, 6.51963790e-01, 8.94727453e-01, 4.49571590e-01,
+    1.29932405e-01, 8.64026259e-01, 9.92599934e-01, 7.43721560e-01, 8.87300215e-01, 1.06369925e-01,
+    8.11335531e-01, 7.87734900e-01, 9.87344678e-01, 5.32502820e-01, 4.42612382e-01, 9.64041183e-01,
+    1.66085871e-01, 1.12937664e-01, 5.24423470e-01, 6.54689333e-01, 4.59119726e-01, 5.22774091e-01,
+    3.08722276e-02, 6.26979315e-01, 4.49754105e-01, 8.07495757e-01, 2.34199499e-01, 1.67765675e-01,
+    9.22168418e-01, 3.73210378e-01, 8.04432575e-01, 5.61890354e-01, 4.47025593e-01, 6.43155678e-01,
+    2.40407640e-01, 5.91631279e-01, 1.59369206e-01, 7.75799090e-01, 8.32067212e-01, 5.59791576e-02,
+    6.39105224e-01, 4.85274738e-01, 2.12630838e-01, 2.81431312e-02, 7.16205363e-01, 6.83885011e-01,
+    5.23869697e-01, 9.99418314e-01, 8.35331599e-01, 4.69877463e-02, 6.74712562e-01, 7.99273684e-01,
+    2.77001890e-02, 5.75809742e-01, 2.78513031e-01, 8.36209905e-01, 7.25472379e-01, 4.87173943e-01,
+    7.88311357e-01, 9.64676177e-01, 1.75752651e-01, 4.98112580e-01, 8.08850418e-02, 6.40981131e-01,
+    4.06647450e-01, 8.46539387e-01, 2.12620694e-01, 9.11012851e-01, 8.25041445e-01, 8.90065575e-01,
+    9.63626055e-01, 5.96689242e-01, 1.63372670e-01, 4.51640148e-01, 3.43026542e-01, 5.80658851e-01,
+    2.82327625e-01, 4.75535418e-01, 6.27760926e-01, 8.46314115e-01, 9.61961932e-01, 3.19806094e-01,
+    5.05508062e-01, 5.28102944e-01, 6.13045057e-01, 7.44714938e-01, 1.50586073e-01, 7.91878033e-01,
+    4.89839179e-01, 3.10496849e-01, 8.82309038e-01, 2.86922314e-01, 4.84687559e-01, 5.20838630e-01,
+    4.62955493e-01, 2.38185305e-01, 5.47259907e-02, 7.10916137e-01, 7.31887202e-01, 6.25602317e-01,
+    8.77741168e-01, 4.19881322e-01, 4.81222328e-01, 1.28224501e-01, 2.46034010e-01, 3.34971854e-01,
+    7.37216484e-01, 5.62134821e-02, 7.14089724e-01, 9.85549393e-01, 4.66295827e-01, 3.08722434e-03,
+    4.70237690e-01, 2.66524167e-01, 7.93875484e-01, 4.54795911e-02, 8.09702944e-01, 1.47709735e-02,
+    1.70082405e-01, 6.35905179e-01, 3.75379109e-01, 4.30315011e-01, 3.15788760e-01, 5.58065230e-01,
+    2.24643800e-01, 2.42142981e-01, 6.57283636e-01, 3.34921891e-01, 1.26588975e-01, 7.68064155e-01,
+    9.43856291e-01, 4.47518596e-01, 5.44453573e-01, 9.95764932e-01, 7.16444391e-01, 8.51019765e-01,
+    1.01179183e-01, 4.45473958e-01, 4.60327322e-01, 4.96895844e-02, 4.72907738e-01, 5.58987444e-01,
+    3.41027487e-01, 1.56175026e-01, 7.58283148e-01, 6.83600909e-01, 2.14623396e-01, 3.27348880e-01,
+    3.92517893e-01, 6.70418431e-01, 5.16440832e-01, 8.63140348e-01, 5.73277464e-01, 3.46608058e-01,
+    7.39396341e-01, 7.20852434e-01, 2.35653246e-02, 3.89935659e-01, 7.53783745e-01, 6.34563528e-01,
+    8.79339335e-01, 7.41599159e-02, 5.62433904e-01, 6.15553852e-01, 4.56956324e-01, 5.20047447e-01,
+    5.26845015e-02, 5.58471266e-01, 1.63632233e-01, 5.38936665e-02, 6.49593683e-01, 2.56838748e-01,
+    8.99035326e-01, 7.20847756e-01, 5.68954684e-01, 7.43684755e-01, 5.70924238e-01, 3.82318724e-01,
+    4.89328290e-01, 5.62208561e-01, 4.97540804e-02, 4.18011085e-01, 6.88041565e-01, 2.16234653e-01,
+    7.89548214e-01, 8.46136387e-01, 8.46816189e-01, 1.73842353e-01, 6.11627842e-02, 8.44440559e-01,
+    4.50646654e-01, 3.74785037e-01, 4.87196697e-01, 4.56276448e-01, 9.13284391e-01, 4.15715464e-01,
+    7.13597697e-01, 1.23641270e-02, 5.10031271e-01, 4.74601930e-02, 2.55731159e-01, 3.22090006e-01,
+    1.91165703e-01, 4.51170940e-01, 7.50843157e-01, 4.42420576e-01, 4.25380660e-01, 4.50667257e-01,
+    6.55689206e-01, 9.68257670e-02, 1.96528793e-01, 8.97343028e-01, 4.99940904e-01, 6.65504083e-01,
+    9.41828079e-01, 4.54397338e-01, 5.61893331e-01, 5.09839880e-01, 4.53117514e-01, 8.96804127e-02,
+    1.74888861e-01, 6.65641378e-01, 2.81668336e-01, 1.89532742e-01, 5.61668382e-01, 8.68330157e-02,
+    8.25092797e-01, 5.18106324e-01, 1.71904024e-01, 3.68385523e-01, 1.62005436e-01, 7.48507399e-01,
+    9.30274827e-01, 2.38198517e-01, 9.52222901e-01, 5.23587800e-01, 6.94384557e-01, 1.09338652e-01,
+    4.83356794e-01, 2.73050402e-01, 3.68027050e-01, 5.92366466e-01, 1.83192289e-01, 8.60376029e-01,
+    7.13926203e-01, 8.16750052e-01, 1.57890291e-01, 6.25691951e-01, 5.24831646e-01, 1.73873797e-01,
+    1.02429784e-01, 9.17488471e-01, 4.03584434e-01, 9.31170884e-01, 2.79386137e-01, 8.77745206e-01,
+    2.45200576e-01, 1.28896951e-01, 3.15713052e-01, 5.27874291e-01, 2.16444335e-01, 7.03883817e-01,
+    7.74738919e-02, 8.42422142e-01, 3.75598924e-01, 3.51002411e-01, 6.22752776e-01, 4.82407943e-01,
+    7.43107867e-01, 9.46182666e-01, 9.44344819e-01, 3.28124763e-01, 1.06147431e-01, 1.65102684e-01,
+    3.84060507e-01, 2.91057722e-01, 7.68173662e-02, 1.03543651e-01, 6.76698940e-01, 1.43141994e-01,
+    7.21342202e-01, 6.69471294e-03, 9.07298311e-01, 5.57080171e-01, 8.10954489e-01, 4.11120526e-01,
+    2.06407453e-01, 2.59590556e-01, 7.58512718e-01, 5.79873897e-01, 2.92875650e-01, 2.83686529e-01,
+    2.42829343e-01, 9.19323719e-01, 3.46832864e-01, 3.58238858e-01, 7.42827585e-01, 2.05760059e-01,
+    9.58438860e-01, 5.66326411e-01, 6.60292846e-01, 5.61095078e-02, 6.79465531e-01, 7.05118513e-01,
+    4.44713264e-01, 2.09732933e-01, 5.22732436e-01, 1.74396512e-01, 5.29356748e-01, 4.38475687e-01,
+    4.94036404e-01, 4.09785794e-01, 6.40025507e-01, 5.79371821e-01, 1.57726118e-01, 6.04572263e-01,
+    5.41072639e-01, 5.18847173e-01, 1.97093284e-01, 8.91767002e-01, 4.29050835e-01, 8.25490570e-01,
+    3.87699807e-01, 4.50705808e-01, 2.49371643e-01, 3.36074898e-01, 9.29925118e-01, 6.65393649e-01,
+    9.07275994e-01, 3.73075859e-01, 4.14044139e-03, 2.37463702e-01, 2.25893784e-01, 2.46900245e-01,
+    4.50350196e-01, 3.48618117e-01, 5.07193932e-01, 5.23435142e-01, 8.13611417e-01, 8.92715622e-01,
+    1.02623450e-01, 3.06088345e-01, 7.80461650e-01, 2.21453645e-01, 2.01419652e-01, 2.84254457e-01,
+    3.68286735e-01, 7.39358243e-01, 8.97879394e-01, 9.81599566e-01, 7.56526442e-01, 7.37645545e-01,
+    4.23976657e-02, 8.25922012e-01, 2.60956996e-01, 2.90702065e-01, 8.98388344e-01, 3.03733299e-01,
+    8.49071471e-01, 3.45835425e-01, 7.65458276e-01, 5.68094872e-01, 8.93770930e-01, 9.93161641e-01,
+    5.63368667e-02, 4.26548945e-01, 5.46745780e-01, 5.75674571e-01, 7.94599487e-01, 7.18935553e-02,
+    4.46492976e-01, 6.40240123e-01, 2.73246969e-01, 2.00465968e-01, 1.30718835e-01, 1.92492005e-01,
+    1.96617189e-01, 6.61271644e-01, 8.12687657e-01, 8.66342445e-01
 
    },
    {0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -589,6 +422,5 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
 typedef LinkageTest<float, int> LinkageTestF_Int;
 TEST_P(LinkageTestF_Int, Result) { EXPECT_TRUE(score == 1.0); }
 
-INSTANTIATE_TEST_CASE_P(LinkageTest, LinkageTestF_Int,
-                        ::testing::ValuesIn(linkage_inputsf2));
+INSTANTIATE_TEST_CASE_P(LinkageTest, LinkageTestF_Int, ::testing::ValuesIn(linkage_inputsf2));
 }  // end namespace raft
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
index 4900b3ff2b..3cf465e032 100644
--- a/cpp/test/sparse/norm.cu
+++ b/cpp/test/sparse/norm.cu
@@ -39,24 +39,25 @@ struct CSRRowNormalizeInputs {
 };
 
 template <typename Type_f, typename Index_>
-class CSRRowNormalizeTest
-  : public ::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>> {
+class CSRRowNormalizeTest : public ::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>> {
  public:
   CSRRowNormalizeTest()
-    : params(::testing::TestWithParam<
-             CSRRowNormalizeInputs<Type_f, Index_>>::GetParam()),
+    : params(::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>>::GetParam()),
       stream(handle.get_stream()),
       in_vals(params.in_vals.size(), stream),
       verify(params.verify.size(), stream),
       ex_scan(params.ex_scan.size(), stream),
-      result(params.verify.size(), stream) {}
+      result(params.verify.size(), stream)
+  {
+  }
 
  protected:
   void SetUp() override {}
 
-  void Run() {
+  void Run()
+  {
     Index_ n_rows = params.ex_scan.size();
-    Index_ nnz = params.in_vals.size();
+    Index_ nnz    = params.in_vals.size();
 
     raft::update_device(ex_scan.data(), params.ex_scan.data(), n_rows, stream);
     raft::update_device(in_vals.data(), params.in_vals.data(), nnz, stream);
@@ -73,8 +74,8 @@ class CSRRowNormalizeTest
         break;
     }
 
-    ASSERT_TRUE(raft::devArrMatch<Type_f>(verify.data(), result.data(), nnz,
-                                          raft::Compare<Type_f>()));
+    ASSERT_TRUE(
+      raft::devArrMatch<Type_f>(verify.data(), result.data(), nnz, raft::Compare<Type_f>()));
   }
 
  protected:
@@ -113,9 +114,11 @@ const std::vector<CSRRowNormalizeInputs<double, int>> csrnormalize_inputs_d = {
    {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestF,
+INSTANTIATE_TEST_CASE_P(SparseNormTest,
+                        CSRRowNormalizeTestF,
                         ::testing::ValuesIn(csrnormalize_inputs_f));
-INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestD,
+INSTANTIATE_TEST_CASE_P(SparseNormTest,
+                        CSRRowNormalizeTestD,
                         ::testing::ValuesIn(csrnormalize_inputs_d));
 
 }  // namespace sparse
diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu
index 8ff4a600bc..9a27ae5134 100644
--- a/cpp/test/sparse/reduce.cu
+++ b/cpp/test/sparse/reduce.cu
@@ -42,15 +42,15 @@ struct SparseReduceInputs {
 };
 
 template <typename value_t, typename value_idx>
-class SparseReduceTest
-  : public ::testing::TestWithParam<SparseReduceInputs<value_t, value_idx>> {
+class SparseReduceTest : public ::testing::TestWithParam<SparseReduceInputs<value_t, value_idx>> {
  protected:
-  void SetUp() override {
-    params = ::testing::TestWithParam<
-      SparseReduceInputs<value_t, value_idx>>::GetParam();
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<SparseReduceInputs<value_t, value_idx>>::GetParam();
   }
 
-  void Run() {
+  void Run()
+  {
     raft::handle_t handle;
 
     auto stream = handle.get_stream();
@@ -62,30 +62,29 @@ class SparseReduceTest
     rmm::device_uvector<value_idx> out_cols(params.out_cols.size(), stream);
     rmm::device_uvector<value_t> out_vals(params.out_vals.size(), stream);
 
-    raft::update_device(in_rows.data(), params.in_rows.data(),
-                        params.in_rows.size(), stream);
-    raft::update_device(in_cols.data(), params.in_cols.data(),
-                        params.in_cols.size(), stream);
-    raft::update_device(in_vals.data(), params.in_vals.data(),
-                        params.in_vals.size(), stream);
-    raft::update_device(out_rows.data(), params.out_rows.data(),
-                        params.out_rows.size(), stream);
-    raft::update_device(out_cols.data(), params.out_cols.data(),
-                        params.out_cols.size(), stream);
-    raft::update_device(out_vals.data(), params.out_vals.data(),
-                        params.out_vals.size(), stream);
+    raft::update_device(in_rows.data(), params.in_rows.data(), params.in_rows.size(), stream);
+    raft::update_device(in_cols.data(), params.in_cols.data(), params.in_cols.size(), stream);
+    raft::update_device(in_vals.data(), params.in_vals.data(), params.in_vals.size(), stream);
+    raft::update_device(out_rows.data(), params.out_rows.data(), params.out_rows.size(), stream);
+    raft::update_device(out_cols.data(), params.out_cols.data(), params.out_cols.size(), stream);
+    raft::update_device(out_vals.data(), params.out_vals.data(), params.out_vals.size(), stream);
 
     raft::sparse::COO<value_t, value_idx> out(stream);
-    raft::sparse::op::max_duplicates(handle, out, in_rows.data(),
-                                     in_cols.data(), in_vals.data(),
-                                     params.in_rows.size(), params.m, params.n);
+    raft::sparse::op::max_duplicates(handle,
+                                     out,
+                                     in_rows.data(),
+                                     in_cols.data(),
+                                     in_vals.data(),
+                                     params.in_rows.size(),
+                                     params.m,
+                                     params.n);
 
     ASSERT_TRUE(raft::devArrMatch<value_idx>(
       out_rows.data(), out.rows(), out.nnz, raft::Compare<value_idx>()));
     ASSERT_TRUE(raft::devArrMatch<value_idx>(
       out_cols.data(), out.cols(), out.nnz, raft::Compare<value_idx>()));
-    ASSERT_TRUE(raft::devArrMatch<value_t>(out_vals.data(), out.vals(), out.nnz,
-                                           raft::Compare<value_t>()));
+    ASSERT_TRUE(
+      raft::devArrMatch<value_t>(out_vals.data(), out.vals(), out.nnz, raft::Compare<value_t>()));
   }
 
   void TearDown() override {}
@@ -114,7 +113,8 @@ const std::vector<SparseReduceInputs<float, int>> max_reduce_inputs_f = {
    4},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseReduceTest, SparseReduceTestF,
+INSTANTIATE_TEST_CASE_P(SparseReduceTest,
+                        SparseReduceTestF,
                         ::testing::ValuesIn(max_reduce_inputs_f));
 
 }  // namespace sparse
diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu
index d527e7323e..d73288b9f6 100644
--- a/cpp/test/sparse/row_op.cu
+++ b/cpp/test/sparse/row_op.cu
@@ -38,43 +38,48 @@ struct CSRRowOpInputs {
 /** Wrapper to call csr_row_op because the enclosing function of a __device__
  *  lambda cannot have private ot protected access within the class. */
 template <typename Type_f, typename Index_>
-void csr_row_op_wrapper(const Index_ *row_ind, Index_ n_rows, Index_ nnz,
-                        Type_f *result, cudaStream_t stream) {
+void csr_row_op_wrapper(
+  const Index_* row_ind, Index_ n_rows, Index_ nnz, Type_f* result, cudaStream_t stream)
+{
   op::csr_row_op<Index_, 32>(
-    row_ind, n_rows, nnz,
+    row_ind,
+    n_rows,
+    nnz,
     [result] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {
-      for (Index_ i = start_idx; i < stop_idx; i++) result[i] = row;
+      for (Index_ i = start_idx; i < stop_idx; i++)
+        result[i] = row;
     },
     stream);
 }
 
 template <typename Type_f, typename Index_>
-class CSRRowOpTest
-  : public ::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>> {
+class CSRRowOpTest : public ::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>> {
  public:
   CSRRowOpTest()
-    : params(
-        ::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>>::GetParam()),
+    : params(::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>>::GetParam()),
       stream(handle.get_stream()),
       verify(params.verify.size(), stream),
       ex_scan(params.ex_scan.size(), stream),
-      result(params.verify.size(), stream) {}
+      result(params.verify.size(), stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     n_rows = params.ex_scan.size();
-    nnz = params.verify.size();
+    nnz    = params.verify.size();
   }
 
-  void Run() {
+  void Run()
+  {
     raft::update_device(ex_scan.data(), params.ex_scan.data(), n_rows, stream);
     raft::update_device(verify.data(), params.verify.data(), nnz, stream);
 
-    csr_row_op_wrapper<Type_f, Index_>(ex_scan.data(), n_rows, nnz,
-                                       result.data(), stream);
+    csr_row_op_wrapper<Type_f, Index_>(ex_scan.data(), n_rows, nnz, result.data(), stream);
 
-    ASSERT_TRUE(raft::devArrMatch<Type_f>(verify.data(), result.data(), nnz,
-                                          raft::Compare<Type_f>()));
+    ASSERT_TRUE(
+      raft::devArrMatch<Type_f>(verify.data(), result.data(), nnz, raft::Compare<Type_f>()));
   }
 
  protected:
@@ -100,10 +105,8 @@ const std::vector<CSRRowOpInputs<double, int>> csrrowop_inputs_d = {
   {{0, 4, 8, 9}, {0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0}},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestF,
-                        ::testing::ValuesIn(csrrowop_inputs_f));
-INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestD,
-                        ::testing::ValuesIn(csrrowop_inputs_d));
+INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestF, ::testing::ValuesIn(csrrowop_inputs_f));
+INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestD, ::testing::ValuesIn(csrrowop_inputs_d));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu
index 7d43780cfd..c7cd03b485 100644
--- a/cpp/test/sparse/sort.cu
+++ b/cpp/test/sparse/sort.cu
@@ -46,7 +46,8 @@ class SparseSortTest : public ::testing::TestWithParam<SparseSortInput<T>> {
 const std::vector<SparseSortInput<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
 typedef SparseSortTest<float> COOSort;
-TEST_P(COOSort, Result) {
+TEST_P(COOSort, Result)
+{
   params = ::testing::TestWithParam<SparseSortInput<float>>::GetParam();
   raft::random::Rng r(params.seed);
   cudaStream_t stream;
@@ -59,13 +60,13 @@ TEST_P(COOSort, Result) {
 
   r.uniform(in_vals.data(), params.nnz, float(-1.0), float(1.0), stream);
 
-  int *in_rows_h = (int *)malloc(params.nnz * sizeof(int));
-  int *in_cols_h = (int *)malloc(params.nnz * sizeof(int));
-  int *verify_h = (int *)malloc(params.nnz * sizeof(int));
+  int* in_rows_h = (int*)malloc(params.nnz * sizeof(int));
+  int* in_cols_h = (int*)malloc(params.nnz * sizeof(int));
+  int* verify_h  = (int*)malloc(params.nnz * sizeof(int));
 
   for (int i = 0; i < params.nnz; i++) {
     in_rows_h[i] = params.nnz - i - 1;
-    verify_h[i] = i;
+    verify_h[i]  = i;
     in_cols_h[i] = i;
   }
 
@@ -74,11 +75,11 @@ TEST_P(COOSort, Result) {
   raft::update_device(in_cols.data(), in_cols_h, params.nnz, stream);
   raft::update_device(verify.data(), verify_h, params.nnz, stream);
 
-  op::coo_sort(params.m, params.n, params.nnz, in_rows.data(), in_cols.data(),
-               in_vals.data(), stream);
+  op::coo_sort(
+    params.m, params.n, params.nnz, in_rows.data(), in_cols.data(), in_vals.data(), stream);
 
-  ASSERT_TRUE(raft::devArrMatch<int>(verify.data(), in_rows.data(), params.nnz,
-                                     raft::Compare<int>()));
+  ASSERT_TRUE(
+    raft::devArrMatch<int>(verify.data(), in_rows.data(), params.nnz, raft::Compare<int>()));
 
   delete[] in_rows_h;
   delete[] in_cols_h;
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu
index 77d9d3d822..53bea0ddc0 100644
--- a/cpp/test/sparse/symmetrize.cu
+++ b/cpp/test/sparse/symmetrize.cu
@@ -31,8 +31,9 @@ namespace raft {
 namespace sparse {
 
 template <typename value_idx, typename value_t>
-__global__ void assert_symmetry(value_idx *rows, value_idx *cols, value_t *vals,
-                                value_idx nnz, value_idx *sum) {
+__global__ void assert_symmetry(
+  value_idx* rows, value_idx* cols, value_t* vals, value_idx nnz, value_idx* sum)
+{
   int tid = blockDim.x * blockIdx.x + threadIdx.x;
 
   if (tid >= nnz) return;
@@ -51,28 +52,31 @@ struct SparseSymmetrizeInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(
-  ::std::ostream &os, const SparseSymmetrizeInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os,
+                           const SparseSymmetrizeInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class SparseSymmetrizeTest : public ::testing::TestWithParam<
-                               SparseSymmetrizeInputs<value_idx, value_t>> {
+class SparseSymmetrizeTest
+  : public ::testing::TestWithParam<SparseSymmetrizeInputs<value_idx, value_t>> {
  public:
   SparseSymmetrizeTest()
-    : params(::testing::TestWithParam<
-             SparseSymmetrizeInputs<value_idx, value_t>>::GetParam()),
+    : params(::testing::TestWithParam<SparseSymmetrizeInputs<value_idx, value_t>>::GetParam()),
       stream(handle.get_stream()),
       indptr(0, stream),
       indices(0, stream),
-      data(0, stream) {}
+      data(0, stream)
+  {
+  }
 
  protected:
-  void make_data() {
-    std::vector<value_idx> indptr_h = params.indptr_h;
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h = params.data_h;
+    std::vector<value_t> data_h      = params.data_h;
 
     indptr.resize(indptr_h.size(), stream);
     indices.resize(indices_h.size(), stream);
@@ -83,22 +87,22 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam<
     update_device(data.data(), data_h.data(), data_h.size(), stream);
   }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     make_data();
 
-    value_idx m = params.indptr_h.size() - 1;
-    value_idx n = params.n_cols;
+    value_idx m   = params.indptr_h.size() - 1;
+    value_idx n   = params.n_cols;
     value_idx nnz = params.indices_h.size();
 
     rmm::device_uvector<value_idx> coo_rows(nnz, stream);
 
-    raft::sparse::convert::csr_to_coo(indptr.data(), m, coo_rows.data(), nnz,
-                                      stream);
+    raft::sparse::convert::csr_to_coo(indptr.data(), m, coo_rows.data(), nnz, stream);
 
     raft::sparse::COO<value_t, value_idx> out(stream);
 
-    raft::sparse::linalg::symmetrize(handle, coo_rows.data(), indices.data(),
-                                     data.data(), m, n, coo_rows.size(), out);
+    raft::sparse::linalg::symmetrize(
+      handle, coo_rows.data(), indices.data(), data.data(), m, n, coo_rows.size(), out);
 
     rmm::device_scalar<value_idx> sum(stream);
     sum.set_value_to_zero_async(stream);
@@ -130,8 +134,7 @@ struct COOSymmetrizeInputs {
 };
 
 template <typename T>
-class COOSymmetrizeTest
-  : public ::testing::TestWithParam<COOSymmetrizeInputs<T>> {
+class COOSymmetrizeTest : public ::testing::TestWithParam<COOSymmetrizeInputs<T>> {
  protected:
   void SetUp() override {}
 
@@ -141,22 +144,21 @@ class COOSymmetrizeTest
 const std::vector<COOSymmetrizeInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
 typedef COOSymmetrizeTest<float> COOSymmetrize;
-TEST_P(COOSymmetrize, Result) {
+TEST_P(COOSymmetrize, Result)
+{
   cudaStream_t stream;
   cudaStreamCreate(&stream);
 
   int nnz = 8;
 
-  int *in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
-  int *in_cols_h = new int[nnz]{1, 3, 2, 3, 0, 1, 0, 2};
-  float *in_vals_h = new float[nnz]{0.5, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5};
+  int* in_rows_h   = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
+  int* in_cols_h   = new int[nnz]{1, 3, 2, 3, 0, 1, 0, 2};
+  float* in_vals_h = new float[nnz]{0.5, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5};
 
-  int *exp_rows_h =
-    new int[nnz * 2]{1, 0, 0, 0, 1, 3, 1, 0, 0, 2, 2, 0, 3, 2, 3, 0};
-  int *exp_cols_h =
-    new int[nnz * 2]{0, 1, 3, 0, 2, 1, 3, 0, 2, 0, 1, 0, 0, 3, 2, 0};
-  float *exp_vals_h = new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0,
-                                         0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0};
+  int* exp_rows_h = new int[nnz * 2]{1, 0, 0, 0, 1, 3, 1, 0, 0, 2, 2, 0, 3, 2, 3, 0};
+  int* exp_cols_h = new int[nnz * 2]{0, 1, 3, 0, 2, 1, 3, 0, 2, 0, 1, 0, 0, 3, 2, 0};
+  float* exp_vals_h =
+    new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0, 0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0};
 
   COO<float> in(stream, nnz, 4, 4);
   raft::update_device(in.rows(), *&in_rows_h, nnz, stream);
@@ -166,22 +168,18 @@ TEST_P(COOSymmetrize, Result) {
   COO<float> out(stream);
 
   linalg::coo_symmetrize<32, float>(
-    &in, &out,
-    [] __device__(int row, int col, float val, float trans) {
-      return val + trans;
-    },
+    &in,
+    &out,
+    [] __device__(int row, int col, float val, float trans) { return val + trans; },
     stream);
 
   CUDA_CHECK(cudaStreamSynchronize(stream));
   std::cout << out << std::endl;
 
   ASSERT_TRUE(out.nnz == nnz * 2);
-  ASSERT_TRUE(raft::devArrMatch<int>(out.rows(), exp_rows_h, out.nnz,
-                                     raft::Compare<int>()));
-  ASSERT_TRUE(raft::devArrMatch<int>(out.cols(), exp_cols_h, out.nnz,
-                                     raft::Compare<int>()));
-  ASSERT_TRUE(raft::devArrMatch<float>(out.vals(), exp_vals_h, out.nnz,
-                                       raft::Compare<float>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out.rows(), exp_rows_h, out.nnz, raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out.cols(), exp_cols_h, out.nnz, raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<float>(out.vals(), exp_vals_h, out.nnz, raft::Compare<float>()));
 
   cudaStreamDestroy(stream);
 
@@ -194,8 +192,7 @@ TEST_P(COOSymmetrize, Result) {
   delete[] exp_vals_h;
 }
 
-INSTANTIATE_TEST_CASE_P(COOSymmetrizeTest, COOSymmetrize,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(COOSymmetrizeTest, COOSymmetrize, ::testing::ValuesIn(inputsf));
 
 const std::vector<SparseSymmetrizeInputs<int, float>> symm_inputs_fint = {
   // Test n_clusters == n_points
@@ -215,7 +212,8 @@ const std::vector<SparseSymmetrizeInputs<int, float>> symm_inputs_fint = {
 typedef SparseSymmetrizeTest<int, float> SparseSymmetrizeTestF_int;
 TEST_P(SparseSymmetrizeTestF_int, Result) { ASSERT_TRUE(sum_h == 0); }
 
-INSTANTIATE_TEST_CASE_P(SparseSymmetrizeTest, SparseSymmetrizeTestF_int,
+INSTANTIATE_TEST_CASE_P(SparseSymmetrizeTest,
+                        SparseSymmetrizeTestF_int,
                         ::testing::ValuesIn(symm_inputs_fint));
 
 }  // namespace sparse
diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu
index ca30506df0..ab85e7fe8f 100644
--- a/cpp/test/spatial/ball_cover.cu
+++ b/cpp/test/spatial/ball_cover.cu
@@ -37,21 +37,26 @@ namespace knn {
 using namespace std;
 
 template <typename value_idx, typename value_t>
-__global__ void count_discrepancies_kernel(value_idx *actual_idx,
-                                           value_idx *expected_idx,
-                                           value_t *actual, value_t *expected,
-                                           uint32_t m, uint32_t n,
-                                           uint32_t *out, float thres = 1e-3) {
+__global__ void count_discrepancies_kernel(value_idx* actual_idx,
+                                           value_idx* expected_idx,
+                                           value_t* actual,
+                                           value_t* expected,
+                                           uint32_t m,
+                                           uint32_t n,
+                                           uint32_t* out,
+                                           float thres = 1e-3)
+{
   uint32_t row = blockDim.x * blockIdx.x + threadIdx.x;
 
   int n_diffs = 0;
   if (row < m) {
     for (uint32_t i = 0; i < n; i++) {
-      value_t d = actual[row * n + i] - expected[row * n + i];
+      value_t d    = actual[row * n + i] - expected[row * n + i];
       bool matches = fabsf(d) <= thres;
       if (!matches) {
         //          printf("row=%d, actual_idx=%ld, actual=%f, expected_id=%ld, expected=%f\n",
-        //                 row, actual_idx[row*n+i], actual[row*n+i], expected_idx[row*n+i], expected[row*n+i]);
+        //                 row, actual_idx[row*n+i], actual[row*n+i], expected_idx[row*n+i],
+        //                 expected[row*n+i]);
       }
 
       n_diffs += !matches;
@@ -61,13 +66,19 @@ __global__ void count_discrepancies_kernel(value_idx *actual_idx,
 }
 
 struct is_nonzero {
-  __host__ __device__ bool operator()(uint32_t &i) { return i > 0; }
+  __host__ __device__ bool operator()(uint32_t& i) { return i > 0; }
 };
 
 template <typename value_idx, typename value_t>
-uint32_t count_discrepancies(value_idx *actual_idx, value_idx *expected_idx,
-                             value_t *actual, value_t *expected, uint32_t m,
-                             uint32_t n, uint32_t *out, cudaStream_t stream) {
+uint32_t count_discrepancies(value_idx* actual_idx,
+                             value_idx* expected_idx,
+                             value_t* actual,
+                             value_t* expected,
+                             uint32_t m,
+                             uint32_t n,
+                             uint32_t* out,
+                             cudaStream_t stream)
+{
   uint32_t tpb = 256;
   count_discrepancies_kernel<<<raft::ceildiv(m, tpb), tpb, 0, stream>>>(
     actual_idx, expected_idx, actual, expected, m, n, out);
@@ -79,25 +90,41 @@ uint32_t count_discrepancies(value_idx *actual_idx, value_idx *expected_idx,
 }
 
 template <typename value_t>
-void compute_bfknn(const raft::handle_t &handle, const value_t *X1,
-                   const value_t *X2, uint32_t n, uint32_t d, uint32_t k,
-                   const raft::distance::DistanceType metric, value_t *dists,
-                   int64_t *inds) {
-  std::vector<value_t *> input_vec = {const_cast<value_t *>(X1)};
+void compute_bfknn(const raft::handle_t& handle,
+                   const value_t* X1,
+                   const value_t* X2,
+                   uint32_t n,
+                   uint32_t d,
+                   uint32_t k,
+                   const raft::distance::DistanceType metric,
+                   value_t* dists,
+                   int64_t* inds)
+{
+  std::vector<value_t*> input_vec = {const_cast<value_t*>(X1)};
   std::vector<uint32_t> sizes_vec = {n};
 
-  cudaStream_t *int_streams = nullptr;
-  std::vector<int64_t> *translations = nullptr;
-
-  raft::spatial::knn::detail::brute_force_knn_impl<uint32_t, int64_t>(
-    input_vec, sizes_vec, d, const_cast<value_t *>(X2), n, inds, dists, k,
-    handle.get_stream(), int_streams, 0, true, true, translations, metric);
+  cudaStream_t* int_streams          = nullptr;
+  std::vector<int64_t>* translations = nullptr;
+
+  raft::spatial::knn::detail::brute_force_knn_impl<uint32_t, int64_t>(input_vec,
+                                                                      sizes_vec,
+                                                                      d,
+                                                                      const_cast<value_t*>(X2),
+                                                                      n,
+                                                                      inds,
+                                                                      dists,
+                                                                      k,
+                                                                      handle.get_stream(),
+                                                                      int_streams,
+                                                                      0,
+                                                                      true,
+                                                                      true,
+                                                                      translations,
+                                                                      metric);
 }
 
 struct ToRadians {
-  __device__ __host__ float operator()(float a) {
-    return a * (CUDART_PI_F / 180.0);
-  }
+  __device__ __host__ float operator()(float a) { return a * (CUDART_PI_F / 180.0); }
 };
 
 struct BallCoverInputs {
@@ -109,13 +136,14 @@ struct BallCoverInputs {
 template <typename value_idx, typename value_t>
 class BallCoverKNNQueryTest : public ::testing::TestWithParam<BallCoverInputs> {
  protected:
-  void basicTest() {
+  void basicTest()
+  {
     params = ::testing::TestWithParam<BallCoverInputs>::GetParam();
     raft::handle_t handle;
 
-    uint32_t k = params.k;
+    uint32_t k   = params.k;
     float weight = params.weight;
-    auto metric = params.metric;
+    auto metric  = params.metric;
 
     std::vector<value_t> h_train_inputs = spatial_data;
 
@@ -126,17 +154,25 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam<BallCoverInputs> {
 
     // Allocate input
     rmm::device_uvector<value_t> d_train_inputs(n * d, handle.get_stream());
-    raft::update_device(d_train_inputs.data(), h_train_inputs.data(), n * d,
-                        handle.get_stream());
+    raft::update_device(d_train_inputs.data(), h_train_inputs.data(), n * d, handle.get_stream());
 
     if (metric == raft::distance::DistanceType::Haversine) {
-      thrust::transform(handle.get_thrust_policy(), d_train_inputs.data(),
+      thrust::transform(handle.get_thrust_policy(),
+                        d_train_inputs.data(),
                         d_train_inputs.data() + d_train_inputs.size(),
-                        d_train_inputs.data(), ToRadians());
+                        d_train_inputs.data(),
+                        ToRadians());
     }
 
-    compute_bfknn(handle, d_train_inputs.data(), d_train_inputs.data(), n, d, k,
-                  metric, d_ref_D.data(), d_ref_I.data());
+    compute_bfknn(handle,
+                  d_train_inputs.data(),
+                  d_train_inputs.data(),
+                  n,
+                  d,
+                  k,
+                  metric,
+                  d_ref_D.data(),
+                  d_ref_I.data());
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
 
@@ -144,13 +180,11 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam<BallCoverInputs> {
     rmm::device_uvector<value_idx> d_pred_I(n * k, handle.get_stream());
     rmm::device_uvector<value_t> d_pred_D(n * k, handle.get_stream());
 
-    BallCoverIndex<value_idx, value_t> index(handle, d_train_inputs.data(), n,
-                                             d, metric);
+    BallCoverIndex<value_idx, value_t> index(handle, d_train_inputs.data(), n, d, metric);
 
     raft::spatial::knn::rbc_build_index(handle, index);
-    raft::spatial::knn::rbc_knn_query(handle, index, k, d_train_inputs.data(),
-                                      n, d_pred_I.data(), d_pred_D.data(), true,
-                                      weight);
+    raft::spatial::knn::rbc_knn_query(
+      handle, index, k, d_train_inputs.data(), n, d_pred_I.data(), d_pred_D.data(), true, weight);
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
     // What we really want are for the distances to match exactly. The
@@ -158,12 +192,19 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam<BallCoverInputs> {
     // can be nondeterministic.
 
     rmm::device_uvector<uint32_t> discrepancies(n, handle.get_stream());
-    thrust::fill(handle.get_thrust_policy(), discrepancies.data(),
-                 discrepancies.data() + discrepancies.size(), 0);
+    thrust::fill(handle.get_thrust_policy(),
+                 discrepancies.data(),
+                 discrepancies.data() + discrepancies.size(),
+                 0);
     //
-    int res = count_discrepancies(d_ref_I.data(), d_pred_I.data(),
-                                  d_ref_D.data(), d_pred_D.data(), n, k,
-                                  discrepancies.data(), handle.get_stream());
+    int res = count_discrepancies(d_ref_I.data(),
+                                  d_pred_I.data(),
+                                  d_ref_D.data(),
+                                  d_pred_D.data(),
+                                  n,
+                                  k,
+                                  discrepancies.data(),
+                                  handle.get_stream());
 
     ASSERT_TRUE(res == 0);
   }
@@ -180,13 +221,14 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam<BallCoverInputs> {
 template <typename value_idx, typename value_t>
 class BallCoverAllKNNTest : public ::testing::TestWithParam<BallCoverInputs> {
  protected:
-  void basicTest() {
+  void basicTest()
+  {
     params = ::testing::TestWithParam<BallCoverInputs>::GetParam();
     raft::handle_t handle;
 
-    uint32_t k = params.k;
+    uint32_t k   = params.k;
     float weight = params.weight;
-    auto metric = params.metric;
+    auto metric  = params.metric;
 
     std::vector<value_t> h_train_inputs = spatial_data;
 
@@ -197,25 +239,37 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam<BallCoverInputs> {
 
     // Allocate input
     rmm::device_uvector<value_t> d_train_inputs(n * d, handle.get_stream());
-    raft::update_device(d_train_inputs.data(), h_train_inputs.data(), n * d,
-                        handle.get_stream());
+    raft::update_device(d_train_inputs.data(), h_train_inputs.data(), n * d, handle.get_stream());
 
     if (metric == raft::distance::DistanceType::Haversine) {
-      thrust::transform(handle.get_thrust_policy(), d_train_inputs.data(),
+      thrust::transform(handle.get_thrust_policy(),
+                        d_train_inputs.data(),
                         d_train_inputs.data() + d_train_inputs.size(),
-                        d_train_inputs.data(), ToRadians());
+                        d_train_inputs.data(),
+                        ToRadians());
     }
 
-    cudaStream_t *int_streams = nullptr;
-    std::vector<int64_t> *translations = nullptr;
+    cudaStream_t* int_streams          = nullptr;
+    std::vector<int64_t>* translations = nullptr;
 
-    std::vector<float *> input_vec = {d_train_inputs.data()};
+    std::vector<float*> input_vec   = {d_train_inputs.data()};
     std::vector<uint32_t> sizes_vec = {n};
 
-    raft::spatial::knn::detail::brute_force_knn_impl<uint32_t, int64_t>(
-      input_vec, sizes_vec, d, d_train_inputs.data(), n, d_ref_I.data(),
-      d_ref_D.data(), k, handle.get_stream(), int_streams, 0, true, true,
-      translations, metric);
+    raft::spatial::knn::detail::brute_force_knn_impl<uint32_t, int64_t>(input_vec,
+                                                                        sizes_vec,
+                                                                        d,
+                                                                        d_train_inputs.data(),
+                                                                        n,
+                                                                        d_ref_I.data(),
+                                                                        d_ref_D.data(),
+                                                                        k,
+                                                                        handle.get_stream(),
+                                                                        int_streams,
+                                                                        0,
+                                                                        true,
+                                                                        true,
+                                                                        translations,
+                                                                        metric);
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
 
@@ -223,11 +277,10 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam<BallCoverInputs> {
     rmm::device_uvector<value_idx> d_pred_I(n * k, handle.get_stream());
     rmm::device_uvector<value_t> d_pred_D(n * k, handle.get_stream());
 
-    BallCoverIndex<value_idx, value_t> index(handle, d_train_inputs.data(), n,
-                                             d, metric);
+    BallCoverIndex<value_idx, value_t> index(handle, d_train_inputs.data(), n, d, metric);
 
-    raft::spatial::knn::rbc_all_knn_query(handle, index, k, d_pred_I.data(),
-                                          d_pred_D.data(), true, weight);
+    raft::spatial::knn::rbc_all_knn_query(
+      handle, index, k, d_pred_I.data(), d_pred_D.data(), true, weight);
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
     // What we really want are for the distances to match exactly. The
@@ -235,12 +288,19 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam<BallCoverInputs> {
     // can be nondeterministic.
 
     rmm::device_uvector<uint32_t> discrepancies(n, handle.get_stream());
-    thrust::fill(handle.get_thrust_policy(), discrepancies.data(),
-                 discrepancies.data() + discrepancies.size(), 0);
+    thrust::fill(handle.get_thrust_policy(),
+                 discrepancies.data(),
+                 discrepancies.data() + discrepancies.size(),
+                 0);
     //
-    uint32_t res = count_discrepancies(
-      d_ref_I.data(), d_pred_I.data(), d_ref_D.data(), d_pred_D.data(), n, k,
-      discrepancies.data(), handle.get_stream());
+    uint32_t res = count_discrepancies(d_ref_I.data(),
+                                       d_pred_I.data(),
+                                       d_ref_D.data(),
+                                       d_pred_D.data(),
+                                       n,
+                                       k,
+                                       discrepancies.data(),
+                                       handle.get_stream());
     ASSERT_TRUE(res == 0);
   }
 
@@ -265,9 +325,11 @@ const std::vector<BallCoverInputs> ballcover_inputs = {
   {7, 1.0, raft::distance::DistanceType::L2SqrtUnexpanded},
 };
 
-INSTANTIATE_TEST_CASE_P(BallCoverAllKNNTest, BallCoverAllKNNTestF,
+INSTANTIATE_TEST_CASE_P(BallCoverAllKNNTest,
+                        BallCoverAllKNNTestF,
                         ::testing::ValuesIn(ballcover_inputs));
-INSTANTIATE_TEST_CASE_P(BallCoverKNNQueryTest, BallCoverKNNQueryTestF,
+INSTANTIATE_TEST_CASE_P(BallCoverKNNQueryTest,
+                        BallCoverKNNQueryTestF,
                         ::testing::ValuesIn(ballcover_inputs));
 
 TEST_P(BallCoverAllKNNTestF, Fit) { basicTest(); }
diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu
index 4930b47e0c..e48a3c6657 100644
--- a/cpp/test/spatial/fused_l2_knn.cu
+++ b/cpp/test/spatial/fused_l2_knn.cu
@@ -49,20 +49,25 @@ struct idx_dist_pair {
   IdxT idx;
   DistT dist;
   compareDist eq_compare;
-  bool operator==(const idx_dist_pair<IdxT, DistT, compareDist> &a) const {
+  bool operator==(const idx_dist_pair<IdxT, DistT, compareDist>& a) const
+  {
     if (idx == a.idx) return true;
     if (eq_compare(dist, a.dist)) return true;
     return false;
   }
-  idx_dist_pair(IdxT x, DistT y, compareDist op)
-    : idx(x), dist(y), eq_compare(op) {}
+  idx_dist_pair(IdxT x, DistT y, compareDist op) : idx(x), dist(y), eq_compare(op) {}
 };
 
 template <typename T, typename DistT>
-testing::AssertionResult devArrMatchKnnPair(
-  const T *expected_idx, const T *actual_idx, const DistT *expected_dist,
-  const DistT *actual_dist, size_t rows, size_t cols, const DistT eps,
-  cudaStream_t stream = 0) {
+testing::AssertionResult devArrMatchKnnPair(const T* expected_idx,
+                                            const T* actual_idx,
+                                            const DistT* expected_dist,
+                                            const DistT* actual_dist,
+                                            size_t rows,
+                                            size_t cols,
+                                            const DistT eps,
+                                            cudaStream_t stream = 0)
+{
   size_t size = rows * cols;
   std::unique_ptr<T[]> exp_idx_h(new T[size]);
   std::unique_ptr<T[]> act_idx_h(new T[size]);
@@ -75,9 +80,9 @@ testing::AssertionResult devArrMatchKnnPair(
   CUDA_CHECK(cudaStreamSynchronize(stream));
   for (size_t i(0); i < rows; ++i) {
     for (size_t j(0); j < cols; ++j) {
-      auto idx = i * cols + j;  // row major assumption!
-      auto exp_idx = exp_idx_h.get()[idx];
-      auto act_idx = act_idx_h.get()[idx];
+      auto idx      = i * cols + j;  // row major assumption!
+      auto exp_idx  = exp_idx_h.get()[idx];
+      auto act_idx  = act_idx_h.get()[idx];
       auto exp_dist = exp_dist_h.get()[idx];
       auto act_dist = act_dist_h.get()[idx];
       idx_dist_pair exp_kvp(exp_idx, exp_dist, raft::CompareApprox<DistT>(eps));
@@ -85,8 +90,7 @@ testing::AssertionResult devArrMatchKnnPair(
       if (!(exp_kvp == act_kvp)) {
         return testing::AssertionFailure()
                << "actual=" << act_kvp.idx << "," << act_kvp.dist << "!="
-               << "expected" << exp_kvp.idx << "," << exp_kvp.dist << " @" << i
-               << "," << j;
+               << "expected" << exp_kvp.idx << "," << exp_kvp.dist << " @" << i << "," << j;
       }
     }
   }
@@ -96,26 +100,43 @@ testing::AssertionResult devArrMatchKnnPair(
 template <typename T>
 class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
  protected:
-  void testBruteForce() {
+  void testBruteForce()
+  {
     cudaStream_t stream = handle_.get_stream();
 
     launchFaissBfknn();
-    detail::fusedL2Knn(dim, raft_indices_, raft_distances_, database,
-                       search_queries, num_db_vecs, num_queries, k_, true, true,
-                       stream, metric);
+    detail::fusedL2Knn(dim,
+                       raft_indices_,
+                       raft_distances_,
+                       database,
+                       search_queries,
+                       num_db_vecs,
+                       num_queries,
+                       k_,
+                       true,
+                       true,
+                       stream,
+                       metric);
 
     // verify.
-    devArrMatchKnnPair(faiss_indices_, raft_indices_, faiss_distances_,
-                       raft_distances_, num_queries, k_, float(0.001), stream);
+    devArrMatchKnnPair(faiss_indices_,
+                       raft_indices_,
+                       faiss_distances_,
+                       raft_distances_,
+                       num_queries,
+                       k_,
+                       float(0.001),
+                       stream);
   }
 
-  void SetUp() override {
-    params_ = ::testing::TestWithParam<FusedL2KNNInputs>::GetParam();
+  void SetUp() override
+  {
+    params_     = ::testing::TestWithParam<FusedL2KNNInputs>::GetParam();
     num_queries = params_.num_queries;
     num_db_vecs = params_.num_db_vecs;
-    dim = params_.dim;
-    k_ = params_.k;
-    metric = params_.metric_;
+    dim         = params_.dim;
+    k_          = params_.k;
+    metric      = params_.metric_;
 
     cudaStream_t stream = handle_.get_stream();
 
@@ -133,12 +154,14 @@ class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
     raft::allocate(faiss_distances_, num_queries * k_, stream, true);
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     cudaStream_t stream = handle_.get_stream();
     raft::deallocate_all(stream);
   }
 
-  void launchFaissBfknn() {
+  void launchFaissBfknn()
+  {
     faiss::MetricType m = detail::build_faiss_metric(metric);
 
     faiss::gpu::StandardGpuResources gpu_res;
@@ -149,18 +172,18 @@ class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
     gpu_res.setDefaultStream(device, handle_.get_stream());
 
     faiss::gpu::GpuDistanceParams args;
-    args.metric = m;
-    args.metricArg = 0;
-    args.k = k_;
-    args.dims = dim;
-    args.vectors = database;
+    args.metric          = m;
+    args.metricArg       = 0;
+    args.k               = k_;
+    args.dims            = dim;
+    args.vectors         = database;
     args.vectorsRowMajor = true;
-    args.numVectors = num_db_vecs;
-    args.queries = search_queries;
+    args.numVectors      = num_db_vecs;
+    args.queries         = search_queries;
     args.queriesRowMajor = true;
-    args.numQueries = num_queries;
-    args.outDistances = faiss_distances_;
-    args.outIndices = faiss_indices_;
+    args.numQueries      = num_queries;
+    args.outDistances    = faiss_distances_;
+    args.outIndices      = faiss_indices_;
 
     bfKnn(&gpu_res, args);
   }
@@ -171,12 +194,12 @@ class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
   int num_queries;
   int num_db_vecs;
   int dim;
-  T *database;
-  T *search_queries;
-  int64_t *raft_indices_;
-  T *raft_distances_;
-  int64_t *faiss_indices_;
-  T *faiss_distances_;
+  T* database;
+  T* search_queries;
+  int64_t* raft_indices_;
+  T* raft_distances_;
+  int64_t* faiss_indices_;
+  T* faiss_distances_;
   int k_;
   raft::distance::DistanceType metric;
 };
@@ -201,8 +224,7 @@ const std::vector<FusedL2KNNInputs> inputs = {
 typedef FusedL2KNNTest<float> FusedL2KNNTestF;
 TEST_P(FusedL2KNNTestF, FusedBruteForce) { this->testBruteForce(); }
 
-INSTANTIATE_TEST_CASE_P(FusedL2KNNTest, FusedL2KNNTestF,
-                        ::testing::ValuesIn(inputs));
+INSTANTIATE_TEST_CASE_P(FusedL2KNNTest, FusedL2KNNTestF, ::testing::ValuesIn(inputs));
 
 }  // namespace knn
 }  // namespace spatial
diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu
index 5a45c45bff..bff7665f83 100644
--- a/cpp/test/spatial/haversine.cu
+++ b/cpp/test/spatial/haversine.cu
@@ -35,10 +35,13 @@ class HaversineKNNTest : public ::testing::Test {
       d_ref_I(0, stream),
       d_ref_D(0, stream),
       d_pred_I(0, stream),
-      d_pred_D(0, stream) {}
+      d_pred_D(0, stream)
+  {
+  }
 
  protected:
-  void basicTest() {
+  void basicTest()
+  {
     // Allocate input
     d_train_inputs.resize(n * d, stream);
 
@@ -51,35 +54,45 @@ class HaversineKNNTest : public ::testing::Test {
     d_pred_D.resize(n * n, stream);
 
     // make testdata on host
-    std::vector<value_t> h_train_inputs = {
-      0.71113885, -1.29215058, 0.59613176, -2.08048115,
-      0.74932804, -1.33634042, 0.51486728, -1.65962873,
-      0.53154002, -1.47049808, 0.72891737, -1.54095137};
+    std::vector<value_t> h_train_inputs = {0.71113885,
+                                           -1.29215058,
+                                           0.59613176,
+                                           -2.08048115,
+                                           0.74932804,
+                                           -1.33634042,
+                                           0.51486728,
+                                           -1.65962873,
+                                           0.53154002,
+                                           -1.47049808,
+                                           0.72891737,
+                                           -1.54095137};
 
     h_train_inputs.resize(d_train_inputs.size());
-    raft::update_device(d_train_inputs.data(), h_train_inputs.data(),
-                        d_train_inputs.size(), stream);
-
-    std::vector<value_t> h_res_D = {
-      0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595,
-      0., 0.36575755, 0.44288665, 0.5170737,  0.59501296, 0.62925595,
-      0., 0.05041587, 0.152463,   0.2426416,  0.34925285, 0.59501296,
-      0., 0.16461092, 0.2345792,  0.34925285, 0.35749438, 0.36575755,
-      0., 0.16461092, 0.20535265, 0.23048252, 0.2426416,  0.5170737,
-      0., 0.152463,   0.18767063, 0.20535265, 0.2345792,  0.44288665};
+    raft::update_device(
+      d_train_inputs.data(), h_train_inputs.data(), d_train_inputs.size(), stream);
+
+    std::vector<value_t> h_res_D = {0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595,
+                                    0., 0.36575755, 0.44288665, 0.5170737,  0.59501296, 0.62925595,
+                                    0., 0.05041587, 0.152463,   0.2426416,  0.34925285, 0.59501296,
+                                    0., 0.16461092, 0.2345792,  0.34925285, 0.35749438, 0.36575755,
+                                    0., 0.16461092, 0.20535265, 0.23048252, 0.2426416,  0.5170737,
+                                    0., 0.152463,   0.18767063, 0.20535265, 0.2345792,  0.44288665};
     h_res_D.resize(n * n);
     raft::update_device(d_ref_D.data(), h_res_D.data(), n * n, stream);
 
-    std::vector<value_idx> h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0,
-                                      2, 0, 5, 4, 3, 1, 3, 4, 5, 2, 0, 1,
-                                      4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1};
+    std::vector<value_idx> h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0, 2, 0, 5, 4, 3, 1,
+                                      3, 4, 5, 2, 0, 1, 4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1};
     h_res_I.resize(n * n);
-    raft::update_device<value_idx>(d_ref_I.data(), h_res_I.data(), n * n,
-                                   stream);
+    raft::update_device<value_idx>(d_ref_I.data(), h_res_I.data(), n * n, stream);
 
-    raft::spatial::knn::detail::haversine_knn(
-      d_pred_I.data(), d_pred_D.data(), d_train_inputs.data(),
-      d_train_inputs.data(), n, n, k, stream);
+    raft::spatial::knn::detail::haversine_knn(d_pred_I.data(),
+                                              d_pred_D.data(),
+                                              d_train_inputs.data(),
+                                              d_train_inputs.data(),
+                                              n,
+                                              n,
+                                              k,
+                                              stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
@@ -106,11 +119,11 @@ class HaversineKNNTest : public ::testing::Test {
 
 typedef HaversineKNNTest<int, float> HaversineKNNTestF;
 
-TEST_F(HaversineKNNTestF, Fit) {
-  ASSERT_TRUE(raft::devArrMatch(d_ref_D.data(), d_pred_D.data(), n * n,
-                                raft::CompareApprox<float>(1e-3)));
-  ASSERT_TRUE(raft::devArrMatch(d_ref_I.data(), d_pred_I.data(), n * n,
-                                raft::Compare<int>()));
+TEST_F(HaversineKNNTestF, Fit)
+{
+  ASSERT_TRUE(
+    raft::devArrMatch(d_ref_D.data(), d_pred_D.data(), n * n, raft::CompareApprox<float>(1e-3)));
+  ASSERT_TRUE(raft::devArrMatch(d_ref_I.data(), d_pred_I.data(), n * n, raft::Compare<int>()));
 }
 
 }  // namespace knn
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index 35a82b1e53..49e5aaab4b 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -36,17 +36,17 @@ struct KNNInputs {
   std::vector<int> labels;
 };
 
-__global__ void build_actual_output(int *output, int n_rows, int k,
-                                    const int *idx_labels,
-                                    const int64_t *indices) {
+__global__ void build_actual_output(
+  int* output, int n_rows, int k, const int* idx_labels, const int64_t* indices)
+{
   int element = threadIdx.x + blockDim.x * blockIdx.x;
   if (element >= n_rows * k) return;
 
   output[element] = idx_labels[indices[element]];
 }
 
-__global__ void build_expected_output(int *output, int n_rows, int k,
-                                      const int *labels) {
+__global__ void build_expected_output(int* output, int n_rows, int k, const int* labels)
+{
   int row = threadIdx.x + blockDim.x * blockIdx.x;
   if (row >= n_rows) return;
 
@@ -68,23 +68,33 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
       search_data_(0, stream),
       indices_(0, stream),
       distances_(0, stream),
-      search_labels_(0, stream) {}
+      search_labels_(0, stream)
+  {
+  }
 
  protected:
-  void testBruteForce() {
-    raft::print_device_vector("Input array: ", input_.data(), rows_ * cols_,
-                              std::cout);
+  void testBruteForce()
+  {
+    raft::print_device_vector("Input array: ", input_.data(), rows_ * cols_, std::cout);
     std::cout << "K: " << k_ << "\n";
-    raft::print_device_vector("Labels array: ", search_labels_.data(), rows_,
-                              std::cout);
+    raft::print_device_vector("Labels array: ", search_labels_.data(), rows_, std::cout);
 
-    std::vector<float *> input_vec;
+    std::vector<float*> input_vec;
     std::vector<int> sizes_vec;
     input_vec.push_back(input_.data());
     sizes_vec.push_back(rows_);
 
-    brute_force_knn(handle, input_vec, sizes_vec, cols_, search_data_.data(),
-                    rows_, indices_.data(), distances_.data(), k_, true, true);
+    brute_force_knn(handle,
+                    input_vec,
+                    sizes_vec,
+                    cols_,
+                    search_data_.data(),
+                    rows_,
+                    indices_.data(),
+                    distances_.data(),
+                    k_,
+                    true,
+                    true);
 
     build_actual_output<<<raft::ceildiv(rows_ * k_, 32), 32, 0, stream>>>(
       actual_labels_.data(), rows_, k_, search_labels_.data(), indices_.data());
@@ -92,14 +102,15 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
     build_expected_output<<<raft::ceildiv(rows_ * k_, 32), 32, 0, stream>>>(
       expected_labels_.data(), rows_, k_, search_labels_.data());
 
-    ASSERT_TRUE(devArrMatch(expected_labels_.data(), actual_labels_.data(),
-                            rows_ * k_, raft::Compare<int>()));
+    ASSERT_TRUE(devArrMatch(
+      expected_labels_.data(), actual_labels_.data(), rows_ * k_, raft::Compare<int>()));
   }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     rows_ = params_.input.size();
     cols_ = params_.input[0].size();
-    k_ = params_.k;
+    k_    = params_.k;
 
     actual_labels_.resize(rows_ * k_, stream);
     expected_labels_.resize(rows_ * k_, stream);
@@ -109,20 +120,17 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
     distances_.resize(rows_ * k_, stream);
     search_labels_.resize(rows_, stream);
 
-    CUDA_CHECK(cudaMemsetAsync(actual_labels_.data(), 0,
-                               actual_labels_.size() * sizeof(int), stream));
-    CUDA_CHECK(cudaMemsetAsync(expected_labels_.data(), 0,
-                               expected_labels_.size() * sizeof(int), stream));
     CUDA_CHECK(
-      cudaMemsetAsync(input_.data(), 0, input_.size() * sizeof(float), stream));
-    CUDA_CHECK(cudaMemsetAsync(search_data_.data(), 0,
-                               search_data_.size() * sizeof(float), stream));
-    CUDA_CHECK(cudaMemsetAsync(indices_.data(), 0,
-                               indices_.size() * sizeof(int64_t), stream));
-    CUDA_CHECK(cudaMemsetAsync(distances_.data(), 0,
-                               distances_.size() * sizeof(float), stream));
-    CUDA_CHECK(cudaMemsetAsync(search_labels_.data(), 0,
-                               search_labels_.size() * sizeof(int), stream));
+      cudaMemsetAsync(actual_labels_.data(), 0, actual_labels_.size() * sizeof(int), stream));
+    CUDA_CHECK(
+      cudaMemsetAsync(expected_labels_.data(), 0, expected_labels_.size() * sizeof(int), stream));
+    CUDA_CHECK(cudaMemsetAsync(input_.data(), 0, input_.size() * sizeof(float), stream));
+    CUDA_CHECK(
+      cudaMemsetAsync(search_data_.data(), 0, search_data_.size() * sizeof(float), stream));
+    CUDA_CHECK(cudaMemsetAsync(indices_.data(), 0, indices_.size() * sizeof(int64_t), stream));
+    CUDA_CHECK(cudaMemsetAsync(distances_.data(), 0, distances_.size() * sizeof(float), stream));
+    CUDA_CHECK(
+      cudaMemsetAsync(search_labels_.data(), 0, search_labels_.size() * sizeof(int), stream));
 
     std::vector<float> row_major_input;
     for (std::size_t i = 0; i < params_.input.size(); ++i) {
@@ -130,13 +138,13 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
         row_major_input.push_back(params_.input[i][j]);
       }
     }
-    rmm::device_buffer input_d = rmm::device_buffer(
-      row_major_input.data(), row_major_input.size() * sizeof(float), stream);
-    float *input_ptr = static_cast<float *>(input_d.data());
+    rmm::device_buffer input_d =
+      rmm::device_buffer(row_major_input.data(), row_major_input.size() * sizeof(float), stream);
+    float* input_ptr = static_cast<float*>(input_d.data());
 
-    rmm::device_buffer labels_d = rmm::device_buffer(
-      params_.labels.data(), params_.labels.size() * sizeof(int), stream);
-    int *labels_ptr = static_cast<int *>(labels_d.data());
+    rmm::device_buffer labels_d =
+      rmm::device_buffer(params_.labels.data(), params_.labels.size() * sizeof(int), stream);
+    int* labels_ptr = static_cast<int*>(labels_d.data());
 
     raft::copy(input_.data(), input_ptr, rows_ * cols_, stream);
     raft::copy(search_data_.data(), input_ptr, rows_ * cols_, stream);
diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu
index 7742b9bd30..ad6d1e58d1 100644
--- a/cpp/test/spatial/selection.cu
+++ b/cpp/test/spatial/selection.cu
@@ -45,8 +45,9 @@ struct SparseSelectionInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(
-  ::std::ostream &os, const SparseSelectionInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os,
+                           const SparseSelectionInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
@@ -55,18 +56,20 @@ class SparseSelectionTest
   : public ::testing::TestWithParam<SparseSelectionInputs<value_idx, value_t>> {
  public:
   SparseSelectionTest()
-    : params(::testing::TestWithParam<
-             SparseSelectionInputs<value_idx, value_t>>::GetParam()),
+    : params(::testing::TestWithParam<SparseSelectionInputs<value_idx, value_t>>::GetParam()),
       stream(handle.get_stream()),
       dists(0, stream),
       inds(0, stream),
       out_indices_ref(0, stream),
       out_dists_ref(0, stream),
       out_dists(0, stream),
-      out_indices(0, stream) {}
+      out_indices(0, stream)
+  {
+  }
 
  protected:
-  void make_data() {
+  void make_data()
+  {
     std::vector<value_t> dists_h = params.dists_h;
 
     dists.resize(n_rows * n_cols, stream);
@@ -77,36 +80,43 @@ class SparseSelectionTest
     update_device(dists.data(), dists_h.data(), dists_h.size(), stream);
     iota_fill(inds.data(), n_rows, n_cols, stream);
 
-    std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
+    std::vector<value_t> out_dists_ref_h     = params.out_dists_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
     out_indices_ref.resize(out_indices_ref_h.size(), stream);
     out_dists_ref.resize(out_dists_ref_h.size(), stream);
 
-    update_device(out_indices_ref.data(), out_indices_ref_h.data(),
-                  out_indices_ref_h.size(), stream);
-    update_device(out_dists_ref.data(), out_dists_ref_h.data(),
-                  out_dists_ref_h.size(), stream);
+    update_device(
+      out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream);
+    update_device(out_dists_ref.data(), out_dists_ref_h.data(), out_dists_ref_h.size(), stream);
   }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     n_rows = params.n_rows;
     n_cols = params.n_cols;
-    k = params.k;
+    k      = params.k;
 
     make_data();
 
-    raft::spatial::knn::select_k(dists.data(), inds.data(), n_rows, n_cols,
-                                 out_dists.data(), out_indices.data(),
-                                 params.select_min, k, stream);
+    raft::spatial::knn::select_k(dists.data(),
+                                 inds.data(),
+                                 n_rows,
+                                 n_cols,
+                                 out_dists.data(),
+                                 out_indices.data(),
+                                 params.select_min,
+                                 k,
+                                 stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void compare() {
-    ASSERT_TRUE(devArrMatch(out_dists_ref.data(), out_dists.data(), n_rows * k,
-                            Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(out_indices_ref.data(), out_indices.data(),
-                            n_rows * k, Compare<value_idx>()));
+  void compare()
+  {
+    ASSERT_TRUE(
+      devArrMatch(out_dists_ref.data(), out_dists.data(), n_rows * k, Compare<value_t>()));
+    ASSERT_TRUE(
+      devArrMatch(out_indices_ref.data(), out_indices.data(), n_rows * k, Compare<value_idx>()));
   }
 
  protected:
@@ -141,7 +151,8 @@ const std::vector<SparseSelectionInputs<int, float>> inputs_i32_f = {
    true}};
 typedef SparseSelectionTest<int, float> SparseSelectionTestF;
 TEST_P(SparseSelectionTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(SparseSelectionTest, SparseSelectionTestF,
+INSTANTIATE_TEST_CASE_P(SparseSelectionTest,
+                        SparseSelectionTestF,
                         ::testing::ValuesIn(inputs_i32_f));
 
 };  // end namespace selection
diff --git a/cpp/test/spatial/spatial_data.h b/cpp/test/spatial/spatial_data.h
index 87891164fc..dbb32c4546 100644
--- a/cpp/test/spatial/spatial_data.h
+++ b/cpp/test/spatial/spatial_data.h
@@ -5,23 +5,18 @@ namespace spatial {
 
 // Latitude and longitude coordinates of 51 US states / territories
 std::vector<float> spatial_data = {
-  63.588753, -154.493062, 32.318231, -86.902298,  35.20105,  -91.831833,
-  34.048928, -111.093731, 36.778261, -119.417932, 39.550051, -105.782067,
-  41.603221, -73.087749,  38.905985, -77.033418,  38.910832, -75.52767,
-  27.664827, -81.515754,  32.157435, -82.907123,  19.898682, -155.665857,
-  41.878003, -93.097702,  44.068202, -114.742041, 40.633125, -89.398528,
-  40.551217, -85.602364,  39.011902, -98.484246,  37.839333, -84.270018,
-  31.244823, -92.145024,  42.407211, -71.382437,  39.045755, -76.641271,
-  45.253783, -69.445469,  44.314844, -85.602364,  46.729553, -94.6859,
-  37.964253, -91.831833,  32.354668, -89.398528,  46.879682, -110.362566,
-  35.759573, -79.0193,    47.551493, -101.002012, 41.492537, -99.901813,
-  43.193852, -71.572395,  40.058324, -74.405661,  34.97273,  -105.032363,
-  38.80261,  -116.419389, 43.299428, -74.217933,  40.417287, -82.907123,
-  35.007752, -97.092877,  43.804133, -120.554201, 41.203322, -77.194525,
-  18.220833, -66.590149,  41.580095, -71.477429,  33.836081, -81.163725,
-  43.969515, -99.901813,  35.517491, -86.580447,  31.968599, -99.901813,
-  39.32098,  -111.093731, 37.431573, -78.656894,  44.558803, -72.577841,
-  47.751074, -120.740139, 43.78444,  -88.787868,  38.597626, -80.454903,
-  43.075968, -107.290284};
+  63.588753, -154.493062, 32.318231, -86.902298,  35.20105,  -91.831833,  34.048928, -111.093731,
+  36.778261, -119.417932, 39.550051, -105.782067, 41.603221, -73.087749,  38.905985, -77.033418,
+  38.910832, -75.52767,   27.664827, -81.515754,  32.157435, -82.907123,  19.898682, -155.665857,
+  41.878003, -93.097702,  44.068202, -114.742041, 40.633125, -89.398528,  40.551217, -85.602364,
+  39.011902, -98.484246,  37.839333, -84.270018,  31.244823, -92.145024,  42.407211, -71.382437,
+  39.045755, -76.641271,  45.253783, -69.445469,  44.314844, -85.602364,  46.729553, -94.6859,
+  37.964253, -91.831833,  32.354668, -89.398528,  46.879682, -110.362566, 35.759573, -79.0193,
+  47.551493, -101.002012, 41.492537, -99.901813,  43.193852, -71.572395,  40.058324, -74.405661,
+  34.97273,  -105.032363, 38.80261,  -116.419389, 43.299428, -74.217933,  40.417287, -82.907123,
+  35.007752, -97.092877,  43.804133, -120.554201, 41.203322, -77.194525,  18.220833, -66.590149,
+  41.580095, -71.477429,  33.836081, -81.163725,  43.969515, -99.901813,  35.517491, -86.580447,
+  31.968599, -99.901813,  39.32098,  -111.093731, 37.431573, -78.656894,  44.558803, -72.577841,
+  47.751074, -120.740139, 43.78444,  -88.787868,  38.597626, -80.454903,  43.075968, -107.290284};
 };  // namespace spatial
 };  // namespace raft
\ No newline at end of file
diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu
index 388ad56f2d..fa54b04cda 100644
--- a/cpp/test/spectral_matrix.cu
+++ b/cpp/test/spectral_matrix.cu
@@ -32,7 +32,8 @@ struct csr_view_t {
   index_type number_of_edges;
 };
 }  // namespace
-TEST(Raft, SpectralMatrices) {
+TEST(Raft, SpectralMatrices)
+{
   using namespace matrix;
   using index_type = int;
   using value_type = double;
@@ -48,7 +49,7 @@ TEST(Raft, SpectralMatrices) {
   index_type* ro{nullptr};
   index_type* ci{nullptr};
   value_type* vs{nullptr};
-  index_type nnz = 0;
+  index_type nnz   = 0;
   index_type nrows = 0;
   sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
   sparse_matrix_t<index_type, value_type> sm2{h, csr_v};
@@ -62,9 +63,7 @@ TEST(Raft, SpectralMatrices) {
   };
   EXPECT_ANY_THROW(cnstr_lm1());  // because of nullptr ptr args
 
-  auto cnstr_lm2 = [&h, &sm2](void) {
-    laplacian_matrix_t<index_type, value_type> lm2{h, sm2};
-  };
+  auto cnstr_lm2 = [&h, &sm2](void) { laplacian_matrix_t<index_type, value_type> lm2{h, sm2}; };
   EXPECT_ANY_THROW(cnstr_lm2());  // because of nullptr ptr args
 
   auto cnstr_mm1 = [&h, ro, ci, vs, nrows, nnz](void) {
@@ -72,9 +71,7 @@ TEST(Raft, SpectralMatrices) {
   };
   EXPECT_ANY_THROW(cnstr_mm1());  // because of nullptr ptr args
 
-  auto cnstr_mm2 = [&h, &sm2](void) {
-    modularity_matrix_t<index_type, value_type> mm2{h, sm2};
-  };
+  auto cnstr_mm2 = [&h, &sm2](void) { modularity_matrix_t<index_type, value_type> mm2{h, sm2}; };
   EXPECT_ANY_THROW(cnstr_mm2());  // because of nullptr ptr args
 }
 
diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu
index cf866a5663..b8ea2cb799 100644
--- a/cpp/test/stats/mean.cu
+++ b/cpp/test/stats/mean.cu
@@ -35,7 +35,8 @@ struct MeanInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const MeanInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const MeanInputs<T>& dims)
+{
   return os;
 }
 
@@ -48,20 +49,23 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
       rows(params.rows),
       cols(params.cols),
       data(rows * cols, stream),
-      mean_act(rows * cols, stream) {}
+      mean_act(rows * cols, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int len = rows * cols;
     r.normal(data.data(), len, params.mean, (T)1.0, stream);
     meanSGtest(data.data(), stream);
   }
 
-  void meanSGtest(T *data, cudaStream_t stream) {
+  void meanSGtest(T* data, cudaStream_t stream)
+  {
     int rows = params.rows, cols = params.cols;
-    mean(mean_act.data(), data, cols, rows, params.sample, params.rowMajor,
-         stream);
+    mean(mean_act.data(), data, cols, rows, params.sample, params.rowMajor, stream);
   }
 
  protected:
@@ -76,52 +80,52 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
 // Note: For 1024 samples, 256 experiments, a mean of 1.0 with stddev=1.0, the
 // measured mean (of a normal distribution) will fall outside of an epsilon of
 // 0.15 only 4/10000 times. (epsilon of 0.1 will fail 30/100 times)
-const std::vector<MeanInputs<float>> inputsf = {
-  {0.15f, 1.f, 1024, 32, true, false, 1234ULL},
-  {0.15f, 1.f, 1024, 64, true, false, 1234ULL},
-  {0.15f, 1.f, 1024, 128, true, false, 1234ULL},
-  {0.15f, 1.f, 1024, 256, true, false, 1234ULL},
-  {0.15f, -1.f, 1024, 32, false, false, 1234ULL},
-  {0.15f, -1.f, 1024, 64, false, false, 1234ULL},
-  {0.15f, -1.f, 1024, 128, false, false, 1234ULL},
-  {0.15f, -1.f, 1024, 256, false, false, 1234ULL},
-  {0.15f, 1.f, 1024, 32, true, true, 1234ULL},
-  {0.15f, 1.f, 1024, 64, true, true, 1234ULL},
-  {0.15f, 1.f, 1024, 128, true, true, 1234ULL},
-  {0.15f, 1.f, 1024, 256, true, true, 1234ULL},
-  {0.15f, -1.f, 1024, 32, false, true, 1234ULL},
-  {0.15f, -1.f, 1024, 64, false, true, 1234ULL},
-  {0.15f, -1.f, 1024, 128, false, true, 1234ULL},
-  {0.15f, -1.f, 1024, 256, false, true, 1234ULL}};
-
-const std::vector<MeanInputs<double>> inputsd = {
-  {0.15, 1.0, 1024, 32, true, false, 1234ULL},
-  {0.15, 1.0, 1024, 64, true, false, 1234ULL},
-  {0.15, 1.0, 1024, 128, true, false, 1234ULL},
-  {0.15, 1.0, 1024, 256, true, false, 1234ULL},
-  {0.15, -1.0, 1024, 32, false, false, 1234ULL},
-  {0.15, -1.0, 1024, 64, false, false, 1234ULL},
-  {0.15, -1.0, 1024, 128, false, false, 1234ULL},
-  {0.15, -1.0, 1024, 256, false, false, 1234ULL},
-  {0.15, 1.0, 1024, 32, true, true, 1234ULL},
-  {0.15, 1.0, 1024, 64, true, true, 1234ULL},
-  {0.15, 1.0, 1024, 128, true, true, 1234ULL},
-  {0.15, 1.0, 1024, 256, true, true, 1234ULL},
-  {0.15, -1.0, 1024, 32, false, true, 1234ULL},
-  {0.15, -1.0, 1024, 64, false, true, 1234ULL},
-  {0.15, -1.0, 1024, 128, false, true, 1234ULL},
-  {0.15, -1.0, 1024, 256, false, true, 1234ULL}};
+const std::vector<MeanInputs<float>> inputsf = {{0.15f, 1.f, 1024, 32, true, false, 1234ULL},
+                                                {0.15f, 1.f, 1024, 64, true, false, 1234ULL},
+                                                {0.15f, 1.f, 1024, 128, true, false, 1234ULL},
+                                                {0.15f, 1.f, 1024, 256, true, false, 1234ULL},
+                                                {0.15f, -1.f, 1024, 32, false, false, 1234ULL},
+                                                {0.15f, -1.f, 1024, 64, false, false, 1234ULL},
+                                                {0.15f, -1.f, 1024, 128, false, false, 1234ULL},
+                                                {0.15f, -1.f, 1024, 256, false, false, 1234ULL},
+                                                {0.15f, 1.f, 1024, 32, true, true, 1234ULL},
+                                                {0.15f, 1.f, 1024, 64, true, true, 1234ULL},
+                                                {0.15f, 1.f, 1024, 128, true, true, 1234ULL},
+                                                {0.15f, 1.f, 1024, 256, true, true, 1234ULL},
+                                                {0.15f, -1.f, 1024, 32, false, true, 1234ULL},
+                                                {0.15f, -1.f, 1024, 64, false, true, 1234ULL},
+                                                {0.15f, -1.f, 1024, 128, false, true, 1234ULL},
+                                                {0.15f, -1.f, 1024, 256, false, true, 1234ULL}};
+
+const std::vector<MeanInputs<double>> inputsd = {{0.15, 1.0, 1024, 32, true, false, 1234ULL},
+                                                 {0.15, 1.0, 1024, 64, true, false, 1234ULL},
+                                                 {0.15, 1.0, 1024, 128, true, false, 1234ULL},
+                                                 {0.15, 1.0, 1024, 256, true, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 32, false, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 64, false, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 128, false, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 256, false, false, 1234ULL},
+                                                 {0.15, 1.0, 1024, 32, true, true, 1234ULL},
+                                                 {0.15, 1.0, 1024, 64, true, true, 1234ULL},
+                                                 {0.15, 1.0, 1024, 128, true, true, 1234ULL},
+                                                 {0.15, 1.0, 1024, 256, true, true, 1234ULL},
+                                                 {0.15, -1.0, 1024, 32, false, true, 1234ULL},
+                                                 {0.15, -1.0, 1024, 64, false, true, 1234ULL},
+                                                 {0.15, -1.0, 1024, 128, false, true, 1234ULL},
+                                                 {0.15, -1.0, 1024, 256, false, true, 1234ULL}};
 
 typedef MeanTest<float> MeanTestF;
-TEST_P(MeanTestF, Result) {
-  ASSERT_TRUE(devArrMatch(params.mean, mean_act.data(), params.cols,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MeanTestF, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(params.mean, mean_act.data(), params.cols, CompareApprox<float>(params.tolerance)));
 }
 
 typedef MeanTest<double> MeanTestD;
-TEST_P(MeanTestD, Result) {
-  ASSERT_TRUE(devArrMatch(params.mean, mean_act.data(), params.cols,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MeanTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    params.mean, mean_act.data(), params.cols, CompareApprox<double>(params.tolerance)));
 }
 
 INSTANTIATE_TEST_SUITE_P(MeanTests, MeanTestF, ::testing::ValuesIn(inputsf));
diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu
index dcc4b4e551..6a76a289d7 100644
--- a/cpp/test/stats/mean_center.cu
+++ b/cpp/test/stats/mean_center.cu
@@ -34,37 +34,49 @@ struct MeanCenterInputs {
 };
 
 template <typename T, typename IdxType>
-::std::ostream &operator<<(::std::ostream &os,
-                           const MeanCenterInputs<T, IdxType> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const MeanCenterInputs<T, IdxType>& dims)
+{
   return os;
 }
 
 template <typename T, typename IdxType>
-class MeanCenterTest
-  : public ::testing::TestWithParam<MeanCenterInputs<T, IdxType>> {
+class MeanCenterTest : public ::testing::TestWithParam<MeanCenterInputs<T, IdxType>> {
  public:
   MeanCenterTest()
-    : params(
-        ::testing::TestWithParam<MeanCenterInputs<T, IdxType>>::GetParam()),
+    : params(::testing::TestWithParam<MeanCenterInputs<T, IdxType>>::GetParam()),
       stream(handle.get_stream()),
       rows(params.rows),
       cols(params.cols),
       out(rows * cols, stream),
       out_ref(rows * cols, stream),
       data(rows * cols, stream),
-      meanVec(params.bcastAlongRows ? cols : rows, stream) {}
+      meanVec(params.bcastAlongRows ? cols : rows, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     auto len = rows * cols;
     r.normal(data.data(), len, params.mean, (T)1.0, stream);
-    raft::stats::mean(meanVec.data(), data.data(), cols, rows, params.sample,
-                      params.rowMajor, stream);
-    meanCenter(out.data(), data.data(), meanVec.data(), cols, rows,
-               params.rowMajor, params.bcastAlongRows, stream);
-    raft::linalg::naiveMatVec(out_ref.data(), data.data(), meanVec.data(), cols,
-                              rows, params.rowMajor, params.bcastAlongRows,
+    raft::stats::mean(
+      meanVec.data(), data.data(), cols, rows, params.sample, params.rowMajor, stream);
+    meanCenter(out.data(),
+               data.data(),
+               meanVec.data(),
+               cols,
+               rows,
+               params.rowMajor,
+               params.bcastAlongRows,
+               stream);
+    raft::linalg::naiveMatVec(out_ref.data(),
+                              data.data(),
+                              meanVec.data(),
+                              cols,
+                              rows,
+                              params.rowMajor,
+                              params.bcastAlongRows,
                               (T)-1.0);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
@@ -104,12 +116,12 @@ const std::vector<MeanCenterInputs<float, int>> inputsf_i32 = {
   {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL},
   {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}};
 typedef MeanCenterTest<float, int> MeanCenterTestF_i32;
-TEST_P(MeanCenterTestF_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.cols,
-                          raft::CompareApprox<float>(params.tolerance)));
+TEST_P(MeanCenterTestF_i32, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out.data(), out_ref.data(), params.cols, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32,
-                         ::testing::ValuesIn(inputsf_i32));
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32, ::testing::ValuesIn(inputsf_i32));
 
 const std::vector<MeanCenterInputs<float, size_t>> inputsf_i64 = {
   {0.05f, 1.f, 1024, 32, true, false, true, 1234ULL},
@@ -137,12 +149,12 @@ const std::vector<MeanCenterInputs<float, size_t>> inputsf_i64 = {
   {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL},
   {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}};
 typedef MeanCenterTest<float, size_t> MeanCenterTestF_i64;
-TEST_P(MeanCenterTestF_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.cols,
-                          raft::CompareApprox<float>(params.tolerance)));
+TEST_P(MeanCenterTestF_i64, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out.data(), out_ref.data(), params.cols, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64,
-                         ::testing::ValuesIn(inputsf_i64));
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64, ::testing::ValuesIn(inputsf_i64));
 
 const std::vector<MeanCenterInputs<double, int>> inputsd_i32 = {
   {0.05, 1.0, 1024, 32, true, false, true, 1234ULL},
@@ -170,12 +182,12 @@ const std::vector<MeanCenterInputs<double, int>> inputsd_i32 = {
   {0.05, -1.0, 1024, 64, false, true, false, 1234ULL},
   {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}};
 typedef MeanCenterTest<double, int> MeanCenterTestD_i32;
-TEST_P(MeanCenterTestD_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.cols,
-                          raft::CompareApprox<double>(params.tolerance)));
+TEST_P(MeanCenterTestD_i32, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out.data(), out_ref.data(), params.cols, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32,
-                         ::testing::ValuesIn(inputsd_i32));
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32, ::testing::ValuesIn(inputsd_i32));
 
 const std::vector<MeanCenterInputs<double, size_t>> inputsd_i64 = {
   {0.05, 1.0, 1024, 32, true, false, true, 1234ULL},
@@ -203,12 +215,12 @@ const std::vector<MeanCenterInputs<double, size_t>> inputsd_i64 = {
   {0.05, -1.0, 1024, 64, false, true, false, 1234ULL},
   {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}};
 typedef MeanCenterTest<double, size_t> MeanCenterTestD_i64;
-TEST_P(MeanCenterTestD_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.cols,
-                          raft::CompareApprox<double>(params.tolerance)));
+TEST_P(MeanCenterTestD_i64, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out.data(), out_ref.data(), params.cols, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64,
-                         ::testing::ValuesIn(inputsd_i64));
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64, ::testing::ValuesIn(inputsd_i64));
 
 }  // end namespace stats
 }  // end namespace raft
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index 53f392aaf3..3efc54264e 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -34,7 +34,8 @@ struct StdDevInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const StdDevInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const StdDevInputs<T>& dims)
+{
   return os;
 }
 
@@ -49,10 +50,13 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
       data(rows * cols, stream),
       mean_act(cols, stream),
       stddev_act(cols, stream),
-      vars_act(cols, stream) {}
+      vars_act(cols, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     random::Rng r(params.seed);
     int len = rows * cols;
 
@@ -65,17 +69,17 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void stdVarSGtest(T *data, cudaStream_t stream) {
+  void stdVarSGtest(T* data, cudaStream_t stream)
+  {
     int rows = params.rows, cols = params.cols;
 
-    mean(mean_act.data(), data, cols, rows, params.sample, params.rowMajor,
-         stream);
+    mean(mean_act.data(), data, cols, rows, params.sample, params.rowMajor, stream);
 
-    stddev(stddev_act.data(), data, mean_act.data(), cols, rows, params.sample,
-           params.rowMajor, stream);
+    stddev(
+      stddev_act.data(), data, mean_act.data(), cols, rows, params.sample, params.rowMajor, stream);
 
-    vars(vars_act.data(), data, mean_act.data(), cols, rows, params.sample,
-         params.rowMajor, stream);
+    vars(
+      vars_act.data(), data, mean_act.data(), cols, rows, params.sample, params.rowMajor, stream);
 
     raft::matrix::seqRoot(vars_act.data(), T(1), cols, stream);
   }
@@ -126,28 +130,28 @@ const std::vector<StdDevInputs<double>> inputsd = {
   {0.1, -1.0, 2.0, 1024, 256, false, true, 1234ULL}};
 
 typedef StdDevTest<float> StdDevTestF;
-TEST_P(StdDevTestF, Result) {
-  ASSERT_TRUE(devArrMatch(params.stddev, stddev_act.data(), params.cols,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(StdDevTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    params.stddev, stddev_act.data(), params.cols, CompareApprox<float>(params.tolerance)));
 
-  ASSERT_TRUE(devArrMatch(stddev_act.data(), vars_act.data(), params.cols,
-                          CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    stddev_act.data(), vars_act.data(), params.cols, CompareApprox<float>(params.tolerance)));
 }
 
 typedef StdDevTest<double> StdDevTestD;
-TEST_P(StdDevTestD, Result) {
-  ASSERT_TRUE(devArrMatch(params.stddev, stddev_act.data(), params.cols,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(StdDevTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    params.stddev, stddev_act.data(), params.cols, CompareApprox<double>(params.tolerance)));
 
-  ASSERT_TRUE(devArrMatch(stddev_act.data(), vars_act.data(), params.cols,
-                          CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    stddev_act.data(), vars_act.data(), params.cols, CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace stats
 }  // end namespace raft
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index ac4d642c8e..ecb1171ea5 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -32,7 +32,8 @@ struct SumInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const SumInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const SumInputs<T>& dims)
+{
   return os;
 }
 
@@ -45,10 +46,13 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
       rows(params.rows),
       cols(params.cols),
       data(rows * cols, stream),
-      sum_act(cols, stream) {}
+      sum_act(cols, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     int len = rows * cols;
 
     T data_h[len];
@@ -77,14 +81,17 @@ const std::vector<SumInputs<double>> inputsd = {{0.05, 1024, 32, 1234ULL},
                                                 {0.05, 1024, 256, 1234ULL}};
 
 typedef SumTest<float> SumTestF;
-TEST_P(SumTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(float(params.rows), sum_act.data(), params.cols,
-                                raft::CompareApprox<float>(params.tolerance)));
+TEST_P(SumTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    float(params.rows), sum_act.data(), params.cols, raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef SumTest<double> SumTestD;
-TEST_P(SumTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(double(params.rows), sum_act.data(),
+TEST_P(SumTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(double(params.rows),
+                                sum_act.data(),
                                 params.cols,
                                 raft::CompareApprox<double>(params.tolerance)));
 }
diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h
index 0f135c0121..58b9ae42ae 100644
--- a/cpp/test/test_utils.h
+++ b/cpp/test/test_utils.h
@@ -32,15 +32,16 @@ namespace raft {
 
 template <typename T>
 struct Compare {
-  bool operator()(const T &a, const T &b) const { return a == b; }
+  bool operator()(const T& a, const T& b) const { return a == b; }
 };
 
 template <typename T>
 struct CompareApprox {
   CompareApprox(T eps_) : eps(eps_) {}
-  bool operator()(const T &a, const T &b) const {
-    T diff = abs(a - b);
-    T m = std::max(abs(a), abs(b));
+  bool operator()(const T& a, const T& b) const
+  {
+    T diff  = abs(a - b);
+    T m     = std::max(abs(a), abs(b));
     T ratio = diff >= eps ? diff / m : diff;
 
     return (ratio <= eps);
@@ -53,9 +54,10 @@ struct CompareApprox {
 template <typename T>
 struct CompareApproxAbs {
   CompareApproxAbs(T eps_) : eps(eps_) {}
-  bool operator()(const T &a, const T &b) const {
-    T diff = abs(abs(a) - abs(b));
-    T m = std::max(abs(a), abs(b));
+  bool operator()(const T& a, const T& b) const
+  {
+    T diff  = abs(abs(a) - abs(b));
+    T m     = std::max(abs(a), abs(b));
     T ratio = diff >= eps ? diff / m : diff;
     return (ratio <= eps);
   }
@@ -65,25 +67,26 @@ struct CompareApproxAbs {
 };
 
 template <typename T>
-T abs(const T &a) {
+T abs(const T& a)
+{
   return a > T(0) ? a : -a;
 }
 
 /*
-     * @brief Helper function to compare 2 device n-D arrays with custom comparison
-     * @tparam T the data type of the arrays
-     * @tparam L the comparator lambda or object function
-     * @param expected expected value(s)
-     * @param actual actual values
-     * @param eq_compare the comparator
-     * @param stream cuda stream
-     * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
-     * @{
-     */
+ * @brief Helper function to compare 2 device n-D arrays with custom comparison
+ * @tparam T the data type of the arrays
+ * @tparam L the comparator lambda or object function
+ * @param expected expected value(s)
+ * @param actual actual values
+ * @param eq_compare the comparator
+ * @param stream cuda stream
+ * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
+ * @{
+ */
 template <typename T, typename L>
-testing::AssertionResult devArrMatch(const T *expected, const T *actual,
-                                     size_t size, L eq_compare,
-                                     cudaStream_t stream = 0) {
+testing::AssertionResult devArrMatch(
+  const T* expected, const T* actual, size_t size, L eq_compare, cudaStream_t stream = 0)
+{
   std::unique_ptr<T[]> exp_h(new T[size]);
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(exp_h.get(), expected, size, stream);
@@ -93,16 +96,16 @@ testing::AssertionResult devArrMatch(const T *expected, const T *actual,
     auto exp = exp_h.get()[i];
     auto act = act_h.get()[i];
     if (!eq_compare(exp, act)) {
-      return testing::AssertionFailure()
-             << "actual=" << act << " != expected=" << exp << " @" << i;
+      return testing::AssertionFailure() << "actual=" << act << " != expected=" << exp << " @" << i;
     }
   }
   return testing::AssertionSuccess();
 }
 
 template <typename T, typename L>
-testing::AssertionResult devArrMatch(T expected, const T *actual, size_t size,
-                                     L eq_compare, cudaStream_t stream = 0) {
+testing::AssertionResult devArrMatch(
+  T expected, const T* actual, size_t size, L eq_compare, cudaStream_t stream = 0)
+{
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(act_h.get(), actual, size, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
@@ -117,9 +120,13 @@ testing::AssertionResult devArrMatch(T expected, const T *actual, size_t size,
 }
 
 template <typename T, typename L>
-testing::AssertionResult devArrMatch(const T *expected, const T *actual,
-                                     size_t rows, size_t cols, L eq_compare,
-                                     cudaStream_t stream = 0) {
+testing::AssertionResult devArrMatch(const T* expected,
+                                     const T* actual,
+                                     size_t rows,
+                                     size_t cols,
+                                     L eq_compare,
+                                     cudaStream_t stream = 0)
+{
   size_t size = rows * cols;
   std::unique_ptr<T[]> exp_h(new T[size]);
   std::unique_ptr<T[]> act_h(new T[size]);
@@ -133,8 +140,7 @@ testing::AssertionResult devArrMatch(const T *expected, const T *actual,
       auto act = act_h.get()[idx];
       if (!eq_compare(exp, act)) {
         return testing::AssertionFailure()
-               << "actual=" << act << " != expected=" << exp << " @" << i << ","
-               << j;
+               << "actual=" << act << " != expected=" << exp << " @" << i << "," << j;
       }
     }
   }
@@ -142,9 +148,9 @@ testing::AssertionResult devArrMatch(const T *expected, const T *actual,
 }
 
 template <typename T, typename L>
-testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows,
-                                     size_t cols, L eq_compare,
-                                     cudaStream_t stream = 0) {
+testing::AssertionResult devArrMatch(
+  T expected, const T* actual, size_t rows, size_t cols, L eq_compare, cudaStream_t stream = 0)
+{
   size_t size = rows * cols;
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(act_h.get(), actual, size, stream);
@@ -155,8 +161,7 @@ testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows,
       auto act = act_h.get()[idx];
       if (!eq_compare(expected, act)) {
         return testing::AssertionFailure()
-               << "actual=" << act << " != expected=" << expected << " @" << i
-               << "," << j;
+               << "actual=" << act << " != expected=" << expected << " @" << i << "," << j;
       }
     }
   }
@@ -164,24 +169,24 @@ testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows,
 }
 
 /*
-     * @brief Helper function to compare a device n-D arrays with an expected array
-     * on the host, using a custom comparison
-     * @tparam T the data type of the arrays
-     * @tparam L the comparator lambda or object function
-     * @param expected_h host array of expected value(s)
-     * @param actual_d device array actual values
-     * @param eq_compare the comparator
-     * @param stream cuda stream
-     * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
-     */
+ * @brief Helper function to compare a device n-D arrays with an expected array
+ * on the host, using a custom comparison
+ * @tparam T the data type of the arrays
+ * @tparam L the comparator lambda or object function
+ * @param expected_h host array of expected value(s)
+ * @param actual_d device array actual values
+ * @param eq_compare the comparator
+ * @param stream cuda stream
+ * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
+ */
 template <typename T, typename L>
-testing::AssertionResult devArrMatchHost(const T *expected_h, const T *actual_d,
-                                         size_t size, L eq_compare,
-                                         cudaStream_t stream = 0) {
+testing::AssertionResult devArrMatchHost(
+  const T* expected_h, const T* actual_d, size_t size, L eq_compare, cudaStream_t stream = 0)
+{
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(act_h.get(), actual_d, size, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
-  bool ok = true;
+  bool ok   = true;
   auto fail = testing::AssertionFailure();
   for (size_t i(0); i < size; ++i) {
     auto exp = expected_h[i];
@@ -196,19 +201,19 @@ testing::AssertionResult devArrMatchHost(const T *expected_h, const T *actual_d,
 }
 
 /*
-     * @brief Helper function to compare diagonal values of a 2D matrix
-     * @tparam T the data type of the arrays
-     * @tparam L the comparator lambda or object function
-     * @param expected expected value along diagonal
-     * @param actual actual matrix
-     * @param eq_compare the comparator
-     * @param stream cuda stream
-     * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
-     */
+ * @brief Helper function to compare diagonal values of a 2D matrix
+ * @tparam T the data type of the arrays
+ * @tparam L the comparator lambda or object function
+ * @param expected expected value along diagonal
+ * @param actual actual matrix
+ * @param eq_compare the comparator
+ * @param stream cuda stream
+ * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
+ */
 template <typename T, typename L>
-testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows,
-                                       size_t cols, L eq_compare,
-                                       cudaStream_t stream = 0) {
+testing::AssertionResult diagonalMatch(
+  T expected, const T* actual, size_t rows, size_t cols, L eq_compare, cudaStream_t stream = 0)
+{
   size_t size = rows * cols;
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(act_h.get(), actual, size, stream);
@@ -220,8 +225,7 @@ testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows,
       auto act = act_h.get()[idx];
       if (!eq_compare(expected, act)) {
         return testing::AssertionFailure()
-               << "actual=" << act << " != expected=" << expected << " @" << i
-               << "," << j;
+               << "actual=" << act << " != expected=" << expected << " @" << i << "," << j;
       }
     }
   }
@@ -229,10 +233,10 @@ testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows,
 }
 
 template <typename T, typename L>
-testing::AssertionResult match(const T expected, T actual, L eq_compare) {
+testing::AssertionResult match(const T expected, T actual, L eq_compare)
+{
   if (!eq_compare(expected, actual)) {
-    return testing::AssertionFailure()
-           << "actual=" << actual << " != expected=" << expected;
+    return testing::AssertionFailure() << "actual=" << actual << " != expected=" << expected;
   }
   return testing::AssertionSuccess();
 }
@@ -256,8 +260,8 @@ testing::AssertionResult match(const T expected, T actual, L eq_compare) {
     ms /= args.runs;                                    \
   } while (0)
 
-inline std::vector<float> read_csv(std::string filename,
-                                   bool skip_first_n_columns = 1) {
+inline std::vector<float> read_csv(std::string filename, bool skip_first_n_columns = 1)
+{
   std::vector<float> result;
   std::ifstream myFile(filename);
   if (!myFile.is_open()) throw std::runtime_error("Could not open file");
@@ -268,8 +272,7 @@ inline std::vector<float> read_csv(std::string filename,
   if (myFile.good()) {
     std::getline(myFile, line);
     std::stringstream ss(line);
-    while (std::getline(ss, colname, ',')) {
-    }
+    while (std::getline(ss, colname, ',')) {}
   }
 
   int n_lines = 0;

From ddb08400db6599a863453e5bb3c4d947064c5419 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Mon, 29 Nov 2021 14:28:47 -0500
Subject: [PATCH 047/171] Ensure that the correct ABI for cusolver exists when
 running tests (#393)

cusolver changed SOVERSION's during the CUDA 11.X release cycle, and therefore we need to ensure that we have the same ABI as compiled against when running tests.
---
 ci/gpu/build.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 70a13b2318..bd7cf69e3a 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -47,6 +47,7 @@ conda activate rapids
 gpuci_logger "Installing packages needed for RAFT"
 gpuci_mamba_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \
       "cudatoolkit=${CUDA_REL}" \
+      "libcusolver>=11.2.1" \
       "cudf=${MINOR_VERSION}" \
       "rmm=${MINOR_VERSION}" \
       "dask-cudf=${MINOR_VERSION}" \

From 6e46e6703596bc5da445ed3503945350f774e240 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Wed, 1 Dec 2021 17:31:40 -0500
Subject: [PATCH 048/171] Reverting default knn distance to L2Unexpanded for
 now. (#403)

I believe the new expanded knn changes might need some further investigation. Before we started using the fused l2 knn, both unexpanded and expanded variants mapped to the same (expanded) distance computation in FAISS and the cuml tests passed consistently. For some reason, the new expanded fused l2 knn version is causing test failures within cuml. We should figure out if this is from assertions which are too tight or from a possible bug.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/403
---
 cpp/include/raft/spatial/knn/knn.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index eb9a8f1436..21422e5a57 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -135,8 +135,8 @@ inline void brute_force_knn(raft::handle_t const& handle,
                             bool rowMajorIndex                 = true,
                             bool rowMajorQuery                 = true,
                             std::vector<int64_t>* translations = nullptr,
-                            distance::DistanceType metric      = distance::DistanceType::L2Expanded,
-                            float metric_arg                   = 2.0f)
+                            distance::DistanceType metric = distance::DistanceType::L2Unexpanded,
+                            float metric_arg              = 2.0f)
 {
   ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size");
 

From 80507eef25ebb73a12739a45a97f4b6c1b9632c3 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Thu, 2 Dec 2021 00:46:43 -0500
Subject: [PATCH 049/171] Disabling expanded fused l2 knn to unblock cuml CI
 (#404)

We will enable this again once cuml's tests are passing w/ the changes.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/404
---
 .../raft/spatial/knn/detail/knn_brute_force_faiss.cuh     | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 2866049188..0b89377630 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -293,11 +293,13 @@ void brute_force_knn_impl(
 
     cudaStream_t stream = raft::select_stream(userStream, internalStreams, n_int_streams, i);
 
+    // TODO: Enable this once we figure out why it's causing pytest failures in cuml.
     if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true &&
         (metric == raft::distance::DistanceType::L2Unexpanded ||
-         metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
-         metric == raft::distance::DistanceType::L2Expanded ||
-         metric == raft::distance::DistanceType::L2SqrtExpanded)) {
+         metric == raft::distance::DistanceType::L2SqrtUnexpanded  //||
+         //             metric == raft::distance::DistanceType::L2Expanded ||
+         //             metric == raft::distance::DistanceType::L2SqrtExpanded)
+         )) {
       fusedL2Knn(D,
                  out_i_ptr,
                  out_d_ptr,

From 50f43c769acecc4e9d06b4ce76f6f755986d954b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Fri, 3 Dec 2021 18:26:07 -0500
Subject: [PATCH 050/171] Disabling fused l2 knn from bfknn (#407)

It appears the recent changes to the fused l2 knn have somehow broken a few things in cuml, such as rbc, trustworthines, and UMAP.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/407
---
 .../knn/detail/knn_brute_force_faiss.cuh      | 120 +++++++++---------
 cpp/test/spatial/ball_cover.cu                |  14 +-
 2 files changed, 68 insertions(+), 66 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 0b89377630..d154e5f92a 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -293,69 +293,69 @@ void brute_force_knn_impl(
 
     cudaStream_t stream = raft::select_stream(userStream, internalStreams, n_int_streams, i);
 
-    // TODO: Enable this once we figure out why it's causing pytest failures in cuml.
-    if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true &&
-        (metric == raft::distance::DistanceType::L2Unexpanded ||
-         metric == raft::distance::DistanceType::L2SqrtUnexpanded  //||
-         //             metric == raft::distance::DistanceType::L2Expanded ||
-         //             metric == raft::distance::DistanceType::L2SqrtExpanded)
-         )) {
-      fusedL2Knn(D,
-                 out_i_ptr,
-                 out_d_ptr,
-                 input[i],
-                 search_items,
-                 sizes[i],
-                 n,
-                 k,
-                 rowMajorIndex,
-                 rowMajorQuery,
-                 stream,
-                 metric);
-    } else {
-      switch (metric) {
-        case raft::distance::DistanceType::Haversine:
-
-          ASSERT(D == 2,
-                 "Haversine distance requires 2 dimensions "
-                 "(latitude / longitude).");
-
-          haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream);
-          break;
-        default:
-          faiss::MetricType m = build_faiss_metric(metric);
-
-          faiss::gpu::StandardGpuResources gpu_res;
-
-          gpu_res.noTempMemory();
-          gpu_res.setDefaultStream(device, stream);
-
-          faiss::gpu::GpuDistanceParams args;
-          args.metric          = m;
-          args.metricArg       = metricArg;
-          args.k               = k;
-          args.dims            = D;
-          args.vectors         = input[i];
-          args.vectorsRowMajor = rowMajorIndex;
-          args.numVectors      = sizes[i];
-          args.queries         = search_items;
-          args.queriesRowMajor = rowMajorQuery;
-          args.numQueries      = n;
-          args.outDistances    = out_d_ptr;
-          args.outIndices      = out_i_ptr;
-
-          /**
-           * @todo: Until FAISS supports pluggable allocation strategies,
-           * we will not reap the benefits of the pool allocator for
-           * avoiding device-wide synchronizations from cudaMalloc/cudaFree
-           */
-          bfKnn(&gpu_res, args);
-      }
+    //    // TODO: Enable this once we figure out why it's causing pytest failures in cuml.
+    //    if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true &&
+    //        (metric == raft::distance::DistanceType::L2Unexpanded ||
+    //         metric == raft::distance::DistanceType::L2SqrtUnexpanded  //||
+    //         //             metric == raft::distance::DistanceType::L2Expanded ||
+    //         //             metric == raft::distance::DistanceType::L2SqrtExpanded)
+    //         )) {
+    //      fusedL2Knn(D,
+    //                 out_i_ptr,
+    //                 out_d_ptr,
+    //                 input[i],
+    //                 search_items,
+    //                 sizes[i],
+    //                 n,
+    //                 k,
+    //                 rowMajorIndex,
+    //                 rowMajorQuery,
+    //                 stream,
+    //                 metric);
+    //    } else {
+    switch (metric) {
+      case raft::distance::DistanceType::Haversine:
+
+        ASSERT(D == 2,
+               "Haversine distance requires 2 dimensions "
+               "(latitude / longitude).");
+
+        haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream);
+        break;
+      default:
+        faiss::MetricType m = build_faiss_metric(metric);
+
+        faiss::gpu::StandardGpuResources gpu_res;
+
+        gpu_res.noTempMemory();
+        gpu_res.setDefaultStream(device, stream);
+
+        faiss::gpu::GpuDistanceParams args;
+        args.metric          = m;
+        args.metricArg       = metricArg;
+        args.k               = k;
+        args.dims            = D;
+        args.vectors         = input[i];
+        args.vectorsRowMajor = rowMajorIndex;
+        args.numVectors      = sizes[i];
+        args.queries         = search_items;
+        args.queriesRowMajor = rowMajorQuery;
+        args.numQueries      = n;
+        args.outDistances    = out_d_ptr;
+        args.outIndices      = out_i_ptr;
+
+        /**
+         * @todo: Until FAISS supports pluggable allocation strategies,
+         * we will not reap the benefits of the pool allocator for
+         * avoiding device-wide synchronizations from cudaMalloc/cudaFree
+         */
+        bfKnn(&gpu_res, args);
     }
-
-    CUDA_CHECK(cudaPeekAtLastError());
   }
 
+  CUDA_CHECK(cudaPeekAtLastError());
+  //  }
+
   // Sync internal streams if used. We don't need to
   // sync the user stream because we'll already have
   // fully serial execution.
diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu
index ab85e7fe8f..0a1680badc 100644
--- a/cpp/test/spatial/ball_cover.cu
+++ b/cpp/test/spatial/ball_cover.cu
@@ -52,12 +52,14 @@ __global__ void count_discrepancies_kernel(value_idx* actual_idx,
   if (row < m) {
     for (uint32_t i = 0; i < n; i++) {
       value_t d    = actual[row * n + i] - expected[row * n + i];
-      bool matches = fabsf(d) <= thres;
-      if (!matches) {
-        //          printf("row=%d, actual_idx=%ld, actual=%f, expected_id=%ld, expected=%f\n",
-        //                 row, actual_idx[row*n+i], actual[row*n+i], expected_idx[row*n+i],
-        //                 expected[row*n+i]);
-      }
+      bool matches = (fabsf(d) <= thres) || (actual_idx[row * n + i] == expected_idx[row * n + i] &&
+                                             actual_idx[row * n + i] == row);
+      //      if (!matches) {
+      //                  printf("row=%d, actual_idx=%ld, actual=%f, expected_id=%ld,
+      //                  expected=%f\n",
+      //                         row, actual_idx[row*n+i], actual[row*n+i], expected_idx[row*n+i],
+      //                         expected[row*n+i]);
+      //      }
 
       n_diffs += !matches;
       out[row] = n_diffs;

From 69c5c7157ccfe1aa720779c6b1475dc5b0866a38 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 6 Dec 2021 21:18:19 +0100
Subject: [PATCH 051/171] Update to UCX-Py 0.24 (#392)

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/raft/pull/392
---
 ci/gpu/build.sh          | 2 +-
 ci/local/old-gpubuild.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index bd7cf69e3a..c2a318c39a 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -52,7 +52,7 @@ gpuci_mamba_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid
       "rmm=${MINOR_VERSION}" \
       "dask-cudf=${MINOR_VERSION}" \
       "dask-cuda=${MINOR_VERSION}" \
-      "ucx-py=0.23.*" \
+      "ucx-py=0.24.*" \
       "rapids-build-env=${MINOR_VERSION}.*" \
       "rapids-notebook-env=${MINOR_VERSION}.*" \
       "rapids-doc-env=${MINOR_VERSION}.*"
diff --git a/ci/local/old-gpubuild.sh b/ci/local/old-gpubuild.sh
index 28fd2d13a0..7f0f375d77 100644
--- a/ci/local/old-gpubuild.sh
+++ b/ci/local/old-gpubuild.sh
@@ -56,7 +56,7 @@ gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid
       "distributed>=2.12.0" \
       "dask-cudf=${MINOR_VERSION}" \
       "dask-cuda=${MINOR_VERSION}" \
-      "ucx-py=0.23.*"
+      "ucx-py=0.24.*"
 
 if [ "$RUN_CUML_LIBCUML_TESTS" = "ON" ] || [ "$RUN_CUML_PRIMS_TESTS" = "ON" ] || [ "$RUN_CUML_PYTHON_TESTS" = "ON" ]; then
   gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \

From b29ec65330301e821b01f7035fc97479d04caa3e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Tue, 7 Dec 2021 10:58:05 -0500
Subject: [PATCH 052/171] Hiding implementation details for sparse API (#381)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/381
---
 cpp/.clang-format                             |   2 +-
 cpp/.clang-tidy                               |   3 +-
 cpp/include/raft/sparse/convert/coo.hpp       |  42 +++++
 cpp/include/raft/sparse/convert/csr.hpp       | 138 ++++++++++++++
 cpp/include/raft/sparse/convert/dense.hpp     |  61 ++++++
 .../raft/sparse/convert/{ => detail}/coo.cuh  |   6 +-
 .../raft/sparse/convert/{ => detail}/csr.cuh  |  29 +--
 .../sparse/convert/{ => detail}/dense.cuh     |   4 +-
 cpp/include/raft/sparse/coo.hpp               |  46 +++++
 cpp/include/raft/sparse/{csr.cuh => csr.hpp}  | 123 +-----------
 cpp/include/raft/sparse/{ => detail}/coo.cuh  |  16 +-
 cpp/include/raft/sparse/detail/csr.cuh        | 175 ++++++++++++++++++
 cpp/include/raft/sparse/{ => detail}/utils.h  |   0
 .../sparse/distance/detail/bin_distance.cuh   |   2 +-
 .../raft/sparse/distance/detail/coo_spmv.cuh  |   4 +-
 .../sparse/distance/detail/ip_distance.cuh    |   8 +-
 .../sparse/distance/detail/l2_distance.cuh    |   4 +-
 .../sparse/distance/detail/lp_distance.cuh    |   6 +-
 cpp/include/raft/sparse/distance/distance.hpp |  12 +-
 .../hierarchy/detail/connectivities.cuh       |   6 +-
 .../raft/sparse/hierarchy/detail/mst.cuh      |   4 +-
 .../hierarchy/detail/single_linkage.hpp       | 128 +++++++++++++
 .../raft/sparse/hierarchy/single_linkage.hpp  |  76 +-------
 cpp/include/raft/sparse/linalg/add.hpp        |  95 ++++++++++
 cpp/include/raft/sparse/linalg/degree.hpp     | 119 ++++++++++++
 .../raft/sparse/linalg/{ => detail}/add.cuh   |   4 +-
 .../sparse/linalg/{ => detail}/degree.cuh     |  68 +------
 .../raft/sparse/linalg/{ => detail}/norm.cuh  |   4 +-
 .../sparse/linalg/{ => detail}/spectral.cuh   |   8 +-
 .../sparse/linalg/{ => detail}/symmetrize.cuh |  40 ++--
 .../sparse/linalg/{ => detail}/transpose.h    |   4 +-
 cpp/include/raft/sparse/linalg/norm.hpp       |  69 +++++++
 cpp/include/raft/sparse/linalg/spectral.hpp   |  39 ++++
 cpp/include/raft/sparse/linalg/symmetrize.hpp | 164 ++++++++++++++++
 cpp/include/raft/sparse/linalg/transpose.hpp  |  70 +++++++
 .../raft/sparse/op/{ => detail}/filter.cuh    |  13 +-
 .../raft/sparse/op/{ => detail}/reduce.cuh    |  11 +-
 .../raft/sparse/op/{ => detail}/row_op.cuh    |   4 +-
 .../raft/sparse/op/{ => detail}/slice.h       |   4 +-
 .../raft/sparse/op/{ => detail}/sort.h        |  11 +-
 cpp/include/raft/sparse/op/filter.hpp         |  91 +++++++++
 cpp/include/raft/sparse/op/reduce.hpp         |  83 +++++++++
 cpp/include/raft/sparse/op/row_op.hpp         |  45 +++++
 cpp/include/raft/sparse/op/slice.hpp          |  77 ++++++++
 cpp/include/raft/sparse/op/sort.hpp           |  74 ++++++++
 .../sparse/selection/connect_components.hpp   |  76 ++++++++
 .../{ => detail}/connect_components.cuh       |  36 +++-
 .../sparse/selection/{ => detail}/knn.cuh     |  78 +-------
 .../selection/{ => detail}/knn_graph.cuh      |   6 +-
 cpp/include/raft/sparse/selection/knn.hpp     |  97 ++++++++++
 .../raft/sparse/selection/knn_graph.hpp       |  58 ++++++
 .../raft/spatial/knn/detail/ball_cover.cuh    |   2 +-
 cpp/include/raft/spatial/knn/knn.hpp          |   4 +-
 cpp/test/sparse/add.cu                        |  52 +++---
 cpp/test/sparse/connect_components.cu         |   8 +-
 cpp/test/sparse/convert_coo.cu                |   6 +-
 cpp/test/sparse/convert_csr.cu                |   6 +-
 cpp/test/sparse/csr_row_slice.cu              |   2 +-
 cpp/test/sparse/csr_to_dense.cu               |   2 +-
 cpp/test/sparse/csr_transpose.cu              |   5 +-
 cpp/test/sparse/degree.cu                     |   6 +-
 cpp/test/sparse/dist_coo_spmv.cu              |   2 +-
 cpp/test/sparse/filter.cu                     |   8 +-
 cpp/test/sparse/knn.cu                        |   2 +-
 cpp/test/sparse/knn_graph.cu                  |   4 +-
 cpp/test/sparse/linkage.cu                    |   2 +-
 cpp/test/sparse/norm.cu                       |   8 +-
 cpp/test/sparse/reduce.cu                     |   4 +-
 cpp/test/sparse/row_op.cu                     |   6 +-
 cpp/test/sparse/sort.cu                       |   2 +-
 cpp/test/sparse/symmetrize.cu                 |   8 +-
 cpp/test/spatial/selection.cu                 |   2 +-
 72 files changed, 1978 insertions(+), 506 deletions(-)
 create mode 100644 cpp/include/raft/sparse/convert/coo.hpp
 create mode 100644 cpp/include/raft/sparse/convert/csr.hpp
 create mode 100644 cpp/include/raft/sparse/convert/dense.hpp
 rename cpp/include/raft/sparse/convert/{ => detail}/coo.cuh (95%)
 rename cpp/include/raft/sparse/convert/{ => detail}/csr.cuh (91%)
 rename cpp/include/raft/sparse/convert/{ => detail}/dense.cuh (97%)
 create mode 100644 cpp/include/raft/sparse/coo.hpp
 rename cpp/include/raft/sparse/{csr.cuh => csr.hpp} (64%)
 rename cpp/include/raft/sparse/{ => detail}/coo.cuh (96%)
 create mode 100644 cpp/include/raft/sparse/detail/csr.cuh
 rename cpp/include/raft/sparse/{ => detail}/utils.h (100%)
 create mode 100644 cpp/include/raft/sparse/hierarchy/detail/single_linkage.hpp
 create mode 100644 cpp/include/raft/sparse/linalg/add.hpp
 create mode 100644 cpp/include/raft/sparse/linalg/degree.hpp
 rename cpp/include/raft/sparse/linalg/{ => detail}/add.cuh (98%)
 rename cpp/include/raft/sparse/linalg/{ => detail}/degree.cuh (64%)
 rename cpp/include/raft/sparse/linalg/{ => detail}/norm.cuh (98%)
 rename cpp/include/raft/sparse/linalg/{ => detail}/spectral.cuh (96%)
 rename cpp/include/raft/sparse/linalg/{ => detail}/symmetrize.cuh (91%)
 rename cpp/include/raft/sparse/linalg/{ => detail}/transpose.h (98%)
 create mode 100644 cpp/include/raft/sparse/linalg/norm.hpp
 create mode 100644 cpp/include/raft/sparse/linalg/spectral.hpp
 create mode 100644 cpp/include/raft/sparse/linalg/symmetrize.hpp
 create mode 100644 cpp/include/raft/sparse/linalg/transpose.hpp
 rename cpp/include/raft/sparse/op/{ => detail}/filter.cuh (95%)
 rename cpp/include/raft/sparse/op/{ => detail}/reduce.cuh (96%)
 rename cpp/include/raft/sparse/op/{ => detail}/row_op.cuh (96%)
 rename cpp/include/raft/sparse/op/{ => detail}/slice.h (97%)
 rename cpp/include/raft/sparse/op/{ => detail}/sort.h (94%)
 create mode 100644 cpp/include/raft/sparse/op/filter.hpp
 create mode 100644 cpp/include/raft/sparse/op/reduce.hpp
 create mode 100644 cpp/include/raft/sparse/op/row_op.hpp
 create mode 100644 cpp/include/raft/sparse/op/slice.hpp
 create mode 100644 cpp/include/raft/sparse/op/sort.hpp
 create mode 100644 cpp/include/raft/sparse/selection/connect_components.hpp
 rename cpp/include/raft/sparse/selection/{ => detail}/connect_components.cuh (95%)
 rename cpp/include/raft/sparse/selection/{ => detail}/knn.cuh (81%)
 rename cpp/include/raft/sparse/selection/{ => detail}/knn_graph.cuh (97%)
 create mode 100644 cpp/include/raft/sparse/selection/knn.hpp
 create mode 100644 cpp/include/raft/sparse/selection/knn_graph.hpp

diff --git a/cpp/.clang-format b/cpp/.clang-format
index 0c05436e92..77a14d72db 100644
--- a/cpp/.clang-format
+++ b/cpp/.clang-format
@@ -15,7 +15,7 @@ AlignTrailingComments: true
 AllowAllArgumentsOnNextLine: true
 AllowAllConstructorInitializersOnNextLine: true
 AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: true 
+AllowShortBlocksOnASingleLine: true
 AllowShortCaseLabelsOnASingleLine: true
 AllowShortEnumsOnASingleLine: true
 AllowShortFunctionsOnASingleLine: All
diff --git a/cpp/.clang-tidy b/cpp/.clang-tidy
index 30d96069b0..b4a7d35d93 100644
--- a/cpp/.clang-tidy
+++ b/cpp/.clang-tidy
@@ -7,7 +7,7 @@ HeaderFilterRegex: ''
 AnalyzeTemporaryDtors: false
 FormatStyle:     none
 User:            snanditale
-CheckOptions:    
+CheckOptions:
   - key:             google-build-namespaces.HeaderFileExtensions
     value:           ',h,hh,hpp,hxx'
   - key:             google-global-names-in-headers.HeaderFileExtensions
@@ -227,4 +227,3 @@ CheckOptions:
   - key:             readability-identifier-naming.VariableSuffix
     value:           ''
 ...
-
diff --git a/cpp/include/raft/sparse/convert/coo.hpp b/cpp/include/raft/sparse/convert/coo.hpp
new file mode 100644
index 0000000000..c647b99620
--- /dev/null
+++ b/cpp/include/raft/sparse/convert/coo.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/sparse/convert/detail/coo.cuh>
+
+namespace raft {
+namespace sparse {
+namespace convert {
+
+/**
+ * @brief Convert a CSR row_ind array to a COO rows array
+ * @param row_ind: Input CSR row_ind array
+ * @param m: size of row_ind array
+ * @param coo_rows: Output COO row array
+ * @param nnz: size of output COO row array
+ * @param stream: cuda stream to use
+ */
+template <typename value_idx = int>
+void csr_to_coo(
+  const value_idx* row_ind, value_idx m, value_idx* coo_rows, value_idx nnz, cudaStream_t stream)
+{
+  detail::csr_to_coo<value_idx, 32>(row_ind, m, coo_rows, nnz, stream);
+}
+
+};  // end NAMESPACE convert
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/convert/csr.hpp b/cpp/include/raft/sparse/convert/csr.hpp
new file mode 100644
index 0000000000..f0fe76bed3
--- /dev/null
+++ b/cpp/include/raft/sparse/convert/csr.hpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/sparse/convert/detail/csr.cuh>
+#include <raft/sparse/csr.hpp>
+
+namespace raft {
+namespace sparse {
+namespace convert {
+
+template <typename value_t>
+void coo_to_csr(const raft::handle_t& handle,
+                const int* srcRows,
+                const int* srcCols,
+                const value_t* srcVals,
+                int nnz,
+                int m,
+                int* dst_offsets,
+                int* dstCols,
+                value_t* dstVals)
+{
+  detail::coo_to_csr(handle, srcRows, srcCols, srcVals, nnz, m, dst_offsets, dstCols, dstVals);
+}
+
+/**
+ * @brief Constructs an adjacency graph CSR row_ind_ptr array from
+ * a row_ind array and adjacency array.
+ * @tparam T the numeric type of the index arrays
+ * @tparam TPB_X the number of threads to use per block for kernels
+ * @tparam Lambda function for fused operation in the adj_graph construction
+ * @param row_ind the input CSR row_ind array
+ * @param total_rows number of vertices in graph
+ * @param nnz number of non-zeros
+ * @param batchSize number of vertices in current batch
+ * @param adj an adjacency array (size batchSize x total_rows)
+ * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph
+ * @param stream cuda stream to use
+ * @param fused_op: the fused operation
+ */
+template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph_batched(const Index_* row_ind,
+                           Index_ total_rows,
+                           Index_ nnz,
+                           Index_ batchSize,
+                           const bool* adj,
+                           Index_* row_ind_ptr,
+                           cudaStream_t stream,
+                           Lambda fused_op)
+{
+  detail::csr_adj_graph_batched<Index_, 32, Lambda>(
+    row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream, fused_op);
+}
+
+template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph_batched(const Index_* row_ind,
+                           Index_ total_rows,
+                           Index_ nnz,
+                           Index_ batchSize,
+                           const bool* adj,
+                           Index_* row_ind_ptr,
+                           cudaStream_t stream)
+{
+  detail::csr_adj_graph_batched<Index_, 32, Lambda>(
+    row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream);
+}
+
+/**
+ * @brief Constructs an adjacency graph CSR row_ind_ptr array from a
+ * a row_ind array and adjacency array.
+ * @tparam T the numeric type of the index arrays
+ * @tparam TPB_X the number of threads to use per block for kernels
+ * @param row_ind the input CSR row_ind array
+ * @param total_rows number of total vertices in graph
+ * @param nnz number of non-zeros
+ * @param adj an adjacency array
+ * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph
+ * @param stream cuda stream to use
+ * @param fused_op the fused operation
+ */
+template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph(const Index_* row_ind,
+                   Index_ total_rows,
+                   Index_ nnz,
+                   const bool* adj,
+                   Index_* row_ind_ptr,
+                   cudaStream_t stream,
+                   Lambda fused_op)
+{
+  detail::csr_adj_graph<Index_, 32, Lambda>(
+    row_ind, total_rows, nnz, adj, row_ind_ptr, stream, fused_op);
+}
+
+/**
+ * @brief Generate the row indices array for a sorted COO matrix
+ *
+ * @param rows: COO rows array
+ * @param nnz: size of COO rows array
+ * @param row_ind: output row indices array
+ * @param m: number of rows in dense matrix
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void sorted_coo_to_csr(const T* rows, int nnz, T* row_ind, int m, cudaStream_t stream)
+{
+  detail::sorted_coo_to_csr(rows, nnz, row_ind, m, stream);
+}
+
+/**
+ * @brief Generate the row indices array for a sorted COO matrix
+ *
+ * @param coo: Input COO matrix
+ * @param row_ind: output row indices array
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void sorted_coo_to_csr(COO<T>* coo, int* row_ind, cudaStream_t stream)
+{
+  detail::sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, stream);
+}
+
+};  // end NAMESPACE convert
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/convert/dense.hpp b/cpp/include/raft/sparse/convert/dense.hpp
new file mode 100644
index 0000000000..c8d3b46d03
--- /dev/null
+++ b/cpp/include/raft/sparse/convert/dense.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/sparse/convert/detail/dense.cuh>
+
+namespace raft {
+namespace sparse {
+namespace convert {
+
+/**
+ * Convert CSR arrays to a dense matrix in either row-
+ * or column-major format. A custom kernel is used when
+ * row-major output is desired since cusparse does not
+ * output row-major.
+ * @tparam value_idx : data type of the CSR index arrays
+ * @tparam value_t : data type of the CSR value array
+ * @param[in] handle : cusparse handle for conversion
+ * @param[in] nrows : number of rows in CSR
+ * @param[in] ncols : number of columns in CSR
+ * @param[in] csr_indptr : CSR row index pointer array
+ * @param[in] csr_indices : CSR column indices array
+ * @param[in] csr_data : CSR data array
+ * @param[in] lda : Leading dimension (used for col-major only)
+ * @param[out] out : Dense output array of size nrows * ncols
+ * @param[in] stream : Cuda stream for ordering events
+ * @param[in] row_major : Is row-major output desired?
+ */
+template <typename value_idx, typename value_t>
+void csr_to_dense(cusparseHandle_t handle,
+                  value_idx nrows,
+                  value_idx ncols,
+                  const value_idx* csr_indptr,
+                  const value_idx* csr_indices,
+                  const value_t* csr_data,
+                  value_idx lda,
+                  value_t* out,
+                  cudaStream_t stream,
+                  bool row_major = true)
+{
+  detail::csr_to_dense<value_idx, value_t>(
+    handle, nrows, ncols, csr_indptr, csr_indices, csr_data, lda, out, stream, row_major);
+}
+
+};  // end NAMESPACE convert
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/convert/coo.cuh b/cpp/include/raft/sparse/convert/detail/coo.cuh
similarity index 95%
rename from cpp/include/raft/sparse/convert/coo.cuh
rename to cpp/include/raft/sparse/convert/detail/coo.cuh
index 5d38bdf4a8..7ad24496ab 100644
--- a/cpp/include/raft/sparse/convert/coo.cuh
+++ b/cpp/include/raft/sparse/convert/detail/coo.cuh
@@ -29,12 +29,13 @@
 #include <algorithm>
 #include <iostream>
 
-#include <raft/sparse/utils.h>
-#include <raft/sparse/coo.cuh>
+#include <raft/sparse/detail/utils.h>
+#include <raft/sparse/coo.hpp>
 
 namespace raft {
 namespace sparse {
 namespace convert {
+namespace detail {
 
 template <typename value_idx = int, int TPB_X = 32>
 __global__ void csr_to_coo_kernel(const value_idx* row_ind,
@@ -73,6 +74,7 @@ void csr_to_coo(
   CUDA_CHECK(cudaGetLastError());
 }
 
+};  // end NAMESPACE detail
 };  // end NAMESPACE convert
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/detail/csr.cuh
similarity index 91%
rename from cpp/include/raft/sparse/convert/csr.cuh
rename to cpp/include/raft/sparse/convert/detail/csr.cuh
index 2569b5d90f..241b5730c0 100644
--- a/cpp/include/raft/sparse/convert/csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/csr.cuh
@@ -33,14 +33,15 @@
 #include <algorithm>
 #include <iostream>
 
-#include <raft/sparse/utils.h>
-#include <raft/sparse/coo.cuh>
-#include <raft/sparse/linalg/degree.cuh>
-#include <raft/sparse/op/row_op.cuh>
+#include <raft/sparse/detail/utils.h>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/linalg/degree.hpp>
+#include <raft/sparse/op/row_op.hpp>
 
 namespace raft {
 namespace sparse {
 namespace convert {
+namespace detail {
 
 template <typename value_t>
 void coo_to_csr(const raft::handle_t& handle,
@@ -97,7 +98,7 @@ void csr_adj_graph_batched(const Index_* row_ind,
                            cudaStream_t stream,
                            Lambda fused_op)
 {
-  op::csr_row_op<Index_, TPB_X>(
+  op::csr_row_op<Index_>(
     row_ind,
     batchSize,
     nnz,
@@ -177,7 +178,7 @@ void sorted_coo_to_csr(const T* rows, int nnz, T* row_ind, int m, cudaStream_t s
 
   CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream));
 
-  linalg::coo_degree<32>(rows, nnz, row_counts.data(), stream);
+  linalg::coo_degree(rows, nnz, row_counts.data(), stream);
 
   // create csr compressed row index from row counts
   thrust::device_ptr<T> row_counts_d = thrust::device_pointer_cast(row_counts.data());
@@ -185,19 +186,7 @@ void sorted_coo_to_csr(const T* rows, int nnz, T* row_ind, int m, cudaStream_t s
   exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m, c_ind_d);
 }
 
-/**
- * @brief Generate the row indices array for a sorted COO matrix
- *
- * @param coo: Input COO matrix
- * @param row_ind: output row indices array
- * @param stream: cuda stream to use
- */
-template <typename T>
-void sorted_coo_to_csr(COO<T>* coo, int* row_ind, cudaStream_t stream)
-{
-  sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, stream);
-}
-
+};  // end NAMESPACE detail
 };  // end NAMESPACE convert
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/convert/dense.cuh b/cpp/include/raft/sparse/convert/detail/dense.cuh
similarity index 97%
rename from cpp/include/raft/sparse/convert/dense.cuh
rename to cpp/include/raft/sparse/convert/detail/dense.cuh
index e90882b501..ca4a567355 100644
--- a/cpp/include/raft/sparse/convert/dense.cuh
+++ b/cpp/include/raft/sparse/convert/detail/dense.cuh
@@ -30,11 +30,12 @@
 #include <algorithm>
 #include <iostream>
 
-#include <raft/sparse/utils.h>
+#include <raft/sparse/detail/utils.h>
 
 namespace raft {
 namespace sparse {
 namespace convert {
+namespace detail {
 
 template <typename value_t>
 __global__ void csr_to_dense_warp_per_row_kernel(
@@ -108,6 +109,7 @@ void csr_to_dense(cusparseHandle_t handle,
   }
 }
 
+};  // namespace detail
 };  // end NAMESPACE convert
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/coo.hpp b/cpp/include/raft/sparse/coo.hpp
new file mode 100644
index 0000000000..a176fefc3e
--- /dev/null
+++ b/cpp/include/raft/sparse/coo.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/sparse/detail/coo.cuh>
+
+namespace raft {
+namespace sparse {
+
+/** @brief A Container object for sparse coordinate. There are two motivations
+ * behind using a container for COO arrays.
+ *
+ * The first motivation is that it simplifies code, rather than always having
+ * to pass three arrays as function arguments.
+ *
+ * The second is more subtle, but much more important. The size
+ * of the resulting COO from a sparse operation is often not known ahead of time,
+ * since it depends on the contents of the underlying graph. The COO object can
+ * allocate the underlying arrays lazily so that the object can be created by the
+ * user and passed as an output argument in a sparse primitive. The sparse primitive
+ * would have the responsibility for allocating and populating the output arrays,
+ * while the original caller still maintains ownership of the underlying memory.
+ *
+ * @tparam value_t: the type of the value array.
+ * @tparam value_idx: the type of index array
+ *
+ */
+template <typename value_t, typename value_idx = int>
+using COO = detail::COO<value_t, value_idx>;
+
+};  // namespace sparse
+};  // namespace raft
diff --git a/cpp/include/raft/sparse/csr.cuh b/cpp/include/raft/sparse/csr.hpp
similarity index 64%
rename from cpp/include/raft/sparse/csr.cuh
rename to cpp/include/raft/sparse/csr.hpp
index f821ce2b98..ca0e6537e4 100644
--- a/cpp/include/raft/sparse/csr.cuh
+++ b/cpp/include/raft/sparse/csr.hpp
@@ -16,97 +16,14 @@
 
 #pragma once
 
-#include <cusparse_v2.h>
-#include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
-#include <raft/cuda_utils.cuh>
-#include <rmm/device_scalar.hpp>
-#include <rmm/device_uvector.hpp>
-
-#include <thrust/device_ptr.h>
-#include <thrust/scan.h>
-
-#include <cuda_runtime.h>
-#include <stdio.h>
-
-#include <algorithm>
-#include <iostream>
-
-#include <raft/sparse/utils.h>
+#include <raft/sparse/detail/csr.cuh>
 
 namespace raft {
 namespace sparse {
 
-//@TODO: Pull this out into a separate file
-
-struct WeakCCState {
- public:
-  bool* m;
-  WeakCCState(bool* m) : m(m) {}
-};
+constexpr int TPB_X = 256;
 
-template <typename Index_, int TPB_X = 256, typename Lambda>
-__global__ void weak_cc_label_device(Index_* __restrict__ labels,
-                                     const Index_* __restrict__ row_ind,
-                                     const Index_* __restrict__ row_ind_ptr,
-                                     Index_ nnz,
-                                     bool* __restrict__ m,
-                                     Index_ start_vertex_id,
-                                     Index_ batch_size,
-                                     Index_ N,
-                                     Lambda filter_op)
-{
-  Index_ tid       = threadIdx.x + blockIdx.x * TPB_X;
-  Index_ global_id = tid + start_vertex_id;
-  if (tid < batch_size && global_id < N) {
-    Index_ start = __ldg(row_ind + tid);
-
-    Index_ ci, cj;
-    bool ci_mod        = false;
-    ci                 = labels[global_id];
-    bool ci_allow_prop = filter_op(global_id);
-
-    Index_ end = get_stop_idx(tid, batch_size, nnz, row_ind);
-    /// TODO: add one element to row_ind and avoid get_stop_idx
-    for (Index_ j = start; j < end; j++) {
-      Index_ j_ind       = __ldg(row_ind_ptr + j);
-      cj                 = labels[j_ind];
-      bool cj_allow_prop = filter_op(j_ind);
-      if (ci < cj && ci_allow_prop) {
-        if (sizeof(Index_) == 4)
-          atomicMin((int*)(labels + j_ind), ci);
-        else if (sizeof(Index_) == 8)
-          atomicMin((long long int*)(labels + j_ind), ci);
-        if (cj_allow_prop) *m = true;
-      } else if (ci > cj && cj_allow_prop) {
-        ci     = cj;
-        ci_mod = true;
-      }
-    }
-    if (ci_mod) {
-      if (sizeof(Index_) == 4)
-        atomicMin((int*)(labels + global_id), ci);
-      else if (sizeof(Index_) == 8)
-        atomicMin((long long int*)(labels + global_id), ci);
-      if (ci_allow_prop) *m = true;
-    }
-  }
-}
-
-template <typename Index_, int TPB_X = 256, typename Lambda>
-__global__ void weak_cc_init_all_kernel(Index_* labels,
-                                        Index_ N,
-                                        Index_ MAX_LABEL,
-                                        Lambda filter_op)
-{
-  Index_ tid = threadIdx.x + blockIdx.x * TPB_X;
-  if (tid < N) {
-    if (filter_op(tid))
-      labels[tid] = tid + 1;
-    else
-      labels[tid] = MAX_LABEL;
-  }
-}  // namespace sparse
+using WeakCCState = detail::WeakCCState;
 
 /**
  * @brief Partial calculation of the weakly connected components in the
@@ -130,7 +47,7 @@ __global__ void weak_cc_init_all_kernel(Index_* labels,
  * @param filter_op an optional filtering function to determine which points
  * should get considered for labeling. It gets global indexes (not batch-wide!)
  */
-template <typename Index_, int TPB_X = 256, typename Lambda = auto(Index_)->bool>
+template <typename Index_, typename Lambda = auto(Index_)->bool>
 void weak_cc_batched(Index_* labels,
                      const Index_* row_ind,
                      const Index_* row_ind_ptr,
@@ -142,30 +59,8 @@ void weak_cc_batched(Index_* labels,
                      cudaStream_t stream,
                      Lambda filter_op)
 {
-  ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8, "Index_ should be 4 or 8 bytes");
-
-  bool host_m;
-
-  Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
-  weak_cc_init_all_kernel<Index_, TPB_X>
-    <<<raft::ceildiv(N, Index_(TPB_X)), TPB_X, 0, stream>>>(labels, N, MAX_LABEL, filter_op);
-  CUDA_CHECK(cudaPeekAtLastError());
-
-  int n_iters = 0;
-  do {
-    CUDA_CHECK(cudaMemsetAsync(state->m, false, sizeof(bool), stream));
-
-    weak_cc_label_device<Index_, TPB_X>
-      <<<raft::ceildiv(batch_size, Index_(TPB_X)), TPB_X, 0, stream>>>(
-        labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id, batch_size, N, filter_op);
-    CUDA_CHECK(cudaPeekAtLastError());
-
-    //** Updating m *
-    raft::update_host(&host_m, state->m, 1, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-
-    n_iters++;
-  } while (host_m);
+  detail::weak_cc_batched<Index_, TPB_X, Lambda>(
+    labels, row_ind, row_ind_ptr, nnz, N, start_vertex_id, batch_size, state, stream, filter_op);
 }
 
 /**
@@ -188,7 +83,7 @@ void weak_cc_batched(Index_* labels,
  * @param state instance of inter-batch state management
  * @param stream the cuda stream to use
  */
-template <typename Index_, int TPB_X = 256>
+template <typename Index_>
 void weak_cc_batched(Index_* labels,
                      const Index_* row_ind,
                      const Index_* row_ind_ptr,
@@ -234,7 +129,7 @@ void weak_cc_batched(Index_* labels,
  * @param filter_op an optional filtering function to determine which points
  * should get considered for labeling. It gets global indexes (not batch-wide!)
  */
-template <typename Index_ = int, int TPB_X = 256, typename Lambda = auto(Index_)->bool>
+template <typename Index_ = int, typename Lambda = auto(Index_)->bool>
 void weak_cc(Index_* labels,
              const Index_* row_ind,
              const Index_* row_ind_ptr,
@@ -269,7 +164,7 @@ void weak_cc(Index_* labels,
  * @param N number of vertices
  * @param stream the cuda stream to use
  */
-template <typename Index_, int TPB_X = 256>
+template <typename Index_>
 void weak_cc(Index_* labels,
              const Index_* row_ind,
              const Index_* row_ind_ptr,
diff --git a/cpp/include/raft/sparse/coo.cuh b/cpp/include/raft/sparse/detail/coo.cuh
similarity index 96%
rename from cpp/include/raft/sparse/coo.cuh
rename to cpp/include/raft/sparse/detail/coo.cuh
index ad1bac1e75..ded0b2c36a 100644
--- a/cpp/include/raft/sparse/coo.cuh
+++ b/cpp/include/raft/sparse/detail/coo.cuh
@@ -15,25 +15,14 @@
  */
 
 #include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
-#include <raft/cuda_utils.cuh>
-#include <rmm/device_uvector.hpp>
-
-#include <cusparse_v2.h>
-
-#include <thrust/device_ptr.h>
-#include <thrust/scan.h>
-
-#include <cuda_runtime.h>
-#include <raft/device_atomics.cuh>
-
 #include <iostream>
-#define restrict __restrict__
+#include <rmm/device_uvector.hpp>
 
 #pragma once
 
 namespace raft {
 namespace sparse {
+namespace detail {
 
 /** @brief A Container object for sparse coordinate. There are two motivations
  * behind using a container for COO arrays.
@@ -247,5 +236,6 @@ class COO {
   }
 };
 
+};  // namespace detail
 };  // namespace sparse
 };  // namespace raft
diff --git a/cpp/include/raft/sparse/detail/csr.cuh b/cpp/include/raft/sparse/detail/csr.cuh
new file mode 100644
index 0000000000..4c98b27318
--- /dev/null
+++ b/cpp/include/raft/sparse/detail/csr.cuh
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+#include <raft/cudart_utils.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/cuda_utils.cuh>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include <raft/sparse/detail/utils.h>
+
+namespace raft {
+namespace sparse {
+namespace detail {
+
+//@TODO: Pull this out into a separate file
+
+struct WeakCCState {
+ public:
+  bool* m;
+
+  WeakCCState(bool* m) : m(m) {}
+};
+
+template <typename Index_, int TPB_X = 256, typename Lambda>
+__global__ void weak_cc_label_device(Index_* __restrict__ labels,
+                                     const Index_* __restrict__ row_ind,
+                                     const Index_* __restrict__ row_ind_ptr,
+                                     Index_ nnz,
+                                     bool* __restrict__ m,
+                                     Index_ start_vertex_id,
+                                     Index_ batch_size,
+                                     Index_ N,
+                                     Lambda filter_op)
+{
+  Index_ tid       = threadIdx.x + blockIdx.x * TPB_X;
+  Index_ global_id = tid + start_vertex_id;
+  if (tid < batch_size && global_id < N) {
+    Index_ start = __ldg(row_ind + tid);
+
+    Index_ ci, cj;
+    bool ci_mod        = false;
+    ci                 = labels[global_id];
+    bool ci_allow_prop = filter_op(global_id);
+
+    Index_ end = get_stop_idx(tid, batch_size, nnz, row_ind);
+    /// TODO: add one element to row_ind and avoid get_stop_idx
+    for (Index_ j = start; j < end; j++) {
+      Index_ j_ind       = __ldg(row_ind_ptr + j);
+      cj                 = labels[j_ind];
+      bool cj_allow_prop = filter_op(j_ind);
+      if (ci < cj && ci_allow_prop) {
+        if (sizeof(Index_) == 4)
+          atomicMin((int*)(labels + j_ind), ci);
+        else if (sizeof(Index_) == 8)
+          atomicMin((long long int*)(labels + j_ind), ci);
+        if (cj_allow_prop) *m = true;
+      } else if (ci > cj && cj_allow_prop) {
+        ci     = cj;
+        ci_mod = true;
+      }
+    }
+    if (ci_mod) {
+      if (sizeof(Index_) == 4)
+        atomicMin((int*)(labels + global_id), ci);
+      else if (sizeof(Index_) == 8)
+        atomicMin((long long int*)(labels + global_id), ci);
+      if (ci_allow_prop) *m = true;
+    }
+  }
+}
+
+template <typename Index_, int TPB_X = 256, typename Lambda>
+__global__ void weak_cc_init_all_kernel(Index_* labels,
+                                        Index_ N,
+                                        Index_ MAX_LABEL,
+                                        Lambda filter_op)
+{
+  Index_ tid = threadIdx.x + blockIdx.x * TPB_X;
+  if (tid < N) {
+    if (filter_op(tid))
+      labels[tid] = tid + 1;
+    else
+      labels[tid] = MAX_LABEL;
+  }
+}  // namespace sparse
+
+/**
+ * @brief Partial calculation of the weakly connected components in the
+ * context of a batched algorithm: the labels are computed wrt the sub-graph
+ * represented by the given CSR matrix of dimensions batch_size * N.
+ * Note that this overwrites the labels array and it is the responsibility of
+ * the caller to combine the results from different batches
+ * (cf label/merge_labels.cuh)
+ *
+ * @tparam Index_ the numeric type of non-floating point elements
+ * @tparam TPB_X the threads to use per block when configuring the kernel
+ * @param labels an array for the output labels
+ * @param row_ind the compressed row index of the CSR array
+ * @param row_ind_ptr the row index pointer of the CSR array
+ * @param nnz the size of row_ind_ptr array
+ * @param N number of vertices
+ * @param start_vertex_id the starting vertex index for the current batch
+ * @param batch_size number of vertices for current batch
+ * @param state instance of inter-batch state management
+ * @param stream the cuda stream to use
+ * @param filter_op an optional filtering function to determine which points
+ * should get considered for labeling. It gets global indexes (not batch-wide!)
+ */
+template <typename Index_, int TPB_X = 256, typename Lambda = auto(Index_)->bool>
+void weak_cc_batched(Index_* labels,
+                     const Index_* row_ind,
+                     const Index_* row_ind_ptr,
+                     Index_ nnz,
+                     Index_ N,
+                     Index_ start_vertex_id,
+                     Index_ batch_size,
+                     WeakCCState* state,
+                     cudaStream_t stream,
+                     Lambda filter_op)
+{
+  ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8, "Index_ should be 4 or 8 bytes");
+
+  bool host_m;
+
+  Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
+  weak_cc_init_all_kernel<Index_, TPB_X>
+    <<<raft::ceildiv(N, Index_(TPB_X)), TPB_X, 0, stream>>>(labels, N, MAX_LABEL, filter_op);
+  CUDA_CHECK(cudaPeekAtLastError());
+
+  int n_iters = 0;
+  do {
+    CUDA_CHECK(cudaMemsetAsync(state->m, false, sizeof(bool), stream));
+
+    weak_cc_label_device<Index_, TPB_X>
+      <<<raft::ceildiv(batch_size, Index_(TPB_X)), TPB_X, 0, stream>>>(
+        labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id, batch_size, N, filter_op);
+    CUDA_CHECK(cudaPeekAtLastError());
+
+    //** Updating m *
+    raft::update_host(&host_m, state->m, 1, stream);
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    n_iters++;
+  } while (host_m);
+}
+
+};  // namespace detail
+};  // namespace sparse
+};  // namespace raft
diff --git a/cpp/include/raft/sparse/utils.h b/cpp/include/raft/sparse/detail/utils.h
similarity index 100%
rename from cpp/include/raft/sparse/utils.h
rename to cpp/include/raft/sparse/detail/utils.h
diff --git a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
index 4d3b31df9a..7527e876ec 100644
--- a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
@@ -21,8 +21,8 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/utils.h>
 #include <raft/sparse/distance/common.h>
-#include <raft/sparse/utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/sparse/distance/detail/ip_distance.cuh>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
index 6694d0fc4f..e69a292ef1 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
@@ -24,8 +24,8 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/mr/device/buffer.hpp>
 
-#include "../../csr.cuh"
-#include "../../utils.h"
+#include "../../csr.hpp"
+#include "../../detail/utils.h"
 #include "../common.h"
 
 #include <limits.h>
diff --git a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
index bde979a993..03c13df511 100644
--- a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
@@ -22,13 +22,13 @@
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
 
+#include <raft/sparse/detail/utils.h>
 #include <raft/sparse/distance/common.h>
-#include <raft/sparse/linalg/transpose.h>
-#include <raft/sparse/utils.h>
-#include <raft/sparse/convert/csr.cuh>
-#include <raft/sparse/convert/dense.cuh>
+#include <raft/sparse/convert/csr.hpp>
+#include <raft/sparse/convert/dense.hpp>
 #include <raft/sparse/distance/detail/coo_spmv.cuh>
 #include <raft/sparse/distance/detail/operators.cuh>
+#include <raft/sparse/linalg/transpose.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <nvfunctional>
diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
index a4a534823f..5ee2d250fb 100644
--- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
@@ -21,11 +21,11 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/utils.h>
 #include <raft/sparse/distance/common.h>
-#include <raft/sparse/utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/sparse/csr.cuh>
+#include <raft/sparse/csr.hpp>
 #include <raft/sparse/distance/detail/ip_distance.cuh>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
index f5e7c75988..5be9de97c3 100644
--- a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
@@ -25,11 +25,11 @@
 
 #include <rmm/device_uvector.hpp>
 
-#include <raft/sparse/utils.h>
-#include <raft/sparse/csr.cuh>
+#include <raft/sparse/detail/utils.h>
+#include <raft/sparse/csr.hpp>
 
 #include <raft/sparse/distance/common.h>
-#include <raft/sparse/convert/coo.cuh>
+#include <raft/sparse/convert/coo.hpp>
 #include <raft/sparse/distance/detail/operators.cuh>
 
 #include <nvfunctional>
diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp
index 92c08654d2..9b708f4b27 100644
--- a/cpp/include/raft/sparse/distance/distance.hpp
+++ b/cpp/include/raft/sparse/distance/distance.hpp
@@ -24,12 +24,12 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/mr/device/buffer.hpp>
 
-#include <raft/sparse/linalg/transpose.h>
-#include <raft/sparse/utils.h>
-#include <raft/sparse/convert/coo.cuh>
-#include <raft/sparse/convert/csr.cuh>
-#include <raft/sparse/convert/dense.cuh>
-#include <raft/sparse/csr.cuh>
+#include <raft/sparse/detail/utils.h>
+#include <raft/sparse/convert/coo.hpp>
+#include <raft/sparse/convert/csr.hpp>
+#include <raft/sparse/convert/dense.hpp>
+#include <raft/sparse/csr.hpp>
+#include <raft/sparse/linalg/transpose.hpp>
 
 #include <raft/sparse/distance/detail/bin_distance.cuh>
 #include <raft/sparse/distance/detail/ip_distance.cuh>
diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
index c06c24e100..5eb5213f8d 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
@@ -26,9 +26,9 @@
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/hierarchy/common.h>
 #include <raft/mr/device/buffer.hpp>
-#include <raft/sparse/convert/csr.cuh>
-#include <raft/sparse/coo.cuh>
-#include <raft/sparse/selection/knn_graph.cuh>
+#include <raft/sparse/convert/csr.hpp>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/selection/knn_graph.hpp>
 
 #include <limits>
 
diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
index 0c0b049f11..b3b06c5fe9 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
@@ -19,10 +19,10 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 
-#include <raft/sparse/op/sort.h>
 #include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/mst/mst.cuh>
-#include <raft/sparse/selection/connect_components.cuh>
+#include <raft/sparse/op/sort.hpp>
+#include <raft/sparse/selection/connect_components.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
diff --git a/cpp/include/raft/sparse/hierarchy/detail/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/detail/single_linkage.hpp
new file mode 100644
index 0000000000..702198e422
--- /dev/null
+++ b/cpp/include/raft/sparse/hierarchy/detail/single_linkage.hpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cudart_utils.h>
+#include <rmm/device_uvector.hpp>
+
+#include <raft/sparse/hierarchy/common.h>
+#include <raft/sparse/hierarchy/detail/agglomerative.cuh>
+#include <raft/sparse/hierarchy/detail/connectivities.cuh>
+#include <raft/sparse/hierarchy/detail/mst.cuh>
+
+namespace raft {
+namespace hierarchy {
+namespace detail {
+
+static const size_t EMPTY = 0;
+
+/**
+ * Single-linkage clustering, capable of constructing a KNN graph to
+ * scale the algorithm beyond the n^2 memory consumption of implementations
+ * that use the fully-connected graph of pairwise distances by connecting
+ * a knn graph when k is not large enough to connect it.
+
+ * @tparam value_idx
+ * @tparam value_t
+ * @tparam dist_type method to use for constructing connectivities graph
+ * @param[in] handle raft handle
+ * @param[in] X dense input matrix in row-major layout
+ * @param[in] m number of rows in X
+ * @param[in] n number of columns in X
+ * @param[in] metric distance metrix to use when constructing connectivities graph
+ * @param[out] out struct containing output dendrogram and cluster assignments
+ * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect
+ control
+ *            of k. The algorithm will set `k = log(n) + c`
+ * @param[in] n_clusters number of clusters to assign data samples
+ */
+template <typename value_idx, typename value_t, LinkageDistance dist_type>
+void single_linkage(const raft::handle_t& handle,
+                    const value_t* X,
+                    size_t m,
+                    size_t n,
+                    raft::distance::DistanceType metric,
+                    linkage_output<value_idx, value_t>* out,
+                    int c,
+                    size_t n_clusters)
+{
+  ASSERT(n_clusters <= m, "n_clusters must be less than or equal to the number of data points");
+
+  auto stream = handle.get_stream();
+
+  rmm::device_uvector<value_idx> indptr(EMPTY, stream);
+  rmm::device_uvector<value_idx> indices(EMPTY, stream);
+  rmm::device_uvector<value_t> pw_dists(EMPTY, stream);
+
+  /**
+   * 1. Construct distance graph
+   */
+  detail::get_distance_graph<value_idx, value_t, dist_type>(
+    handle, X, m, n, metric, indptr, indices, pw_dists, c);
+
+  rmm::device_uvector<value_idx> mst_rows(m - 1, stream);
+  rmm::device_uvector<value_idx> mst_cols(m - 1, stream);
+  rmm::device_uvector<value_t> mst_data(m - 1, stream);
+
+  /**
+   * 2. Construct MST, sorted by weights
+   */
+  rmm::device_uvector<value_idx> color(m, stream);
+  raft::linkage::FixConnectivitiesRedOp<value_idx, value_t> op(color.data(), m);
+  detail::build_sorted_mst<value_idx, value_t>(handle,
+                                               X,
+                                               indptr.data(),
+                                               indices.data(),
+                                               pw_dists.data(),
+                                               m,
+                                               n,
+                                               mst_rows.data(),
+                                               mst_cols.data(),
+                                               mst_data.data(),
+                                               color.data(),
+                                               indices.size(),
+                                               op,
+                                               metric);
+
+  pw_dists.release();
+
+  /**
+   * Perform hierarchical labeling
+   */
+  size_t n_edges = mst_rows.size();
+
+  rmm::device_uvector<value_t> out_delta(n_edges, stream);
+  rmm::device_uvector<value_idx> out_size(n_edges, stream);
+  // Create dendrogram
+  detail::build_dendrogram_host<value_idx, value_t>(handle,
+                                                    mst_rows.data(),
+                                                    mst_cols.data(),
+                                                    mst_data.data(),
+                                                    n_edges,
+                                                    out->children,
+                                                    out_delta.data(),
+                                                    out_size.data());
+  detail::extract_flattened_clusters(handle, out->labels, out->children, n_clusters, m);
+
+  out->m                      = m;
+  out->n_clusters             = n_clusters;
+  out->n_leaves               = m;
+  out->n_connected_components = 1;
+}
+};  // namespace detail
+};  // namespace hierarchy
+};  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
index 3b6f1347ab..104c1235d4 100644
--- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
+++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
@@ -16,19 +16,12 @@
 
 #pragma once
 
-#include <raft/cudart_utils.h>
-#include <rmm/device_uvector.hpp>
-
 #include <raft/sparse/hierarchy/common.h>
-#include <raft/sparse/hierarchy/detail/agglomerative.cuh>
-#include <raft/sparse/hierarchy/detail/connectivities.cuh>
-#include <raft/sparse/hierarchy/detail/mst.cuh>
+#include <raft/sparse/hierarchy/detail/single_linkage.hpp>
 
 namespace raft {
 namespace hierarchy {
 
-static const size_t EMPTY = 0;
-
 /**
  * Single-linkage clustering, capable of constructing a KNN graph to
  * scale the algorithm beyond the n^2 memory consumption of implementations
@@ -61,69 +54,8 @@ void single_linkage(const raft::handle_t& handle,
                     int c,
                     size_t n_clusters)
 {
-  ASSERT(n_clusters <= m, "n_clusters must be less than or equal to the number of data points");
-
-  auto stream = handle.get_stream();
-
-  rmm::device_uvector<value_idx> indptr(EMPTY, stream);
-  rmm::device_uvector<value_idx> indices(EMPTY, stream);
-  rmm::device_uvector<value_t> pw_dists(EMPTY, stream);
-
-  /**
-   * 1. Construct distance graph
-   */
-  detail::get_distance_graph<value_idx, value_t, dist_type>(
-    handle, X, m, n, metric, indptr, indices, pw_dists, c);
-
-  rmm::device_uvector<value_idx> mst_rows(m - 1, stream);
-  rmm::device_uvector<value_idx> mst_cols(m - 1, stream);
-  rmm::device_uvector<value_t> mst_data(m - 1, stream);
-
-  /**
-   * 2. Construct MST, sorted by weights
-   */
-  rmm::device_uvector<value_idx> color(m, stream);
-  raft::linkage::FixConnectivitiesRedOp<value_idx, value_t> op(color.data(), m);
-  detail::build_sorted_mst<value_idx, value_t>(handle,
-                                               X,
-                                               indptr.data(),
-                                               indices.data(),
-                                               pw_dists.data(),
-                                               m,
-                                               n,
-                                               mst_rows.data(),
-                                               mst_cols.data(),
-                                               mst_data.data(),
-                                               color.data(),
-                                               indices.size(),
-                                               op,
-                                               metric);
-
-  pw_dists.release();
-
-  /**
-   * Perform hierarchical labeling
-   */
-  size_t n_edges = mst_rows.size();
-
-  rmm::device_uvector<value_t> out_delta(n_edges, stream);
-  rmm::device_uvector<value_idx> out_size(n_edges, stream);
-  // Create dendrogram
-  detail::build_dendrogram_host<value_idx, value_t>(handle,
-                                                    mst_rows.data(),
-                                                    mst_cols.data(),
-                                                    mst_data.data(),
-                                                    n_edges,
-                                                    out->children,
-                                                    out_delta.data(),
-                                                    out_size.data());
-  detail::extract_flattened_clusters(handle, out->labels, out->children, n_clusters, m);
-
-  out->m                      = m;
-  out->n_clusters             = n_clusters;
-  out->n_leaves               = m;
-  out->n_connected_components = 1;
+  detail::single_linkage<value_idx, value_t, dist_type>(
+    handle, X, m, n, metric, out, c, n_clusters);
 }
-
 };  // namespace hierarchy
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/include/raft/sparse/linalg/add.hpp b/cpp/include/raft/sparse/linalg/add.hpp
new file mode 100644
index 0000000000..30c39b1ffc
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/add.hpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/sparse/linalg/detail/add.cuh>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * @brief Calculate the CSR row_ind array that would result
+ * from summing together two CSR matrices
+ * @param a_ind: left hand row_ind array
+ * @param a_indptr: left hand index_ptr array
+ * @param a_val: left hand data array
+ * @param nnz1: size of left hand index_ptr and val arrays
+ * @param b_ind: right hand row_ind array
+ * @param b_indptr: right hand index_ptr array
+ * @param b_val: right hand data array
+ * @param nnz2: size of right hand index_ptr and val arrays
+ * @param m: size of output array (number of rows in final matrix)
+ * @param out_ind: output row_ind array
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+size_t csr_add_calc_inds(const int* a_ind,
+                         const int* a_indptr,
+                         const T* a_val,
+                         int nnz1,
+                         const int* b_ind,
+                         const int* b_indptr,
+                         const T* b_val,
+                         int nnz2,
+                         int m,
+                         int* out_ind,
+                         cudaStream_t stream)
+{
+  return detail::csr_add_calc_inds(
+    a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, out_ind, stream);
+}
+
+/**
+ * @brief Calculate the CSR row_ind array that would result
+ * from summing together two CSR matrices
+ * @param a_ind: left hand row_ind array
+ * @param a_indptr: left hand index_ptr array
+ * @param a_val: left hand data array
+ * @param nnz1: size of left hand index_ptr and val arrays
+ * @param b_ind: right hand row_ind array
+ * @param b_indptr: right hand index_ptr array
+ * @param b_val: right hand data array
+ * @param nnz2: size of right hand index_ptr and val arrays
+ * @param m: size of output array (number of rows in final matrix)
+ * @param c_ind: output row_ind array
+ * @param c_indptr: output ind_ptr array
+ * @param c_val: output data array
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void csr_add_finalize(const int* a_ind,
+                      const int* a_indptr,
+                      const T* a_val,
+                      int nnz1,
+                      const int* b_ind,
+                      const int* b_indptr,
+                      const T* b_val,
+                      int nnz2,
+                      int m,
+                      int* c_ind,
+                      int* c_indptr,
+                      T* c_val,
+                      cudaStream_t stream)
+{
+  detail::csr_add_finalize(
+    a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, c_ind, c_indptr, c_val, stream);
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/linalg/degree.hpp b/cpp/include/raft/sparse/linalg/degree.hpp
new file mode 100644
index 0000000000..04643b219d
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/degree.hpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/linalg/detail/degree.cuh>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * @brief Count the number of values for each row
+ * @tparam TPB_X: number of threads to use per block
+ * @param rows: rows array of the COO matrix
+ * @param nnz: size of the rows array
+ * @param results: output result array
+ * @param stream: cuda stream to use
+ */
+template <typename T = int>
+void coo_degree(const T* rows, int nnz, T* results, cudaStream_t stream)
+{
+  detail::coo_degree<64, T>(rows, nnz, results, stream);
+}
+
+/**
+ * @brief Count the number of values for each row
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: type name of underlying values array
+ * @param in: input COO object for counting rows
+ * @param results: output array with row counts (size=in->n_rows)
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_degree(COO<T>* in, int* results, cudaStream_t stream)
+{
+  coo_degree(in->rows(), in->nnz, results, stream);
+}
+
+/**
+ * @brief Count the number of values for each row that doesn't match a particular scalar
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: the type name of the underlying value arrays
+ * @param rows: Input COO row array
+ * @param vals: Input COO val arrays
+ * @param nnz: size of input COO arrays
+ * @param scalar: scalar to match for counting rows
+ * @param results: output row counts
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_degree_scalar(
+  const int* rows, const T* vals, int nnz, T scalar, int* results, cudaStream_t stream = 0)
+{
+  detail::coo_degree_scalar<64>(rows, vals, nnz, scalar, results, stream);
+}
+
+/**
+ * @brief Count the number of values for each row that doesn't match a particular scalar
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: the type name of the underlying value arrays
+ * @param in: Input COO array
+ * @param scalar: scalar to match for counting rows
+ * @param results: output row counts
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_degree_scalar(COO<T>* in, T scalar, int* results, cudaStream_t stream)
+{
+  coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, results, stream);
+}
+
+/**
+ * @brief Count the number of nonzeros for each row
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: the type name of the underlying value arrays
+ * @param rows: Input COO row array
+ * @param vals: Input COO val arrays
+ * @param nnz: size of input COO arrays
+ * @param results: output row counts
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_degree_nz(const int* rows, const T* vals, int nnz, int* results, cudaStream_t stream)
+{
+  detail::coo_degree_nz<64>(rows, vals, nnz, results, stream);
+}
+
+/**
+ * @brief Count the number of nonzero values for each row
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: the type name of the underlying value arrays
+ * @param in: Input COO array
+ * @param results: output row counts
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_degree_nz(COO<T>* in, int* results, cudaStream_t stream)
+{
+  coo_degree_nz(in->rows(), in->vals(), in->nnz, results, stream);
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/detail/add.cuh
similarity index 98%
rename from cpp/include/raft/sparse/linalg/add.cuh
rename to cpp/include/raft/sparse/linalg/detail/add.cuh
index 0c17d55762..6ef619108a 100644
--- a/cpp/include/raft/sparse/linalg/add.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/add.cuh
@@ -33,11 +33,12 @@
 #include <algorithm>
 #include <iostream>
 
-#include <raft/sparse/utils.h>
+#include <raft/sparse/detail/utils.h>
 
 namespace raft {
 namespace sparse {
 namespace linalg {
+namespace detail {
 
 template <typename T, int TPB_X = 128>
 __global__ void csr_add_calc_row_counts_kernel(const int* a_ind,
@@ -245,6 +246,7 @@ void csr_add_finalize(const int* a_ind,
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
+};  // end NAMESPACE detail
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/detail/degree.cuh
similarity index 64%
rename from cpp/include/raft/sparse/linalg/degree.cuh
rename to cpp/include/raft/sparse/linalg/detail/degree.cuh
index 052f674325..08a140d420 100644
--- a/cpp/include/raft/sparse/linalg/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/degree.cuh
@@ -16,23 +16,19 @@
 
 #pragma once
 
-#include <cusparse_v2.h>
 #include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
-#include <raft/cuda_utils.cuh>
-
-#include <thrust/device_ptr.h>
-#include <thrust/scan.h>
+#include <raft/device_atomics.cuh>
 
 #include <cuda_runtime.h>
 #include <stdio.h>
 
-#include <raft/sparse/utils.h>
-#include <raft/sparse/coo.cuh>
+#include <raft/sparse/detail/utils.h>
+#include <raft/sparse/coo.hpp>
 
 namespace raft {
 namespace sparse {
 namespace linalg {
+namespace detail {
 
 /**
  * @brief Count all the rows in the coo row array and place them in the
@@ -68,24 +64,6 @@ void coo_degree(const T* rows, int nnz, T* results, cudaStream_t stream)
   CUDA_CHECK(cudaGetLastError());
 }
 
-/**
- * @brief Count the number of values for each row
- * @tparam TPB_X: number of threads to use per block
- * @tparam T: type name of underlying values array
- * @param in: input COO object for counting rows
- * @param results: output array with row counts (size=in->n_rows)
- * @param stream: cuda stream to use
- */
-template <int TPB_X = 64, typename T>
-void coo_degree(COO<T>* in, int* results, cudaStream_t stream)
-{
-  dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
-  dim3 blk_rc(TPB_X, 1, 1);
-
-  coo_degree_kernel<TPB_X><<<grid_rc, blk_rc, 0, stream>>>(in->rows(), in->nnz, results);
-  CUDA_CHECK(cudaGetLastError());
-}
-
 template <int TPB_X = 64, typename T>
 __global__ void coo_degree_nz_kernel(const int* rows, const T* vals, int nnz, int* results)
 {
@@ -101,25 +79,6 @@ __global__ void coo_degree_scalar_kernel(
   if (row < nnz && vals[row] != scalar) { raft::myAtomicAdd(results + rows[row], 1); }
 }
 
-/**
- * @brief Count the number of values for each row that doesn't match a particular scalar
- * @tparam TPB_X: number of threads to use per block
- * @tparam T: the type name of the underlying value arrays
- * @param in: Input COO array
- * @param scalar: scalar to match for counting rows
- * @param results: output row counts
- * @param stream: cuda stream to use
- */
-template <int TPB_X = 64, typename T>
-void coo_degree_scalar(COO<T>* in, T scalar, int* results, cudaStream_t stream)
-{
-  dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
-  dim3 blk_rc(TPB_X, 1, 1);
-  coo_degree_scalar_kernel<TPB_X, T>
-    <<<grid_rc, blk_rc, 0, stream>>>(in->rows(), in->vals(), in->nnz, scalar, results);
-  CUDA_CHECK(cudaGetLastError());
-}
-
 /**
  * @brief Count the number of values for each row that doesn't match a particular scalar
  * @tparam TPB_X: number of threads to use per block
@@ -159,24 +118,7 @@ void coo_degree_nz(const int* rows, const T* vals, int nnz, int* results, cudaSt
   coo_degree_nz_kernel<TPB_X, T><<<grid_rc, blk_rc, 0, stream>>>(rows, vals, nnz, results);
 }
 
-/**
- * @brief Count the number of nonzero values for each row
- * @tparam TPB_X: number of threads to use per block
- * @tparam T: the type name of the underlying value arrays
- * @param in: Input COO array
- * @param results: output row counts
- * @param stream: cuda stream to use
- */
-template <int TPB_X = 64, typename T>
-void coo_degree_nz(COO<T>* in, int* results, cudaStream_t stream)
-{
-  dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
-  dim3 blk_rc(TPB_X, 1, 1);
-
-  coo_degree_nz_kernel<TPB_X, T>
-    <<<grid_rc, blk_rc, 0, stream>>>(in->rows(), in->vals(), in->nnz, results);
-}
-
+};  // end NAMESPACE detail
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/detail/norm.cuh
similarity index 98%
rename from cpp/include/raft/sparse/linalg/norm.cuh
rename to cpp/include/raft/sparse/linalg/detail/norm.cuh
index 59dc5ff3e4..742d914951 100644
--- a/cpp/include/raft/sparse/linalg/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/norm.cuh
@@ -30,11 +30,12 @@
 #include <iostream>
 #include <limits>
 
-#include <raft/sparse/utils.h>
+#include <raft/sparse/detail/utils.h>
 
 namespace raft {
 namespace sparse {
 namespace linalg {
+namespace detail {
 
 template <int TPB_X = 64, typename T>
 __global__ void csr_row_normalize_l1_kernel(
@@ -169,6 +170,7 @@ void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
   CUDA_CHECK(cudaGetLastError());
 }
 
+};  // end NAMESPACE detail
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
similarity index 96%
rename from cpp/include/raft/sparse/linalg/spectral.cuh
rename to cpp/include/raft/sparse/linalg/detail/spectral.cuh
index a293e359c2..521b1ea7ec 100644
--- a/cpp/include/raft/sparse/linalg/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -21,13 +21,13 @@
 #include <raft/spectral/partition.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include <raft/sparse/convert/csr.cuh>
-#include <raft/sparse/coo.cuh>
-#include <selection/knn.cuh>
+#include <raft/sparse/convert/csr.hpp>
+#include <raft/sparse/coo.hpp>
 
 namespace raft {
 namespace sparse {
 namespace spectral {
+namespace detail {
 
 template <typename T>
 void fit_embedding(const raft::handle_t& handle,
@@ -107,6 +107,8 @@ void fit_embedding(const raft::handle_t& handle,
 
   CUDA_CHECK(cudaGetLastError());
 }
+
+};  // namespace detail
 };  // namespace spectral
 };  // namespace sparse
 };  // namespace raft
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
similarity index 91%
rename from cpp/include/raft/sparse/linalg/symmetrize.cuh
rename to cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
index ae89e7993c..ea7f2f2fad 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
@@ -24,10 +24,10 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <raft/sparse/op/sort.h>
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
 #include <raft/device_atomics.cuh>
+#include <raft/sparse/op/sort.hpp>
 
 #include <cuda_runtime.h>
 #include <stdio.h>
@@ -35,14 +35,15 @@
 #include <algorithm>
 #include <iostream>
 
-#include <raft/sparse/utils.h>
-#include <raft/sparse/convert/csr.cuh>
-#include <raft/sparse/coo.cuh>
-#include <raft/sparse/op/reduce.cuh>
+#include <raft/sparse/detail/utils.h>
+#include <raft/sparse/convert/csr.hpp>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/op/reduce.hpp>
 
 namespace raft {
 namespace sparse {
 namespace linalg {
+namespace detail {
 
 // TODO: value_idx param needs to be used for this once FAISS is updated to use float32
 // for indices so that the index types can be uniform
@@ -172,12 +173,12 @@ void coo_symmetrize(COO<T>* in,
  * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction
  */
 template <typename value_idx = int64_t, typename value_t = float>
-__global__ static void symmetric_find_size(const value_t* restrict data,
-                                           const value_idx* restrict indices,
+__global__ static void symmetric_find_size(const value_t* __restrict__ data,
+                                           const value_idx* __restrict__ indices,
                                            const value_idx n,
                                            const int k,
-                                           value_idx* restrict row_sizes,
-                                           value_idx* restrict row_sizes2)
+                                           value_idx* __restrict__ row_sizes,
+                                           value_idx* __restrict__ row_sizes2)
 {
   const auto row = blockIdx.x * blockDim.x + threadIdx.x;  // for every row
   const auto j   = blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
@@ -202,8 +203,8 @@ __global__ static void symmetric_find_size(const value_t* restrict data,
 template <typename value_idx>
 __global__ static void reduce_find_size(const value_idx n,
                                         const int k,
-                                        value_idx* restrict row_sizes,
-                                        const value_idx* restrict row_sizes2)
+                                        value_idx* __restrict__ row_sizes,
+                                        const value_idx* __restrict__ row_sizes2)
 {
   const auto i = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (i >= n) return;
@@ -225,12 +226,12 @@ __global__ static void reduce_find_size(const value_idx n,
  * @param k: Number of n_neighbors
  */
 template <typename value_idx = int64_t, typename value_t = float>
-__global__ static void symmetric_sum(value_idx* restrict edges,
-                                     const value_t* restrict data,
-                                     const value_idx* restrict indices,
-                                     value_t* restrict VAL,
-                                     value_idx* restrict COL,
-                                     value_idx* restrict ROW,
+__global__ static void symmetric_sum(value_idx* __restrict__ edges,
+                                     const value_t* __restrict__ data,
+                                     const value_idx* __restrict__ indices,
+                                     value_t* __restrict__ VAL,
+                                     value_idx* __restrict__ COL,
+                                     value_idx* __restrict__ ROW,
                                      const value_idx n,
                                      const int k)
 {
@@ -269,8 +270,8 @@ __global__ static void symmetric_sum(value_idx* restrict edges,
  * @param stream: Input cuda stream
  */
 template <typename value_idx = int64_t, typename value_t = float, int TPB_X = 32, int TPB_Y = 32>
-void from_knn_symmetrize_matrix(const value_idx* restrict knn_indices,
-                                const value_t* restrict knn_dists,
+void from_knn_symmetrize_matrix(const value_idx* __restrict__ knn_indices,
+                                const value_t* __restrict__ knn_dists,
                                 const value_idx n,
                                 const int k,
                                 COO<value_t, value_idx>* out,
@@ -361,6 +362,7 @@ void symmetrize(const raft::handle_t& handle,
     handle, out, symm_rows.data(), symm_cols.data(), symm_vals.data(), nnz * 2, m, n);
 }
 
+};  // end NAMESPACE detail
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/linalg/transpose.h b/cpp/include/raft/sparse/linalg/detail/transpose.h
similarity index 98%
rename from cpp/include/raft/sparse/linalg/transpose.h
rename to cpp/include/raft/sparse/linalg/detail/transpose.h
index e3a9b1fbd9..ae22a93d15 100644
--- a/cpp/include/raft/sparse/linalg/transpose.h
+++ b/cpp/include/raft/sparse/linalg/detail/transpose.h
@@ -32,11 +32,12 @@
 #include <algorithm>
 #include <iostream>
 
-#include <raft/sparse/utils.h>
+#include <raft/sparse/detail/utils.h>
 
 namespace raft {
 namespace sparse {
 namespace linalg {
+namespace detail {
 
 /**
  * Transpose a set of CSR arrays into a set of CSC arrays.
@@ -104,6 +105,7 @@ void csr_transpose(cusparseHandle_t handle,
                                                stream));
 }
 
+};  // end NAMESPACE detail
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/norm.hpp b/cpp/include/raft/sparse/linalg/norm.hpp
new file mode 100644
index 0000000000..683daedf4f
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/norm.hpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/sparse/linalg/detail/norm.cuh>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * @brief Perform L1 normalization on the rows of a given CSR-formatted sparse matrix
+ *
+ * @param ia: row_ind array
+ * @param vals: data array
+ * @param nnz: size of data array
+ * @param m: size of row_ind array
+ * @param result: l1 normalized data array
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void csr_row_normalize_l1(const int* ia,  // csr row ex_scan (sorted by row)
+                          const T* vals,
+                          int nnz,  // array of values and number of non-zeros
+                          int m,    // num rows in csr
+                          T* result,
+                          cudaStream_t stream)
+{  // output array
+  detail::csr_row_normalize_l1(ia, vals, nnz, m, result, stream);
+}
+
+/**
+ * @brief Perform L_inf normalization on a given CSR-formatted sparse matrix
+ *
+ * @param ia: row_ind array
+ * @param vals: data array
+ * @param nnz: size of data array
+ * @param m: size of row_ind array
+ * @param result: l1 normalized data array
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
+                           const T* vals,
+                           int nnz,  // array of values and number of non-zeros
+                           int m,    // num total rows in csr
+                           T* result,
+                           cudaStream_t stream)
+{
+  detail::csr_row_normalize_max(ia, vals, nnz, m, result, stream);
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/spectral.hpp b/cpp/include/raft/sparse/linalg/spectral.hpp
new file mode 100644
index 0000000000..619987062f
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/spectral.hpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/handle.hpp>
+#include <raft/sparse/linalg/detail/spectral.cuh>
+
+namespace raft {
+namespace sparse {
+namespace spectral {
+
+template <typename T>
+void fit_embedding(const raft::handle_t& handle,
+                   int* rows,
+                   int* cols,
+                   T* vals,
+                   int nnz,
+                   int n,
+                   int n_components,
+                   T* out,
+                   unsigned long long seed = 1234567)
+{
+  detail::fit_embedding(handle, rows, cols, vals, nnz, n, n_components, out, seed);
+}
+};  // namespace spectral
+};  // namespace sparse
+};  // namespace raft
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.hpp b/cpp/include/raft/sparse/linalg/symmetrize.hpp
new file mode 100644
index 0000000000..64d27f5b6f
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/symmetrize.hpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/linalg/detail/symmetrize.cuh>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * @brief takes a COO matrix which may not be symmetric and symmetrizes
+ * it, running a custom reduction function against the each value
+ * and its transposed value.
+ *
+ * @param in: Input COO matrix
+ * @param out: Output symmetrized COO matrix
+ * @param reduction_op: a custom reduction function
+ * @param stream: cuda stream to use
+ */
+template <typename T, typename Lambda>
+void coo_symmetrize(COO<T>* in,
+                    COO<T>* out,
+                    Lambda reduction_op,  // two-argument reducer
+                    cudaStream_t stream)
+{
+  detail::coo_symmetrize(in, out, reduction_op, stream);
+}
+
+/**
+ * @brief Find how much space needed in each row.
+ * We look through all datapoints and increment the count for each row.
+ *
+ * TODO: This isn't generalized. Remove in place of `symmetrize()`
+ * @param data: Input knn distances(n, k)
+ * @param indices: Input knn indices(n, k)
+ * @param n: Number of rows
+ * @param k: Number of n_neighbors
+ * @param row_sizes: Input empty row sum 1 array(n)
+ * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction
+ */
+template <typename value_idx = int64_t, typename value_t = float>
+__global__ static void symmetric_find_size(const value_t* __restrict__ data,
+                                           const value_idx* __restrict__ indices,
+                                           const value_idx n,
+                                           const int k,
+                                           value_idx* __restrict__ row_sizes,
+                                           value_idx* __restrict__ row_sizes2)
+{
+  detail::symmetric_find_size(data, indices, n, k, row_sizes, row_sizes2);
+}
+
+/**
+ * @brief Reduce sum(row_sizes) + k
+ * Reduction for symmetric_find_size kernel. Allows algo to be faster.
+ *
+ * TODO: This isn't generalized. Remove in place of `symmetrize()`
+ * @param n: Number of rows
+ * @param k: Number of n_neighbors
+ * @param row_sizes: Input row sum 1 array(n)
+ * @param row_sizes2: Input row sum 2 array(n) for faster reduction
+ */
+template <typename value_idx>
+__global__ static void reduce_find_size(const value_idx n,
+                                        const int k,
+                                        value_idx* __restrict__ row_sizes,
+                                        const value_idx* __restrict__ row_sizes2)
+{
+  detail::reduce_find_size(n, k, row_sizes, row_sizes2);
+}
+
+/**
+ * @brief Perform data + data.T operation.
+ * Can only run once row_sizes from the CSR matrix of data + data.T has been
+ * determined.
+ *
+ * TODO: This isn't generalized. Remove in place of `symmetrize()`
+ *
+ * @param edges: Input row sum array(n) after reduction
+ * @param data: Input knn distances(n, k)
+ * @param indices: Input knn indices(n, k)
+ * @param VAL: Output values for data + data.T
+ * @param COL: Output column indices for data + data.T
+ * @param ROW: Output row indices for data + data.T
+ * @param n: Number of rows
+ * @param k: Number of n_neighbors
+ */
+template <typename value_idx = int64_t, typename value_t = float>
+__global__ static void symmetric_sum(value_idx* __restrict__ edges,
+                                     const value_t* __restrict__ data,
+                                     const value_idx* __restrict__ indices,
+                                     value_t* __restrict__ VAL,
+                                     value_idx* __restrict__ COL,
+                                     value_idx* __restrict__ ROW,
+                                     const value_idx n,
+                                     const int k)
+{
+  detail::symmetric_sum(edges, data, indices, VAL, COL, ROW, n, k);
+}
+
+/**
+ * @brief Perform data + data.T on raw KNN data.
+ * The following steps are invoked:
+ * (1) Find how much space needed in each row
+ * (2) Compute final space needed (n*k + sum(row_sizes)) == 2*n*k
+ * (3) Allocate new space
+ * (4) Prepare edges for each new row
+ * (5) Perform final data + data.T operation
+ * (6) Return summed up VAL, COL, ROW
+ *
+ * TODO: This isn't generalized. Remove in place of `symmetrize()`
+ *
+ * @param knn_indices: Input knn distances(n, k)
+ * @param knn_dists: Input knn indices(n, k)
+ * @param n: Number of rows
+ * @param k: Number of n_neighbors
+ * @param out: Output COO Matrix class
+ * @param stream: Input cuda stream
+ */
+template <typename value_idx = int64_t, typename value_t = float, int TPB_X = 32, int TPB_Y = 32>
+void from_knn_symmetrize_matrix(const value_idx* __restrict__ knn_indices,
+                                const value_t* __restrict__ knn_dists,
+                                const value_idx n,
+                                const int k,
+                                COO<value_t, value_idx>* out,
+                                cudaStream_t stream)
+{
+  detail::from_knn_symmetrize_matrix(knn_indices, knn_dists, n, k, out, stream);
+}
+
+/**
+ * Symmetrizes a COO matrix
+ */
+template <typename value_idx, typename value_t>
+void symmetrize(const raft::handle_t& handle,
+                const value_idx* rows,
+                const value_idx* cols,
+                const value_t* vals,
+                size_t m,
+                size_t n,
+                size_t nnz,
+                raft::sparse::COO<value_t, value_idx>& out)
+{
+  detail::symmetrize(handle, rows, cols, vals, m, n, nnz, out);
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/linalg/transpose.hpp b/cpp/include/raft/sparse/linalg/transpose.hpp
new file mode 100644
index 0000000000..c2c478ccec
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/transpose.hpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/sparse/linalg/detail/transpose.h>
+#include <raft/handle.hpp>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * Transpose a set of CSR arrays into a set of CSC arrays.
+ * @tparam value_idx : data type of the CSR index arrays
+ * @tparam value_t : data type of the CSR data array
+ * @param[in] handle : used for invoking cusparse
+ * @param[in] csr_indptr : CSR row index array
+ * @param[in] csr_indices : CSR column indices array
+ * @param[in] csr_data : CSR data array
+ * @param[out] csc_indptr : CSC row index array
+ * @param[out] csc_indices : CSC column indices array
+ * @param[out] csc_data : CSC data array
+ * @param[in] csr_nrows : Number of rows in CSR
+ * @param[in] csr_ncols : Number of columns in CSR
+ * @param[in] nnz : Number of nonzeros of CSR
+ * @param[in] stream : Cuda stream for ordering events
+ */
+template <typename value_idx, typename value_t>
+void csr_transpose(const raft::handle_t& handle,
+                   const value_idx* csr_indptr,
+                   const value_idx* csr_indices,
+                   const value_t* csr_data,
+                   value_idx* csc_indptr,
+                   value_idx* csc_indices,
+                   value_t* csc_data,
+                   value_idx csr_nrows,
+                   value_idx csr_ncols,
+                   value_idx nnz,
+                   cudaStream_t stream)
+{
+  detail::csr_transpose(handle.get_cusparse_handle(),
+                        csr_indptr,
+                        csr_indices,
+                        csr_data,
+                        csc_indptr,
+                        csc_indices,
+                        csc_data,
+                        csr_nrows,
+                        csr_ncols,
+                        nnz,
+                        stream);
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/detail/filter.cuh
similarity index 95%
rename from cpp/include/raft/sparse/op/filter.cuh
rename to cpp/include/raft/sparse/op/detail/filter.cuh
index 8bc8c746f9..31ec1eed22 100644
--- a/cpp/include/raft/sparse/op/filter.cuh
+++ b/cpp/include/raft/sparse/op/detail/filter.cuh
@@ -33,13 +33,14 @@
 #include <cstdio>
 #include <iostream>
 
-#include <raft/sparse/utils.h>
-#include <raft/sparse/coo.cuh>
-#include <raft/sparse/linalg/degree.cuh>
+#include <raft/sparse/detail/utils.h>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/linalg/degree.hpp>
 
 namespace raft {
 namespace sparse {
 namespace op {
+namespace detail {
 
 template <int TPB_X, typename T>
 __global__ void coo_remove_scalar_kernel(const int* rows,
@@ -153,11 +154,10 @@ void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
   CUDA_CHECK(cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream));
   CUDA_CHECK(cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream));
 
-  linalg::coo_degree<TPB_X>(in->rows(), in->nnz, row_count.data(), stream);
+  linalg::coo_degree(in->rows(), in->nnz, row_count.data(), stream);
   CUDA_CHECK(cudaPeekAtLastError());
 
-  linalg::coo_degree_scalar<TPB_X>(
-    in->rows(), in->vals(), in->nnz, scalar, row_count_nz.data(), stream);
+  linalg::coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, row_count_nz.data(), stream);
   CUDA_CHECK(cudaPeekAtLastError());
 
   thrust::device_ptr<int> d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data());
@@ -194,6 +194,7 @@ void coo_remove_zeros(COO<T>* in, COO<T>* out, cudaStream_t stream)
   coo_remove_scalar<TPB_X, T>(in, out, T(0.0), stream);
 }
 
+};  // namespace detail
 };  // namespace op
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/detail/reduce.cuh
similarity index 96%
rename from cpp/include/raft/sparse/op/reduce.cuh
rename to cpp/include/raft/sparse/op/detail/reduce.cuh
index 84d584d108..55a8ee2948 100644
--- a/cpp/include/raft/sparse/op/reduce.cuh
+++ b/cpp/include/raft/sparse/op/detail/reduce.cuh
@@ -23,10 +23,10 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/mr/device/buffer.hpp>
 
-#include <raft/sparse/op/sort.h>
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
 #include <raft/device_atomics.cuh>
+#include <raft/sparse/op/sort.hpp>
 
 #include <cuda_runtime.h>
 #include <stdio.h>
@@ -35,13 +35,14 @@
 #include <algorithm>
 #include <iostream>
 
-#include <raft/sparse/utils.h>
-#include <raft/sparse/convert/csr.cuh>
-#include <raft/sparse/coo.cuh>
+#include <raft/sparse/detail/utils.h>
+#include <raft/sparse/convert/csr.hpp>
+#include <raft/sparse/coo.hpp>
 
 namespace raft {
 namespace sparse {
 namespace op {
+namespace detail {
 
 template <typename value_idx>
 __global__ void compute_duplicates_diffs_kernel(const value_idx* rows,
@@ -155,6 +156,8 @@ void max_duplicates(const raft::handle_t& handle,
   max_duplicates_kernel<<<raft::ceildiv(nnz, (size_t)256), 256, 0, stream>>>(
     rows, cols, vals, diff.data() + 1, out.rows(), out.cols(), out.vals(), nnz);
 }
+
+};  // END namespace detail
 };  // END namespace op
 };  // END namespace sparse
 };  // END namespace raft
diff --git a/cpp/include/raft/sparse/op/row_op.cuh b/cpp/include/raft/sparse/op/detail/row_op.cuh
similarity index 96%
rename from cpp/include/raft/sparse/op/row_op.cuh
rename to cpp/include/raft/sparse/op/detail/row_op.cuh
index 194a878ac1..4fd76a0202 100644
--- a/cpp/include/raft/sparse/op/row_op.cuh
+++ b/cpp/include/raft/sparse/op/detail/row_op.cuh
@@ -31,11 +31,12 @@
 #include <algorithm>
 #include <iostream>
 
-#include <raft/sparse/utils.h>
+#include <raft/sparse/detail/utils.h>
 
 namespace raft {
 namespace sparse {
 namespace op {
+namespace detail {
 
 template <typename T, int TPB_X = 256, typename Lambda = auto(T, T, T)->void>
 __global__ void csr_row_op_kernel(const T* row_ind, T n_rows, T nnz, Lambda op)
@@ -69,6 +70,7 @@ void csr_row_op(const Index_* row_ind, Index_ n_rows, Index_ nnz, Lambda op, cud
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
+};  // namespace detail
 };  // namespace op
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/op/slice.h b/cpp/include/raft/sparse/op/detail/slice.h
similarity index 97%
rename from cpp/include/raft/sparse/op/slice.h
rename to cpp/include/raft/sparse/op/detail/slice.h
index 9bbe04cf34..4a09f4af7f 100644
--- a/cpp/include/raft/sparse/op/slice.h
+++ b/cpp/include/raft/sparse/op/detail/slice.h
@@ -32,11 +32,12 @@
 #include <algorithm>
 #include <iostream>
 
-#include <raft/sparse/utils.h>
+#include <raft/sparse/detail/utils.h>
 
 namespace raft {
 namespace sparse {
 namespace op {
+namespace detail {
 
 /**
  * Slice consecutive rows from a CSR array and populate newly sliced indptr array
@@ -102,6 +103,7 @@ void csr_row_slice_populate(value_idx start_offset,
   raft::copy(data_out, data + start_offset, stop_offset - start_offset, stream);
 }
 
+};  // namespace detail
 };  // namespace op
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/sort.h b/cpp/include/raft/sparse/op/detail/sort.h
similarity index 94%
rename from cpp/include/raft/sparse/op/sort.h
rename to cpp/include/raft/sparse/op/detail/sort.h
index d397bce780..9f32dd97b1 100644
--- a/cpp/include/raft/sparse/op/sort.h
+++ b/cpp/include/raft/sparse/op/detail/sort.h
@@ -18,9 +18,9 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/sparse/utils.h>
+#include <raft/sparse/detail/utils.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/sparse/coo.cuh>
+#include <raft/sparse/coo.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/device_ptr.h>
@@ -35,10 +35,14 @@
 namespace raft {
 namespace sparse {
 namespace op {
+namespace detail {
 
 struct TupleComp {
   template <typename one, typename two>
-  __host__ __device__ bool operator()(const one& t1, const two& t2)
+  __host__ __device__
+
+    bool
+    operator()(const one& t1, const two& t2)
   {
     // sort first by each sample's color,
     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
@@ -102,6 +106,7 @@ void coo_sort_by_weight(
 
   thrust::sort_by_key(rmm::exec_policy(stream), t_data, t_data + nnz, first);
 }
+};  // namespace detail
 };  // namespace op
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/op/filter.hpp b/cpp/include/raft/sparse/op/filter.hpp
new file mode 100644
index 0000000000..8e57f53d9a
--- /dev/null
+++ b/cpp/include/raft/sparse/op/filter.hpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/handle.hpp>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/op/detail/filter.cuh>
+
+namespace raft {
+namespace sparse {
+namespace op {
+
+/**
+ * @brief Removes the values matching a particular scalar from a COO formatted sparse matrix.
+ *
+ * @param rows: input array of rows (size n)
+ * @param cols: input array of cols (size n)
+ * @param vals: input array of vals (size n)
+ * @param nnz: size of current rows/cols/vals arrays
+ * @param crows: compressed array of rows
+ * @param ccols: compressed array of cols
+ * @param cvals: compressed array of vals
+ * @param cnnz: array of non-zero counts per row
+ * @param cur_cnnz array of counts per row
+ * @param scalar: scalar to remove from arrays
+ * @param n: number of rows in dense matrix
+ * @param d_alloc device allocator for temporary buffers
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_remove_scalar(const int* rows,
+                       const int* cols,
+                       const T* vals,
+                       int nnz,
+                       int* crows,
+                       int* ccols,
+                       T* cvals,
+                       int* cnnz,
+                       int* cur_cnnz,
+                       T scalar,
+                       int n,
+                       cudaStream_t stream)
+{
+  detail::coo_remove_scalar<128, T>(
+    rows, cols, vals, nnz, crows, ccols, cvals, cnnz, cur_cnnz, scalar, n, stream);
+}
+
+/**
+ * @brief Removes the values matching a particular scalar from a COO formatted sparse matrix.
+ *
+ * @param in: input COO matrix
+ * @param out: output COO matrix
+ * @param scalar: scalar to remove from arrays
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
+{
+  detail::coo_remove_scalar<128, T>(in, out, scalar, stream);
+}
+
+/**
+ * @brief Removes zeros from a COO formatted sparse matrix.
+ *
+ * @param in: input COO matrix
+ * @param out: output COO matrix
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_remove_zeros(COO<T>* in, COO<T>* out, cudaStream_t stream)
+{
+  coo_remove_scalar<T>(in, out, T(0.0), stream);
+}
+
+};  // namespace op
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/op/reduce.hpp b/cpp/include/raft/sparse/op/reduce.hpp
new file mode 100644
index 0000000000..094968b11c
--- /dev/null
+++ b/cpp/include/raft/sparse/op/reduce.hpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/handle.hpp>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/op/detail/reduce.cuh>
+
+namespace raft {
+namespace sparse {
+namespace op {
+/**
+ * Computes a mask from a sorted COO matrix where 0's denote
+ * duplicate values and 1's denote new values. This mask can
+ * be useful for computing an exclusive scan to pre-build offsets
+ * for reducing duplicates, such as when symmetrizing
+ * or taking the min of each duplicated value.
+ *
+ * Note that this function always marks the first value as 0 so that
+ * a cumulative sum can be performed as a follow-on. However, even
+ * if the mask is used direclty, any duplicates should always have a
+ * 1 when first encountered so it can be assumed that the first element
+ * is always a 1 otherwise.
+ *
+ * @tparam value_idx
+ * @param[out] mask output mask, size nnz
+ * @param[in] rows COO rows array, size nnz
+ * @param[in] cols COO cols array, size nnz
+ * @param[in] nnz number of nonzeros in input arrays
+ * @param[in] stream cuda ops will be ordered wrt this stream
+ */
+template <typename value_idx>
+void compute_duplicates_mask(
+  value_idx* mask, const value_idx* rows, const value_idx* cols, size_t nnz, cudaStream_t stream)
+{
+  detail::compute_duplicates_mask(mask, rows, cols, nnz, stream);
+}
+
+/**
+ * Performs a COO reduce of duplicate columns per row, taking the max weight
+ * for duplicate columns in each row. This function assumes the input COO
+ * has been sorted by both row and column but makes no assumption on
+ * the sorting of values.
+ * @tparam value_idx
+ * @tparam value_t
+ * @param[out] out output COO, the nnz will be computed allocate() will be called in this function.
+ * @param[in] rows COO rows array, size nnz
+ * @param[in] cols COO cols array, size nnz
+ * @param[in] vals COO vals array, size nnz
+ * @param[in] nnz number of nonzeros in COO input arrays
+ * @param[in] m number of rows in COO input matrix
+ * @param[in] n number of columns in COO input matrix
+ * @param[in] stream cuda ops will be ordered wrt this stream
+ */
+template <typename value_idx, typename value_t>
+void max_duplicates(const raft::handle_t& handle,
+                    raft::sparse::COO<value_t, value_idx>& out,
+                    const value_idx* rows,
+                    const value_idx* cols,
+                    const value_t* vals,
+                    size_t nnz,
+                    size_t m,
+                    size_t n)
+{
+  detail::max_duplicates(handle, out, rows, cols, vals, nnz, m, n);
+}
+};  // END namespace op
+};  // END namespace sparse
+};  // END namespace raft
diff --git a/cpp/include/raft/sparse/op/row_op.hpp b/cpp/include/raft/sparse/op/row_op.hpp
new file mode 100644
index 0000000000..5dc115cfce
--- /dev/null
+++ b/cpp/include/raft/sparse/op/row_op.hpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/handle.hpp>
+#include <raft/sparse/op/detail/row_op.cuh>
+
+namespace raft {
+namespace sparse {
+namespace op {
+
+/**
+ * @brief Perform a custom row operation on a CSR matrix in batches.
+ * @tparam T numerical type of row_ind array
+ * @tparam TPB_X number of threads per block to use for underlying kernel
+ * @tparam Lambda type of custom operation function
+ * @param row_ind the CSR row_ind array to perform parallel operations over
+ * @param n_rows total number vertices in graph
+ * @param nnz number of non-zeros
+ * @param op custom row operation functor accepting the row and beginning index.
+ * @param stream cuda stream to use
+ */
+template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_row_op(const Index_* row_ind, Index_ n_rows, Index_ nnz, Lambda op, cudaStream_t stream)
+{
+  detail::csr_row_op<Index_, 128, Lambda>(row_ind, n_rows, nnz, op, stream);
+}
+
+};  // namespace op
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/op/slice.hpp b/cpp/include/raft/sparse/op/slice.hpp
new file mode 100644
index 0000000000..adda68f0fb
--- /dev/null
+++ b/cpp/include/raft/sparse/op/slice.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/sparse/op/detail/slice.h>
+#include <raft/handle.hpp>
+
+namespace raft {
+namespace sparse {
+namespace op {
+
+/**
+ * Slice consecutive rows from a CSR array and populate newly sliced indptr array
+ * @tparam value_idx
+ * @param[in] start_row : beginning row to slice
+ * @param[in] stop_row : ending row to slice
+ * @param[in] indptr : indptr of input CSR to slice
+ * @param[out] indptr_out : output sliced indptr to populate
+ * @param[in] start_offset : beginning column offset of input indptr
+ * @param[in] stop_offset : ending column offset of input indptr
+ * @param[in] stream : cuda stream for ordering events
+ */
+template <typename value_idx>
+void csr_row_slice_indptr(value_idx start_row,
+                          value_idx stop_row,
+                          const value_idx* indptr,
+                          value_idx* indptr_out,
+                          value_idx* start_offset,
+                          value_idx* stop_offset,
+                          cudaStream_t stream)
+{
+  detail::csr_row_slice_indptr(
+    start_row, stop_row, indptr, indptr_out, start_offset, stop_offset, stream);
+}
+
+/**
+ * Slice rows from a CSR, populate column and data arrays
+ * @tparam[in] value_idx : data type of CSR index arrays
+ * @tparam[in] value_t : data type of CSR data array
+ * @param[in] start_offset : beginning column offset to slice
+ * @param[in] stop_offset : ending column offset to slice
+ * @param[in] indices : column indices array from input CSR
+ * @param[in] data : data array from input CSR
+ * @param[out] indices_out : output column indices array
+ * @param[out] data_out : output data array
+ * @param[in] stream : cuda stream for ordering events
+ */
+template <typename value_idx, typename value_t>
+void csr_row_slice_populate(value_idx start_offset,
+                            value_idx stop_offset,
+                            const value_idx* indices,
+                            const value_t* data,
+                            value_idx* indices_out,
+                            value_t* data_out,
+                            cudaStream_t stream)
+{
+  detail::csr_row_slice_populate(
+    start_offset, stop_offset, indices, data, indices_out, data_out, stream);
+}
+
+};  // namespace op
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/sort.hpp b/cpp/include/raft/sparse/op/sort.hpp
new file mode 100644
index 0000000000..e5b243e53f
--- /dev/null
+++ b/cpp/include/raft/sparse/op/sort.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/sparse/op/detail/sort.h>
+#include <raft/handle.hpp>
+
+namespace raft {
+namespace sparse {
+namespace op {
+
+/**
+ * @brief Sorts the arrays that comprise the coo matrix
+ * by row and then by column.
+ *
+ * @param m number of rows in coo matrix
+ * @param n number of cols in coo matrix
+ * @param nnz number of non-zeros
+ * @param rows rows array from coo matrix
+ * @param cols cols array from coo matrix
+ * @param vals vals array from coo matrix
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_sort(int m, int n, int nnz, int* rows, int* cols, T* vals, cudaStream_t stream)
+{
+  detail::coo_sort(m, n, nnz, rows, cols, vals, stream);
+}
+
+/**
+ * @brief Sort the underlying COO arrays by row
+ * @tparam T: the type name of the underlying value array
+ * @param in: COO to sort by row
+ * @param stream: the cuda stream to use
+ */
+template <typename T>
+void coo_sort(COO<T>* const in, cudaStream_t stream)
+{
+  coo_sort<T>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream);
+}
+
+/**
+ * Sorts a COO by its weight
+ * @tparam value_idx
+ * @tparam value_t
+ * @param[inout] rows source edges
+ * @param[inout] cols dest edges
+ * @param[inout] data edge weights
+ * @param[in] nnz number of edges in edge list
+ * @param[in] stream cuda stream for which to order cuda operations
+ */
+template <typename value_idx, typename value_t>
+void coo_sort_by_weight(
+  value_idx* rows, value_idx* cols, value_t* data, value_idx nnz, cudaStream_t stream)
+{
+  detail::coo_sort_by_weight(rows, cols, data, nnz, stream);
+}
+};  // namespace op
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/selection/connect_components.hpp b/cpp/include/raft/sparse/selection/connect_components.hpp
new file mode 100644
index 0000000000..bc03cd56e2
--- /dev/null
+++ b/cpp/include/raft/sparse/selection/connect_components.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/handle.hpp>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/selection/detail/connect_components.cuh>
+
+namespace raft {
+namespace linkage {
+
+template <typename value_idx, typename value_t>
+using FixConnectivitiesRedOp = detail::FixConnectivitiesRedOp<value_idx, value_t>;
+
+/**
+ * Gets the number of unique components from array of
+ * colors or labels. This does not assume the components are
+ * drawn from a monotonically increasing set.
+ * @tparam value_idx
+ * @param[in] colors array of components
+ * @param[in] n_rows size of components array
+ * @param[in] stream cuda stream for which to order cuda operations
+ * @return total number of components
+ */
+template <typename value_idx>
+value_idx get_n_components(value_idx* colors, size_t n_rows, cudaStream_t stream)
+{
+  return detail::get_n_components(colors, n_rows, stream);
+}
+
+/**
+ * Connects the components of an otherwise unconnected knn graph
+ * by computing a 1-nn to neighboring components of each data point
+ * (e.g. component(nn) != component(self)) and reducing the results to
+ * include the set of smallest destination components for each source
+ * component. The result will not necessarily contain
+ * n_components^2 - n_components number of elements because many components
+ * will likely not be contained in the neighborhoods of 1-nns.
+ * @tparam value_idx
+ * @tparam value_t
+ * @param[in] handle raft handle
+ * @param[out] out output edge list containing nearest cross-component
+ *             edges.
+ * @param[in] X original (row-major) dense matrix for which knn graph should be constructed.
+ * @param[in] colors array containing component number for each row of X
+ * @param[in] n_rows number of rows in X
+ * @param[in] n_cols number of cols in X
+ */
+template <typename value_idx, typename value_t, typename red_op>
+void connect_components(
+  const raft::handle_t& handle,
+  raft::sparse::COO<value_t, value_idx>& out,
+  const value_t* X,
+  const value_idx* orig_colors,
+  size_t n_rows,
+  size_t n_cols,
+  red_op reduction_op,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded)
+{
+  detail::connect_components(handle, out, X, orig_colors, n_rows, n_cols, reduction_op, metric);
+}
+
+};  // end namespace linkage
+};  // end namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/detail/connect_components.cuh
similarity index 95%
rename from cpp/include/raft/sparse/selection/connect_components.cuh
rename to cpp/include/raft/sparse/selection/detail/connect_components.cuh
index 8edb0e8b43..35101f1714 100644
--- a/cpp/include/raft/sparse/selection/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/detail/connect_components.cuh
@@ -20,10 +20,10 @@
 #include <raft/label/classlabels.cuh>
 #include <raft/linalg/norm.cuh>
 #include <raft/mr/device/buffer.hpp>
-#include <raft/sparse/convert/csr.cuh>
-#include <raft/sparse/coo.cuh>
-#include <raft/sparse/linalg/symmetrize.cuh>
-#include <raft/sparse/op/reduce.cuh>
+#include <raft/sparse/convert/csr.hpp>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/linalg/symmetrize.hpp>
+#include <raft/sparse/op/reduce.hpp>
 
 #include <raft/cudart_utils.h>
 
@@ -38,6 +38,7 @@
 
 namespace raft {
 namespace linkage {
+namespace detail {
 
 /**
  * \brief A key identifier paired with a corresponding value
@@ -100,7 +101,9 @@ struct FixConnectivitiesRedOp {
     }
   }
 
-  DI KVP operator()(value_idx rit, const KVP& a, const KVP& b)
+  DI KVP
+
+  operator()(value_idx rit, const KVP& a, const KVP& b)
   {
     if (rit < m && a.value < b.value && colors[rit] != colors[a.key]) {
       return a;
@@ -142,9 +145,19 @@ template <typename LabelT, typename DataT>
 struct CubKVPMinReduce {
   typedef cub::KeyValuePair<LabelT, DataT> KVP;
 
-  DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
+  DI KVP
+
+  operator()(LabelT rit, const KVP& a, const KVP& b)
+  {
+    return b.value < a.value ? b : a;
+  }
+
+  DI KVP
 
-  DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
+  operator()(const KVP& a, const KVP& b)
+  {
+    return b.value < a.value ? b : a;
+  }
 
 };  // KVPMinReduce
 
@@ -177,7 +190,9 @@ struct LookupColorOp {
 
   LookupColorOp(value_idx* colors_) : colors(colors_) {}
 
-  DI value_idx operator()(const cub::KeyValuePair<value_idx, value_t>& kvp)
+  DI value_idx
+
+  operator()(const cub::KeyValuePair<value_idx, value_t>& kvp)
   {
     return colors[kvp.key];
   }
@@ -256,7 +271,7 @@ void sort_by_color(value_idx* colors,
   thrust::copy(rmm::exec_policy(stream), arg_sort_iter, arg_sort_iter + n_rows, src_indices);
 
   auto keys = thrust::make_zip_iterator(
-    thrust::make_tuple(colors, nn_colors, (raft::linkage::KeyValuePair<value_idx, value_t>*)kvp));
+    thrust::make_tuple(colors, nn_colors, (KeyValuePair<value_idx, value_t>*)kvp));
   auto vals = thrust::make_zip_iterator(thrust::make_tuple(src_indices));
 
   // get all the colors in contiguous locations so we can map them to warps.
@@ -417,5 +432,6 @@ void connect_components(
     handle, min_edges.rows(), min_edges.cols(), min_edges.vals(), n_rows, n_rows, size, out);
 }
 
+};  // end namespace detail
 };  // end namespace linkage
-};  // end namespace raft
\ No newline at end of file
+};  // end namespace raft
diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/detail/knn.cuh
similarity index 81%
rename from cpp/include/raft/sparse/selection/knn.cuh
rename to cpp/include/raft/sparse/selection/detail/knn.cuh
index 8486abd863..d29fd59a88 100644
--- a/cpp/include/raft/sparse/selection/knn.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn.cuh
@@ -25,16 +25,17 @@
 #include <raft/matrix/matrix.hpp>
 #include <raft/mr/device/buffer.hpp>
 
-#include <raft/sparse/op/slice.h>
-#include <raft/sparse/utils.h>
-#include <raft/sparse/coo.cuh>
-#include <raft/sparse/csr.cuh>
+#include <raft/sparse/detail/utils.h>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/csr.hpp>
 #include <raft/sparse/distance/distance.hpp>
+#include <raft/sparse/op/slice.hpp>
 #include <raft/spatial/knn/knn.hpp>
 
 namespace raft {
 namespace sparse {
 namespace selection {
+namespace detail {
 
 template <typename value_idx, typename value_t>
 struct csr_batcher_t {
@@ -426,74 +427,7 @@ class sparse_knn_t {
   const raft::handle_t& handle;
 };
 
-/**
- * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors
- * using some distance implementation
- * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1)
- * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz)
- * @param[in] idxData csr data array of the index matrix (size idxNNZ)
- * @param[in] idxNNA number of non-zeros for sparse index matrix
- * @param[in] n_idx_rows number of data samples in index matrix
- * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1)
- * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ)
- * @param[in] queryData csr data array of the query matrix (size queryNNZ)
- * @param[in] queryNNZ number of non-zeros for sparse query matrix
- * @param[in] n_query_rows number of data samples in query matrix
- * @param[in] n_query_cols number of features in query matrix
- * @param[out] output_indices dense matrix for output indices (size n_query_rows * k)
- * @param[out] output_dists dense matrix for output distances (size n_query_rows * k)
- * @param[in] k the number of neighbors to query
- * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to
- * @param[in] batch_size_index maximum number of rows to use from index matrix per batch
- * @param[in] batch_size_query maximum number of rows to use from query matrix per batch
- * @param[in] metric distance metric/measure to use
- * @param[in] metricArg potential argument for metric (currently unused)
- */
-template <typename value_idx = int, typename value_t = float, int TPB_X = 32>
-void brute_force_knn(const value_idx* idxIndptr,
-                     const value_idx* idxIndices,
-                     const value_t* idxData,
-                     size_t idxNNZ,
-                     int n_idx_rows,
-                     int n_idx_cols,
-                     const value_idx* queryIndptr,
-                     const value_idx* queryIndices,
-                     const value_t* queryData,
-                     size_t queryNNZ,
-                     int n_query_rows,
-                     int n_query_cols,
-                     value_idx* output_indices,
-                     value_t* output_dists,
-                     int k,
-                     const raft::handle_t& handle,
-                     size_t batch_size_index             = 2 << 14,  // approx 1M
-                     size_t batch_size_query             = 2 << 14,
-                     raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
-                     float metricArg                     = 0)
-{
-  sparse_knn_t<value_idx, value_t>(idxIndptr,
-                                   idxIndices,
-                                   idxData,
-                                   idxNNZ,
-                                   n_idx_rows,
-                                   n_idx_cols,
-                                   queryIndptr,
-                                   queryIndices,
-                                   queryData,
-                                   queryNNZ,
-                                   n_query_rows,
-                                   n_query_cols,
-                                   output_indices,
-                                   output_dists,
-                                   k,
-                                   handle,
-                                   batch_size_index,
-                                   batch_size_query,
-                                   metric,
-                                   metricArg)
-    .run();
-}
-
+};  // namespace detail
 };  // namespace selection
 };  // namespace sparse
 };  // namespace raft
diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
similarity index 97%
rename from cpp/include/raft/sparse/selection/knn_graph.cuh
rename to cpp/include/raft/sparse/selection/detail/knn_graph.cuh
index f13c43c306..83cb23f513 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
@@ -19,8 +19,8 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 
-#include <raft/sparse/coo.cuh>
-#include <raft/sparse/linalg/symmetrize.cuh>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/linalg/symmetrize.hpp>
 
 #include <raft/spatial/knn/knn.hpp>
 
@@ -37,6 +37,7 @@
 namespace raft {
 namespace sparse {
 namespace selection {
+namespace detail {
 
 /**
  * Fills indices array of pairwise distance array
@@ -148,6 +149,7 @@ void knn_graph(const handle_t& handle,
     handle, rows.data(), indices.data(), data.data(), m, k, nnz, out);
 }
 
+};  // namespace detail
 };  // namespace selection
 };  // namespace sparse
 };  // end namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/selection/knn.hpp b/cpp/include/raft/sparse/selection/knn.hpp
new file mode 100644
index 0000000000..141026dc82
--- /dev/null
+++ b/cpp/include/raft/sparse/selection/knn.hpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/distance_type.h>
+#include <raft/handle.hpp>
+#include <raft/sparse/selection/detail/knn.cuh>
+
+namespace raft {
+namespace sparse {
+namespace selection {
+
+/**
+ * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors
+ * using some distance implementation
+ * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1)
+ * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz)
+ * @param[in] idxData csr data array of the index matrix (size idxNNZ)
+ * @param[in] idxNNA number of non-zeros for sparse index matrix
+ * @param[in] n_idx_rows number of data samples in index matrix
+ * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1)
+ * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ)
+ * @param[in] queryData csr data array of the query matrix (size queryNNZ)
+ * @param[in] queryNNZ number of non-zeros for sparse query matrix
+ * @param[in] n_query_rows number of data samples in query matrix
+ * @param[in] n_query_cols number of features in query matrix
+ * @param[out] output_indices dense matrix for output indices (size n_query_rows * k)
+ * @param[out] output_dists dense matrix for output distances (size n_query_rows * k)
+ * @param[in] k the number of neighbors to query
+ * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to
+ * @param[in] batch_size_index maximum number of rows to use from index matrix per batch
+ * @param[in] batch_size_query maximum number of rows to use from query matrix per batch
+ * @param[in] metric distance metric/measure to use
+ * @param[in] metricArg potential argument for metric (currently unused)
+ */
+template <typename value_idx = int, typename value_t = float, int TPB_X = 32>
+void brute_force_knn(const value_idx* idxIndptr,
+                     const value_idx* idxIndices,
+                     const value_t* idxData,
+                     size_t idxNNZ,
+                     int n_idx_rows,
+                     int n_idx_cols,
+                     const value_idx* queryIndptr,
+                     const value_idx* queryIndices,
+                     const value_t* queryData,
+                     size_t queryNNZ,
+                     int n_query_rows,
+                     int n_query_cols,
+                     value_idx* output_indices,
+                     value_t* output_dists,
+                     int k,
+                     const raft::handle_t& handle,
+                     size_t batch_size_index             = 2 << 14,  // approx 1M
+                     size_t batch_size_query             = 2 << 14,
+                     raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
+                     float metricArg                     = 0)
+{
+  detail::sparse_knn_t<value_idx, value_t>(idxIndptr,
+                                           idxIndices,
+                                           idxData,
+                                           idxNNZ,
+                                           n_idx_rows,
+                                           n_idx_cols,
+                                           queryIndptr,
+                                           queryIndices,
+                                           queryData,
+                                           queryNNZ,
+                                           n_query_rows,
+                                           n_query_cols,
+                                           output_indices,
+                                           output_dists,
+                                           k,
+                                           handle,
+                                           batch_size_index,
+                                           batch_size_query,
+                                           metric,
+                                           metricArg)
+    .run();
+}
+
+};  // namespace selection
+};  // namespace sparse
+};  // namespace raft
diff --git a/cpp/include/raft/sparse/selection/knn_graph.hpp b/cpp/include/raft/sparse/selection/knn_graph.hpp
new file mode 100644
index 0000000000..7af452541f
--- /dev/null
+++ b/cpp/include/raft/sparse/selection/knn_graph.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/distance_type.h>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/selection/detail/knn_graph.cuh>
+
+namespace raft {
+namespace sparse {
+namespace selection {
+
+/**
+ * Constructs a (symmetrized) knn graph edge list from
+ * dense input vectors.
+ *
+ * Note: The resulting KNN graph is not guaranteed to be connected.
+ *
+ * @tparam value_idx
+ * @tparam value_t
+ * @param[in] handle raft handle
+ * @param[in] X dense matrix of input data samples and observations
+ * @param[in] m number of data samples (rows) in X
+ * @param[in] n number of observations (columns) in X
+ * @param[in] metric distance metric to use when constructing neighborhoods
+ * @param[out] out output edge list
+ * @param[out] out output edge list
+ * @param c
+ */
+template <typename value_idx = int, typename value_t = float>
+void knn_graph(const handle_t& handle,
+               const value_t* X,
+               size_t m,
+               size_t n,
+               raft::distance::DistanceType metric,
+               raft::sparse::COO<value_t, value_idx>& out,
+               int c = 15)
+{
+  detail::knn_graph(handle, X, m, n, metric, out, c);
+}
+
+};  // namespace selection
+};  // namespace sparse
+};  // end namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
index 7b54c3d25b..59fce73188 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
@@ -33,7 +33,7 @@
 
 #include <raft/matrix/matrix.hpp>
 #include <raft/random/rng.hpp>
-#include <raft/sparse/convert/csr.cuh>
+#include <raft/sparse/convert/csr.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index 21422e5a57..eb9a8f1436 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -135,8 +135,8 @@ inline void brute_force_knn(raft::handle_t const& handle,
                             bool rowMajorIndex                 = true,
                             bool rowMajorQuery                 = true,
                             std::vector<int64_t>* translations = nullptr,
-                            distance::DistanceType metric = distance::DistanceType::L2Unexpanded,
-                            float metric_arg              = 2.0f)
+                            distance::DistanceType metric      = distance::DistanceType::L2Expanded,
+                            float metric_arg                   = 2.0f)
 {
   ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size");
 
diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu
index d7e11e8fef..e1223b90a3 100644
--- a/cpp/test/sparse/add.cu
+++ b/cpp/test/sparse/add.cu
@@ -16,8 +16,8 @@
 
 #include <gtest/gtest.h>
 
-#include <raft/sparse/csr.cuh>
-#include <raft/sparse/linalg/add.cuh>
+#include <raft/sparse/csr.hpp>
+#include <raft/sparse/linalg/add.hpp>
 
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.hpp>
@@ -89,35 +89,35 @@ class CSRAddTest : public ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>>
     raft::update_device(
       values_verify.data(), params.matrix_verify.values.data(), nnz_result, stream);
 
-    Index_ nnz = linalg::csr_add_calc_inds<Type_f, 32>(ind_a.data(),
-                                                       ind_ptr_a.data(),
-                                                       values_a.data(),
-                                                       nnz_a,
-                                                       ind_b.data(),
-                                                       ind_ptr_b.data(),
-                                                       values_b.data(),
-                                                       nnz_b,
-                                                       n_rows,
-                                                       ind_result.data(),
-                                                       stream);
+    Index_ nnz = linalg::csr_add_calc_inds<Type_f>(ind_a.data(),
+                                                   ind_ptr_a.data(),
+                                                   values_a.data(),
+                                                   nnz_a,
+                                                   ind_b.data(),
+                                                   ind_ptr_b.data(),
+                                                   values_b.data(),
+                                                   nnz_b,
+                                                   n_rows,
+                                                   ind_result.data(),
+                                                   stream);
 
     ASSERT_TRUE(nnz == nnz_result);
     ASSERT_TRUE(raft::devArrMatch<Index_>(
       ind_verify.data(), ind_result.data(), n_rows, raft::Compare<Index_>()));
 
-    linalg::csr_add_finalize<Type_f, 32>(ind_a.data(),
-                                         ind_ptr_a.data(),
-                                         values_a.data(),
-                                         nnz_a,
-                                         ind_b.data(),
-                                         ind_ptr_b.data(),
-                                         values_b.data(),
-                                         nnz_b,
-                                         n_rows,
-                                         ind_result.data(),
-                                         ind_ptr_result.data(),
-                                         values_result.data(),
-                                         stream);
+    linalg::csr_add_finalize<Type_f>(ind_a.data(),
+                                     ind_ptr_a.data(),
+                                     values_a.data(),
+                                     nnz_a,
+                                     ind_b.data(),
+                                     ind_ptr_b.data(),
+                                     values_b.data(),
+                                     nnz_b,
+                                     n_rows,
+                                     ind_result.data(),
+                                     ind_ptr_result.data(),
+                                     values_result.data(),
+                                     stream);
 
     ASSERT_TRUE(raft::devArrMatch<Index_>(
       ind_ptr_verify.data(), ind_ptr_result.data(), nnz, raft::Compare<Index_>()));
diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu
index 5e4b164b37..57e7414861 100644
--- a/cpp/test/sparse/connect_components.cu
+++ b/cpp/test/sparse/connect_components.cu
@@ -22,14 +22,14 @@
 #include <raft/cuda_utils.cuh>
 #include <vector>
 
-#include <raft/sparse/linalg/symmetrize.cuh>
+#include <raft/sparse/linalg/symmetrize.hpp>
 #include <raft/sparse/mst/mst.cuh>
-#include <raft/sparse/selection/knn_graph.cuh>
+#include <raft/sparse/selection/knn_graph.hpp>
 
 #include <raft/linalg/distance_type.h>
 #include <raft/linalg/transpose.h>
-#include <raft/sparse/convert/csr.cuh>
-#include <raft/sparse/coo.cuh>
+#include <raft/sparse/convert/csr.hpp>
+#include <raft/sparse/coo.hpp>
 #include <raft/sparse/hierarchy/single_linkage.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu
index 2028513010..ecc1315c5f 100644
--- a/cpp/test/sparse/convert_coo.cu
+++ b/cpp/test/sparse/convert_coo.cu
@@ -16,8 +16,8 @@
 
 #include <gtest/gtest.h>
 
-#include <raft/sparse/convert/coo.cuh>
-#include <raft/sparse/csr.cuh>
+#include <raft/sparse/convert/coo.hpp>
+#include <raft/sparse/csr.hpp>
 
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.hpp>
@@ -59,7 +59,7 @@ class CSRtoCOOTest : public ::testing::TestWithParam<CSRtoCOOInputs<Index_>> {
     raft::update_device(ex_scan.data(), params.ex_scan.data(), n_rows, stream);
     raft::update_device(verify.data(), params.verify.data(), nnz, stream);
 
-    convert::csr_to_coo<Index_, 32>(ex_scan.data(), n_rows, result.data(), nnz, stream);
+    convert::csr_to_coo<Index_>(ex_scan.data(), n_rows, result.data(), nnz, stream);
 
     ASSERT_TRUE(
       raft::devArrMatch<Index_>(verify.data(), result.data(), nnz, raft::Compare<float>(), stream));
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index 18e8b874bb..dd774c1d79 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -19,8 +19,8 @@
 #include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
-#include <raft/sparse/convert/csr.cuh>
-#include <raft/sparse/coo.cuh>
+#include <raft/sparse/convert/csr.hpp>
+#include <raft/sparse/coo.hpp>
 
 #include <iostream>
 
@@ -123,7 +123,7 @@ class CSRAdjGraphTest : public ::testing::TestWithParam<CSRAdjGraphInputs<Index_
                         stream);
     raft::update_device(verify.data(), params.verify.data(), nnz, stream);
 
-    convert::csr_adj_graph_batched<Index_, 32>(
+    convert::csr_adj_graph_batched<Index_>(
       row_ind.data(), params.n_cols, nnz, params.n_rows, adj.data(), result.data(), stream);
 
     ASSERT_TRUE(
diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu
index 16372dc0f6..768397e617 100644
--- a/cpp/test/sparse/csr_row_slice.cu
+++ b/cpp/test/sparse/csr_row_slice.cu
@@ -20,7 +20,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/sparse/op/slice.h>
+#include <raft/sparse/op/slice.hpp>
 
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu
index 85f00cdd27..bbc7c8c185 100644
--- a/cpp/test/sparse/csr_to_dense.cu
+++ b/cpp/test/sparse/csr_to_dense.cu
@@ -20,7 +20,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/sparse/convert/dense.cuh>
+#include <raft/sparse/convert/dense.hpp>
 
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu
index 3380eaa6fb..4e762ed91b 100644
--- a/cpp/test/sparse/csr_transpose.cu
+++ b/cpp/test/sparse/csr_transpose.cu
@@ -20,8 +20,8 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/sparse/linalg/transpose.h>
 #include <raft/handle.hpp>
+#include <raft/sparse/linalg/transpose.hpp>
 
 #include "../test_utils.h"
 
@@ -102,7 +102,7 @@ class CSRTransposeTest : public ::testing::TestWithParam<CSRTransposeInputs<valu
 
   void SetUp() override
   {
-    CUSPARSE_CHECK(cusparseCreate(&handle));
+    raft::handle_t handle;
 
     make_data();
 
@@ -119,7 +119,6 @@ class CSRTransposeTest : public ::testing::TestWithParam<CSRTransposeInputs<valu
                                         stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
-    CUSPARSE_CHECK(cusparseDestroy(handle));
   }
 
   void compare()
diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu
index 8b1c7988d6..d11da1068a 100644
--- a/cpp/test/sparse/degree.cu
+++ b/cpp/test/sparse/degree.cu
@@ -19,7 +19,7 @@
 #include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
-#include <raft/sparse/linalg/degree.cuh>
+#include <raft/sparse/linalg/degree.hpp>
 
 #include <iostream>
 
@@ -63,7 +63,7 @@ TEST_P(COODegree, Result)
   raft::update_device(in_rows.data(), *&in_rows_h, 5, stream);
   raft::update_device(verify.data(), *&verify_h, 5, stream);
 
-  linalg::coo_degree<32>(in_rows.data(), 5, results.data(), stream);
+  linalg::coo_degree(in_rows.data(), 5, results.data(), stream);
   cudaDeviceSynchronize();
 
   ASSERT_TRUE(raft::devArrMatch<int>(verify.data(), results.data(), 5, raft::Compare<int>()));
@@ -93,7 +93,7 @@ TEST_P(COODegreeNonzero, Result)
   raft::update_device(verify.data(), *&verify_h, 5, stream);
   raft::update_device(in_vals.data(), *&in_vals_h, 5, stream);
 
-  linalg::coo_degree_nz<32, float>(in_rows.data(), in_vals.data(), 5, results.data(), stream);
+  linalg::coo_degree_nz<float>(in_rows.data(), in_vals.data(), 5, results.data(), stream);
   cudaDeviceSynchronize();
 
   ASSERT_TRUE(raft::devArrMatch<int>(verify.data(), results.data(), 5, raft::Compare<int>()));
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index 000d58d029..d0de8705ab 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -24,7 +24,7 @@
 #include <raft/linalg/unary_op.cuh>
 #include <rmm/device_uvector.hpp>
 
-#include <raft/sparse/convert/coo.cuh>
+#include <raft/sparse/convert/coo.hpp>
 #include <raft/sparse/distance/detail/coo_spmv.cuh>
 #include <raft/sparse/distance/detail/operators.cuh>
 
diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu
index 63245a63b0..efa399acdb 100644
--- a/cpp/test/sparse/filter.cu
+++ b/cpp/test/sparse/filter.cu
@@ -19,9 +19,9 @@
 #include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
-#include <raft/sparse/op/sort.h>
-#include <raft/sparse/coo.cuh>
-#include <raft/sparse/op/filter.cuh>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/op/filter.hpp>
+#include <raft/sparse/op/sort.hpp>
 
 #include <iostream>
 
@@ -95,7 +95,7 @@ TEST_P(COORemoveZeros, Result)
   raft::update_device(out_ref.cols(), *&out_cols_ref_h, 2, stream);
   raft::update_device(out_ref.vals(), out_vals_ref_h, 2, stream);
 
-  op::coo_remove_zeros<32, float>(&in, &out, stream);
+  op::coo_remove_zeros<float>(&in, &out, stream);
 
   ASSERT_TRUE(raft::devArrMatch<int>(out_ref.rows(), out.rows(), 2, raft::Compare<int>()));
   ASSERT_TRUE(raft::devArrMatch<int>(out_ref.cols(), out.cols(), 2, raft::Compare<int>()));
diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu
index a693262193..d4f57a381f 100644
--- a/cpp/test/sparse/knn.cu
+++ b/cpp/test/sparse/knn.cu
@@ -19,7 +19,7 @@
 
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/sparse/selection/knn.cuh>
+#include <raft/sparse/selection/knn.hpp>
 #include "../test_utils.h"
 
 #include <raft/cudart_utils.h>
diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu
index 1ed017f40a..584272fb2c 100644
--- a/cpp/test/sparse/knn_graph.cu
+++ b/cpp/test/sparse/knn_graph.cu
@@ -21,8 +21,8 @@
 #include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
 
-#include <raft/sparse/coo.cuh>
-#include <raft/sparse/selection/knn_graph.cuh>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/selection/knn_graph.hpp>
 
 #include <iostream>
 
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu
index 50401e5b7a..d506e3e54d 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/sparse/linkage.cu
@@ -19,7 +19,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
 #include <raft/linalg/transpose.h>
-#include <raft/sparse/coo.cuh>
+#include <raft/sparse/coo.hpp>
 #include <raft/sparse/hierarchy/single_linkage.hpp>
 
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
index 3cf465e032..be26b6f24b 100644
--- a/cpp/test/sparse/norm.cu
+++ b/cpp/test/sparse/norm.cu
@@ -18,8 +18,8 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.hpp>
-#include <raft/sparse/csr.cuh>
-#include <raft/sparse/linalg/norm.cuh>
+#include <raft/sparse/csr.hpp>
+#include <raft/sparse/linalg/norm.hpp>
 #include "../test_utils.h"
 
 #include <iostream>
@@ -65,11 +65,11 @@ class CSRRowNormalizeTest : public ::testing::TestWithParam<CSRRowNormalizeInput
 
     switch (params.method) {
       case MAX:
-        linalg::csr_row_normalize_max<32, Type_f>(
+        linalg::csr_row_normalize_max<Type_f>(
           ex_scan.data(), in_vals.data(), nnz, n_rows, result.data(), stream);
         break;
       case L1:
-        linalg::csr_row_normalize_l1<32, Type_f>(
+        linalg::csr_row_normalize_l1<Type_f>(
           ex_scan.data(), in_vals.data(), nnz, n_rows, result.data(), stream);
         break;
     }
diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu
index 9a27ae5134..f66cd873d5 100644
--- a/cpp/test/sparse/reduce.cu
+++ b/cpp/test/sparse/reduce.cu
@@ -20,8 +20,8 @@
 #include <iostream>
 #include <limits>
 #include <raft/handle.hpp>
-#include <raft/sparse/coo.cuh>
-#include <raft/sparse/op/reduce.cuh>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/op/reduce.hpp>
 #include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
 
diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu
index d73288b9f6..e650661c0d 100644
--- a/cpp/test/sparse/row_op.cu
+++ b/cpp/test/sparse/row_op.cu
@@ -16,8 +16,8 @@
 
 #include <gtest/gtest.h>
 
-#include <raft/sparse/csr.cuh>
-#include <raft/sparse/op/row_op.cuh>
+#include <raft/sparse/csr.hpp>
+#include <raft/sparse/op/row_op.hpp>
 
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.hpp>
@@ -41,7 +41,7 @@ template <typename Type_f, typename Index_>
 void csr_row_op_wrapper(
   const Index_* row_ind, Index_ n_rows, Index_ nnz, Type_f* result, cudaStream_t stream)
 {
-  op::csr_row_op<Index_, 32>(
+  op::csr_row_op<Index_>(
     row_ind,
     n_rows,
     nnz,
diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu
index c7cd03b485..b2658f37ca 100644
--- a/cpp/test/sparse/sort.cu
+++ b/cpp/test/sparse/sort.cu
@@ -19,7 +19,7 @@
 #include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
-#include <raft/sparse/op/sort.h>
+#include <raft/sparse/op/sort.hpp>
 
 #include <iostream>
 
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu
index 53bea0ddc0..b9fc868cf0 100644
--- a/cpp/test/sparse/symmetrize.cu
+++ b/cpp/test/sparse/symmetrize.cu
@@ -17,9 +17,9 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.hpp>
-#include <raft/sparse/convert/coo.cuh>
-#include <raft/sparse/coo.cuh>
-#include <raft/sparse/linalg/symmetrize.cuh>
+#include <raft/sparse/convert/coo.hpp>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/linalg/symmetrize.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -167,7 +167,7 @@ TEST_P(COOSymmetrize, Result)
 
   COO<float> out(stream);
 
-  linalg::coo_symmetrize<32, float>(
+  linalg::coo_symmetrize<float>(
     &in,
     &out,
     [] __device__(int row, int col, float val, float trans) { return val + trans; },
diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu
index ad6d1e58d1..69f6a47978 100644
--- a/cpp/test/spatial/selection.cu
+++ b/cpp/test/spatial/selection.cu
@@ -19,7 +19,7 @@
 
 #include "../test_utils.h"
 
-#include <raft/sparse/utils.h>
+#include <raft/sparse/detail/utils.h>
 #include <raft/spatial/knn/knn.hpp>
 
 namespace raft {

From 8ddb61c6c4ab1916224d76e7bd62724fa6bad9c3 Mon Sep 17 00:00:00 2001
From: Mahesh Doijade <36705640+mdoijade@users.noreply.github.com>
Date: Fri, 10 Dec 2021 02:26:56 +0530
Subject: [PATCH 053/171] fix nan issues in L2 expanded sqrt KNN distances
 (#411)

-  fix nans introduced in l2 expanded sqrt  KNN distances due to very small negative 0s produced by fusedL2knn due to floating point computation uncertainty.
- re-enable fused l2 knn.

Authors:
  - Mahesh Doijade (https://github.com/mdoijade)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/411
---
 .../knn/detail/knn_brute_force_faiss.cuh      | 123 +++++++++---------
 1 file changed, 63 insertions(+), 60 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index d154e5f92a..c20582df72 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -294,67 +294,66 @@ void brute_force_knn_impl(
     cudaStream_t stream = raft::select_stream(userStream, internalStreams, n_int_streams, i);
 
     //    // TODO: Enable this once we figure out why it's causing pytest failures in cuml.
-    //    if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true &&
-    //        (metric == raft::distance::DistanceType::L2Unexpanded ||
-    //         metric == raft::distance::DistanceType::L2SqrtUnexpanded  //||
-    //         //             metric == raft::distance::DistanceType::L2Expanded ||
-    //         //             metric == raft::distance::DistanceType::L2SqrtExpanded)
-    //         )) {
-    //      fusedL2Knn(D,
-    //                 out_i_ptr,
-    //                 out_d_ptr,
-    //                 input[i],
-    //                 search_items,
-    //                 sizes[i],
-    //                 n,
-    //                 k,
-    //                 rowMajorIndex,
-    //                 rowMajorQuery,
-    //                 stream,
-    //                 metric);
-    //    } else {
-    switch (metric) {
-      case raft::distance::DistanceType::Haversine:
-
-        ASSERT(D == 2,
-               "Haversine distance requires 2 dimensions "
-               "(latitude / longitude).");
-
-        haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream);
-        break;
-      default:
-        faiss::MetricType m = build_faiss_metric(metric);
-
-        faiss::gpu::StandardGpuResources gpu_res;
-
-        gpu_res.noTempMemory();
-        gpu_res.setDefaultStream(device, stream);
-
-        faiss::gpu::GpuDistanceParams args;
-        args.metric          = m;
-        args.metricArg       = metricArg;
-        args.k               = k;
-        args.dims            = D;
-        args.vectors         = input[i];
-        args.vectorsRowMajor = rowMajorIndex;
-        args.numVectors      = sizes[i];
-        args.queries         = search_items;
-        args.queriesRowMajor = rowMajorQuery;
-        args.numQueries      = n;
-        args.outDistances    = out_d_ptr;
-        args.outIndices      = out_i_ptr;
-
-        /**
-         * @todo: Until FAISS supports pluggable allocation strategies,
-         * we will not reap the benefits of the pool allocator for
-         * avoiding device-wide synchronizations from cudaMalloc/cudaFree
-         */
-        bfKnn(&gpu_res, args);
+    if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true &&
+        (metric == raft::distance::DistanceType::L2Unexpanded ||
+         metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
+         metric == raft::distance::DistanceType::L2Expanded ||
+         metric == raft::distance::DistanceType::L2SqrtExpanded)) {
+      fusedL2Knn(D,
+                 out_i_ptr,
+                 out_d_ptr,
+                 input[i],
+                 search_items,
+                 sizes[i],
+                 n,
+                 k,
+                 rowMajorIndex,
+                 rowMajorQuery,
+                 stream,
+                 metric);
+    } else {
+      switch (metric) {
+        case raft::distance::DistanceType::Haversine:
+
+          ASSERT(D == 2,
+                 "Haversine distance requires 2 dimensions "
+                 "(latitude / longitude).");
+
+          haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream);
+          break;
+        default:
+          faiss::MetricType m = build_faiss_metric(metric);
+
+          faiss::gpu::StandardGpuResources gpu_res;
+
+          gpu_res.noTempMemory();
+          gpu_res.setDefaultStream(device, stream);
+
+          faiss::gpu::GpuDistanceParams args;
+          args.metric          = m;
+          args.metricArg       = metricArg;
+          args.k               = k;
+          args.dims            = D;
+          args.vectors         = input[i];
+          args.vectorsRowMajor = rowMajorIndex;
+          args.numVectors      = sizes[i];
+          args.queries         = search_items;
+          args.queriesRowMajor = rowMajorQuery;
+          args.numQueries      = n;
+          args.outDistances    = out_d_ptr;
+          args.outIndices      = out_i_ptr;
+
+          /**
+           * @todo: Until FAISS supports pluggable allocation strategies,
+           * we will not reap the benefits of the pool allocator for
+           * avoiding device-wide synchronizations from cudaMalloc/cudaFree
+           */
+          bfKnn(&gpu_res, args);
+      }
     }
-  }
 
-  CUDA_CHECK(cudaPeekAtLastError());
-  //  }
+    CUDA_CHECK(cudaPeekAtLastError());
+  }
 
   // Sync internal streams if used. We don't need to
   // sync the user stream because we'll already have
@@ -379,7 +378,11 @@ void brute_force_knn_impl(
     float p = 0.5;  // standard l2
     if (metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / metricArg;
     raft::linalg::unaryOp<float>(
-      res_D, res_D, n * k, [p] __device__(float input) { return powf(input, p); }, userStream);
+      res_D,
+      res_D,
+      n * k,
+      [p] __device__(float input) { return powf(fabsf(input), p); },
+      userStream);
   }
 
   query_metric_processor->revert(search_items);

From 2ecdd34a6f0b878fe91312bf0156a3e70240fc7f Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Thu, 9 Dec 2021 16:50:24 -0500
Subject: [PATCH 054/171] Consistent renaming of CHECK_CUDA and *_TRY macros
 (#410)

This PR makes the naming of cuda status checks/logs consistent with other RAPIDS libraries.

1. `RAFT_CHECK_CUDA` is used to check if a CUDA error has been thrown (and synchronize if in debug mode) given a stream
2. `RAFT_{lib_name}_TRY` evaluates the return code of a library function
3. `{lib_name}_CHECK` invokes its corresponding `RAFT_{lib_name}_TRY` macro for now. Once these are removed from all consuming projects, we can simply remove them.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Seunghwa Kang (https://github.com/seunghwak)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/raft/pull/410
---
 cpp/include/raft/common/scatter.cuh           |   2 +-
 cpp/include/raft/comms/mpi_comms.hpp          | 158 ++++-----
 cpp/include/raft/comms/std_comms.hpp          | 140 ++++----
 cpp/include/raft/comms/test.hpp               |  54 ++--
 cpp/include/raft/comms/util.hpp               |  14 +-
 cpp/include/raft/cudart_utils.h               |  41 ++-
 cpp/include/raft/distance/detail/canberra.cuh |   2 +-
 .../raft/distance/detail/chebyshev.cuh        |   2 +-
 .../raft/distance/detail/correlation.cuh      |   2 +-
 cpp/include/raft/distance/detail/cosine.cuh   |   2 +-
 cpp/include/raft/distance/detail/distance.cuh |   4 +-
 .../raft/distance/detail/euclidean.cuh        |   4 +-
 .../raft/distance/detail/fused_l2_nn.cuh      |   6 +-
 cpp/include/raft/distance/detail/hamming.cuh  |   2 +-
 .../raft/distance/detail/hellinger.cuh        |   2 +-
 .../raft/distance/detail/jensen_shannon.cuh   |   2 +-
 .../raft/distance/detail/kl_divergence.cuh    |   2 +-
 cpp/include/raft/distance/detail/l1.cuh       |   2 +-
 .../raft/distance/detail/minkowski.cuh        |   2 +-
 .../detail/pairwise_distance_base.cuh         |   2 +-
 .../raft/distance/detail/russell_rao.cuh      |   2 +-
 cpp/include/raft/handle.hpp                   |  48 ++-
 cpp/include/raft/label/classlabels.cuh        |   2 +-
 cpp/include/raft/label/merge_labels.cuh       |   8 +-
 cpp/include/raft/linalg/add.cuh               |   2 +-
 cpp/include/raft/linalg/binary_op.cuh         |   2 +-
 .../raft/linalg/cholesky_r1_update.cuh        |  38 +--
 .../raft/linalg/coalesced_reduction.cuh       |   2 +-
 cpp/include/raft/linalg/cublas_wrappers.h     |  21 +-
 cpp/include/raft/linalg/cusolver_wrappers.h   |  22 +-
 cpp/include/raft/linalg/eig.cuh               | 250 +++++++--------
 cpp/include/raft/linalg/gemm.cuh              |   4 +-
 cpp/include/raft/linalg/gemv.h                |   4 +-
 cpp/include/raft/linalg/lanczos.hpp           | 301 +++++++++---------
 cpp/include/raft/linalg/map.cuh               |   2 +-
 cpp/include/raft/linalg/map_then_reduce.cuh   |   2 +-
 cpp/include/raft/linalg/matrix_vector_op.cuh  |   4 +-
 cpp/include/raft/linalg/qr.cuh                |  68 ++--
 cpp/include/raft/linalg/subtract.cuh          |   2 +-
 cpp/include/raft/linalg/svd.cuh               | 148 ++++-----
 cpp/include/raft/linalg/transpose.h           |  28 +-
 cpp/include/raft/linalg/unary_op.cuh          |   4 +-
 cpp/include/raft/matrix/detail/math.cuh       |   4 +-
 cpp/include/raft/matrix/detail/matrix.cuh     |   2 +-
 cpp/include/raft/matrix/matrix.hpp            |   2 +-
 cpp/include/raft/mr/buffer_base.hpp           |  10 +-
 cpp/include/raft/mr/host/allocator.hpp        |   4 +-
 cpp/include/raft/random/detail/rng_impl.cuh   |   8 +-
 .../raft/sparse/convert/detail/coo.cuh        |   2 +-
 .../raft/sparse/convert/detail/csr.cuh        |  10 +-
 .../raft/sparse/convert/detail/dense.cuh      |  12 +-
 cpp/include/raft/sparse/cusparse_wrappers.h   |  18 +-
 cpp/include/raft/sparse/detail/coo.cuh        |  12 +-
 cpp/include/raft/sparse/detail/csr.cuh        |   8 +-
 .../sparse/distance/detail/bin_distance.cuh   |   4 +-
 .../raft/sparse/distance/detail/coo_spmv.cuh  |  16 +-
 .../coo_spmv_strategies/base_strategy.cuh     |  40 +--
 .../sparse/distance/detail/l2_distance.cuh    |  12 +-
 .../sparse/hierarchy/detail/agglomerative.cuh |   2 +-
 cpp/include/raft/sparse/linalg/detail/add.cuh |   6 +-
 .../raft/sparse/linalg/detail/degree.cuh      |   2 +-
 .../raft/sparse/linalg/detail/norm.cuh        |   4 +-
 .../raft/sparse/linalg/detail/spectral.cuh    |   4 +-
 .../raft/sparse/linalg/detail/symmetrize.cuh  |  12 +-
 .../raft/sparse/linalg/detail/transpose.h     |  60 ++--
 .../raft/sparse/mst/detail/mst_solver_inl.cuh |   2 +-
 cpp/include/raft/sparse/op/detail/filter.cuh  |  20 +-
 cpp/include/raft/sparse/op/detail/reduce.cuh  |   4 +-
 cpp/include/raft/sparse/op/detail/row_op.cuh  |   2 +-
 cpp/include/raft/sparse/op/detail/slice.h     |   2 +-
 .../selection/detail/connect_components.cuh   |   2 +-
 .../raft/sparse/selection/detail/knn.cuh      |   2 +-
 .../knn/detail/ann_quantized_faiss.cuh        |   2 +-
 .../raft/spatial/knn/detail/fused_l2_knn.cuh  |   8 +-
 .../knn/detail/knn_brute_force_faiss.cuh      |  10 +-
 .../spatial/knn/detail/selection_faiss.cuh    |   2 +-
 cpp/include/raft/spectral/kmeans.hpp          |  94 +++---
 cpp/include/raft/spectral/matrix_wrappers.hpp | 104 +++---
 .../raft/spectral/modularity_maximization.hpp |   4 +-
 cpp/include/raft/spectral/partition.hpp       |   2 +-
 cpp/include/raft/spectral/spectral_util.hpp   |  50 +--
 cpp/include/raft/stats/detail/mean.cuh        |   6 +-
 cpp/include/raft/stats/detail/stddev.cuh      |   8 +-
 cpp/include/raft/stats/detail/sum.cuh         |   4 +-
 cpp/test/cudart_utils.cpp                     |   2 +-
 cpp/test/distance/dist_adj.cu                 |   4 +-
 cpp/test/distance/distance_base.cuh           |   4 +-
 cpp/test/distance/fused_l2_nn.cu              |  14 +-
 cpp/test/handle.cpp                           |   6 +-
 cpp/test/label/label.cu                       |   6 +-
 cpp/test/linalg/add.cu                        |   2 +-
 cpp/test/linalg/add.cuh                       |   2 +-
 cpp/test/linalg/binary_op.cu                  |  10 +-
 cpp/test/linalg/binary_op.cuh                 |   2 +-
 cpp/test/linalg/cholesky_r1.cu                |  24 +-
 cpp/test/linalg/coalesced_reduction.cu        |   2 +-
 cpp/test/linalg/divide.cu                     |   6 +-
 cpp/test/linalg/eig.cu                        |   2 +-
 cpp/test/linalg/eig_sel.cu                    |   2 +-
 cpp/test/linalg/eltwise.cu                    |   8 +-
 cpp/test/linalg/gemm_layout.cu                |  12 +-
 cpp/test/linalg/map.cu                        |   4 +-
 cpp/test/linalg/map_then_reduce.cu            |   8 +-
 cpp/test/linalg/matrix_vector_op.cu           |   2 +-
 cpp/test/linalg/matrix_vector_op.cuh          |   4 +-
 cpp/test/linalg/multiply.cu                   |   2 +-
 cpp/test/linalg/norm.cu                       |   8 +-
 cpp/test/linalg/reduce.cu                     |   2 +-
 cpp/test/linalg/reduce.cuh                    |  12 +-
 cpp/test/linalg/strided_reduction.cu          |   2 +-
 cpp/test/linalg/subtract.cu                   |   6 +-
 cpp/test/linalg/svd.cu                        |   2 +-
 cpp/test/linalg/transpose.cu                  |   2 +-
 cpp/test/linalg/unary_op.cu                   |   6 +-
 cpp/test/linalg/unary_op.cuh                  |   2 +-
 cpp/test/matrix/math.cu                       |   8 +-
 cpp/test/matrix/matrix.cu                     |   6 +-
 cpp/test/mr/device/buffer.cpp                 |  12 +-
 cpp/test/mr/host/buffer.cpp                   |  14 +-
 cpp/test/mst.cu                               |  18 +-
 cpp/test/random/rng.cu                        |  22 +-
 cpp/test/random/rng_int.cu                    |   6 +-
 cpp/test/random/sample_without_replacement.cu |   2 +-
 cpp/test/sparse/connect_components.cu         |   2 +-
 cpp/test/sparse/convert_csr.cu                |   6 +-
 cpp/test/sparse/csr_row_slice.cu              |   4 +-
 cpp/test/sparse/csr_to_dense.cu               |   8 +-
 cpp/test/sparse/csr_transpose.cu              |   2 +-
 cpp/test/sparse/degree.cu                     |  14 +-
 cpp/test/sparse/dist_coo_spmv.cu              |   2 +-
 cpp/test/sparse/distance.cu                   |   2 +-
 cpp/test/sparse/filter.cu                     |   2 +-
 cpp/test/sparse/knn.cu                        |   2 +-
 cpp/test/sparse/knn_graph.cu                  |   2 +-
 cpp/test/sparse/linkage.cu                    |   8 +-
 cpp/test/sparse/sort.cu                       |   4 +-
 cpp/test/sparse/symmetrize.cu                 |   4 +-
 cpp/test/spatial/ball_cover.cu                |   8 +-
 cpp/test/spatial/fused_l2_knn.cu              |   4 +-
 cpp/test/spatial/haversine.cu                 |   2 +-
 cpp/test/spatial/knn.cu                       |  16 +-
 cpp/test/spatial/selection.cu                 |   2 +-
 cpp/test/stats/mean_center.cu                 |   2 +-
 cpp/test/stats/stddev.cu                      |   2 +-
 cpp/test/stats/sum.cu                         |   2 +-
 cpp/test/test_utils.h                         |  40 +--
 python/setup.py                               |   5 +
 147 files changed, 1214 insertions(+), 1180 deletions(-)

diff --git a/cpp/include/raft/common/scatter.cuh b/cpp/include/raft/common/scatter.cuh
index b228ac5499..2d25b85a50 100644
--- a/cpp/include/raft/common/scatter.cuh
+++ b/cpp/include/raft/common/scatter.cuh
@@ -46,7 +46,7 @@ void scatterImpl(
 {
   const IdxT nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxT)TPB);
   scatterKernel<DataT, VecLen, Lambda, IdxT><<<nblks, TPB, 0, stream>>>(out, in, idx, len, op);
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 /**
diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp
index 3091cd53a9..413763c07f 100644
--- a/cpp/include/raft/comms/mpi_comms.hpp
+++ b/cpp/include/raft/comms/mpi_comms.hpp
@@ -32,7 +32,7 @@
 #include <raft/error.hpp>
 #include <raft/handle.hpp>
 
-#define MPI_TRY(call)                                                                         \
+#define RAFT_MPI_TRY(call)                                                                    \
   do {                                                                                        \
     int status = call;                                                                        \
     if (MPI_SUCCESS != status) {                                                              \
@@ -44,7 +44,12 @@
     }                                                                                         \
   } while (0)
 
-#define MPI_TRY_NO_THROW(call)                                              \
+// FIXME: Remove after consumer rename
+#ifndef MPI_TRY
+#define MPI_TRY(call) RAFT_MPI_TRY(call)
+#endif
+
+#define RAFT_MPI_TRY_NO_THROW(call)                                         \
   do {                                                                      \
     int status = call;                                                      \
     if (MPI_SUCCESS != status) {                                            \
@@ -59,6 +64,11 @@
     }                                                                       \
   } while (0)
 
+// FIXME: Remove after consumer rename
+#ifndef MPI_TRY_NO_THROW
+#define MPI_TRY_NO_THROW(call) RAFT_MPI_TRY_NO_THROW(call)
+#endif
+
 namespace raft {
 namespace comms {
 
@@ -98,24 +108,24 @@ class mpi_comms : public comms_iface {
     : owns_mpi_comm_(owns_mpi_comm), mpi_comm_(comm), size_(0), rank_(1), next_request_id_(0)
   {
     int mpi_is_initialized = 0;
-    MPI_TRY(MPI_Initialized(&mpi_is_initialized));
+    RAFT_MPI_TRY(MPI_Initialized(&mpi_is_initialized));
     RAFT_EXPECTS(mpi_is_initialized, "ERROR: MPI is not initialized!");
-    MPI_TRY(MPI_Comm_size(mpi_comm_, &size_));
-    MPI_TRY(MPI_Comm_rank(mpi_comm_, &rank_));
+    RAFT_MPI_TRY(MPI_Comm_size(mpi_comm_, &size_));
+    RAFT_MPI_TRY(MPI_Comm_rank(mpi_comm_, &rank_));
     // get NCCL unique ID at rank 0 and broadcast it to all others
     ncclUniqueId id;
-    if (0 == rank_) NCCL_TRY(ncclGetUniqueId(&id));
-    MPI_TRY(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, mpi_comm_));
+    if (0 == rank_) RAFT_NCCL_TRY(ncclGetUniqueId(&id));
+    RAFT_MPI_TRY(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, mpi_comm_));
 
     // initializing NCCL
-    NCCL_TRY(ncclCommInitRank(&nccl_comm_, size_, id, rank_));
+    RAFT_NCCL_TRY(ncclCommInitRank(&nccl_comm_, size_, id, rank_));
   }
 
   virtual ~mpi_comms()
   {
     // finalizing NCCL
-    NCCL_TRY_NO_THROW(ncclCommDestroy(nccl_comm_));
-    if (owns_mpi_comm_) { MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_)); }
+    RAFT_NCCL_TRY_NO_THROW(ncclCommDestroy(nccl_comm_));
+    if (owns_mpi_comm_) { RAFT_MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_)); }
   }
 
   int get_size() const { return size_; }
@@ -125,11 +135,11 @@ class mpi_comms : public comms_iface {
   std::unique_ptr<comms_iface> comm_split(int color, int key) const
   {
     MPI_Comm new_comm;
-    MPI_TRY(MPI_Comm_split(mpi_comm_, color, key, &new_comm));
+    RAFT_MPI_TRY(MPI_Comm_split(mpi_comm_, color, key, &new_comm));
     return std::unique_ptr<comms_iface>(new mpi_comms(new_comm, true));
   }
 
-  void barrier() const { MPI_TRY(MPI_Barrier(mpi_comm_)); }
+  void barrier() const { RAFT_MPI_TRY(MPI_Barrier(mpi_comm_)); }
 
   void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const
   {
@@ -142,7 +152,7 @@ class mpi_comms : public comms_iface {
       req_id  = *it;
       free_requests_.erase(it);
     }
-    MPI_TRY(MPI_Isend(buf, size, MPI_BYTE, dest, tag, mpi_comm_, &mpi_req));
+    RAFT_MPI_TRY(MPI_Isend(buf, size, MPI_BYTE, dest, tag, mpi_comm_, &mpi_req));
     requests_in_flight_.insert(std::make_pair(req_id, mpi_req));
     *request = req_id;
   }
@@ -159,7 +169,7 @@ class mpi_comms : public comms_iface {
       free_requests_.erase(it);
     }
 
-    MPI_TRY(MPI_Irecv(buf, size, MPI_BYTE, source, tag, mpi_comm_, &mpi_req));
+    RAFT_MPI_TRY(MPI_Irecv(buf, size, MPI_BYTE, source, tag, mpi_comm_, &mpi_req));
     requests_in_flight_.insert(std::make_pair(req_id, mpi_req));
     *request = req_id;
   }
@@ -177,7 +187,7 @@ class mpi_comms : public comms_iface {
       free_requests_.insert(req_it->first);
       requests_in_flight_.erase(req_it);
     }
-    MPI_TRY(MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE));
+    RAFT_MPI_TRY(MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE));
   }
 
   void allreduce(const void* sendbuff,
@@ -187,13 +197,13 @@ class mpi_comms : public comms_iface {
                  op_t op,
                  cudaStream_t stream) const
   {
-    NCCL_TRY(ncclAllReduce(
+    RAFT_NCCL_TRY(ncclAllReduce(
       sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream));
   }
 
   void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const
   {
-    NCCL_TRY(
+    RAFT_NCCL_TRY(
       ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
   }
 
@@ -204,7 +214,7 @@ class mpi_comms : public comms_iface {
              int root,
              cudaStream_t stream) const
   {
-    NCCL_TRY(ncclBroadcast(
+    RAFT_NCCL_TRY(ncclBroadcast(
       sendbuff, recvbuff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
   }
 
@@ -216,14 +226,14 @@ class mpi_comms : public comms_iface {
               int root,
               cudaStream_t stream) const
   {
-    NCCL_TRY(ncclReduce(sendbuff,
-                        recvbuff,
-                        count,
-                        get_nccl_datatype(datatype),
-                        get_nccl_op(op),
-                        root,
-                        nccl_comm_,
-                        stream));
+    RAFT_NCCL_TRY(ncclReduce(sendbuff,
+                             recvbuff,
+                             count,
+                             get_nccl_datatype(datatype),
+                             get_nccl_op(op),
+                             root,
+                             nccl_comm_,
+                             stream));
   }
 
   void allgather(const void* sendbuff,
@@ -232,7 +242,7 @@ class mpi_comms : public comms_iface {
                  datatype_t datatype,
                  cudaStream_t stream) const
   {
-    NCCL_TRY(ncclAllGather(
+    RAFT_NCCL_TRY(ncclAllGather(
       sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream));
   }
 
@@ -246,7 +256,7 @@ class mpi_comms : public comms_iface {
     // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" -
     // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4.
     for (int root = 0; root < size_; ++root) {
-      NCCL_TRY(
+      RAFT_NCCL_TRY(
         ncclBroadcast(sendbuf,
                       static_cast<char*>(recvbuf) + displs[root] * get_datatype_size(datatype),
                       recvcounts[root],
@@ -265,19 +275,20 @@ class mpi_comms : public comms_iface {
               cudaStream_t stream) const
   {
     size_t dtype_size = get_datatype_size(datatype);
-    NCCL_TRY(ncclGroupStart());
+    RAFT_NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
-        NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + sendcount * r * dtype_size,
-                          sendcount,
-                          get_nccl_datatype(datatype),
-                          r,
-                          nccl_comm_,
-                          stream));
+        RAFT_NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + sendcount * r * dtype_size,
+                               sendcount,
+                               get_nccl_datatype(datatype),
+                               r,
+                               nccl_comm_,
+                               stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
-    NCCL_TRY(ncclGroupEnd());
+    RAFT_NCCL_TRY(
+      ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
+    RAFT_NCCL_TRY(ncclGroupEnd());
   }
 
   void gatherv(const void* sendbuff,
@@ -290,19 +301,20 @@ class mpi_comms : public comms_iface {
                cudaStream_t stream) const
   {
     size_t dtype_size = get_datatype_size(datatype);
-    NCCL_TRY(ncclGroupStart());
+    RAFT_NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
-        NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + displs[r] * dtype_size,
-                          recvcounts[r],
-                          get_nccl_datatype(datatype),
-                          r,
-                          nccl_comm_,
-                          stream));
+        RAFT_NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + displs[r] * dtype_size,
+                               recvcounts[r],
+                               get_nccl_datatype(datatype),
+                               r,
+                               nccl_comm_,
+                               stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
-    NCCL_TRY(ncclGroupEnd());
+    RAFT_NCCL_TRY(
+      ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
+    RAFT_NCCL_TRY(ncclGroupEnd());
   }
 
   void reducescatter(const void* sendbuff,
@@ -312,13 +324,13 @@ class mpi_comms : public comms_iface {
                      op_t op,
                      cudaStream_t stream) const
   {
-    NCCL_TRY(ncclReduceScatter(sendbuff,
-                               recvbuff,
-                               recvcount,
-                               get_nccl_datatype(datatype),
-                               get_nccl_op(op),
-                               nccl_comm_,
-                               stream));
+    RAFT_NCCL_TRY(ncclReduceScatter(sendbuff,
+                                    recvbuff,
+                                    recvcount,
+                                    get_nccl_datatype(datatype),
+                                    get_nccl_op(op),
+                                    nccl_comm_,
+                                    stream));
   }
 
   status_t sync_stream(cudaStream_t stream) const
@@ -357,13 +369,13 @@ class mpi_comms : public comms_iface {
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
   void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const
   {
-    NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream));
+    RAFT_NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream));
   }
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
   void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const
   {
-    NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream));
+    RAFT_NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream));
   }
 
   void device_sendrecv(const void* sendbuf,
@@ -375,10 +387,10 @@ class mpi_comms : public comms_iface {
                        cudaStream_t stream) const
   {
     // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
-    NCCL_TRY(ncclGroupStart());
-    NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream));
-    NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
-    NCCL_TRY(ncclGroupEnd());
+    RAFT_NCCL_TRY(ncclGroupStart());
+    RAFT_NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream));
+    RAFT_NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
+    RAFT_NCCL_TRY(ncclGroupEnd());
   }
 
   void device_multicast_sendrecv(const void* sendbuf,
@@ -392,24 +404,24 @@ class mpi_comms : public comms_iface {
                                  cudaStream_t stream) const
   {
     // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
-    NCCL_TRY(ncclGroupStart());
+    RAFT_NCCL_TRY(ncclGroupStart());
     for (size_t i = 0; i < sendsizes.size(); ++i) {
-      NCCL_TRY(ncclSend(static_cast<const char*>(sendbuf) + sendoffsets[i],
-                        sendsizes[i],
-                        ncclUint8,
-                        dests[i],
-                        nccl_comm_,
-                        stream));
+      RAFT_NCCL_TRY(ncclSend(static_cast<const char*>(sendbuf) + sendoffsets[i],
+                             sendsizes[i],
+                             ncclUint8,
+                             dests[i],
+                             nccl_comm_,
+                             stream));
     }
     for (size_t i = 0; i < recvsizes.size(); ++i) {
-      NCCL_TRY(ncclRecv(static_cast<char*>(recvbuf) + recvoffsets[i],
-                        recvsizes[i],
-                        ncclUint8,
-                        sources[i],
-                        nccl_comm_,
-                        stream));
+      RAFT_NCCL_TRY(ncclRecv(static_cast<char*>(recvbuf) + recvoffsets[i],
+                             recvsizes[i],
+                             ncclUint8,
+                             sources[i],
+                             nccl_comm_,
+                             stream));
     }
-    NCCL_TRY(ncclGroupEnd());
+    RAFT_NCCL_TRY(ncclGroupEnd());
   }
 
  private:
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index 1647c29667..6afb0f56c6 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -130,7 +130,7 @@ class std_comms : public comms_iface {
     update_host(h_colors.data(), d_colors.data(), get_size(), stream_);
     update_host(h_keys.data(), d_keys.data(), get_size(), stream_);
 
-    CUDA_CHECK(cudaStreamSynchronize(stream_));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
 
     std::vector<int> subcomm_ranks{};
     std::vector<ucp_ep_h> new_ucx_ptrs{};
@@ -144,7 +144,7 @@ class std_comms : public comms_iface {
 
     ncclUniqueId id{};
     if (get_rank() == subcomm_ranks[0]) {  // root of the new subcommunicator
-      NCCL_TRY(ncclGetUniqueId(&id));
+      RAFT_NCCL_TRY(ncclGetUniqueId(&id));
       std::vector<request_t> requests(subcomm_ranks.size() - 1);
       for (size_t i = 1; i < subcomm_ranks.size(); ++i) {
         isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color, requests.data() + (i - 1));
@@ -159,7 +159,7 @@ class std_comms : public comms_iface {
     barrier();
 
     ncclComm_t nccl_comm;
-    NCCL_TRY(ncclCommInitRank(&nccl_comm, subcomm_ranks.size(), id, key));
+    RAFT_NCCL_TRY(ncclCommInitRank(&nccl_comm, subcomm_ranks.size(), id, key));
 
     if (ucp_worker_ != nullptr && subcomms_ucp_) {
       auto eps_sp = std::make_shared<ucp_ep_h*>(new_ucx_ptrs.data());
@@ -178,8 +178,8 @@ class std_comms : public comms_iface {
 
   void barrier() const
   {
-    CUDA_CHECK(cudaMemsetAsync(sendbuff_, 1, sizeof(int), stream_));
-    CUDA_CHECK(cudaMemsetAsync(recvbuff_, 1, sizeof(int), stream_));
+    RAFT_CUDA_TRY(cudaMemsetAsync(sendbuff_, 1, sizeof(int), stream_));
+    RAFT_CUDA_TRY(cudaMemsetAsync(recvbuff_, 1, sizeof(int), stream_));
 
     allreduce(sendbuff_, recvbuff_, 1, datatype_t::INT32, op_t::SUM, stream_);
 
@@ -304,13 +304,13 @@ class std_comms : public comms_iface {
                  op_t op,
                  cudaStream_t stream) const
   {
-    NCCL_TRY(ncclAllReduce(
+    RAFT_NCCL_TRY(ncclAllReduce(
       sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream));
   }
 
   void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const
   {
-    NCCL_TRY(
+    RAFT_NCCL_TRY(
       ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
   }
 
@@ -321,7 +321,7 @@ class std_comms : public comms_iface {
              int root,
              cudaStream_t stream) const
   {
-    NCCL_TRY(ncclBroadcast(
+    RAFT_NCCL_TRY(ncclBroadcast(
       sendbuff, recvbuff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
   }
 
@@ -333,14 +333,14 @@ class std_comms : public comms_iface {
               int root,
               cudaStream_t stream) const
   {
-    NCCL_TRY(ncclReduce(sendbuff,
-                        recvbuff,
-                        count,
-                        get_nccl_datatype(datatype),
-                        get_nccl_op(op),
-                        root,
-                        nccl_comm_,
-                        stream));
+    RAFT_NCCL_TRY(ncclReduce(sendbuff,
+                             recvbuff,
+                             count,
+                             get_nccl_datatype(datatype),
+                             get_nccl_op(op),
+                             root,
+                             nccl_comm_,
+                             stream));
   }
 
   void allgather(const void* sendbuff,
@@ -349,7 +349,7 @@ class std_comms : public comms_iface {
                  datatype_t datatype,
                  cudaStream_t stream) const
   {
-    NCCL_TRY(ncclAllGather(
+    RAFT_NCCL_TRY(ncclAllGather(
       sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream));
   }
 
@@ -364,13 +364,13 @@ class std_comms : public comms_iface {
     // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4.
     for (int root = 0; root < num_ranks_; ++root) {
       size_t dtype_size = get_datatype_size(datatype);
-      NCCL_TRY(ncclBroadcast(sendbuf,
-                             static_cast<char*>(recvbuf) + displs[root] * dtype_size,
-                             recvcounts[root],
-                             get_nccl_datatype(datatype),
-                             root,
-                             nccl_comm_,
-                             stream));
+      RAFT_NCCL_TRY(ncclBroadcast(sendbuf,
+                                  static_cast<char*>(recvbuf) + displs[root] * dtype_size,
+                                  recvcounts[root],
+                                  get_nccl_datatype(datatype),
+                                  root,
+                                  nccl_comm_,
+                                  stream));
     }
   }
 
@@ -382,19 +382,20 @@ class std_comms : public comms_iface {
               cudaStream_t stream) const
   {
     size_t dtype_size = get_datatype_size(datatype);
-    NCCL_TRY(ncclGroupStart());
+    RAFT_NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
-        NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + sendcount * r * dtype_size,
-                          sendcount,
-                          get_nccl_datatype(datatype),
-                          r,
-                          nccl_comm_,
-                          stream));
+        RAFT_NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + sendcount * r * dtype_size,
+                               sendcount,
+                               get_nccl_datatype(datatype),
+                               r,
+                               nccl_comm_,
+                               stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
-    NCCL_TRY(ncclGroupEnd());
+    RAFT_NCCL_TRY(
+      ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
+    RAFT_NCCL_TRY(ncclGroupEnd());
   }
 
   void gatherv(const void* sendbuff,
@@ -407,19 +408,20 @@ class std_comms : public comms_iface {
                cudaStream_t stream) const
   {
     size_t dtype_size = get_datatype_size(datatype);
-    NCCL_TRY(ncclGroupStart());
+    RAFT_NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
-        NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + displs[r] * dtype_size,
-                          recvcounts[r],
-                          get_nccl_datatype(datatype),
-                          r,
-                          nccl_comm_,
-                          stream));
+        RAFT_NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + displs[r] * dtype_size,
+                               recvcounts[r],
+                               get_nccl_datatype(datatype),
+                               r,
+                               nccl_comm_,
+                               stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
-    NCCL_TRY(ncclGroupEnd());
+    RAFT_NCCL_TRY(
+      ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
+    RAFT_NCCL_TRY(ncclGroupEnd());
   }
 
   void reducescatter(const void* sendbuff,
@@ -429,13 +431,13 @@ class std_comms : public comms_iface {
                      op_t op,
                      cudaStream_t stream) const
   {
-    NCCL_TRY(ncclReduceScatter(sendbuff,
-                               recvbuff,
-                               recvcount,
-                               get_nccl_datatype(datatype),
-                               get_nccl_op(op),
-                               nccl_comm_,
-                               stream));
+    RAFT_NCCL_TRY(ncclReduceScatter(sendbuff,
+                                    recvbuff,
+                                    recvcount,
+                                    get_nccl_datatype(datatype),
+                                    get_nccl_op(op),
+                                    nccl_comm_,
+                                    stream));
   }
 
   status_t sync_stream(cudaStream_t stream) const
@@ -474,13 +476,13 @@ class std_comms : public comms_iface {
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
   void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const
   {
-    NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream));
+    RAFT_NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream));
   }
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
   void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const
   {
-    NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream));
+    RAFT_NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream));
   }
 
   void device_sendrecv(const void* sendbuf,
@@ -492,10 +494,10 @@ class std_comms : public comms_iface {
                        cudaStream_t stream) const
   {
     // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
-    NCCL_TRY(ncclGroupStart());
-    NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream));
-    NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
-    NCCL_TRY(ncclGroupEnd());
+    RAFT_NCCL_TRY(ncclGroupStart());
+    RAFT_NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream));
+    RAFT_NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
+    RAFT_NCCL_TRY(ncclGroupEnd());
   }
 
   void device_multicast_sendrecv(const void* sendbuf,
@@ -509,24 +511,24 @@ class std_comms : public comms_iface {
                                  cudaStream_t stream) const
   {
     // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
-    NCCL_TRY(ncclGroupStart());
+    RAFT_NCCL_TRY(ncclGroupStart());
     for (size_t i = 0; i < sendsizes.size(); ++i) {
-      NCCL_TRY(ncclSend(static_cast<const char*>(sendbuf) + sendoffsets[i],
-                        sendsizes[i],
-                        ncclUint8,
-                        dests[i],
-                        nccl_comm_,
-                        stream));
+      RAFT_NCCL_TRY(ncclSend(static_cast<const char*>(sendbuf) + sendoffsets[i],
+                             sendsizes[i],
+                             ncclUint8,
+                             dests[i],
+                             nccl_comm_,
+                             stream));
     }
     for (size_t i = 0; i < recvsizes.size(); ++i) {
-      NCCL_TRY(ncclRecv(static_cast<char*>(recvbuf) + recvoffsets[i],
-                        recvsizes[i],
-                        ncclUint8,
-                        sources[i],
-                        nccl_comm_,
-                        stream));
+      RAFT_NCCL_TRY(ncclRecv(static_cast<char*>(recvbuf) + recvoffsets[i],
+                             recvsizes[i],
+                             ncclUint8,
+                             sources[i],
+                             nccl_comm_,
+                             stream));
     }
-    NCCL_TRY(ncclGroupEnd());
+    RAFT_NCCL_TRY(ncclGroupEnd());
   }
 
  private:
diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp
index 5f87bf41fa..93b57b13a0 100644
--- a/cpp/include/raft/comms/test.hpp
+++ b/cpp/include/raft/comms/test.hpp
@@ -46,13 +46,13 @@ bool test_collective_allreduce(const handle_t& handle, int root)
   cudaStream_t stream = handle.get_stream();
 
   rmm::device_scalar<int> temp_d(stream);
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream));
 
   communicator.allreduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, stream);
 
   int temp_h = 0;
-  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream));
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   communicator.barrier();
 
   std::cout << "Clique size: " << communicator.get_size() << std::endl;
@@ -79,13 +79,15 @@ bool test_collective_broadcast(const handle_t& handle, int root)
   rmm::device_scalar<int> temp_d(stream);
 
   if (communicator.get_rank() == root)
-    CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
+    RAFT_CUDA_TRY(
+      cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
 
   communicator.bcast(temp_d.data(), 1, root, stream);
   communicator.sync_stream(stream);
   int temp_h = -1;  // Verify more than one byte is being sent
-  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(
+    cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   communicator.barrier();
 
   std::cout << "Clique size: " << communicator.get_size() << std::endl;
@@ -111,13 +113,14 @@ bool test_collective_reduce(const handle_t& handle, int root)
 
   rmm::device_scalar<int> temp_d(stream);
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
 
   communicator.reduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, root, stream);
   communicator.sync_stream(stream);
   int temp_h = -1;  // Verify more than one byte is being sent
-  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(
+    cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   communicator.barrier();
 
   std::cout << "Clique size: " << communicator.get_size() << std::endl;
@@ -147,14 +150,14 @@ bool test_collective_allgather(const handle_t& handle, int root)
   rmm::device_scalar<int> temp_d(stream);
   rmm::device_uvector<int> recv_d(communicator.get_size(), stream);
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
 
   communicator.allgather(temp_d.data(), recv_d.data(), 1, stream);
   communicator.sync_stream(stream);
   int temp_h[communicator.get_size()];  // Verify more than one byte is being sent
-  CUDA_CHECK(cudaMemcpyAsync(
+  RAFT_CUDA_TRY(cudaMemcpyAsync(
     &temp_h, recv_d.data(), sizeof(int) * communicator.get_size(), cudaMemcpyDeviceToHost, stream));
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   communicator.barrier();
 
   std::cout << "Clique size: " << communicator.get_size() << std::endl;
@@ -185,16 +188,16 @@ bool test_collective_gather(const handle_t& handle, int root)
   rmm::device_uvector<int> recv_d(communicator.get_rank() == root ? communicator.get_size() : 0,
                                   stream);
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
 
   communicator.gather(temp_d.data(), recv_d.data(), 1, root, stream);
   communicator.sync_stream(stream);
 
   if (communicator.get_rank() == root) {
     std::vector<int> temp_h(communicator.get_size(), 0);
-    CUDA_CHECK(cudaMemcpyAsync(
+    RAFT_CUDA_TRY(cudaMemcpyAsync(
       temp_h.data(), recv_d.data(), sizeof(int) * temp_h.size(), cudaMemcpyDeviceToHost, stream));
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
     for (int i = 0; i < communicator.get_size(); i++) {
       if (temp_h[i] != i) return false;
@@ -229,7 +232,7 @@ bool test_collective_gatherv(const handle_t& handle, int root)
   rmm::device_uvector<int> recv_d(communicator.get_rank() == root ? displacements.back() : 0,
                                   stream);
 
-  CUDA_CHECK(cudaMemcpyAsync(
+  RAFT_CUDA_TRY(cudaMemcpyAsync(
     temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream));
 
   communicator.gatherv(
@@ -244,12 +247,12 @@ bool test_collective_gatherv(const handle_t& handle, int root)
 
   if (communicator.get_rank() == root) {
     std::vector<int> temp_h(displacements.back(), 0);
-    CUDA_CHECK(cudaMemcpyAsync(temp_h.data(),
-                               recv_d.data(),
-                               sizeof(int) * displacements.back(),
-                               cudaMemcpyDeviceToHost,
-                               stream));
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaMemcpyAsync(temp_h.data(),
+                                  recv_d.data(),
+                                  sizeof(int) * displacements.back(),
+                                  cudaMemcpyDeviceToHost,
+                                  stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
     for (int i = 0; i < communicator.get_size(); i++) {
       if (std::count_if(temp_h.begin() + displacements[i],
@@ -280,14 +283,15 @@ bool test_collective_reducescatter(const handle_t& handle, int root)
   rmm::device_uvector<int> temp_d(sends.size(), stream);
   rmm::device_scalar<int> recv_d(stream);
 
-  CUDA_CHECK(cudaMemcpyAsync(
+  RAFT_CUDA_TRY(cudaMemcpyAsync(
     temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream));
 
   communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM, stream);
   communicator.sync_stream(stream);
   int temp_h = -1;  // Verify more than one byte is being sent
-  CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(
+    cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   communicator.barrier();
 
   std::cout << "Clique size: " << communicator.get_size() << std::endl;
diff --git a/cpp/include/raft/comms/util.hpp b/cpp/include/raft/comms/util.hpp
index 1b0548fc00..ef16773c75 100644
--- a/cpp/include/raft/comms/util.hpp
+++ b/cpp/include/raft/comms/util.hpp
@@ -26,7 +26,7 @@
  * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an
  * exception detailing the NCCL error that occurred
  */
-#define NCCL_TRY(call)                             \
+#define RAFT_NCCL_TRY(call)                        \
   do {                                             \
     ncclResult_t const status = (call);            \
     if (ncclSuccess != status) {                   \
@@ -41,7 +41,12 @@
     }                                              \
   } while (0);
 
-#define NCCL_TRY_NO_THROW(call)                                                        \
+// FIXME: Remove after consumer rename
+#ifndef NCCL_TRY
+#define NCCL_TRY(call) RAFT_NCCL_TRY(call)
+#endif
+
+#define RAFT_NCCL_TRY_NO_THROW(call)                                                   \
   do {                                                                                 \
     ncclResult_t status = call;                                                        \
     if (ncclSuccess != status) {                                                       \
@@ -49,6 +54,11 @@
     }                                                                                  \
   } while (0)
 
+// FIXME: Remove after consumer rename
+#ifndef NCCL_TRY_NO_THROW
+#define NCCL_TRY_NO_THROW(call) RAFT_NCCL_TRY_NO_THROW(call)
+#endif
+
 namespace raft {
 namespace comms {
 
diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index cf06416a96..1464cd070e 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -53,8 +53,7 @@ struct cuda_error : public raft::exception {
  * exception detailing the CUDA error that occurred
  *
  */
-#ifndef CUDA_TRY
-#define CUDA_TRY(call)                             \
+#define RAFT_CUDA_TRY(call)                        \
   do {                                             \
     cudaError_t const status = call;               \
     if (status != cudaSuccess) {                   \
@@ -69,7 +68,12 @@ struct cuda_error : public raft::exception {
       throw raft::cuda_error(msg);                 \
     }                                              \
   } while (0)
+
+// FIXME: Remove after consumers rename
+#ifndef CUDA_TRY
+#define CUDA_TRY(call) RAFT_CUDA_TRY(call)
 #endif
+
 /**
  * @brief Debug macro to check for CUDA errors
  *
@@ -84,23 +88,26 @@ struct cuda_error : public raft::exception {
  * asynchronous kernel launch.
  */
 #ifndef NDEBUG
-#define CHECK_CUDA(stream) CUDA_TRY(cudaStreamSynchronize(stream));
+#define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 #else
-#define CHECK_CUDA(stream) CUDA_TRY(cudaPeekAtLastError());
+#define RAFT_CHECK_CUDA(stream) RAFT_CUDA_TRY(cudaPeekAtLastError());
 #endif
 
-/** FIXME: temporary alias for cuML compatibility */
+// FIXME: Remove after consumers rename
+#ifndef CHECK_CUDA
+#define CHECK_CUDA(call) RAFT_CHECK_CUDA(call)
+#endif
+
+/** FIXME: remove after cuml rename */
 #ifndef CUDA_CHECK
-#define CUDA_CHECK(call) CUDA_TRY(call)
+#define CUDA_CHECK(call) RAFT_CUDA_TRY(call)
 #endif
 
-///@todo: enable this only after we have added logging support in raft
 // /**
 //  * @brief check for cuda runtime API errors but log error instead of raising
 //  *        exception.
 //  */
-#ifndef CUDA_CHECK_NO_THROW
-#define CUDA_CHECK_NO_THROW(call)                                  \
+#define RAFT_CUDA_TRY_NO_THROW(call)                               \
   do {                                                             \
     cudaError_t const status = call;                               \
     if (cudaSuccess != status) {                                   \
@@ -111,6 +118,10 @@ struct cuda_error : public raft::exception {
              cudaGetErrorString(status));                          \
     }                                                              \
   } while (0)
+
+// FIXME: Remove after cuml rename
+#ifndef CUDA_CHECK_NO_THROW
+#define CUDA_CHECK_NO_THROW(call) RAFT_CHECK_CUDA_NO_THROW(call)
 #endif
 
 /**
@@ -118,8 +129,6 @@ struct cuda_error : public raft::exception {
  * TODO: Rename original implementations in 22.04 to fix
  * https://github.com/rapidsai/raft/issues/128
  */
-#define RAFT_CUDA_CHECK(call)          CUDA_CHECK(call)
-#define RAFT_CUDA_CHECK_NO_THROW(call) CUDA_CHECK_NO_THROW(call)
 
 namespace raft {
 
@@ -328,9 +337,9 @@ inline void deallocate_all(rmm::cuda_stream_view stream)
 inline int getSharedMemPerBlock()
 {
   int devId;
-  CUDA_CHECK(cudaGetDevice(&devId));
+  RAFT_CUDA_TRY(cudaGetDevice(&devId));
   int smemPerBlk;
-  CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId));
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId));
   return smemPerBlk;
 }
 
@@ -338,9 +347,9 @@ inline int getSharedMemPerBlock()
 inline int getMultiProcessorCount()
 {
   int devId;
-  CUDA_CHECK(cudaGetDevice(&devId));
+  RAFT_CUDA_TRY(cudaGetDevice(&devId));
   int mpCount;
-  CUDA_CHECK(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
   return mpCount;
 }
 
@@ -352,7 +361,7 @@ std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t strea
 
   T* arr_h = (T*)malloc(size * sizeof(T));
   update_host(arr_h, arr, size, stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
   ss << name << " = [ ";
   for (int i = 0; i < size; i++) {
diff --git a/cpp/include/raft/distance/detail/canberra.cuh b/cpp/include/raft/distance/detail/canberra.cuh
index 46edf0bf47..6be994b80a 100644
--- a/cpp/include/raft/distance/detail/canberra.cuh
+++ b/cpp/include/raft/distance/detail/canberra.cuh
@@ -118,7 +118,7 @@ static void canberraImpl(const DataT* x,
       x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 template <typename DataT,
diff --git a/cpp/include/raft/distance/detail/chebyshev.cuh b/cpp/include/raft/distance/detail/chebyshev.cuh
index 99b314bd08..1ac10f269e 100644
--- a/cpp/include/raft/distance/detail/chebyshev.cuh
+++ b/cpp/include/raft/distance/detail/chebyshev.cuh
@@ -114,7 +114,7 @@ static void chebyshevImpl(const DataT* x,
       x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 template <typename DataT,
diff --git a/cpp/include/raft/distance/detail/correlation.cuh b/cpp/include/raft/distance/detail/correlation.cuh
index 159f9ab580..8384598805 100644
--- a/cpp/include/raft/distance/detail/correlation.cuh
+++ b/cpp/include/raft/distance/detail/correlation.cuh
@@ -162,7 +162,7 @@ static void correlationImpl(const DataT* x,
       x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 template <typename DataT,
diff --git a/cpp/include/raft/distance/detail/cosine.cuh b/cpp/include/raft/distance/detail/cosine.cuh
index 5684fd0a16..3007198f60 100644
--- a/cpp/include/raft/distance/detail/cosine.cuh
+++ b/cpp/include/raft/distance/detail/cosine.cuh
@@ -128,7 +128,7 @@ void cosineImpl(const DataT* x,
       x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 template <typename DataT,
diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index 91838e8bfa..9eeccdb827 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -591,7 +591,7 @@ void distance(const InType* x,
 {
   DistanceImpl<distanceType, InType, AccType, OutType, FinalLambda, Index_> distImpl;
   distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 /**
@@ -635,7 +635,7 @@ void distance(const InType* x,
   auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) { return d_val; };
   distance<distanceType, InType, AccType, OutType, decltype(default_fin_op), Index_>(
     x, y, dist, m, n, k, workspace, worksize, default_fin_op, stream, isRowMajor, metric_arg);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 /**
diff --git a/cpp/include/raft/distance/detail/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh
index 1166543f8c..a8deb8df24 100644
--- a/cpp/include/raft/distance/detail/euclidean.cuh
+++ b/cpp/include/raft/distance/detail/euclidean.cuh
@@ -138,7 +138,7 @@ void euclideanExpImpl(const DataT* x,
       x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 template <typename DataT,
@@ -352,7 +352,7 @@ void euclideanUnExpImpl(const DataT* x,
       x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 template <typename DataT,
diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
index 9373992ada..f81f27576b 100644
--- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
@@ -289,11 +289,11 @@ void fusedL2NNImpl(OutT* min,
   // Accumulation operation lambda
   auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) { acc += x * y; };
 
-  CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream));
   if (initOutBuffer) {
     initKernel<DataT, OutT, IdxT, ReduceOpT>
       <<<nblks, P::Nthreads, 0, stream>>>(min, m, maxVal, redOp);
-    CUDA_CHECK(cudaGetLastError());
+    RAFT_CUDA_TRY(cudaGetLastError());
   }
 
   auto fin_op = [] __device__(DataT d_val, int g_d_idx) { return d_val; };
@@ -328,7 +328,7 @@ void fusedL2NNImpl(OutT* min,
       min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, core_lambda, fin_op);
   }
 
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 }  // namespace detail
diff --git a/cpp/include/raft/distance/detail/hamming.cuh b/cpp/include/raft/distance/detail/hamming.cuh
index 886b9d1426..bed9d09e3e 100644
--- a/cpp/include/raft/distance/detail/hamming.cuh
+++ b/cpp/include/raft/distance/detail/hamming.cuh
@@ -123,7 +123,7 @@ static void hammingUnexpandedImpl(const DataT* x,
       x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 template <typename DataT,
diff --git a/cpp/include/raft/distance/detail/hellinger.cuh b/cpp/include/raft/distance/detail/hellinger.cuh
index 189bbed491..1874d2e942 100644
--- a/cpp/include/raft/distance/detail/hellinger.cuh
+++ b/cpp/include/raft/distance/detail/hellinger.cuh
@@ -152,7 +152,7 @@ static void hellingerImpl(const DataT* x,
       (DataT*)y, y, n * k, unaryOp_lambda, stream);
   }
 
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 template <typename DataT,
diff --git a/cpp/include/raft/distance/detail/jensen_shannon.cuh b/cpp/include/raft/distance/detail/jensen_shannon.cuh
index b3240fe398..92ee071cf5 100644
--- a/cpp/include/raft/distance/detail/jensen_shannon.cuh
+++ b/cpp/include/raft/distance/detail/jensen_shannon.cuh
@@ -131,7 +131,7 @@ static void jensenShannonImpl(const DataT* x,
       x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 template <typename DataT,
diff --git a/cpp/include/raft/distance/detail/kl_divergence.cuh b/cpp/include/raft/distance/detail/kl_divergence.cuh
index 31127a4d8d..4c0c4b6ace 100644
--- a/cpp/include/raft/distance/detail/kl_divergence.cuh
+++ b/cpp/include/raft/distance/detail/kl_divergence.cuh
@@ -250,7 +250,7 @@ static void klDivergenceImpl(const DataT* x,
     }
   }
 
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 template <typename DataT,
diff --git a/cpp/include/raft/distance/detail/l1.cuh b/cpp/include/raft/distance/detail/l1.cuh
index e444e65d1f..6372019fd3 100644
--- a/cpp/include/raft/distance/detail/l1.cuh
+++ b/cpp/include/raft/distance/detail/l1.cuh
@@ -113,7 +113,7 @@ static void l1Impl(const DataT* x,
       x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 template <typename DataT,
diff --git a/cpp/include/raft/distance/detail/minkowski.cuh b/cpp/include/raft/distance/detail/minkowski.cuh
index 22a183c22c..d3d0979d0d 100644
--- a/cpp/include/raft/distance/detail/minkowski.cuh
+++ b/cpp/include/raft/distance/detail/minkowski.cuh
@@ -127,7 +127,7 @@ void minkowskiUnExpImpl(const DataT* x,
       x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 template <typename DataT,
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index 8fa7801c70..bfba7ab144 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -370,7 +370,7 @@ dim3 launchConfigGenerator(IdxT m, IdxT n, std::size_t sMemSize, T func)
   int numBlocksPerSm = 0;
   dim3 grid;
 
-  CUDA_CHECK(
+  RAFT_CUDA_TRY(
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, func, P::Nthreads, sMemSize));
   std::size_t minGridSize = numSMs * numBlocksPerSm;
   std::size_t yChunks     = raft::ceildiv<int>(m, P::Mblk);
diff --git a/cpp/include/raft/distance/detail/russell_rao.cuh b/cpp/include/raft/distance/detail/russell_rao.cuh
index d4fbb039e7..5d516e7830 100644
--- a/cpp/include/raft/distance/detail/russell_rao.cuh
+++ b/cpp/include/raft/distance/detail/russell_rao.cuh
@@ -124,7 +124,7 @@ static void russellRaoImpl(const DataT* x,
       x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 template <typename DataT,
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 70fff1e210..4b7605cc63 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -59,7 +59,7 @@ class handle_t {
   explicit handle_t(int n_streams = kNumDefaultWorkerStreams)
     : dev_id_([]() -> int {
         int cur_dev = -1;
-        CUDA_CHECK(cudaGetDevice(&cur_dev));
+        RAFT_CUDA_TRY(cudaGetDevice(&cur_dev));
         return cur_dev;
       }())
   {
@@ -104,7 +104,7 @@ class handle_t {
   {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cublas_initialized_) {
-      CUBLAS_CHECK(cublasCreate(&cublas_handle_));
+      RAFT_CUBLAS_TRY(cublasCreate(&cublas_handle_));
       cublas_initialized_ = true;
     }
     return cublas_handle_;
@@ -114,7 +114,7 @@ class handle_t {
   {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusolver_dn_initialized_) {
-      CUSOLVER_CHECK(cusolverDnCreate(&cusolver_dn_handle_));
+      RAFT_CUSOLVER_TRY(cusolverDnCreate(&cusolver_dn_handle_));
       cusolver_dn_initialized_ = true;
     }
     return cusolver_dn_handle_;
@@ -124,7 +124,7 @@ class handle_t {
   {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusolver_sp_initialized_) {
-      CUSOLVER_CHECK(cusolverSpCreate(&cusolver_sp_handle_));
+      RAFT_CUSOLVER_TRY(cusolverSpCreate(&cusolver_sp_handle_));
       cusolver_sp_initialized_ = true;
     }
     return cusolver_sp_handle_;
@@ -134,7 +134,7 @@ class handle_t {
   {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusparse_initialized_) {
-      CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
+      RAFT_CUSPARSE_TRY(cusparseCreate(&cusparse_handle_));
       cusparse_initialized_ = true;
     }
     return cusparse_handle_;
@@ -173,17 +173,17 @@ class handle_t {
 
   void wait_on_user_stream() const
   {
-    CUDA_CHECK(cudaEventRecord(event_, user_stream_));
+    RAFT_CUDA_TRY(cudaEventRecord(event_, user_stream_));
     for (int i = 0; i < get_num_internal_streams(); i++) {
-      CUDA_CHECK(cudaStreamWaitEvent(get_internal_stream(i), event_, 0));
+      RAFT_CUDA_TRY(cudaStreamWaitEvent(get_internal_stream(i), event_, 0));
     }
   }
 
   void wait_on_internal_streams() const
   {
     for (int i = 0; i < get_num_internal_streams(); i++) {
-      CUDA_CHECK(cudaEventRecord(event_, get_internal_stream(i)));
-      CUDA_CHECK(cudaStreamWaitEvent(user_stream_, event_, 0));
+      RAFT_CUDA_TRY(cudaEventRecord(event_, get_internal_stream(i)));
+      RAFT_CUDA_TRY(cudaStreamWaitEvent(user_stream_, event_, 0));
     }
   }
 
@@ -218,7 +218,7 @@ class handle_t {
   {
     std::lock_guard<std::mutex> _(mutex_);
     if (!device_prop_initialized_) {
-      CUDA_CHECK(cudaGetDeviceProperties(&prop_, dev_id_));
+      RAFT_CUDA_TRY(cudaGetDeviceProperties(&prop_, dev_id_));
       device_prop_initialized_ = true;
     }
     return prop_;
@@ -245,29 +245,19 @@ class handle_t {
   mutable bool device_prop_initialized_{false};
   mutable std::mutex mutex_;
 
-  void create_resources() { CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); }
+  void create_resources()
+  {
+    RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+  }
 
   void destroy_resources()
   {
     ///@todo: enable *_NO_THROW variants once we have enabled logging
-    if (cusparse_initialized_) {
-      // CUSPARSE_CHECK_NO_THROW(cusparseDestroy(cusparse_handle_));
-      CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_));
-    }
-    if (cusolver_dn_initialized_) {
-      // CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
-      CUSOLVER_CHECK(cusolverDnDestroy(cusolver_dn_handle_));
-    }
-    if (cusolver_sp_initialized_) {
-      // CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
-      CUSOLVER_CHECK(cusolverSpDestroy(cusolver_sp_handle_));
-    }
-    if (cublas_initialized_) {
-      // CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_));
-      CUBLAS_CHECK(cublasDestroy(cublas_handle_));
-    }
-    // CUDA_CHECK_NO_THROW(cudaEventDestroy(event_));
-    CUDA_CHECK(cudaEventDestroy(event_));
+    if (cusparse_initialized_) { RAFT_CUSPARSE_TRY(cusparseDestroy(cusparse_handle_)); }
+    if (cusolver_dn_initialized_) { RAFT_CUSOLVER_TRY(cusolverDnDestroy(cusolver_dn_handle_)); }
+    if (cusolver_sp_initialized_) { RAFT_CUSOLVER_TRY(cusolverSpDestroy(cusolver_sp_handle_)); }
+    if (cublas_initialized_) { RAFT_CUBLAS_TRY(cublasDestroy(cublas_handle_)); }
+    RAFT_CUDA_TRY(cudaEventDestroy(event_));
   }
 };  // class handle_t
 
diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh
index a2e29952d7..4e9e993b78 100644
--- a/cpp/include/raft/label/classlabels.cuh
+++ b/cpp/include/raft/label/classlabels.cuh
@@ -101,7 +101,7 @@ void getOvrlabels(
     n,
     [idx, y_unique] __device__(value_t y) { return y == y_unique[idx] ? +1 : -1; },
     stream);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 // TODO: add one-versus-one selection: select two classes, relabel them to
diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/merge_labels.cuh
index 1ee0659b0d..9818b5d71b 100644
--- a/cpp/include/raft/label/merge_labels.cuh
+++ b/cpp/include/raft/label/merge_labels.cuh
@@ -137,20 +137,20 @@ void merge_labels(value_idx* labels_a,
   // Step 1: compute connected components in the label equivalence graph
   bool host_m;
   do {
-    CUDA_CHECK(cudaMemsetAsync(m, false, sizeof(bool), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(m, false, sizeof(bool), stream));
 
     propagate_label_kernel<value_idx, TPB_X>
       <<<blocks, threads, 0, stream>>>(labels_a, labels_b, R, mask, m, N);
-    CUDA_CHECK(cudaPeekAtLastError());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
 
     raft::update_host(&host_m, m, 1, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   } while (host_m);
 
   // Step 2: re-assign minimum equivalent label
   reassign_label_kernel<value_idx, TPB_X>
     <<<blocks, threads, 0, stream>>>(labels_a, labels_b, R, N, MAX_LABEL);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 };  // namespace label
diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh
index 11d3174951..926cc44197 100644
--- a/cpp/include/raft/linalg/add.cuh
+++ b/cpp/include/raft/linalg/add.cuh
@@ -94,7 +94,7 @@ void addDevScalar(math_t* outDev,
   dim3 block(256);
   dim3 grid(raft::ceildiv(len, (IdxType)block.x));
   add_dev_scalar_kernel<math_t><<<grid, block, 0, stream>>>(outDev, inDev, singleScalarDev, len);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh
index a49a433941..00a2af0014 100644
--- a/cpp/include/raft/linalg/binary_op.cuh
+++ b/cpp/include/raft/linalg/binary_op.cuh
@@ -49,7 +49,7 @@ void binaryOpImpl(
   const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
   binaryOpKernel<InType, VecLen, Lambda, IdxType, OutType>
     <<<nblks, TPB, 0, stream>>>(out, in1, in2, len, op);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 /**
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh
index 4b58133ac5..31e3a99a81 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh
@@ -73,7 +73,7 @@ namespace linalg {
  *   // Calculate a new row/column of matrix A into A_new
  *   // ...
  *   // Copy new row to L[rank-1,:]
- *   CUBLAS_CHECK(cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1,
+ *   RAFT_CUBLAS_TRY(cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1,
  *                           L + n - 1, ld_L, stream));
  *   // Update Cholesky factorization
  *   MLCommon::LinAlg::choleskyRank1Update(
@@ -171,38 +171,38 @@ void choleskyRank1Update(const raft::handle_t& handle,
     // contiguous. We copy elements from A_row to a contiguous workspace A_new.
     A_row = L + n - 1;
     A_new = reinterpret_cast<math_t*>(workspace);
-    CUBLAS_CHECK(
+    RAFT_CUBLAS_TRY(
       raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream));
   }
   cublasOperation_t op = (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N;
   if (n > 1) {
     // Calculate L_12 = x by solving equation L_11 x = A_12
     math_t alpha = 1;
-    CUBLAS_CHECK(raft::linalg::cublastrsm(handle.get_cublas_handle(),
-                                          CUBLAS_SIDE_LEFT,
-                                          uplo,
-                                          op,
-                                          CUBLAS_DIAG_NON_UNIT,
-                                          n - 1,
-                                          1,
-                                          &alpha,
-                                          L,
-                                          ld,
-                                          A_new,
-                                          n - 1,
-                                          stream));
+    RAFT_CUBLAS_TRY(raft::linalg::cublastrsm(handle.get_cublas_handle(),
+                                             CUBLAS_SIDE_LEFT,
+                                             uplo,
+                                             op,
+                                             CUBLAS_DIAG_NON_UNIT,
+                                             n - 1,
+                                             1,
+                                             &alpha,
+                                             L,
+                                             ld,
+                                             A_new,
+                                             n - 1,
+                                             stream));
 
     // A_new now stores L_12, we calculate s = L_12 * L_12
-    CUBLAS_CHECK(
+    RAFT_CUBLAS_TRY(
       raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream));
 
     if (uplo == CUBLAS_FILL_MODE_LOWER) {
       // Copy back the L_12 elements as the n-th row of L
-      CUBLAS_CHECK(
+      RAFT_CUBLAS_TRY(
         raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream));
     }
   } else {  // n == 1 case
-    CUDA_CHECK(cudaMemsetAsync(s, 0, sizeof(math_t), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(s, 0, sizeof(math_t), stream));
   }
 
   // L_22 = sqrt(A_22 - L_12 * L_12)
@@ -210,7 +210,7 @@ void choleskyRank1Update(const raft::handle_t& handle,
   math_t L_22_host;
   raft::update_host(&s_host, s, 1, stream);
   raft::update_host(&L_22_host, L_22, 1, stream);  // L_22 stores A_22
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   L_22_host = std::sqrt(L_22_host - s_host);
 
   // Check for numeric error with sqrt. If the matrix is not positive definit or
diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh
index 7e0744f98a..717e2c42b2 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/coalesced_reduction.cuh
@@ -120,7 +120,7 @@ void coalescedReduction(OutType* dots,
     coalescedReductionKernel<InType, OutType, IdxType, 256>
       <<<N, 256, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
   }
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h
index 3616d54506..d125aa40dd 100644
--- a/cpp/include/raft/linalg/cublas_wrappers.h
+++ b/cpp/include/raft/linalg/cublas_wrappers.h
@@ -69,7 +69,7 @@ inline const char* cublas_error_to_string(cublasStatus_t err)
  * Invokes a cuBLAS runtime API function call, if the call does not return
  * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred
  */
-#define CUBLAS_TRY(call)                                                   \
+#define RAFT_CUBLAS_TRY(call)                                              \
   do {                                                                     \
     cublasStatus_t const status = (call);                                  \
     if (CUBLAS_STATUS_SUCCESS != status) {                                 \
@@ -84,20 +84,13 @@ inline const char* cublas_error_to_string(cublasStatus_t err)
     }                                                                      \
   } while (0)
 
-/** FIXME: temporary alias for cuML compatibility */
-#define CUBLAS_CHECK(call) CUBLAS_TRY(call)
+// FIXME: Remove after consumers rename
+#ifndef CUBLAS_TRY
+#define CUBLAS_TRY(call) RAFT_CUBLAS_TRY(call)
+#endif
 
-/** check for cublas runtime API errors but do not assert */
-#define CUBLAS_CHECK_NO_THROW(call)                                      \
-  do {                                                                   \
-    cublasStatus_t err = call;                                           \
-    if (err != CUBLAS_STATUS_SUCCESS) {                                  \
-      CUML_LOG_ERROR("CUBLAS call='%s' got errorcode=%d err=%s",         \
-                     #call,                                              \
-                     err,                                                \
-                     raft::linalg::detail::cublas_error_to_string(err)); \
-    }                                                                    \
-  } while (0)
+/** FIXME: remove after cuml rename */
+#define CUBLAS_CHECK(call) CUBLAS_TRY(call)
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/cusolver_wrappers.h
index 85f2740647..0c94804111 100644
--- a/cpp/include/raft/linalg/cusolver_wrappers.h
+++ b/cpp/include/raft/linalg/cusolver_wrappers.h
@@ -68,7 +68,7 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err)
  * Invokes a cuSOLVER runtime API function call, if the call does not return
  * CUSolver_STATUS_SUCCESS, throws an exception detailing the cuSOLVER error that occurred
  */
-#define CUSOLVER_TRY(call)                                                   \
+#define RAFT_CUSOLVER_TRY(call)                                              \
   do {                                                                       \
     cusolverStatus_t const status = (call);                                  \
     if (CUSOLVER_STATUS_SUCCESS != status) {                                 \
@@ -83,20 +83,14 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err)
     }                                                                        \
   } while (0)
 
-/** FIXME: temporary alias for cuML compatibility */
-#define CUSOLVER_CHECK(call) CUSOLVER_TRY(call)
+// FIXME: remove after consumer rename
+#ifndef CUSOLVER_TRY
+#define CUSOLVER_TRY(call) RAFT_CUSOLVER_TRY(call)
+#endif
 
-//@todo: enable this once logging is enabled
-#if 0
-** check for cusolver runtime API errors but do not assert */
-define CUSOLVER_CHECK_NO_THROW(call)                                          \
-  do {                                                                         \
-    cusolverStatus_t err = call;                                               \
-    if (err != CUSOLVER_STATUS_SUCCESS) {                                      \
-      CUML_LOG_ERROR("CUSOLVER call='%s' got errorcode=%d err=%s", #call, err, \
-                     raft::linalg::detail::cusolver_error_to_string(err));     \
-    }                                                                          \
-  } while (0)
+// FIXME: remove after cuml rename
+#ifndef CUSOLVER_CHECK
+#define CUSOLVER_CHECK(call) CUSOLVER_TRY(call)
 #endif
 
 namespace raft {
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
index 288d379dac..b67c9d494a 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -40,32 +40,32 @@ void eigDC_legacy(const raft::handle_t& handle,
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int lwork;
-  CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH,
-                                            CUSOLVER_EIG_MODE_VECTOR,
-                                            CUBLAS_FILL_MODE_UPPER,
-                                            n_rows,
-                                            in,
-                                            n_cols,
-                                            eig_vals,
-                                            &lwork));
+  RAFT_CUSOLVER_TRY(cusolverDnsyevd_bufferSize(cusolverH,
+                                               CUSOLVER_EIG_MODE_VECTOR,
+                                               CUBLAS_FILL_MODE_UPPER,
+                                               n_rows,
+                                               in,
+                                               n_cols,
+                                               eig_vals,
+                                               &lwork));
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
   rmm::device_scalar<int> d_dev_info(stream);
 
   raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
 
-  CUSOLVER_CHECK(cusolverDnsyevd(cusolverH,
-                                 CUSOLVER_EIG_MODE_VECTOR,
-                                 CUBLAS_FILL_MODE_UPPER,
-                                 n_rows,
-                                 eig_vectors,
-                                 n_cols,
-                                 eig_vals,
-                                 d_work.data(),
-                                 lwork,
-                                 d_dev_info.data(),
-                                 stream));
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUSOLVER_TRY(cusolverDnsyevd(cusolverH,
+                                    CUSOLVER_EIG_MODE_VECTOR,
+                                    CUBLAS_FILL_MODE_UPPER,
+                                    n_rows,
+                                    eig_vectors,
+                                    n_cols,
+                                    eig_vals,
+                                    d_work.data(),
+                                    lwork,
+                                    d_dev_info.data(),
+                                    stream));
+  RAFT_CUDA_TRY(cudaGetLastError());
 
   auto dev_info = d_dev_info.value(stream);
   ASSERT(dev_info == 0,
@@ -101,21 +101,21 @@ void eigDC(const raft::handle_t& handle,
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   cusolverDnParams_t dn_params = nullptr;
-  CUSOLVER_CHECK(cusolverDnCreateParams(&dn_params));
+  RAFT_CUSOLVER_TRY(cusolverDnCreateParams(&dn_params));
 
   size_t workspaceDevice = 0;
   size_t workspaceHost   = 0;
-  CUSOLVER_CHECK(cusolverDnxsyevd_bufferSize(cusolverH,
-                                             dn_params,
-                                             CUSOLVER_EIG_MODE_VECTOR,
-                                             CUBLAS_FILL_MODE_UPPER,
-                                             static_cast<int64_t>(n_rows),
-                                             eig_vectors,
-                                             static_cast<int64_t>(n_cols),
-                                             eig_vals,
-                                             &workspaceDevice,
-                                             &workspaceHost,
-                                             stream));
+  RAFT_CUSOLVER_TRY(cusolverDnxsyevd_bufferSize(cusolverH,
+                                                dn_params,
+                                                CUSOLVER_EIG_MODE_VECTOR,
+                                                CUBLAS_FILL_MODE_UPPER,
+                                                static_cast<int64_t>(n_rows),
+                                                eig_vectors,
+                                                static_cast<int64_t>(n_cols),
+                                                eig_vals,
+                                                &workspaceDevice,
+                                                &workspaceHost,
+                                                stream));
 
   rmm::device_uvector<math_t> d_work(workspaceDevice / sizeof(math_t), stream);
   rmm::device_scalar<int> d_dev_info(stream);
@@ -123,23 +123,23 @@ void eigDC(const raft::handle_t& handle,
 
   raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
 
-  CUSOLVER_CHECK(cusolverDnxsyevd(cusolverH,
-                                  dn_params,
-                                  CUSOLVER_EIG_MODE_VECTOR,
-                                  CUBLAS_FILL_MODE_UPPER,
-                                  static_cast<int64_t>(n_rows),
-                                  eig_vectors,
-                                  static_cast<int64_t>(n_cols),
-                                  eig_vals,
-                                  d_work.data(),
-                                  workspaceDevice,
-                                  h_work.data(),
-                                  workspaceHost,
-                                  d_dev_info.data(),
-                                  stream));
-
-  CUDA_CHECK(cudaGetLastError());
-  CUSOLVER_CHECK(cusolverDnDestroyParams(dn_params));
+  RAFT_CUSOLVER_TRY(cusolverDnxsyevd(cusolverH,
+                                     dn_params,
+                                     CUSOLVER_EIG_MODE_VECTOR,
+                                     CUBLAS_FILL_MODE_UPPER,
+                                     static_cast<int64_t>(n_rows),
+                                     eig_vectors,
+                                     static_cast<int64_t>(n_cols),
+                                     eig_vals,
+                                     d_work.data(),
+                                     workspaceDevice,
+                                     h_work.data(),
+                                     workspaceHost,
+                                     d_dev_info.data(),
+                                     stream));
+
+  RAFT_CUDA_TRY(cudaGetLastError());
+  RAFT_CUSOLVER_TRY(cusolverDnDestroyParams(dn_params));
   int dev_info = d_dev_info.value(stream);
   ASSERT(dev_info == 0,
          "eig.cuh: eigensolver couldn't converge to a solution. "
@@ -181,67 +181,67 @@ void eigSelDC(const raft::handle_t& handle,
   int lwork;
   int h_meig;
 
-  CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize(cusolverH,
-                                             CUSOLVER_EIG_MODE_VECTOR,
-                                             CUSOLVER_EIG_RANGE_I,
-                                             CUBLAS_FILL_MODE_UPPER,
-                                             n_rows,
-                                             in,
-                                             n_cols,
-                                             math_t(0.0),
-                                             math_t(0.0),
-                                             n_cols - n_eig_vals + 1,
-                                             n_cols,
-                                             &h_meig,
-                                             eig_vals,
-                                             &lwork));
+  RAFT_CUSOLVER_TRY(cusolverDnsyevdx_bufferSize(cusolverH,
+                                                CUSOLVER_EIG_MODE_VECTOR,
+                                                CUSOLVER_EIG_RANGE_I,
+                                                CUBLAS_FILL_MODE_UPPER,
+                                                n_rows,
+                                                in,
+                                                n_cols,
+                                                math_t(0.0),
+                                                math_t(0.0),
+                                                n_cols - n_eig_vals + 1,
+                                                n_cols,
+                                                &h_meig,
+                                                eig_vals,
+                                                &lwork));
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
   rmm::device_scalar<int> d_dev_info(stream);
   rmm::device_uvector<math_t> d_eig_vectors(0, stream);
 
   if (memUsage == OVERWRITE_INPUT) {
-    CUSOLVER_CHECK(cusolverDnsyevdx(cusolverH,
-                                    CUSOLVER_EIG_MODE_VECTOR,
-                                    CUSOLVER_EIG_RANGE_I,
-                                    CUBLAS_FILL_MODE_UPPER,
-                                    n_rows,
-                                    in,
-                                    n_cols,
-                                    math_t(0.0),
-                                    math_t(0.0),
-                                    n_cols - n_eig_vals + 1,
-                                    n_cols,
-                                    &h_meig,
-                                    eig_vals,
-                                    d_work.data(),
-                                    lwork,
-                                    d_dev_info.data(),
-                                    stream));
+    RAFT_CUSOLVER_TRY(cusolverDnsyevdx(cusolverH,
+                                       CUSOLVER_EIG_MODE_VECTOR,
+                                       CUSOLVER_EIG_RANGE_I,
+                                       CUBLAS_FILL_MODE_UPPER,
+                                       n_rows,
+                                       in,
+                                       n_cols,
+                                       math_t(0.0),
+                                       math_t(0.0),
+                                       n_cols - n_eig_vals + 1,
+                                       n_cols,
+                                       &h_meig,
+                                       eig_vals,
+                                       d_work.data(),
+                                       lwork,
+                                       d_dev_info.data(),
+                                       stream));
   } else if (memUsage == COPY_INPUT) {
     d_eig_vectors.resize(n_rows * n_cols, stream);
     raft::matrix::copy(in, d_eig_vectors.data(), n_rows, n_cols, stream);
 
-    CUSOLVER_CHECK(cusolverDnsyevdx(cusolverH,
-                                    CUSOLVER_EIG_MODE_VECTOR,
-                                    CUSOLVER_EIG_RANGE_I,
-                                    CUBLAS_FILL_MODE_UPPER,
-                                    n_rows,
-                                    eig_vectors,
-                                    n_cols,
-                                    math_t(0.0),
-                                    math_t(0.0),
-                                    n_cols - n_eig_vals + 1,
-                                    n_cols,
-                                    &h_meig,
-                                    eig_vals,
-                                    d_work.data(),
-                                    lwork,
-                                    d_dev_info.data(),
-                                    stream));
+    RAFT_CUSOLVER_TRY(cusolverDnsyevdx(cusolverH,
+                                       CUSOLVER_EIG_MODE_VECTOR,
+                                       CUSOLVER_EIG_RANGE_I,
+                                       CUBLAS_FILL_MODE_UPPER,
+                                       n_rows,
+                                       eig_vectors,
+                                       n_cols,
+                                       math_t(0.0),
+                                       math_t(0.0),
+                                       n_cols - n_eig_vals + 1,
+                                       n_cols,
+                                       &h_meig,
+                                       eig_vals,
+                                       d_work.data(),
+                                       lwork,
+                                       d_dev_info.data(),
+                                       stream));
   }
 
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 
   int dev_info = d_dev_info.value(stream);
   ASSERT(dev_info == 0,
@@ -286,44 +286,44 @@ void eigJacobi(const raft::handle_t& handle,
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   syevjInfo_t syevj_params = nullptr;
-  CUSOLVER_CHECK(cusolverDnCreateSyevjInfo(&syevj_params));
-  CUSOLVER_CHECK(cusolverDnXsyevjSetTolerance(syevj_params, tol));
-  CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(syevj_params, static_cast<int>(sweeps)));
+  RAFT_CUSOLVER_TRY(cusolverDnCreateSyevjInfo(&syevj_params));
+  RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetTolerance(syevj_params, tol));
+  RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetMaxSweeps(syevj_params, static_cast<int>(sweeps)));
 
   int lwork;
-  CUSOLVER_CHECK(cusolverDnsyevj_bufferSize(cusolverH,
-                                            CUSOLVER_EIG_MODE_VECTOR,
-                                            CUBLAS_FILL_MODE_UPPER,
-                                            n_rows,
-                                            eig_vectors,
-                                            n_cols,
-                                            eig_vals,
-                                            &lwork,
-                                            syevj_params));
+  RAFT_CUSOLVER_TRY(cusolverDnsyevj_bufferSize(cusolverH,
+                                               CUSOLVER_EIG_MODE_VECTOR,
+                                               CUBLAS_FILL_MODE_UPPER,
+                                               n_rows,
+                                               eig_vectors,
+                                               n_cols,
+                                               eig_vals,
+                                               &lwork,
+                                               syevj_params));
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
   rmm::device_scalar<int> dev_info(stream);
 
   raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
 
-  CUSOLVER_CHECK(cusolverDnsyevj(cusolverH,
-                                 CUSOLVER_EIG_MODE_VECTOR,
-                                 CUBLAS_FILL_MODE_UPPER,
-                                 n_rows,
-                                 eig_vectors,
-                                 n_cols,
-                                 eig_vals,
-                                 d_work.data(),
-                                 lwork,
-                                 dev_info.data(),
-                                 syevj_params,
-                                 stream));
+  RAFT_CUSOLVER_TRY(cusolverDnsyevj(cusolverH,
+                                    CUSOLVER_EIG_MODE_VECTOR,
+                                    CUBLAS_FILL_MODE_UPPER,
+                                    n_rows,
+                                    eig_vectors,
+                                    n_cols,
+                                    eig_vals,
+                                    d_work.data(),
+                                    lwork,
+                                    dev_info.data(),
+                                    syevj_params,
+                                    stream));
 
   int executed_sweeps;
-  CUSOLVER_CHECK(cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps));
+  RAFT_CUSOLVER_TRY(cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps));
 
-  CUDA_CHECK(cudaGetLastError());
-  CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params));
+  RAFT_CUDA_TRY(cudaGetLastError());
+  RAFT_CUSOLVER_TRY(cusolverDnDestroySyevjInfo(syevj_params));
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh
index d5942b7446..959f74ee2b 100644
--- a/cpp/include/raft/linalg/gemm.cuh
+++ b/cpp/include/raft/linalg/gemm.cuh
@@ -65,7 +65,7 @@ void gemm(const raft::handle_t& handle,
   int lda = trans_a == CUBLAS_OP_T ? k : m;
   int ldb = trans_b == CUBLAS_OP_T ? n : k;
   int ldc = m;
-  CUBLAS_CHECK(
+  RAFT_CUBLAS_TRY(
     cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc, stream));
 }
 
@@ -191,7 +191,7 @@ void gemm(const raft::handle_t& handle,
     K = _K;
   }
   // Actual cuBLAS call
-  CUBLAS_CHECK(
+  RAFT_CUBLAS_TRY(
     cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda, b, ldb, &beta, c, ldc, stream));
 }
 
diff --git a/cpp/include/raft/linalg/gemv.h b/cpp/include/raft/linalg/gemv.h
index ac0547e30a..965cd32a57 100644
--- a/cpp/include/raft/linalg/gemv.h
+++ b/cpp/include/raft/linalg/gemv.h
@@ -41,7 +41,7 @@ void gemv(const raft::handle_t& handle,
 {
   cublasHandle_t cublas_h = handle.get_cublas_handle();
   cublasOperation_t op_a  = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBLAS_CHECK(
+  RAFT_CUBLAS_TRY(
     cublasgemv(cublas_h, op_a, n_rows, n_cols, &alpha, A, n_rows, x, incx, &beta, y, incy, stream));
 }
 
@@ -139,7 +139,7 @@ void gemv(const raft::handle_t& handle,
 {
   cublasHandle_t cublas_h = handle.get_cublas_handle();
   cublasOperation_t op_a  = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBLAS_CHECK(
+  RAFT_CUBLAS_TRY(
     cublasgemv(cublas_h, op_a, n_rows_a, n_cols_a, &alpha, A, lda, x, 1, &beta, y, 1, stream));
 }
 
diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp
index 39089473e3..ef2b6cc941 100644
--- a/cpp/include/raft/linalg/lanczos.hpp
+++ b/cpp/include/raft/linalg/lanczos.hpp
@@ -130,20 +130,20 @@ int performLanczosIteration(handle_t const& handle,
     A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n);
 
     // Orthogonalize Lanczos vector
-    CUBLAS_CHECK(cublasdot(
+    RAFT_CUBLAS_TRY(cublasdot(
       cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream));
 
     alpha = -alpha_host[0];
-    CUBLAS_CHECK(cublasaxpy(
+    RAFT_CUBLAS_TRY(cublasaxpy(
       cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
-    CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream));
+    RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream));
 
     // Check if Lanczos has converged
     if (beta_host[0] <= tol) return 0;
 
     // Normalize Lanczos vector
     alpha = 1 / beta_host[0];
-    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
+    RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
   }
 
   // -------------------------------------------------------
@@ -165,33 +165,33 @@ int performLanczosIteration(handle_t const& handle,
     // Full reorthogonalization
     //   "Twice is enough" algorithm per Kahan and Parlett
     if (reorthogonalize) {
-      CUBLAS_CHECK(cublasgemv(cublas_h,
-                              CUBLAS_OP_T,
-                              n,
-                              *iter,
-                              &one,
-                              lanczosVecs_dev,
-                              n,
-                              lanczosVecs_dev + IDX(0, *iter, n),
-                              1,
-                              &zero,
-                              work_dev,
-                              1,
-                              stream));
-
-      CUBLAS_CHECK(cublasgemv(cublas_h,
-                              CUBLAS_OP_N,
-                              n,
-                              *iter,
-                              &negOne,
-                              lanczosVecs_dev,
-                              n,
-                              work_dev,
-                              1,
-                              &one,
-                              lanczosVecs_dev + IDX(0, *iter, n),
-                              1,
-                              stream));
+      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
+                                 CUBLAS_OP_T,
+                                 n,
+                                 *iter,
+                                 &one,
+                                 lanczosVecs_dev,
+                                 n,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 &zero,
+                                 work_dev,
+                                 1,
+                                 stream));
+
+      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
+                                 CUBLAS_OP_N,
+                                 n,
+                                 *iter,
+                                 &negOne,
+                                 lanczosVecs_dev,
+                                 n,
+                                 work_dev,
+                                 1,
+                                 &one,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 stream));
 
       CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1),
                                work_dev + (*iter - 1),
@@ -199,69 +199,69 @@ int performLanczosIteration(handle_t const& handle,
                                cudaMemcpyDeviceToHost,
                                stream));
 
-      CUBLAS_CHECK(cublasgemv(cublas_h,
-                              CUBLAS_OP_T,
-                              n,
-                              *iter,
-                              &one,
-                              lanczosVecs_dev,
-                              n,
-                              lanczosVecs_dev + IDX(0, *iter, n),
-                              1,
-                              &zero,
-                              work_dev,
-                              1,
-                              stream));
-
-      CUBLAS_CHECK(cublasgemv(cublas_h,
-                              CUBLAS_OP_N,
-                              n,
-                              *iter,
-                              &negOne,
-                              lanczosVecs_dev,
-                              n,
-                              work_dev,
-                              1,
-                              &one,
-                              lanczosVecs_dev + IDX(0, *iter, n),
-                              1,
-                              stream));
+      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
+                                 CUBLAS_OP_T,
+                                 n,
+                                 *iter,
+                                 &one,
+                                 lanczosVecs_dev,
+                                 n,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 &zero,
+                                 work_dev,
+                                 1,
+                                 stream));
+
+      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
+                                 CUBLAS_OP_N,
+                                 n,
+                                 *iter,
+                                 &negOne,
+                                 lanczosVecs_dev,
+                                 n,
+                                 work_dev,
+                                 1,
+                                 &one,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 stream));
     }
 
     // Orthogonalization with 3-term recurrence relation
     else {
-      CUBLAS_CHECK(cublasdot(cublas_h,
-                             n,
-                             lanczosVecs_dev + IDX(0, *iter - 1, n),
-                             1,
-                             lanczosVecs_dev + IDX(0, *iter, n),
-                             1,
-                             alpha_host + (*iter - 1),
-                             stream));
+      RAFT_CUBLAS_TRY(cublasdot(cublas_h,
+                                n,
+                                lanczosVecs_dev + IDX(0, *iter - 1, n),
+                                1,
+                                lanczosVecs_dev + IDX(0, *iter, n),
+                                1,
+                                alpha_host + (*iter - 1),
+                                stream));
 
       auto alpha = -alpha_host[*iter - 1];
-      CUBLAS_CHECK(cublasaxpy(cublas_h,
-                              n,
-                              &alpha,
-                              lanczosVecs_dev + IDX(0, *iter - 1, n),
-                              1,
-                              lanczosVecs_dev + IDX(0, *iter, n),
-                              1,
-                              stream));
+      RAFT_CUBLAS_TRY(cublasaxpy(cublas_h,
+                                 n,
+                                 &alpha,
+                                 lanczosVecs_dev + IDX(0, *iter - 1, n),
+                                 1,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 stream));
 
       alpha = -beta_host[*iter - 2];
-      CUBLAS_CHECK(cublasaxpy(cublas_h,
-                              n,
-                              &alpha,
-                              lanczosVecs_dev + IDX(0, *iter - 2, n),
-                              1,
-                              lanczosVecs_dev + IDX(0, *iter, n),
-                              1,
-                              stream));
+      RAFT_CUBLAS_TRY(cublasaxpy(cublas_h,
+                                 n,
+                                 &alpha,
+                                 lanczosVecs_dev + IDX(0, *iter - 2, n),
+                                 1,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 stream));
     }
 
     // Compute residual
-    CUBLAS_CHECK(cublasnrm2(
+    RAFT_CUBLAS_TRY(cublasnrm2(
       cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream));
 
     // Check if Lanczos has converged
@@ -269,7 +269,7 @@ int performLanczosIteration(handle_t const& handle,
 
     // Normalize Lanczos vector
     alpha = 1 / beta_host[*iter - 1];
-    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+    RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
   }
 
   CUDA_TRY(cudaStreamSynchronize(stream));
@@ -641,36 +641,36 @@ static int lanczosRestart(handle_t const& handle,
     V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream));
 
   beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)];
-  CUBLAS_CHECK(cublasgemv(cublas_h,
-                          CUBLAS_OP_N,
-                          n,
-                          iter,
-                          beta_host + iter_new - 1,
-                          lanczosVecs_dev,
-                          n,
-                          V_dev + IDX(0, iter_new, iter),
-                          1,
-                          beta_host + iter - 1,
-                          lanczosVecs_dev + IDX(0, iter, n),
-                          1,
-                          stream));
+  RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
+                             CUBLAS_OP_N,
+                             n,
+                             iter,
+                             beta_host + iter_new - 1,
+                             lanczosVecs_dev,
+                             n,
+                             V_dev + IDX(0, iter_new, iter),
+                             1,
+                             beta_host + iter - 1,
+                             lanczosVecs_dev + IDX(0, iter, n),
+                             1,
+                             stream));
 
   // Obtain new Lanczos vectors
-  CUBLAS_CHECK(cublasgemm(cublas_h,
-                          CUBLAS_OP_N,
-                          CUBLAS_OP_N,
-                          n,
-                          iter_new,
-                          iter,
-                          &one,
-                          lanczosVecs_dev,
-                          n,
-                          V_dev,
-                          iter,
-                          &zero,
-                          work_dev,
-                          n,
-                          stream));
+  RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             n,
+                             iter_new,
+                             iter,
+                             &one,
+                             lanczosVecs_dev,
+                             n,
+                             V_dev,
+                             iter,
+                             &zero,
+                             work_dev,
+                             n,
+                             stream));
 
   CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev,
                            work_dev,
@@ -685,11 +685,12 @@ static int lanczosRestart(handle_t const& handle,
                            cudaMemcpyDeviceToDevice,
                            stream));
 
-  CUBLAS_CHECK(cublasnrm2(
+  RAFT_CUBLAS_TRY(cublasnrm2(
     cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream));
 
   auto h_beta = 1 / beta_host[iter_new - 1];
-  CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream));
+  RAFT_CUBLAS_TRY(
+    cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream));
 
   return 0;
 }
@@ -821,7 +822,7 @@ int computeSmallestEigenvectors(handle_t const& handle,
   work_host = work_host_v.data();
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // -------------------------------------------------------
   // Compute largest eigenvalue to determine shift
@@ -837,10 +838,10 @@ int computeSmallestEigenvectors(handle_t const& handle,
   // Initialize initial Lanczos vector
   curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one);
   value_type_t normQ1;
-  CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream));
+  RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream));
 
   auto h_val = 1 / normQ1;
-  CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
+  RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
 
   // Obtain tridiagonal matrix with Lanczos
   *effIter = 0;
@@ -970,21 +971,21 @@ int computeSmallestEigenvectors(handle_t const& handle,
   CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
-  CUBLAS_CHECK(cublasgemm(cublas_h,
-                          CUBLAS_OP_N,
-                          CUBLAS_OP_N,
-                          n,
-                          nEigVecs,
-                          *effIter,
-                          &one,
-                          lanczosVecs_dev,
-                          n,
-                          work_dev,
-                          *effIter,
-                          &zero,
-                          eigVecs_dev,
-                          n,
-                          stream));
+  RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             n,
+                             nEigVecs,
+                             *effIter,
+                             &one,
+                             lanczosVecs_dev,
+                             n,
+                             work_dev,
+                             *effIter,
+                             &zero,
+                             eigVecs_dev,
+                             n,
+                             stream));
 
   // Clean up and exit
   curandDestroyGenerator(randGen);
@@ -1208,7 +1209,7 @@ int computeLargestEigenvectors(handle_t const& handle,
   work_host = work_host_v.data();
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // -------------------------------------------------------
   // Compute largest eigenvalue
@@ -1222,10 +1223,10 @@ int computeLargestEigenvectors(handle_t const& handle,
   // Initialize initial Lanczos vector
   curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one);
   value_type_t normQ1;
-  CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream));
+  RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream));
 
   auto h_val = 1 / normQ1;
-  CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
+  RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
 
   // Obtain tridiagonal matrix with Lanczos
   *effIter               = 0;
@@ -1360,21 +1361,21 @@ int computeLargestEigenvectors(handle_t const& handle,
   CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
-  CUBLAS_CHECK(cublasgemm(cublas_h,
-                          CUBLAS_OP_N,
-                          CUBLAS_OP_N,
-                          n,
-                          nEigVecs,
-                          *effIter,
-                          &one,
-                          lanczosVecs_dev,
-                          n,
-                          work_dev,
-                          *effIter,
-                          &zero,
-                          eigVecs_dev,
-                          n,
-                          stream));
+  RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             n,
+                             nEigVecs,
+                             *effIter,
+                             &one,
+                             lanczosVecs_dev,
+                             n,
+                             work_dev,
+                             *effIter,
+                             &zero,
+                             eigVecs_dev,
+                             n,
+                             stream));
 
   // Clean up and exit
   curandDestroyGenerator(randGen);
diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.cuh
index 200818fdc3..4facc5e72c 100644
--- a/cpp/include/raft/linalg/map.cuh
+++ b/cpp/include/raft/linalg/map.cuh
@@ -39,7 +39,7 @@ void mapImpl(
   const int nblks = raft::ceildiv(len, (size_t)TPB);
   mapKernel<InType, OutType, MapOp, TPB, Args...>
     <<<nblks, TPB, 0, stream>>>(out, len, map, in, args...);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 /**
diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/map_then_reduce.cuh
index 78a7017c5c..2fa3ae72de 100644
--- a/cpp/include/raft/linalg/map_then_reduce.cuh
+++ b/cpp/include/raft/linalg/map_then_reduce.cuh
@@ -88,7 +88,7 @@ void mapThenReduceImpl(OutType* out,
   const int nblks = raft::ceildiv(len, (size_t)TPB);
   mapThenReduceKernel<InType, OutType, MapOp, ReduceLambda, TPB, Args...>
     <<<nblks, TPB, 0, stream>>>(out, len, neutral, map, op, in, args...);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 /**
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index 81c1919b2e..bd80cf5d02 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -90,7 +90,7 @@ void matrixVectorOpImpl(Type* out,
   IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
   matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
     <<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 /**
@@ -211,7 +211,7 @@ void matrixVectorOpImpl(Type* out,
   IdxType nblks = raft::ceildiv(N * D, (IdxType)TPB);
   matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
     <<<nblks, TPB, 0, stream>>>(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 /**
diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh
index c85cfda934..2870d6d072 100644
--- a/cpp/include/raft/linalg/qr.cuh
+++ b/cpp/include/raft/linalg/qr.cuh
@@ -52,25 +52,25 @@ void qrGetQ(const raft::handle_t& handle,
 
   int m = n_rows, n = n_cols;
   int k = min(m, n);
-  CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
 
   rmm::device_uvector<math_t> tau(k, stream);
-  CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream));
 
   rmm::device_scalar<int> devInfo(stream);
   int Lwork;
 
-  CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork));
+  RAFT_CUSOLVER_TRY(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork));
   rmm::device_uvector<math_t> workspace(Lwork, stream);
-  CUSOLVER_CHECK(cusolverDngeqrf(
+  RAFT_CUSOLVER_TRY(cusolverDngeqrf(
     cusolverH, m, n, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
   /// @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail.
 #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020
-  CUDA_CHECK(cudaDeviceSynchronize());
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
 #endif
-  CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork));
+  RAFT_CUSOLVER_TRY(cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork));
   workspace.resize(Lwork, stream);
-  CUSOLVER_CHECK(cusolverDnorgqr(
+  RAFT_CUSOLVER_TRY(cusolverDnorgqr(
     cusolverH, m, n, k, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
 }
 
@@ -98,52 +98,52 @@ void qrGetQR(const raft::handle_t& handle,
   int m = n_rows, n = n_cols;
   rmm::device_uvector<math_t> R_full(m * n, stream);
   rmm::device_uvector<math_t> tau(min(m, n), stream);
-  CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream));
   int R_full_nrows = m, R_full_ncols = n;
-  CUDA_CHECK(
+  RAFT_CUDA_TRY(
     cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
 
   int Lwork;
   rmm::device_scalar<int> devInfo(stream);
 
-  CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(
+  RAFT_CUSOLVER_TRY(cusolverDngeqrf_bufferSize(
     cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, &Lwork));
   rmm::device_uvector<math_t> workspace(Lwork, stream);
-  CUSOLVER_CHECK(cusolverDngeqrf(cusolverH,
-                                 R_full_nrows,
-                                 R_full_ncols,
-                                 R_full.data(),
-                                 R_full_nrows,
-                                 tau.data(),
-                                 workspace.data(),
-                                 Lwork,
-                                 devInfo.data(),
-                                 stream));
+  RAFT_CUSOLVER_TRY(cusolverDngeqrf(cusolverH,
+                                    R_full_nrows,
+                                    R_full_ncols,
+                                    R_full.data(),
+                                    R_full_nrows,
+                                    tau.data(),
+                                    workspace.data(),
+                                    Lwork,
+                                    devInfo.data(),
+                                    stream));
   // @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail.
 #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020
-  CUDA_CHECK(cudaDeviceSynchronize());
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
 #endif
 
   raft::matrix::copyUpperTriangular(R_full.data(), R, m, n, stream);
 
-  CUDA_CHECK(
+  RAFT_CUDA_TRY(
     cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
   int Q_nrows = m, Q_ncols = n;
 
-  CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(
+  RAFT_CUSOLVER_TRY(cusolverDnorgqr_bufferSize(
     cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), &Lwork));
   workspace.resize(Lwork, stream);
-  CUSOLVER_CHECK(cusolverDnorgqr(cusolverH,
-                                 Q_nrows,
-                                 Q_ncols,
-                                 min(Q_ncols, Q_nrows),
-                                 Q,
-                                 Q_nrows,
-                                 tau.data(),
-                                 workspace.data(),
-                                 Lwork,
-                                 devInfo.data(),
-                                 stream));
+  RAFT_CUSOLVER_TRY(cusolverDnorgqr(cusolverH,
+                                    Q_nrows,
+                                    Q_ncols,
+                                    min(Q_ncols, Q_nrows),
+                                    Q,
+                                    Q_nrows,
+                                    tau.data(),
+                                    workspace.data(),
+                                    Lwork,
+                                    devInfo.data(),
+                                    stream));
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh
index 43060d0818..7ffcb734f8 100644
--- a/cpp/include/raft/linalg/subtract.cuh
+++ b/cpp/include/raft/linalg/subtract.cuh
@@ -98,7 +98,7 @@ void subtractDevScalar(math_t* outDev,
   const IdxType nblks = raft::ceildiv(len, (IdxType)TPB);
   subtract_dev_scalar_kernel<math_t>
     <<<nblks, TPB, 0, stream>>>(outDev, inDev, singleScalarDev, len);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh
index e14a5b6a50..c4dd8a3fd4 100644
--- a/cpp/include/raft/linalg/svd.cuh
+++ b/cpp/include/raft/linalg/svd.cuh
@@ -82,7 +82,7 @@ void svdQR(const raft::handle_t& handle,
   T* d_rwork = nullptr;
 
   int lwork = 0;
-  CUSOLVER_CHECK(cusolverDngesvd_bufferSize<T>(cusolverH, n_rows, n_cols, &lwork));
+  RAFT_CUSOLVER_TRY(cusolverDngesvd_bufferSize<T>(cusolverH, n_rows, n_cols, &lwork));
   rmm::device_uvector<T> d_work(lwork, stream);
 
   char jobu  = 'S';
@@ -98,32 +98,32 @@ void svdQR(const raft::handle_t& handle,
     strcpy(&jobvt, &new_vt);
   }
 
-  CUSOLVER_CHECK(cusolverDngesvd(cusolverH,
-                                 jobu,
-                                 jobvt,
-                                 m,
-                                 n,
-                                 in,
-                                 m,
-                                 sing_vals,
-                                 left_sing_vecs,
-                                 m,
-                                 right_sing_vecs,
-                                 n,
-                                 d_work.data(),
-                                 lwork,
-                                 d_rwork,
-                                 devInfo.data(),
-                                 stream));
+  RAFT_CUSOLVER_TRY(cusolverDngesvd(cusolverH,
+                                    jobu,
+                                    jobvt,
+                                    m,
+                                    n,
+                                    in,
+                                    m,
+                                    sing_vals,
+                                    left_sing_vecs,
+                                    m,
+                                    right_sing_vecs,
+                                    n,
+                                    d_work.data(),
+                                    lwork,
+                                    d_rwork,
+                                    devInfo.data(),
+                                    stream));
 
   // Transpose the right singular vector back
   if (trans_right) raft::linalg::transpose(right_sing_vecs, n_cols, stream);
 
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 
   int dev_info;
   raft::update_host(&dev_info, devInfo.data(), 1, stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   ASSERT(dev_info == 0,
          "svd.cuh: svd couldn't converge to a solution. "
          "This usually occurs when some of the features do not vary enough.");
@@ -222,9 +222,9 @@ void svdJacobi(const raft::handle_t& handle,
 
   gesvdjInfo_t gesvdj_params = NULL;
 
-  CUSOLVER_CHECK(cusolverDnCreateGesvdjInfo(&gesvdj_params));
-  CUSOLVER_CHECK(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));
-  CUSOLVER_CHECK(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, max_sweeps));
+  RAFT_CUSOLVER_TRY(cusolverDnCreateGesvdjInfo(&gesvdj_params));
+  RAFT_CUSOLVER_TRY(cusolverDnXgesvdjSetTolerance(gesvdj_params, tol));
+  RAFT_CUSOLVER_TRY(cusolverDnXgesvdjSetMaxSweeps(gesvdj_params, max_sweeps));
 
   int m = n_rows;
   int n = n_cols;
@@ -234,42 +234,42 @@ void svdJacobi(const raft::handle_t& handle,
   int lwork = 0;
   int econ  = 1;
 
-  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize(cusolverH,
-                                                           CUSOLVER_EIG_MODE_VECTOR,
-                                                           econ,
-                                                           m,
-                                                           n,
-                                                           in,
-                                                           m,
-                                                           sing_vals,
-                                                           left_sing_vecs,
-                                                           m,
-                                                           right_sing_vecs,
-                                                           n,
-                                                           &lwork,
-                                                           gesvdj_params));
+  RAFT_CUSOLVER_TRY(raft::linalg::cusolverDngesvdj_bufferSize(cusolverH,
+                                                              CUSOLVER_EIG_MODE_VECTOR,
+                                                              econ,
+                                                              m,
+                                                              n,
+                                                              in,
+                                                              m,
+                                                              sing_vals,
+                                                              left_sing_vecs,
+                                                              m,
+                                                              right_sing_vecs,
+                                                              n,
+                                                              &lwork,
+                                                              gesvdj_params));
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
 
-  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj(cusolverH,
-                                                CUSOLVER_EIG_MODE_VECTOR,
-                                                econ,
-                                                m,
-                                                n,
-                                                in,
-                                                m,
-                                                sing_vals,
-                                                left_sing_vecs,
-                                                m,
-                                                right_sing_vecs,
-                                                n,
-                                                d_work.data(),
-                                                lwork,
-                                                devInfo.data(),
-                                                gesvdj_params,
-                                                stream));
-
-  CUSOLVER_CHECK(cusolverDnDestroyGesvdjInfo(gesvdj_params));
+  RAFT_CUSOLVER_TRY(raft::linalg::cusolverDngesvdj(cusolverH,
+                                                   CUSOLVER_EIG_MODE_VECTOR,
+                                                   econ,
+                                                   m,
+                                                   n,
+                                                   in,
+                                                   m,
+                                                   sing_vals,
+                                                   left_sing_vecs,
+                                                   m,
+                                                   right_sing_vecs,
+                                                   n,
+                                                   d_work.data(),
+                                                   lwork,
+                                                   devInfo.data(),
+                                                   gesvdj_params,
+                                                   stream));
+
+  RAFT_CUSOLVER_TRY(cusolverDnDestroyGesvdjInfo(gesvdj_params));
 }
 
 /**
@@ -349,8 +349,8 @@ bool evaluateSVDByL2Norm(const raft::handle_t& handle,
   // form product matrix
   rmm::device_uvector<math_t> P_d(m * n, stream);
   rmm::device_uvector<math_t> S_mat(k * k, stream);
-  CUDA_CHECK(cudaMemsetAsync(P_d.data(), 0, sizeof(math_t) * m * n, stream));
-  CUDA_CHECK(cudaMemsetAsync(S_mat.data(), 0, sizeof(math_t) * k * k, stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(P_d.data(), 0, sizeof(math_t) * m * n, stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(S_mat.data(), 0, sizeof(math_t) * k * k, stream));
 
   raft::matrix::initializeDiagonalMatrix(S_vec, S_mat.data(), k, k, stream);
   svdReconstruction(handle, U, S_mat.data(), V, P_d.data(), m, n, k, stream);
@@ -365,22 +365,22 @@ bool evaluateSVDByL2Norm(const raft::handle_t& handle,
   // calculate percent error
   const math_t alpha = 1.0, beta = -1.0;
   rmm::device_uvector<math_t> A_minus_P(m * n, stream);
-  CUDA_CHECK(cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream));
-
-  CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH,
-                                        CUBLAS_OP_N,
-                                        CUBLAS_OP_N,
-                                        m,
-                                        n,
-                                        &alpha,
-                                        A_d,
-                                        m,
-                                        &beta,
-                                        P_d.data(),
-                                        m,
-                                        A_minus_P.data(),
-                                        m,
-                                        stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream));
+
+  RAFT_CUBLAS_TRY(raft::linalg::cublasgeam(cublasH,
+                                           CUBLAS_OP_N,
+                                           CUBLAS_OP_N,
+                                           m,
+                                           n,
+                                           &alpha,
+                                           A_d,
+                                           m,
+                                           &beta,
+                                           P_d.data(),
+                                           m,
+                                           A_minus_P.data(),
+                                           m,
+                                           stream));
 
   math_t norm_A_minus_P = raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream);
   math_t percent_error  = 100.0 * norm_A_minus_P / normA;
diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/transpose.h
index e84ddd1166..63dbae1c8a 100644
--- a/cpp/include/raft/linalg/transpose.h
+++ b/cpp/include/raft/linalg/transpose.h
@@ -47,20 +47,20 @@ void transpose(const raft::handle_t& handle,
 
   const math_t alpha = 1.0;
   const math_t beta  = 0.0;
-  CUBLAS_CHECK(raft::linalg::cublasgeam(cublas_h,
-                                        CUBLAS_OP_T,
-                                        CUBLAS_OP_N,
-                                        out_n_rows,
-                                        out_n_cols,
-                                        &alpha,
-                                        in,
-                                        n_rows,
-                                        &beta,
-                                        out,
-                                        out_n_rows,
-                                        out,
-                                        out_n_rows,
-                                        stream));
+  RAFT_CUBLAS_TRY(raft::linalg::cublasgeam(cublas_h,
+                                           CUBLAS_OP_T,
+                                           CUBLAS_OP_N,
+                                           out_n_rows,
+                                           out_n_cols,
+                                           &alpha,
+                                           in,
+                                           n_rows,
+                                           &beta,
+                                           out,
+                                           out_n_rows,
+                                           out,
+                                           out_n_rows,
+                                           stream));
 }
 
 /**
diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh
index 198b9b2b10..d10bc859fe 100644
--- a/cpp/include/raft/linalg/unary_op.cuh
+++ b/cpp/include/raft/linalg/unary_op.cuh
@@ -47,7 +47,7 @@ void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStr
   const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
   unaryOpKernel<InType, VecLen, Lambda, OutType, IdxType>
     <<<nblks, TPB, 0, stream>>>(out, in, len, op);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 /**
@@ -122,7 +122,7 @@ void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
   if (len <= 0) return;  // silently skip in case of 0 length input
   auto nblks = raft::ceildiv<IdxType>(len, TPB);
   writeOnlyUnaryOpKernel<OutType, Lambda, IdxType><<<nblks, TPB, 0, stream>>>(out, len, op);
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh
index 4b56f3986f..aa0947b3f0 100644
--- a/cpp/include/raft/matrix/detail/math.cuh
+++ b/cpp/include/raft/matrix/detail/math.cuh
@@ -59,7 +59,7 @@ void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t
   } else {
     argmaxKernel<math_t, 256><<<N, 256, 0, stream>>>(in, D, N, out);
   }
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 // Utility kernel needed for signFlip.
@@ -109,7 +109,7 @@ void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream)
   } else {
     signFlipKernel<math_t, 256><<<N, 256, 0, stream>>>(data, D, N);
   }
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 }  // end namespace detail
diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index 709570ae56..cf908c5e6d 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -41,7 +41,7 @@ void copyRows(const m_t* in,
     const idx_t TPB = 256;
     cache::get_vecs<<<raft::ceildiv(n_rows_indices * n_cols, TPB), TPB, 0, stream>>>(
       in, n_cols, indices, n_rows_indices, out);
-    CUDA_CHECK(cudaPeekAtLastError());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
     return;
   }
 
diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp
index c4cd30b7bc..a7a43cff6e 100644
--- a/cpp/include/raft/matrix/matrix.hpp
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -285,7 +285,7 @@ m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t st
 {
   cublasHandle_t cublasH = handle.get_cublas_handle();
   m_t normval            = 0;
-  CUBLAS_CHECK(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream));
+  RAFT_CUBLAS_TRY(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream));
   return normval;
 }
 
diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp
index 38ef59aadf..6998c1f186 100644
--- a/cpp/include/raft/mr/buffer_base.hpp
+++ b/cpp/include/raft/mr/buffer_base.hpp
@@ -64,7 +64,7 @@ class buffer_base {
     if (capacity_ > 0) {
       data_ =
         static_cast<value_type*>(allocator_->allocate(capacity_ * sizeof(value_type), stream_));
-      CUDA_CHECK(cudaStreamSynchronize(stream_));
+      RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
     }
   }
 
@@ -198,11 +198,11 @@ class buffer_base {
   {
     if (stream_ != stream) {
       cudaEvent_t event;
-      CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
-      CUDA_CHECK(cudaEventRecord(event, stream_));
-      CUDA_CHECK(cudaStreamWaitEvent(stream, event, 0));
+      RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+      RAFT_CUDA_TRY(cudaEventRecord(event, stream_));
+      RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, event, 0));
       stream_ = stream;
-      CUDA_CHECK(cudaEventDestroy(event));
+      RAFT_CUDA_TRY(cudaEventDestroy(event));
     }
   }
 };  // class buffer_base
diff --git a/cpp/include/raft/mr/host/allocator.hpp b/cpp/include/raft/mr/host/allocator.hpp
index 7d31248e7f..71b5465451 100644
--- a/cpp/include/raft/mr/host/allocator.hpp
+++ b/cpp/include/raft/mr/host/allocator.hpp
@@ -44,7 +44,7 @@ class default_allocator : public allocator {
   void* allocate(std::size_t n, cudaStream_t stream) override
   {
     void* ptr = nullptr;
-    CUDA_CHECK(cudaMallocHost(&ptr, n));
+    RAFT_CUDA_TRY(cudaMallocHost(&ptr, n));
     return ptr;
   }
 
@@ -52,7 +52,7 @@ class default_allocator : public allocator {
   {
     // Must call _NO_THROW here since this is called frequently from object
     // destructors which are "nothrow" by default
-    CUDA_CHECK_NO_THROW(cudaFreeHost(p));
+    RAFT_CUDA_TRY_NO_THROW(cudaFreeHost(p));
   }
 };  // class default_allocator
 
diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh
index 0f3b58975e..cdebd650f9 100644
--- a/cpp/include/raft/random/detail/rng_impl.cuh
+++ b/cpp/include/raft/random/detail/rng_impl.cuh
@@ -505,7 +505,7 @@ class RngImpl {
   void fill(Type* ptr, LenType len, Type val, cudaStream_t stream)
   {
     detail::constFillKernel<Type><<<nBlocks, NumThreads, 0, stream>>>(ptr, len, val);
-    CUDA_CHECK(cudaPeekAtLastError());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
 
   template <typename Type, typename OutType = bool, typename LenType = int>
@@ -651,7 +651,7 @@ class RngImpl {
     rmm::device_uvector<char> workspace(0, stream);
     sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, (int)len, stream);
     if (outIdx != nullptr) {
-      CUDA_CHECK(cudaMemcpyAsync(
+      RAFT_CUDA_TRY(cudaMemcpyAsync(
         outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, cudaMemcpyDeviceToDevice, stream));
     }
     raft::scatter<DataT, IdxT>(out, in, outIdxPtr, sampledLen, stream);
@@ -734,7 +734,7 @@ class RngImpl {
         break;
       default: ASSERT(false, "randImpl: Incorrect generator type! %d", type);
     };
-    CUDA_CHECK(cudaGetLastError());
+    RAFT_CUDA_TRY(cudaGetLastError());
     offset = newOffset;
   }
 
@@ -766,7 +766,7 @@ class RngImpl {
         break;
       default: ASSERT(false, "rand2Impl: Incorrect generator type! %d", type);
     };
-    CUDA_CHECK(cudaGetLastError());
+    RAFT_CUDA_TRY(cudaGetLastError());
     offset = newOffset;
   }
 };
diff --git a/cpp/include/raft/sparse/convert/detail/coo.cuh b/cpp/include/raft/sparse/convert/detail/coo.cuh
index 7ad24496ab..9a2eef89d2 100644
--- a/cpp/include/raft/sparse/convert/detail/coo.cuh
+++ b/cpp/include/raft/sparse/convert/detail/coo.cuh
@@ -71,7 +71,7 @@ void csr_to_coo(
 
   csr_to_coo_kernel<value_idx, TPB_X><<<grid, blk, 0, stream>>>(row_ind, m, coo_rows, nnz);
 
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 };  // end NAMESPACE detail
diff --git a/cpp/include/raft/sparse/convert/detail/csr.cuh b/cpp/include/raft/sparse/convert/detail/csr.cuh
index 241b5730c0..2641fae0b8 100644
--- a/cpp/include/raft/sparse/convert/detail/csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/csr.cuh
@@ -57,20 +57,20 @@ void coo_to_csr(const raft::handle_t& handle,
   auto stream         = handle.get_stream();
   auto cusparseHandle = handle.get_cusparse_handle();
   rmm::device_uvector<int> dstRows(nnz, stream);
-  CUDA_CHECK(
+  RAFT_CUDA_TRY(
     cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream));
-  CUDA_CHECK(
+  RAFT_CUDA_TRY(
     cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream));
   auto buffSize = raft::sparse::cusparsecoosort_bufferSizeExt(
     cusparseHandle, m, m, nnz, srcRows, srcCols, stream);
   rmm::device_uvector<char> pBuffer(buffSize, stream);
   rmm::device_uvector<int> P(nnz, stream);
-  CUSPARSE_CHECK(cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data()));
+  RAFT_CUSPARSE_TRY(cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data()));
   raft::sparse::cusparsecoosortByRow(
     cusparseHandle, m, m, nnz, dstRows.data(), dstCols, P.data(), pBuffer.data(), stream);
   raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(), stream);
   raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m, dst_offsets, stream);
-  CUDA_CHECK(cudaDeviceSynchronize());
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
 }
 
 /**
@@ -176,7 +176,7 @@ void sorted_coo_to_csr(const T* rows, int nnz, T* row_ind, int m, cudaStream_t s
 {
   rmm::device_uvector<T> row_counts(m, stream);
 
-  CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream));
 
   linalg::coo_degree(rows, nnz, row_counts.data(), stream);
 
diff --git a/cpp/include/raft/sparse/convert/detail/dense.cuh b/cpp/include/raft/sparse/convert/detail/dense.cuh
index ca4a567355..1f3e170b33 100644
--- a/cpp/include/raft/sparse/convert/detail/dense.cuh
+++ b/cpp/include/raft/sparse/convert/detail/dense.cuh
@@ -92,18 +92,18 @@ void csr_to_dense(cusparseHandle_t handle,
      * If we need col-major, use cusparse.
      */
     cusparseMatDescr_t out_mat;
-    CUSPARSE_CHECK(cusparseCreateMatDescr(&out_mat));
-    CUSPARSE_CHECK(cusparseSetMatIndexBase(out_mat, CUSPARSE_INDEX_BASE_ZERO));
-    CUSPARSE_CHECK(cusparseSetMatType(out_mat, CUSPARSE_MATRIX_TYPE_GENERAL));
+    RAFT_CUSPARSE_TRY(cusparseCreateMatDescr(&out_mat));
+    RAFT_CUSPARSE_TRY(cusparseSetMatIndexBase(out_mat, CUSPARSE_INDEX_BASE_ZERO));
+    RAFT_CUSPARSE_TRY(cusparseSetMatType(out_mat, CUSPARSE_MATRIX_TYPE_GENERAL));
 
-    CUSPARSE_CHECK(raft::sparse::cusparsecsr2dense(
+    RAFT_CUSPARSE_TRY(raft::sparse::cusparsecsr2dense(
       handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out, lda, stream));
 
-    CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(out_mat));
+    RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyMatDescr(out_mat));
 
   } else {
     int blockdim = block_dim(ncols);
-    CUDA_CHECK(cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream));
     csr_to_dense_warp_per_row_kernel<<<nrows, blockdim, 0, stream>>>(
       ncols, csr_data, csr_indptr, csr_indices, out);
   }
diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h
index 29a244a962..e2306686ce 100644
--- a/cpp/include/raft/sparse/cusparse_wrappers.h
+++ b/cpp/include/raft/sparse/cusparse_wrappers.h
@@ -80,7 +80,7 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err)
  * Invokes a cuSparse runtime API function call, if the call does not return
  * CUSPARSE_STATUS_SUCCESS, throws an exception detailing the cuSparse error that occurred
  */
-#define CUSPARSE_TRY(call)                                                   \
+#define RAFT_CUSPARSE_TRY(call)                                              \
   do {                                                                       \
     cusparseStatus_t const status = (call);                                  \
     if (CUSPARSE_STATUS_SUCCESS != status) {                                 \
@@ -95,12 +95,19 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err)
     }                                                                        \
   } while (0)
 
-/** FIXME: temporary alias for cuML compatibility */
+// FIXME: Remove after consumer rename
+#ifndef CUSPARSE_TRY
+#define CUSPARSE_TRY(call) RAFT_CUSPARSE_TRY(call)
+#endif
+
+// FIXME: Remove after consumer rename
+#ifndef CUSPARSE_CHECK
 #define CUSPARSE_CHECK(call) CUSPARSE_TRY(call)
+#endif
 
 //@todo: use logger here once logging is enabled
 /** check for cusparse runtime API errors but do not assert */
-#define CUSPARSE_CHECK_NO_THROW(call)                              \
+#define RAFT_CUSPARSE_TRY_NO_THROW(call)                           \
   do {                                                             \
     cusparseStatus_t err = call;                                   \
     if (err != CUSPARSE_STATUS_SUCCESS) {                          \
@@ -111,6 +118,11 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err)
     }                                                              \
   } while (0)
 
+// FIXME: Remove after consumer rename
+#ifndef CUSPARSE_CHECK_NO_THROW
+#define CUSPARSE_CHECK_NO_THROW(call) RAFT_CUSPARSE_TRY_NO_THROW(call)
+#endif
+
 namespace raft {
 namespace sparse {
 
diff --git a/cpp/include/raft/sparse/detail/coo.cuh b/cpp/include/raft/sparse/detail/coo.cuh
index ded0b2c36a..ccf9a1dd0a 100644
--- a/cpp/include/raft/sparse/detail/coo.cuh
+++ b/cpp/include/raft/sparse/detail/coo.cuh
@@ -104,9 +104,11 @@ class COO {
 
   void init_arrays(cudaStream_t stream)
   {
-    CUDA_CHECK(cudaMemsetAsync(this->rows_arr.data(), 0, this->nnz * sizeof(Index_Type), stream));
-    CUDA_CHECK(cudaMemsetAsync(this->cols_arr.data(), 0, this->nnz * sizeof(Index_Type), stream));
-    CUDA_CHECK(cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream));
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(this->rows_arr.data(), 0, this->nnz * sizeof(Index_Type), stream));
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(this->cols_arr.data(), 0, this->nnz * sizeof(Index_Type), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream));
   }
 
   ~COO() {}
@@ -156,7 +158,7 @@ class COO {
   {
     if (c.validate_size() && c.validate_mem()) {
       cudaStream_t stream;
-      CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+      RAFT_CUDA_TRY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 
       out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream) << std::endl;
       out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream) << std::endl;
@@ -165,7 +167,7 @@ class COO {
       out << "n_rows=" << c.n_rows << std::endl;
       out << "n_cols=" << c.n_cols << std::endl;
 
-      CUDA_CHECK(cudaStreamDestroy(stream));
+      RAFT_CUDA_TRY(cudaStreamDestroy(stream));
     } else {
       out << "Cannot print COO object: Uninitialized or invalid." << std::endl;
     }
diff --git a/cpp/include/raft/sparse/detail/csr.cuh b/cpp/include/raft/sparse/detail/csr.cuh
index 4c98b27318..62835e3bc2 100644
--- a/cpp/include/raft/sparse/detail/csr.cuh
+++ b/cpp/include/raft/sparse/detail/csr.cuh
@@ -151,20 +151,20 @@ void weak_cc_batched(Index_* labels,
   Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
   weak_cc_init_all_kernel<Index_, TPB_X>
     <<<raft::ceildiv(N, Index_(TPB_X)), TPB_X, 0, stream>>>(labels, N, MAX_LABEL, filter_op);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   int n_iters = 0;
   do {
-    CUDA_CHECK(cudaMemsetAsync(state->m, false, sizeof(bool), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(state->m, false, sizeof(bool), stream));
 
     weak_cc_label_device<Index_, TPB_X>
       <<<raft::ceildiv(batch_size, Index_(TPB_X)), TPB_X, 0, stream>>>(
         labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id, batch_size, N, filter_op);
-    CUDA_CHECK(cudaPeekAtLastError());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
 
     //** Updating m *
     raft::update_host(&host_m, state->m, 1, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
     n_iters++;
   } while (host_m);
diff --git a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
index 7527e876ec..ad97e0853a 100644
--- a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
@@ -99,8 +99,8 @@ void compute_bin_distance(value_t* out,
 {
   rmm::device_uvector<value_t> Q_norms(m, stream);
   rmm::device_uvector<value_t> R_norms(n, stream);
-  CUDA_CHECK(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
-  CUDA_CHECK(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
+  RAFT_CUDA_TRY(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
+  RAFT_CUDA_TRY(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
 
   compute_binary_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
     Q_norms.data(), Q_coo_rows, Q_data, Q_nnz);
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
index e69a292ef1..fe5ce9c67a 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
@@ -56,10 +56,10 @@ inline void balanced_coo_pairwise_generalized_spmv(
   strategy_t strategy,
   int chunk_size = 500000)
 {
-  CUDA_CHECK(cudaMemsetAsync(out_dists,
-                             0,
-                             sizeof(value_t) * config_.a_nrows * config_.b_nrows,
-                             config_.handle.get_stream()));
+  RAFT_CUDA_TRY(cudaMemsetAsync(out_dists,
+                                0,
+                                sizeof(value_t) * config_.a_nrows * config_.b_nrows,
+                                config_.handle.get_stream()));
 
   strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size);
 };
@@ -112,10 +112,10 @@ inline void balanced_coo_pairwise_generalized_spmv(
   write_f write_func,
   int chunk_size = 500000)
 {
-  CUDA_CHECK(cudaMemsetAsync(out_dists,
-                             0,
-                             sizeof(value_t) * config_.a_nrows * config_.b_nrows,
-                             config_.handle.get_stream()));
+  RAFT_CUDA_TRY(cudaMemsetAsync(out_dists,
+                                0,
+                                sizeof(value_t) * config_.a_nrows * config_.b_nrows,
+                                config_.handle.get_stream()));
 
   int max_cols = max_cols_per_block<value_idx, value_t>();
 
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh
index 9b1dfff022..c4e39c11a0 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh
@@ -53,16 +53,16 @@ class coo_spmv_strategy {
                       int n_blocks,
                       int n_blocks_per_row)
   {
-    CUDA_CHECK(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel<strategy_t,
-                                                                           indptr_it,
-                                                                           value_idx,
-                                                                           value_t,
-                                                                           false,
-                                                                           tpb,
-                                                                           product_f,
-                                                                           accum_f,
-                                                                           write_f>,
-                                      cudaFuncCachePreferShared));
+    RAFT_CUDA_TRY(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel<strategy_t,
+                                                                              indptr_it,
+                                                                              value_idx,
+                                                                              value_t,
+                                                                              false,
+                                                                              tpb,
+                                                                              product_f,
+                                                                              accum_f,
+                                                                              write_f>,
+                                         cudaFuncCachePreferShared));
 
     balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx, value_t, false, tpb>
       <<<n_blocks, tpb, smem, config.handle.get_stream()>>>(strategy,
@@ -103,16 +103,16 @@ class coo_spmv_strategy {
                           int n_blocks,
                           int n_blocks_per_row)
   {
-    CUDA_CHECK(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel<strategy_t,
-                                                                           indptr_it,
-                                                                           value_idx,
-                                                                           value_t,
-                                                                           true,
-                                                                           tpb,
-                                                                           product_f,
-                                                                           accum_f,
-                                                                           write_f>,
-                                      cudaFuncCachePreferShared));
+    RAFT_CUDA_TRY(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel<strategy_t,
+                                                                              indptr_it,
+                                                                              value_idx,
+                                                                              value_t,
+                                                                              true,
+                                                                              tpb,
+                                                                              product_f,
+                                                                              accum_f,
+                                                                              write_f>,
+                                         cudaFuncCachePreferShared));
 
     balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx, value_t, true, tpb>
       <<<n_blocks, tpb, smem, config.handle.get_stream()>>>(strategy,
diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
index 5ee2d250fb..2a9c5363dd 100644
--- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
@@ -142,8 +142,8 @@ void compute_l2(value_t* out,
 {
   rmm::device_uvector<value_t> Q_sq_norms(m, stream);
   rmm::device_uvector<value_t> R_sq_norms(n, stream);
-  CUDA_CHECK(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
-  CUDA_CHECK(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
+  RAFT_CUDA_TRY(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
+  RAFT_CUDA_TRY(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
 
   compute_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
     Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz);
@@ -190,11 +190,11 @@ void compute_corr(value_t* out,
   rmm::device_uvector<value_t> Q_norms(m, stream);
   rmm::device_uvector<value_t> R_norms(n, stream);
 
-  CUDA_CHECK(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
-  CUDA_CHECK(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
+  RAFT_CUDA_TRY(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
+  RAFT_CUDA_TRY(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
 
-  CUDA_CHECK(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
-  CUDA_CHECK(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
+  RAFT_CUDA_TRY(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
+  RAFT_CUDA_TRY(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
 
   compute_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
     Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz);
diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
index 207cca7287..1952f19900 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
@@ -119,7 +119,7 @@ void build_dendrogram_host(const handle_t& handle,
   update_host(mst_dst_h.data(), cols, n_edges, stream);
   update_host(mst_weights_h.data(), data, n_edges, stream);
 
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
   std::vector<value_idx> children_h(n_edges * 2);
   std::vector<value_idx> out_size_h(n_edges);
diff --git a/cpp/include/raft/sparse/linalg/detail/add.cuh b/cpp/include/raft/sparse/linalg/detail/add.cuh
index 6ef619108a..61b72596b5 100644
--- a/cpp/include/raft/sparse/linalg/detail/add.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/add.cuh
@@ -189,14 +189,14 @@ size_t csr_add_calc_inds(const int* a_ind,
   dim3 blk(TPB_X, 1, 1);
 
   rmm::device_uvector<int> row_counts(m + 1, stream);
-  CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream));
 
   csr_add_calc_row_counts_kernel<T, TPB_X><<<grid, blk, 0, stream>>>(
     a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, row_counts.data());
 
   int cnnz = 0;
   raft::update_host(&cnnz, row_counts.data() + m, 1, stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
   // create csr compressed row index from row counts
   thrust::device_ptr<int> row_counts_d = thrust::device_pointer_cast(row_counts.data());
@@ -243,7 +243,7 @@ void csr_add_finalize(const int* a_ind,
 
   csr_add_kernel<T, TPB_X><<<grid, blk, 0, stream>>>(
     a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, c_ind, c_indptr, c_val);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 };  // end NAMESPACE detail
diff --git a/cpp/include/raft/sparse/linalg/detail/degree.cuh b/cpp/include/raft/sparse/linalg/detail/degree.cuh
index 08a140d420..dfbeb09a5b 100644
--- a/cpp/include/raft/sparse/linalg/detail/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/degree.cuh
@@ -61,7 +61,7 @@ void coo_degree(const T* rows, int nnz, T* results, cudaStream_t stream)
   dim3 blk_rc(TPB_X, 1, 1);
 
   coo_degree_kernel<TPB_X><<<grid_rc, blk_rc, 0, stream>>>(rows, nnz, results);
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 template <int TPB_X = 64, typename T>
diff --git a/cpp/include/raft/sparse/linalg/detail/norm.cuh b/cpp/include/raft/sparse/linalg/detail/norm.cuh
index 742d914951..2ba661c938 100644
--- a/cpp/include/raft/sparse/linalg/detail/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/norm.cuh
@@ -100,7 +100,7 @@ void csr_row_normalize_l1(const int* ia,  // csr row ex_scan (sorted by row)
   dim3 blk(TPB_X, 1, 1);
 
   csr_row_normalize_l1_kernel<TPB_X, T><<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 template <int TPB_X = 64, typename T>
@@ -167,7 +167,7 @@ void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
   dim3 blk(TPB_X, 1, 1);
 
   csr_row_normalize_max_kernel<TPB_X, T><<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 };  // end NAMESPACE detail
diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
index 521b1ea7ec..016dccd161 100644
--- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -51,7 +51,7 @@ void fit_embedding(const raft::handle_t& handle,
   rmm::device_uvector<T> eigVecs(n * (n_components + 1), stream);
   rmm::device_uvector<int> labels(n, stream);
 
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
   /**
    * Raft spectral clustering
@@ -105,7 +105,7 @@ void fit_embedding(const raft::handle_t& handle,
 
   raft::copy<T>(out, eigVecs.data() + n, n * n_components, stream);
 
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 };  // namespace detail
diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
index ea7f2f2fad..85c47ef97b 100644
--- a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
@@ -158,7 +158,7 @@ void coo_symmetrize(COO<T>* in,
                                                             in->n_rows,
                                                             in->nnz,
                                                             reduction_op);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 /**
@@ -284,18 +284,18 @@ void from_knn_symmetrize_matrix(const value_idx* __restrict__ knn_indices,
 
   // Notice n+1 since we can reuse these arrays for transpose_edges, original_edges in step (4)
   rmm::device_uvector<value_idx> row_sizes(n, stream);
-  CUDA_CHECK(cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream));
 
   rmm::device_uvector<value_idx> row_sizes2(n, stream);
-  CUDA_CHECK(cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream));
 
   symmetric_find_size<<<numBlocks, threadsPerBlock, 0, stream>>>(
     knn_dists, knn_indices, n, k, row_sizes.data(), row_sizes2.data());
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   reduce_find_size<<<raft::ceildiv(n, (value_idx)1024), 1024, 0, stream>>>(
     n, k, row_sizes.data(), row_sizes2.data());
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   // (2) Compute final space needed (n*k + sum(row_sizes)) == 2*n*k
   // Notice we don't do any merging and leave the result as 2*NNZ
@@ -318,7 +318,7 @@ void from_knn_symmetrize_matrix(const value_idx* __restrict__ knn_indices,
   // (5) Perform final data + data.T operation in tandem with memcpying
   symmetric_sum<<<numBlocks, threadsPerBlock, 0, stream>>>(
     edges, knn_dists, knn_indices, out->vals(), out->cols(), out->rows(), n, k);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 /**
diff --git a/cpp/include/raft/sparse/linalg/detail/transpose.h b/cpp/include/raft/sparse/linalg/detail/transpose.h
index ae22a93d15..55652e2275 100644
--- a/cpp/include/raft/sparse/linalg/detail/transpose.h
+++ b/cpp/include/raft/sparse/linalg/detail/transpose.h
@@ -70,39 +70,39 @@ void csr_transpose(cusparseHandle_t handle,
 {
   size_t convert_csc_workspace_size = 0;
 
-  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize(handle,
-                                                          csr_nrows,
-                                                          csr_ncols,
-                                                          nnz,
-                                                          csr_data,
-                                                          csr_indptr,
-                                                          csr_indices,
-                                                          csc_data,
-                                                          csc_indptr,
-                                                          csc_indices,
-                                                          CUSPARSE_ACTION_NUMERIC,
-                                                          CUSPARSE_INDEX_BASE_ZERO,
-                                                          CUSPARSE_CSR2CSC_ALG1,
-                                                          &convert_csc_workspace_size,
-                                                          stream));
+  RAFT_CUSPARSE_TRY(raft::sparse::cusparsecsr2csc_bufferSize(handle,
+                                                             csr_nrows,
+                                                             csr_ncols,
+                                                             nnz,
+                                                             csr_data,
+                                                             csr_indptr,
+                                                             csr_indices,
+                                                             csc_data,
+                                                             csc_indptr,
+                                                             csc_indices,
+                                                             CUSPARSE_ACTION_NUMERIC,
+                                                             CUSPARSE_INDEX_BASE_ZERO,
+                                                             CUSPARSE_CSR2CSC_ALG1,
+                                                             &convert_csc_workspace_size,
+                                                             stream));
 
   rmm::device_uvector<char> convert_csc_workspace(convert_csc_workspace_size, stream);
 
-  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc(handle,
-                                               csr_nrows,
-                                               csr_ncols,
-                                               nnz,
-                                               csr_data,
-                                               csr_indptr,
-                                               csr_indices,
-                                               csc_data,
-                                               csc_indptr,
-                                               csc_indices,
-                                               CUSPARSE_ACTION_NUMERIC,
-                                               CUSPARSE_INDEX_BASE_ZERO,
-                                               CUSPARSE_CSR2CSC_ALG1,
-                                               convert_csc_workspace.data(),
-                                               stream));
+  RAFT_CUSPARSE_TRY(raft::sparse::cusparsecsr2csc(handle,
+                                                  csr_nrows,
+                                                  csr_ncols,
+                                                  nnz,
+                                                  csr_data,
+                                                  csr_indptr,
+                                                  csr_indices,
+                                                  csc_data,
+                                                  csc_indptr,
+                                                  csc_indices,
+                                                  CUSPARSE_ACTION_NUMERIC,
+                                                  CUSPARSE_INDEX_BASE_ZERO,
+                                                  CUSPARSE_CSR2CSC_ALG1,
+                                                  convert_csc_workspace.data(),
+                                                  stream));
 }
 
 };  // end NAMESPACE detail
diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
index 5591e15b19..5397b3fb95 100644
--- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
+++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
@@ -93,7 +93,7 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(const raft::han
 
   mst_edge_count.set_value_to_zero_async(stream);
   prev_mst_edge_count.set_value_to_zero_async(stream);
-  CUDA_CHECK(cudaMemsetAsync(mst_edge.data(), 0, mst_edge.size() * sizeof(bool), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(mst_edge.data(), 0, mst_edge.size() * sizeof(bool), stream));
 
   // Initially, color holds the vertex id as color
   auto policy = handle.get_thrust_policy();
diff --git a/cpp/include/raft/sparse/op/detail/filter.cuh b/cpp/include/raft/sparse/op/detail/filter.cuh
index 31ec1eed22..b5d819ebac 100644
--- a/cpp/include/raft/sparse/op/detail/filter.cuh
+++ b/cpp/include/raft/sparse/op/detail/filter.cuh
@@ -107,18 +107,18 @@ void coo_remove_scalar(const int* rows,
   rmm::device_uvector<int> ex_scan(n, stream);
   rmm::device_uvector<int> cur_ex_scan(n, stream);
 
-  CUDA_CHECK(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream));
-  CUDA_CHECK(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream));
 
   thrust::device_ptr<int> dev_cnnz    = thrust::device_pointer_cast(cnnz);
   thrust::device_ptr<int> dev_ex_scan = thrust::device_pointer_cast(ex_scan.data());
   thrust::exclusive_scan(rmm::exec_policy(stream), dev_cnnz, dev_cnnz + n, dev_ex_scan);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   thrust::device_ptr<int> dev_cur_cnnz    = thrust::device_pointer_cast(cur_cnnz);
   thrust::device_ptr<int> dev_cur_ex_scan = thrust::device_pointer_cast(cur_ex_scan.data());
   thrust::exclusive_scan(rmm::exec_policy(stream), dev_cur_cnnz, dev_cur_cnnz + n, dev_cur_ex_scan);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   dim3 grid(raft::ceildiv(n, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
@@ -134,7 +134,7 @@ void coo_remove_scalar(const int* rows,
                                                             dev_cur_ex_scan.get(),
                                                             n,
                                                             scalar);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 /**
@@ -151,14 +151,14 @@ void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
   rmm::device_uvector<int> row_count_nz(in->n_rows, stream);
   rmm::device_uvector<int> row_count(in->n_rows, stream);
 
-  CUDA_CHECK(cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream));
-  CUDA_CHECK(cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream));
 
   linalg::coo_degree(in->rows(), in->nnz, row_count.data(), stream);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   linalg::coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, row_count_nz.data(), stream);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   thrust::device_ptr<int> d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data());
   int out_nnz =
@@ -178,7 +178,7 @@ void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
                               scalar,
                               in->n_rows,
                               stream);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 /**
diff --git a/cpp/include/raft/sparse/op/detail/reduce.cuh b/cpp/include/raft/sparse/op/detail/reduce.cuh
index 55a8ee2948..a959e4a3f7 100644
--- a/cpp/include/raft/sparse/op/detail/reduce.cuh
+++ b/cpp/include/raft/sparse/op/detail/reduce.cuh
@@ -102,7 +102,7 @@ template <typename value_idx>
 void compute_duplicates_mask(
   value_idx* mask, const value_idx* rows, const value_idx* cols, size_t nnz, cudaStream_t stream)
 {
-  CUDA_CHECK(cudaMemsetAsync(mask, 0, nnz * sizeof(value_idx), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(mask, 0, nnz * sizeof(value_idx), stream));
 
   compute_duplicates_diffs_kernel<<<raft::ceildiv(nnz, (size_t)256), 256, 0, stream>>>(
     rows, cols, mask, nnz);
@@ -147,7 +147,7 @@ void max_duplicates(const raft::handle_t& handle,
   // compute final size
   value_idx size = 0;
   raft::update_host(&size, diff.data() + (diff.size() - 1), 1, stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   size++;
 
   out.allocate(size, m, n, true, stream);
diff --git a/cpp/include/raft/sparse/op/detail/row_op.cuh b/cpp/include/raft/sparse/op/detail/row_op.cuh
index 4fd76a0202..402e8dcce5 100644
--- a/cpp/include/raft/sparse/op/detail/row_op.cuh
+++ b/cpp/include/raft/sparse/op/detail/row_op.cuh
@@ -67,7 +67,7 @@ void csr_row_op(const Index_* row_ind, Index_ n_rows, Index_ nnz, Lambda op, cud
   dim3 blk(TPB_X, 1, 1);
   csr_row_op_kernel<Index_, TPB_X><<<grid, blk, 0, stream>>>(row_ind, n_rows, nnz, op);
 
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 };  // namespace detail
diff --git a/cpp/include/raft/sparse/op/detail/slice.h b/cpp/include/raft/sparse/op/detail/slice.h
index 4a09f4af7f..366f37bc46 100644
--- a/cpp/include/raft/sparse/op/detail/slice.h
+++ b/cpp/include/raft/sparse/op/detail/slice.h
@@ -62,7 +62,7 @@ void csr_row_slice_indptr(value_idx start_row,
   raft::update_host(start_offset, indptr + start_row, 1, stream);
   raft::update_host(stop_offset, indptr + stop_row + 1, 1, stream);
 
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
   value_idx s_offset = *start_offset;
 
diff --git a/cpp/include/raft/sparse/selection/detail/connect_components.cuh b/cpp/include/raft/sparse/selection/detail/connect_components.cuh
index 35101f1714..8f420a67f4 100644
--- a/cpp/include/raft/sparse/selection/detail/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/detail/connect_components.cuh
@@ -415,7 +415,7 @@ void connect_components(
   // compute final size
   value_idx size = 0;
   raft::update_host(&size, out_index.data() + (out_index.size() - 1), 1, stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
   size++;
 
diff --git a/cpp/include/raft/sparse/selection/detail/knn.cuh b/cpp/include/raft/sparse/selection/detail/knn.cuh
index d29fd59a88..efb8d0201d 100644
--- a/cpp/include/raft/sparse/selection/detail/knn.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn.cuh
@@ -232,7 +232,7 @@ class sparse_knn_t {
         size_t dense_size = idx_batcher.batch_rows() * query_batcher.batch_rows();
         rmm::device_uvector<value_t> batch_dists(dense_size, handle.get_stream());
 
-        CUDA_CHECK(cudaMemset(batch_dists.data(), 0, batch_dists.size() * sizeof(value_t)));
+        RAFT_CUDA_TRY(cudaMemset(batch_dists.data(), 0, batch_dists.size() * sizeof(value_t)));
 
         compute_distances(idx_batcher,
                           query_batcher,
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index 7f4e4511d2..b7f124c51e 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -124,7 +124,7 @@ void approx_knn_build_index(raft::handle_t& handle,
                             IntType D)
 {
   int device;
-  CUDA_CHECK(cudaGetDevice(&device));
+  RAFT_CUDA_TRY(cudaGetDevice(&device));
 
   faiss::gpu::StandardGpuResources* gpu_res = new faiss::gpu::StandardGpuResources();
   gpu_res->noTempMemory();
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index 47fc62066d..27a23034c5 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -621,7 +621,7 @@ void fusedL2UnexpKnnImpl(const DataT* x,
         worksize = sizeof(int32_t) * numMutexes;
         return;
       } else {
-        CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int32_t) * numMutexes, stream));
+        RAFT_CUDA_TRY(cudaMemsetAsync(workspace, 0, sizeof(int32_t) * numMutexes, stream));
       }
     }
 
@@ -645,7 +645,7 @@ void fusedL2UnexpKnnImpl(const DataT* x,
   } else {
   }
 
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 template <typename DataT,
@@ -817,7 +817,7 @@ void fusedL2ExpKnnImpl(const DataT* x,
         return;
       } else {
         mutexes = (int32_t*)((char*)workspace + normsSize);
-        CUDA_CHECK(cudaMemsetAsync(mutexes, 0, sizeof(int32_t) * numMutexes, stream));
+        RAFT_CUDA_TRY(cudaMemsetAsync(mutexes, 0, sizeof(int32_t) * numMutexes, stream));
       }
     }
 
@@ -853,7 +853,7 @@ void fusedL2ExpKnnImpl(const DataT* x,
   } else {
   }
 
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 template <typename DataT,
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index c20582df72..8962c27d52 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -140,7 +140,7 @@ inline void knn_merge_parts_impl(value_t* inK,
   knn_merge_parts_kernel<value_idx, value_t, warp_q, thread_q, n_threads>
     <<<grid, block, 0, stream>>>(
       inK, inV, outK, outV, n_samples, n_parts, kInit, vInit, k, translations);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 /**
@@ -265,7 +265,7 @@ void brute_force_knn_impl(
   }
 
   int device;
-  CUDA_CHECK(cudaGetDevice(&device));
+  RAFT_CUDA_TRY(cudaGetDevice(&device));
 
   rmm::device_uvector<std::int64_t> trans(id_ranges->size(), userStream);
   raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), userStream);
@@ -285,7 +285,7 @@ void brute_force_knn_impl(
   }
 
   // Sync user stream only if using other streams to parallelize query
-  if (n_int_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream));
+  if (n_int_streams > 0) RAFT_CUDA_TRY(cudaStreamSynchronize(userStream));
 
   for (size_t i = 0; i < input.size(); i++) {
     float* out_d_ptr   = out_D + (i * k * n);
@@ -352,14 +352,14 @@ void brute_force_knn_impl(
       }
     }
 
-    CUDA_CHECK(cudaPeekAtLastError());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
 
   // Sync internal streams if used. We don't need to
   // sync the user stream because we'll already have
   // fully serial execution.
   for (int i = 0; i < n_int_streams; i++) {
-    CUDA_CHECK(cudaStreamSynchronize(internalStreams[i]));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(internalStreams[i]));
   }
 
   if (input.size() > 1 || translations != nullptr) {
diff --git a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
index 88fa58a4d7..327efe49bb 100644
--- a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
@@ -110,7 +110,7 @@ inline void select_k_impl(value_t* inK,
     select_k_kernel<value_t, value_idx, true, warp_q, thread_q, n_threads>
       <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k);
   }
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 /**
diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp
index 18b23bea55..549dd4917c 100644
--- a/cpp/include/raft/spectral/kmeans.hpp
+++ b/cpp/include/raft/spectral/kmeans.hpp
@@ -392,9 +392,9 @@ static int chooseNewCentroid(handle_t const& handle,
   // linear interpolation logic:
   //{
   value_type_t minSum{0};
-  CUDA_TRY(
+  RAFT_CUDA_TRY(
     cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream));
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   if (distsSum > minSum) {
     value_type_t vIndex = static_cast<value_type_t>(n - 1);
@@ -404,16 +404,16 @@ static int chooseNewCentroid(handle_t const& handle,
   }
   //}
 
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
   obsIndex = max(obsIndex, 0);
   obsIndex = min(obsIndex, n - 1);
 
   // Record new centroid position
-  CUDA_TRY(cudaMemcpyAsync(centroid,
-                           obs + IDX(0, obsIndex, d),
-                           d * sizeof(value_type_t),
-                           cudaMemcpyDeviceToDevice,
-                           stream));
+  RAFT_CUDA_TRY(cudaMemcpyAsync(centroid,
+                                obs + IDX(0, obsIndex, d),
+                                d * sizeof(value_type_t),
+                                cudaMemcpyDeviceToDevice,
+                                stream));
 
   return 0;
 }
@@ -486,21 +486,21 @@ static int initializeCentroids(handle_t const& handle,
   dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), 1, 1};
 
   // Assign observation vectors to code 0
-  CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
 
   // Choose first centroid
   thrust::fill(thrust_exec_policy,
                thrust::device_pointer_cast(dists),
                thrust::device_pointer_cast(dists + n),
                1);
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
   if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids))
     WARNING("error in k-means++ (could not pick centroid)");
 
   // Compute distances from first centroid
-  CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream));
   computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(n, d, 1, obs, centroids, dists);
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   // Choose remaining centroids
   for (i = 1; i < k; ++i) {
@@ -512,17 +512,17 @@ static int initializeCentroids(handle_t const& handle,
     CUDA_TRY(cudaMemsetAsync(dists + n, 0, n * sizeof(value_type_t), stream));
     computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(
       n, d, 1, obs, centroids + IDX(0, i, d), dists + n);
-    CHECK_CUDA(stream);
+    RAFT_CHECK_CUDA(stream);
 
     // Recompute minimum distances
     minDistances2<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, dists, dists + n, codes, i);
-    CHECK_CUDA(stream);
+    RAFT_CHECK_CUDA(stream);
   }
 
   // Compute cluster sizes
   CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
   computeClusterSizes<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, codes, clusterSizes);
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   return 0;
 }
@@ -569,7 +569,7 @@ static int assignCentroids(handle_t const& handle,
   auto thrust_exec_policy = handle.get_thrust_policy();
 
   // Compute distance between centroids and observation vectors
-  CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream));
 
   // CUDA grid dimensions
   dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};
@@ -581,7 +581,7 @@ static int assignCentroids(handle_t const& handle,
   gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound);
 
   computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, k, obs, centroids, dists);
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   // Find centroid closest to each observation vector
   CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
@@ -657,59 +657,59 @@ static int updateCentroids(handle_t const& handle,
   thrust::device_ptr<index_type_t> rows(work_int + d * n);
 
   // Take transpose of observation matrix
-  CUBLAS_CHECK(cublasgeam(cublas_h,
-                          CUBLAS_OP_T,
-                          CUBLAS_OP_N,
-                          n,
-                          d,
-                          &one,
-                          obs,
-                          d,
-                          &zero,
-                          (value_type_t*)NULL,
-                          n,
-                          thrust::raw_pointer_cast(obs_copy),
-                          n,
-                          stream));
+  RAFT_CUBLAS_TRY(cublasgeam(cublas_h,
+                             CUBLAS_OP_T,
+                             CUBLAS_OP_N,
+                             n,
+                             d,
+                             &one,
+                             obs,
+                             d,
+                             &zero,
+                             (value_type_t*)NULL,
+                             n,
+                             thrust::raw_pointer_cast(obs_copy),
+                             n,
+                             stream));
 
   // Cluster assigned to each observation matrix entry
   thrust::sequence(thrust_exec_policy, rows, rows + d * n);
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
   thrust::transform(thrust_exec_policy,
                     rows,
                     rows + d * n,
                     thrust::make_constant_iterator<index_type_t>(n),
                     rows,
                     thrust::modulus<index_type_t>());
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
   thrust::gather(
     thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy);
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   // Row associated with each observation matrix entry
   thrust::sequence(thrust_exec_policy, rows, rows + d * n);
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
   thrust::transform(thrust_exec_policy,
                     rows,
                     rows + d * n,
                     thrust::make_constant_iterator<index_type_t>(n),
                     rows,
                     thrust::divides<index_type_t>());
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   // Sort and reduce to add observation vectors in same cluster
   thrust::stable_sort_by_key(thrust_exec_policy,
                              codes_copy,
                              codes_copy + d * n,
                              make_zip_iterator(make_tuple(obs_copy, rows)));
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
   thrust::reduce_by_key(thrust_exec_policy,
                         rows,
                         rows + d * n,
                         obs_copy,
                         codes_copy,  // Output to codes_copy is ignored
                         thrust::device_pointer_cast(centroids));
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   // Divide sums by cluster size to get centroid matrix
   //
@@ -722,7 +722,7 @@ static int updateCentroids(handle_t const& handle,
                1};
 
   divideCentroids<<<gridDim, blockDim, 0, stream>>>(d, k, clusterSizes, centroids);
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   return 0;
 }
@@ -829,30 +829,30 @@ int kmeans(handle_t const& handle,
 
     CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream));
     computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, 1, obs, centroids, work);
-    CHECK_CUDA(stream);
+    RAFT_CHECK_CUDA(stream);
     *residual_host = thrust::reduce(
       thrust_exec_policy, thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n));
-    CHECK_CUDA(stream);
+    RAFT_CHECK_CUDA(stream);
     return 0;
   }
   if (n <= k) {
     thrust::sequence(thrust_exec_policy,
                      thrust::device_pointer_cast(codes),
                      thrust::device_pointer_cast(codes + n));
-    CHECK_CUDA(stream);
+    RAFT_CHECK_CUDA(stream);
     thrust::fill_n(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), n, 1);
-    CHECK_CUDA(stream);
+    RAFT_CHECK_CUDA(stream);
 
     if (n < k)
-      CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(index_type_t), stream));
-    CUDA_TRY(cudaMemcpyAsync(
+      RAFT_CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(index_type_t), stream));
+    RAFT_CUDA_TRY(cudaMemcpyAsync(
       centroids, obs, d * n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream));
     *residual_host = 0;
     return 0;
   }
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  RAFT_CUBLAS_TRY(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // -------------------------------------------------------
   // k-means++ algorithm
@@ -895,7 +895,7 @@ int kmeans(handle_t const& handle,
                                     thrust::device_pointer_cast(clusterSizes + k),
                                     0) -
                        thrust::device_pointer_cast(clusterSizes));
-      CHECK_CUDA(stream);
+      RAFT_CHECK_CUDA(stream);
     }
 
     // Check for convergence
diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp
index 9d1f899d66..0d79904707 100644
--- a/cpp/include/raft/spectral/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/matrix_wrappers.hpp
@@ -208,24 +208,24 @@ struct sparse_matrix_t {
     // void*; the casts should be harmless)
     //
     cusparseSpMatDescr_t matA;
-    CUSPARSE_CHECK(cusparsecreatecsr(&matA,
-                                     nrows_,
-                                     ncols_,
-                                     nnz_,
-                                     const_cast<index_type*>(row_offsets_),
-                                     const_cast<index_type*>(col_indices_),
-                                     const_cast<value_type*>(values_)));
+    RAFT_CUSPARSE_TRY(cusparsecreatecsr(&matA,
+                                        nrows_,
+                                        ncols_,
+                                        nnz_,
+                                        const_cast<index_type*>(row_offsets_),
+                                        const_cast<index_type*>(col_indices_),
+                                        const_cast<value_type*>(values_)));
 
     cusparseDnVecDescr_t vecX;
-    CUSPARSE_CHECK(cusparsecreatednvec(&vecX, size_x, x));
+    RAFT_CUSPARSE_TRY(cusparsecreatednvec(&vecX, size_x, x));
 
     cusparseDnVecDescr_t vecY;
-    CUSPARSE_CHECK(cusparsecreatednvec(&vecY, size_y, y));
+    RAFT_CUSPARSE_TRY(cusparsecreatednvec(&vecY, size_y, y));
 
     // get (scratch) external device buffer size:
     //
     size_t bufferSize;
-    CUSPARSE_CHECK(cusparsespmv_buffersize(
+    RAFT_CUSPARSE_TRY(cusparsespmv_buffersize(
       cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, &bufferSize, stream));
 
     // allocate external buffer:
@@ -234,40 +234,40 @@ struct sparse_matrix_t {
 
     // finally perform SpMV:
     //
-    CUSPARSE_CHECK(cusparsespmv(
+    RAFT_CUSPARSE_TRY(cusparsespmv(
       cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, external_buffer.raw(), stream));
 
     // free descriptors:
     //(TODO: maybe wrap them in a RAII struct?)
     //
-    CUSPARSE_CHECK(cusparseDestroyDnVec(vecY));
-    CUSPARSE_CHECK(cusparseDestroyDnVec(vecX));
-    CUSPARSE_CHECK(cusparseDestroySpMat(matA));
+    RAFT_CUSPARSE_TRY(cusparseDestroyDnVec(vecY));
+    RAFT_CUSPARSE_TRY(cusparseDestroyDnVec(vecX));
+    RAFT_CUSPARSE_TRY(cusparseDestroySpMat(matA));
 #else
-    CUSPARSE_CHECK(cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream));
+    RAFT_CUSPARSE_TRY(cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream));
     cusparseMatDescr_t descr = 0;
-    CUSPARSE_CHECK(cusparseCreateMatDescr(&descr));
+    RAFT_CUSPARSE_TRY(cusparseCreateMatDescr(&descr));
     if (symmetric) {
-      CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_SYMMETRIC));
+      RAFT_CUSPARSE_TRY(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_SYMMETRIC));
     } else {
-      CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
+      RAFT_CUSPARSE_TRY(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
     }
-    CUSPARSE_CHECK(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
-    CUSPARSE_CHECK(cusparsecsrmv(cusparse_h,
-                                 trans,
-                                 nrows_,
-                                 ncols_,
-                                 nnz_,
-                                 &alpha,
-                                 descr,
-                                 values_,
-                                 row_offsets_,
-                                 col_indices_,
-                                 x,
-                                 &beta,
-                                 y,
-                                 stream));
-    CUSPARSE_CHECK(cusparseDestroyMatDescr(descr));
+    RAFT_CUSPARSE_TRY(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
+    RAFT_CUSPARSE_TRY(cusparsecsrmv(cusparse_h,
+                                    trans,
+                                    nrows_,
+                                    ncols_,
+                                    nnz_,
+                                    &alpha,
+                                    descr,
+                                    values_,
+                                    row_offsets_,
+                                    col_indices_,
+                                    x,
+                                    &beta,
+                                    y,
+                                    stream));
+    RAFT_CUSPARSE_TRY(cusparseDestroyMatDescr(descr));
 #endif
   }
 
@@ -349,7 +349,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
     if (beta == 0) {
       CUDA_TRY(cudaMemsetAsync(y, 0, n * sizeof(value_type), stream));
     } else if (beta != 1) {
-      CUBLAS_CHECK(linalg::cublasscal(cublas_h, n, &beta, y, 1, stream));
+      RAFT_CUBLAS_TRY(linalg::cublasscal(cublas_h, n, &beta, y, 1, stream));
     }
 
     // Apply diagonal matrix
@@ -358,7 +358,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
 
     dim3 blockDim{BLOCK_SIZE, 1, 1};
     diagmv<<<gridDim, blockDim, 0, stream>>>(n, alpha, diagonal_.raw(), x, y);
-    CHECK_CUDA(stream);
+    RAFT_CHECK_CUDA(stream);
 
     // Apply adjacency matrix
     //
@@ -412,26 +412,26 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
     // gamma = d'*x
     //
     // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res);
-    CUBLAS_CHECK(linalg::cublasdot(cublas_h,
-                                   n,
-                                   laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
-                                   1,
-                                   x,
-                                   1,
-                                   &dot_res,
-                                   stream));
+    RAFT_CUBLAS_TRY(linalg::cublasdot(cublas_h,
+                                      n,
+                                      laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
+                                      1,
+                                      x,
+                                      1,
+                                      &dot_res,
+                                      stream));
 
     // y = y -(gamma/edge_sum)*d
     //
     value_type gamma_ = -dot_res / edge_sum_;
-    CUBLAS_CHECK(linalg::cublasaxpy(cublas_h,
-                                    n,
-                                    &gamma_,
-                                    laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
-                                    1,
-                                    y,
-                                    1,
-                                    stream));
+    RAFT_CUBLAS_TRY(linalg::cublasaxpy(cublas_h,
+                                       n,
+                                       &gamma_,
+                                       laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
+                                       1,
+                                       y,
+                                       1,
+                                       stream));
   }
 
   value_type edge_sum_;
diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp
index 0e0e47ddf3..c61b5f1458 100644
--- a/cpp/include/raft/spectral/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/modularity_maximization.hpp
@@ -118,7 +118,7 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
   // notice that at this point the matrix has already been transposed, so we are scaling
   // columns
   scale_obs(nEigVecs, n, eigVecs);
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   // Find partition clustering
   auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
@@ -160,7 +160,7 @@ void analyzeModularity(handle_t const& handle,
   vector_t<weight_t> Bx(handle, n);
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // Initialize Modularity
   modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp
index b52bfcc0d6..5b1478baa9 100644
--- a/cpp/include/raft/spectral/partition.hpp
+++ b/cpp/include/raft/spectral/partition.hpp
@@ -152,7 +152,7 @@ void analyzePartition(handle_t const& handle,
   vector_t<weight_t> Lx(handle, n);
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // Initialize Laplacian
   /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp
index 44b4af4bdc..a30906de10 100644
--- a/cpp/include/raft/spectral/spectral_util.hpp
+++ b/cpp/include/raft/spectral/spectral_util.hpp
@@ -122,7 +122,7 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs,
     mean = thrust::reduce(thrust_exec_policy,
                           thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
                           thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)));
-    CHECK_CUDA(stream);
+    RAFT_CHECK_CUDA(stream);
     mean /= n;
     thrust::transform(thrust_exec_policy,
                       thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
@@ -130,9 +130,9 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs,
                       thrust::make_constant_iterator(mean),
                       thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
                       thrust::minus<weight_t>());
-    CHECK_CUDA(stream);
+    RAFT_CHECK_CUDA(stream);
 
-    CUBLAS_CHECK(cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream));
+    RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream));
 
     std /= std::sqrt(static_cast<weight_t>(n));
 
@@ -142,31 +142,31 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs,
                       thrust::make_constant_iterator(std),
                       thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
                       thrust::divides<weight_t>());
-    CHECK_CUDA(stream);
+    RAFT_CHECK_CUDA(stream);
   }
 
   // Transpose eigenvector matrix
   //   TODO: in-place transpose
   {
     vector_t<weight_t> work(handle, nEigVecs * n);
-    CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
-
-    CUBLAS_CHECK(cublasgeam(cublas_h,
-                            CUBLAS_OP_T,
-                            CUBLAS_OP_N,
-                            nEigVecs,
-                            n,
-                            &one,
-                            eigVecs,
-                            n,
-                            &zero,
-                            (weight_t*)NULL,
-                            nEigVecs,
-                            work.raw(),
-                            nEigVecs,
-                            stream));
-
-    CUDA_TRY(cudaMemcpyAsync(
+    RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+    RAFT_CUBLAS_TRY(cublasgeam(cublas_h,
+                               CUBLAS_OP_T,
+                               CUBLAS_OP_N,
+                               nEigVecs,
+                               n,
+                               &one,
+                               eigVecs,
+                               n,
+                               &zero,
+                               (weight_t*)NULL,
+                               nEigVecs,
+                               work.raw(),
+                               nEigVecs,
+                               stream));
+
+    RAFT_CUDA_TRY(cudaMemcpyAsync(
       eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream));
   }
 }
@@ -213,17 +213,17 @@ bool construct_indicator(handle_t const& handle,
     thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters + n),
                                                  thrust::device_pointer_cast(part_i.raw() + n))),
     equal_to_i_op<vertex_t, weight_t>(index));
-  CHECK_CUDA(stream);
+  RAFT_CHECK_CUDA(stream);
 
   // Compute size of ith partition
-  CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream));
+  RAFT_CUBLAS_TRY(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream));
 
   clustersize = round(clustersize);
   if (clustersize < 0.5) { return false; }
 
   // Compute part stats
   B.mv(1, part_i.raw(), 0, Bx.raw());
-  CUBLAS_CHECK(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream));
+  RAFT_CUBLAS_TRY(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream));
 
   return true;
 }
diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh
index e8e6bea4dd..899e378d38 100644
--- a/cpp/include/raft/stats/detail/mean.cuh
+++ b/cpp/include/raft/stats/detail/mean.cuh
@@ -71,15 +71,15 @@ void mean(
     static const int ColsPerBlk    = 32;
     static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
     dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
-    CUDA_CHECK(cudaMemsetAsync(mu, 0, sizeof(Type) * D, stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(mu, 0, sizeof(Type) * D, stream));
     meanKernelRowMajor<Type, IdxType, TPB, ColsPerBlk><<<grid, TPB, 0, stream>>>(mu, data, D, N);
-    CUDA_CHECK(cudaPeekAtLastError());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
     Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
     raft::linalg::scalarMultiply(mu, mu, ratio, D, stream);
   } else {
     meanKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(mu, data, D, N);
   }
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 }  // namespace detail
diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh
index 42351269ea..229eb34a7d 100644
--- a/cpp/include/raft/stats/detail/stddev.cuh
+++ b/cpp/include/raft/stats/detail/stddev.cuh
@@ -118,7 +118,7 @@ void stddev(Type* std,
     static const int ColsPerBlk    = 32;
     static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
     dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
-    CUDA_CHECK(cudaMemset(std, 0, sizeof(Type) * D));
+    RAFT_CUDA_TRY(cudaMemset(std, 0, sizeof(Type) * D));
     stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk><<<grid, TPB, 0, stream>>>(std, data, D, N);
     Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
     raft::linalg::binaryOp(
@@ -131,7 +131,7 @@ void stddev(Type* std,
   } else {
     stddevKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(std, data, mu, D, N);
   }
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 /**
@@ -168,7 +168,7 @@ void vars(Type* var,
     static const int ColsPerBlk    = 32;
     static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
     dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
-    CUDA_CHECK(cudaMemset(var, 0, sizeof(Type) * D));
+    RAFT_CUDA_TRY(cudaMemset(var, 0, sizeof(Type) * D));
     stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk><<<grid, TPB, 0, stream>>>(var, data, D, N);
     Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
     raft::linalg::binaryOp(
@@ -176,7 +176,7 @@ void vars(Type* var,
   } else {
     varsKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(var, data, mu, D, N);
   }
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 }  // namespace detail
diff --git a/cpp/include/raft/stats/detail/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh
index b7f5cc8ff7..1db504965c 100644
--- a/cpp/include/raft/stats/detail/sum.cuh
+++ b/cpp/include/raft/stats/detail/sum.cuh
@@ -70,13 +70,13 @@ void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, c
     static const int ColsPerBlk    = 32;
     static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
     dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
-    CUDA_CHECK(cudaMemset(output, 0, sizeof(Type) * D));
+    RAFT_CUDA_TRY(cudaMemset(output, 0, sizeof(Type) * D));
     sumKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
       <<<grid, TPB, 0, stream>>>(output, input, D, N);
   } else {
     sumKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(output, input, D, N);
   }
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 }  // namespace detail
diff --git a/cpp/test/cudart_utils.cpp b/cpp/test/cudart_utils.cpp
index 150767992f..d9c69ce519 100644
--- a/cpp/test/cudart_utils.cpp
+++ b/cpp/test/cudart_utils.cpp
@@ -25,7 +25,7 @@ TEST(Raft, Utils)
   ASSERT_NO_THROW(ASSERT(1 == 1, "Should not assert!"));
   ASSERT_THROW(ASSERT(1 != 1, "Should assert!"), exception);
   ASSERT_THROW(THROW("Should throw!"), exception);
-  ASSERT_NO_THROW(CUDA_CHECK(cudaFree(nullptr)));
+  ASSERT_NO_THROW(RAFT_CUDA_TRY(cudaFree(nullptr)));
 }
 
 }  // namespace raft
diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index 21d7e9d753..a748b0ef0e 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -62,7 +62,7 @@ void naiveDistanceAdj(bool* dist,
   static const dim3 TPB(16, 32, 1);
   dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1);
   naiveDistanceAdjKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, eps, isRowMajor);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename DataType>
@@ -127,7 +127,7 @@ class DistanceAdjTest : public ::testing::TestWithParam<DistanceAdjInputs<DataTy
       fin_op,
       stream,
       isRowMajor);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override {}
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index f445e3b578..ec9d35bb09 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -352,7 +352,7 @@ void naiveDistance(DataType* dist,
       break;
     default: FAIL() << "should be here\n";
   }
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename DataType>
@@ -453,7 +453,7 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
                                              stream,
                                              isRowMajor,
                                              metric_arg);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 932857c536..68ad220734 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -86,15 +86,15 @@ void naive(cub::KeyValuePair<int, DataT>* min,
 {
   static const dim3 TPB(32, 16, 1);
   dim3 nblks(raft::ceildiv(n, (int)TPB.x), raft::ceildiv(m, (int)TPB.y), 1);
-  CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream));
   auto blks = raft::ceildiv(m, 256);
   MinAndDistanceReduceOp<int, DataT> op;
   detail::initKernel<DataT, cub::KeyValuePair<int, DataT>, int>
     <<<blks, 256, 0, stream>>>(min, m, std::numeric_limits<DataT>::max(), op);
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
   naiveKernel<DataT, Sqrt, MinAndDistanceReduceOp<int, DataT>, 16>
     <<<nblks, TPB, 0, stream>>>(min, x, y, m, n, k, workspace, std::numeric_limits<DataT>::max());
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 
 template <typename DataT>
@@ -132,7 +132,7 @@ class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
     generateGoldenResult();
     raft::linalg::rowNorm(xn.data(), x.data(), k, m, raft::linalg::L2Norm, true, stream);
     raft::linalg::rowNorm(yn.data(), y.data(), k, n, raft::linalg::L2Norm, true, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
@@ -175,7 +175,7 @@ class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
                                                          Sqrt,
                                                          true,
                                                          stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 };
 
@@ -217,7 +217,7 @@ template <typename K, typename V, typename L>
   std::shared_ptr<KVP> act_h(new KVP[size]);
   raft::update_host<KVP>(exp_h.get(), expected, size, stream);
   raft::update_host<KVP>(act_h.get(), actual, size, stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   for (size_t i(0); i < size; ++i) {
     auto exp = exp_h.get()[i];
     auto act = act_h.get()[i];
@@ -308,7 +308,7 @@ class FusedL2NNDetTest : public FusedL2NNTest<DataT, Sqrt> {
     FusedL2NNTest<DataT, Sqrt>::SetUp();
     int m = this->params.m;
     min1.resize(m, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override { FusedL2NNTest<DataT, Sqrt>::TearDown(); }
diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp
index 698a601e85..81b8bb6c6c 100644
--- a/cpp/test/handle.cpp
+++ b/cpp/test/handle.cpp
@@ -38,11 +38,11 @@ TEST(Raft, Handle)
   handle_t h(4);
   ASSERT_EQ(4, h.get_num_internal_streams());
   cudaStream_t stream;
-  CUDA_CHECK(cudaStreamCreate(&stream));
+  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
   h.set_stream(stream);
   ASSERT_EQ(stream, h.get_stream());
-  CUDA_CHECK(cudaStreamSynchronize(stream));
-  CUDA_CHECK(cudaStreamDestroy(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
 }
 
 TEST(Raft, GetInternalStreams)
diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu
index d983ec1162..4b56a9ad6f 100644
--- a/cpp/test/label/label.cu
+++ b/cpp/test/label/label.cu
@@ -38,7 +38,7 @@ typedef labelTest MakeMonotonicTest;
 TEST_F(MakeMonotonicTest, Result)
 {
   cudaStream_t stream;
-  CUDA_CHECK(cudaStreamCreate(&stream));
+  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
 
   int m = 12;
 
@@ -55,7 +55,7 @@ TEST_F(MakeMonotonicTest, Result)
 
   make_monotonic(actual.data(), data.data(), m, stream);
 
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
   ASSERT_TRUE(devArrMatch(actual.data(), expected.data(), m, raft::Compare<bool>(), stream));
 
@@ -66,7 +66,7 @@ TEST_F(MakeMonotonicTest, Result)
 TEST(labelTest, Classlabels)
 {
   cudaStream_t stream;
-  CUDA_CHECK(cudaStreamCreate(&stream));
+  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
 
   int n_rows = 6;
   rmm::device_uvector<float> y_d(n_rows, stream);
diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu
index 17b000044e..2b51f4640a 100644
--- a/cpp/test/linalg/add.cu
+++ b/cpp/test/linalg/add.cu
@@ -47,7 +47,7 @@ class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
     r.uniform(in2.data(), len, InT(-1.0), InT(1.0), stream);
     naiveAddElem<InT, OutT>(out_ref.data(), in1.data(), in2.data(), len);
     add<InT, OutT>(out.data(), in1.data(), in2.data(), len, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
   void compare()
diff --git a/cpp/test/linalg/add.cuh b/cpp/test/linalg/add.cuh
index 1d9352bfc1..5e887e0040 100644
--- a/cpp/test/linalg/add.cuh
+++ b/cpp/test/linalg/add.cuh
@@ -35,7 +35,7 @@ void naiveAddElem(OutT* out, const InT* in1, const InT* in2, int len)
   static const int TPB = 64;
   int nblks            = raft::ceildiv(len, TPB);
   naiveAddElemKernel<InT, OutT><<<nblks, TPB>>>(out, in1, in2, len);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename InT, typename OutT = InT>
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu
index c833faa0b2..bb62ddced3 100644
--- a/cpp/test/linalg/binary_op.cu
+++ b/cpp/test/linalg/binary_op.cu
@@ -58,7 +58,7 @@ class BinaryOpTest : public ::testing::TestWithParam<BinaryOpInputs<InType, IdxT
     r.uniform(in2.data(), len, InType(-1.0), InType(1.0), stream);
     naiveAdd(out_ref.data(), in1.data(), in2.data(), len);
     binaryOpLaunch(out.data(), in1.data(), in2.data(), len, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
@@ -124,10 +124,10 @@ class BinaryOpAlignment : public ::testing::Test {
  protected:
   BinaryOpAlignment()
   {
-    CUDA_CHECK(cudaStreamCreate(&stream));
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
     handle.set_stream(stream);
   }
-  void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); }
+  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
 
  public:
   void Misaligned()
@@ -138,8 +138,8 @@ class BinaryOpAlignment : public ::testing::Test {
     rmm::device_uvector<math_t> x(n, stream);
     rmm::device_uvector<math_t> y(n, stream);
     rmm::device_uvector<math_t> z(n, stream);
-    CUDA_CHECK(cudaMemsetAsync(x.data(), 0, n * sizeof(math_t), stream));
-    CUDA_CHECK(cudaMemsetAsync(y.data(), 0, n * sizeof(math_t), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(x.data(), 0, n * sizeof(math_t), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(y.data(), 0, n * sizeof(math_t), stream));
     raft::linalg::binaryOp(
       z.data() + 9,
       x.data() + 137,
diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/test/linalg/binary_op.cuh
index 97cb3ecb24..60450695e7 100644
--- a/cpp/test/linalg/binary_op.cuh
+++ b/cpp/test/linalg/binary_op.cuh
@@ -36,7 +36,7 @@ void naiveAdd(OutType* out, const InType* in1, const InType* in2, IdxType len)
   static const IdxType TPB = 64;
   IdxType nblks            = raft::ceildiv(len, TPB);
   naiveAddKernel<InType, OutType, IdxType><<<nblks, TPB>>>(out, in1, in2, len);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename InType, typename IdxType = int, typename OutType = InType>
diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu
index 6c7bbd1232..1c3d99a883 100644
--- a/cpp/test/linalg/cholesky_r1.cu
+++ b/cpp/test/linalg/cholesky_r1.cu
@@ -38,13 +38,13 @@ class CholeskyR1Test : public ::testing::Test {
       devInfo(handle.get_stream()),
       workspace(0, handle.get_stream())
   {
-    CUDA_CHECK(cudaStreamCreate(&stream));
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
     handle.set_stream(stream);
     raft::update_device(G.data(), G_host, n_rows * n_rows, stream);
 
     // Allocate workspace
     solver_handle = handle.get_cusolver_dn_handle();
-    CUSOLVER_CHECK(raft::linalg::cusolverDnpotrf_bufferSize(
+    RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnpotrf_bufferSize(
       solver_handle, CUBLAS_FILL_MODE_LOWER, n_rows, L.data(), n_rows, &Lwork));
     int n_bytes = 0;
     // Initializing in CUBLAS_FILL_MODE_LOWER, because that has larger workspace
@@ -55,7 +55,7 @@ class CholeskyR1Test : public ::testing::Test {
     workspace.resize(Lwork, stream);
   }
 
-  void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); }
+  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
 
   void testR1Update()
   {
@@ -70,15 +70,15 @@ class CholeskyR1Test : public ::testing::Test {
 
         // Expected solution using Cholesky factorization from scratch
         raft::copy(L_exp.data(), G.data(), n, stream);
-        CUSOLVER_CHECK(raft::linalg::cusolverDnpotrf(solver_handle,
-                                                     uplo,
-                                                     rank,
-                                                     L_exp.data(),
-                                                     n_rows,
-                                                     (math_t*)workspace.data(),
-                                                     Lwork,
-                                                     devInfo.data(),
-                                                     stream));
+        RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnpotrf(solver_handle,
+                                                        uplo,
+                                                        rank,
+                                                        L_exp.data(),
+                                                        n_rows,
+                                                        (math_t*)workspace.data(),
+                                                        Lwork,
+                                                        devInfo.data(),
+                                                        stream));
 
         // Incremental Cholesky factorization using rank one updates.
         raft::linalg::choleskyRank1Update(
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index 9bb84e1eb7..910e6a2365 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -75,7 +75,7 @@ class coalescedReductionTest : public ::testing::TestWithParam<coalescedReductio
     // Add to result with inplace = true next
     coalescedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream, true);
 
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu
index 130a22abf0..7f57c79a7e 100644
--- a/cpp/test/linalg/divide.cu
+++ b/cpp/test/linalg/divide.cu
@@ -37,7 +37,7 @@ void naiveDivide(Type* out, const Type* in, Type scalar, int len, cudaStream_t s
   static const int TPB = 64;
   int nblks            = raft::ceildiv(len, TPB);
   naiveDivideKernel<Type><<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename T>
@@ -57,11 +57,11 @@ class DivideTest : public ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T
   {
     raft::random::Rng r(params.seed);
     int len = params.len;
-    CUDA_CHECK(cudaStreamCreate(&stream));
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
     r.uniform(in.data(), len, T(-1.0), T(1.0), stream);
     naiveDivide(out_ref.data(), in.data(), params.scalar, len, stream);
     divideScalar(out.data(), in.data(), params.scalar, len, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
index 3df3abd2af..c9d95d2058 100644
--- a/cpp/test/linalg/eig.cu
+++ b/cpp/test/linalg/eig.cu
@@ -134,7 +134,7 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
               stream,
               tol,
               sweeps);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index b1e88c91dd..518dce4048 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -92,7 +92,7 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
              eig_vals.data(),
              EigVecMemUsage::OVERWRITE_INPUT,
              stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu
index 5ecca16be6..023b04f8ed 100644
--- a/cpp/test/linalg/eltwise.cu
+++ b/cpp/test/linalg/eltwise.cu
@@ -38,7 +38,7 @@ void naiveScale(Type* out, const Type* in, Type scalar, int len, cudaStream_t st
   static const int TPB = 64;
   int nblks            = raft::ceildiv(len, TPB);
   naiveScaleKernel<Type><<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename T>
@@ -76,7 +76,7 @@ class ScalarMultiplyTest : public ::testing::TestWithParam<ScalarMultiplyInputs<
     r.uniform(in, len, T(-1.0), T(1.0), stream);
     naiveScale(out_ref, in, scalar, len, stream);
     scalarMultiply(out, in, scalar, len, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
@@ -125,7 +125,7 @@ void naiveAdd(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t
   static const int TPB = 64;
   int nblks            = raft::ceildiv(len, TPB);
   naiveAddKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename T>
@@ -164,7 +164,7 @@ class EltwiseAddTest : public ::testing::TestWithParam<EltwiseAddInputs<T>> {
     r.uniform(in2, len, T(-1.0), T(1.0), stream);
     naiveAdd(out_ref, in1, in2, len, stream);
     eltwiseAdd(out, in1, in2, len, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu
index 6231715c8a..da07ed797e 100644
--- a/cpp/test/linalg/gemm_layout.cu
+++ b/cpp/test/linalg/gemm_layout.cu
@@ -80,10 +80,10 @@ class GemmLayoutTest : public ::testing::TestWithParam<GemmLayoutInputs<T>> {
     size_t yElems = params.K * params.N;
     size_t zElems = params.M * params.N;
 
-    CUDA_CHECK(cudaMalloc(&X, xElems * sizeof(T)));
-    CUDA_CHECK(cudaMalloc(&Y, yElems * sizeof(T)));
-    CUDA_CHECK(cudaMalloc(&refZ, zElems * sizeof(T)));
-    CUDA_CHECK(cudaMalloc(&Z, zElems * sizeof(T)));
+    RAFT_CUDA_TRY(cudaMalloc(&X, xElems * sizeof(T)));
+    RAFT_CUDA_TRY(cudaMalloc(&Y, yElems * sizeof(T)));
+    RAFT_CUDA_TRY(cudaMalloc(&refZ, zElems * sizeof(T)));
+    RAFT_CUDA_TRY(cudaMalloc(&Z, zElems * sizeof(T)));
 
     r.uniform(X, xElems, T(-10.0), T(10.0), stream);
     r.uniform(Y, yElems, T(-10.0), T(10.0), stream);
@@ -109,8 +109,8 @@ class GemmLayoutTest : public ::testing::TestWithParam<GemmLayoutInputs<T>> {
 
   void TearDown() override
   {
-    CUDA_CHECK(cudaFree(refZ));
-    CUDA_CHECK(cudaFree(Z));
+    RAFT_CUDA_TRY(cudaFree(refZ));
+    RAFT_CUDA_TRY(cudaFree(Z));
   }
 
  protected:
diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu
index 787d9ba415..f79aac9b7f 100644
--- a/cpp/test/linalg/map.cu
+++ b/cpp/test/linalg/map.cu
@@ -64,7 +64,7 @@ void create_ref(OutType* out_ref,
   eltwiseAdd(tmp.data(), in1, in2, len, stream);
   eltwiseAdd(out_ref, tmp.data(), in3, len, stream);
   scalarAdd(out_ref, out_ref, (OutType)scalar, len, stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
@@ -93,7 +93,7 @@ class MapTest : public ::testing::TestWithParam<MapInputs<InType, IdxType, OutTy
 
     create_ref(out_ref.data(), in1.data(), in2.data(), in3.data(), params.scalar, len, stream);
     mapLaunch(out.data(), in1.data(), in2.data(), in3.data(), params.scalar, len, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index 1594cc3544..c35e1ea9ef 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -39,7 +39,7 @@ void naiveMapReduce(OutType* out, const InType* in, size_t len, MapOp map, cudaS
   static const int TPB = 64;
   int nblks            = raft::ceildiv(len, (size_t)TPB);
   naiveMapReduceKernel<InType, OutType, MapOp><<<nblks, TPB, 0, stream>>>(out, in, len, map);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename T>
@@ -87,7 +87,7 @@ class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<InType>> {
     auto len = params.len;
     r.uniform(in.data(), len, InType(-1.0), InType(1.0), stream);
     mapReduceLaunch(out_ref.data(), out.data(), in.data(), len, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
@@ -133,12 +133,12 @@ class MapGenericReduceTest : public ::testing::Test {
  protected:
   MapGenericReduceTest() : input(n, handle.get_stream()), output(handle.get_stream())
   {
-    CUDA_CHECK(cudaStreamCreate(&stream));
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
     handle.set_stream(stream);
     initInput(input.data(), input.size(), stream);
   }
 
-  void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); }
+  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
 
  public:
   void initInput(InType* input, int n, cudaStream_t stream)
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index 3db7c53041..9f2a1ac78f 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -132,7 +132,7 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<T, IdxType>>
                          params.bcastAlongRows,
                          params.useTwoVectors,
                          stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh
index 5f9c6f1ef3..70a68fb542 100644
--- a/cpp/test/linalg/matrix_vector_op.cuh
+++ b/cpp/test/linalg/matrix_vector_op.cuh
@@ -60,7 +60,7 @@ void naiveMatVec(Type* out,
   IdxType len              = N * D;
   IdxType nblks            = raft::ceildiv(len, TPB);
   naiveMatVecKernel<Type><<<nblks, TPB>>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename Type, typename IdxType = int>
@@ -105,7 +105,7 @@ void naiveMatVec(Type* out,
   IdxType nblks            = raft::ceildiv(len, TPB);
   naiveMatVecKernel<Type>
     <<<nblks, TPB>>>(out, mat, vec1, vec2, D, N, rowMajor, bcastAlongRows, scalar);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 }  // end namespace linalg
diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu
index 2a632d55b2..a4ad0f1d4f 100644
--- a/cpp/test/linalg/multiply.cu
+++ b/cpp/test/linalg/multiply.cu
@@ -45,7 +45,7 @@ class MultiplyTest : public ::testing::TestWithParam<UnaryOpInputs<T>> {
     r.uniform(in.data(), len, T(-1.0), T(1.0), stream);
     naiveScale(out_ref.data(), in.data(), params.scalar, len, stream);
     multiplyScalar(out.data(), in.data(), params.scalar, len, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
index 6dae606f18..3fdedc1814 100644
--- a/cpp/test/linalg/norm.cu
+++ b/cpp/test/linalg/norm.cu
@@ -67,7 +67,7 @@ void naiveRowNorm(
   static const int TPB = 64;
   int nblks            = raft::ceildiv(N, TPB);
   naiveRowNormKernel<Type><<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename T>
@@ -95,7 +95,7 @@ class RowNormTest : public ::testing::TestWithParam<NormInputs<T>> {
     } else {
       rowNorm(dots_act.data(), data.data(), cols, rows, params.type, params.rowMajor, stream);
     }
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
@@ -130,7 +130,7 @@ void naiveColNorm(
   static const int TPB = 64;
   int nblks            = raft::ceildiv(D, TPB);
   naiveColNormKernel<Type><<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename T>
@@ -159,7 +159,7 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<T>> {
     } else {
       colNorm(dots_act.data(), data.data(), cols, rows, params.type, params.rowMajor, stream);
     }
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
index 25ee0a7b77..ba354de2f1 100644
--- a/cpp/test/linalg/reduce.cu
+++ b/cpp/test/linalg/reduce.cu
@@ -96,7 +96,7 @@ class ReduceTest : public ::testing::TestWithParam<ReduceInputs<InType, OutType>
       reduceLaunch(
         dots_act.data(), data.data(), cols, rows, params.rowMajor, params.alongRows, true, stream);
     }
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh
index 82ddfd4661..e74af2c6fe 100644
--- a/cpp/test/linalg/reduce.cuh
+++ b/cpp/test/linalg/reduce.cuh
@@ -44,7 +44,7 @@ void naiveCoalescedReduction(OutType* dots, const InType* data, int D, int N, cu
   static const int TPB = 64;
   int nblks            = raft::ceildiv(N, TPB);
   naiveCoalescedReductionKernel<InType, OutType><<<nblks, TPB, 0, stream>>>(dots, data, D, N);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename InType, typename OutType>
@@ -60,15 +60,15 @@ void unaryAndGemv(OutType* dots, const InType* data, int D, int N, cudaStream_t
     [] __device__(InType v) { return static_cast<OutType>(v * v); },
     stream);
   cublasHandle_t handle;
-  CUBLAS_CHECK(cublasCreate(&handle));
+  RAFT_CUBLAS_TRY(cublasCreate(&handle));
   rmm::device_uvector<OutType> ones(N, stream);  // column vector [1...1]
   raft::linalg::unaryOp<OutType>(
     ones.data(), ones.data(), ones.size(), [=] __device__(OutType input) { return 1; }, stream);
   OutType alpha = 1, beta = 0;
-  CUBLAS_CHECK(raft::linalg::cublasgemv(
+  RAFT_CUBLAS_TRY(raft::linalg::cublasgemv(
     handle, CUBLAS_OP_N, D, N, &alpha, sq.data(), D, ones.data(), 1, &beta, dots, 1, stream));
-  CUDA_CHECK(cudaDeviceSynchronize());
-  CUBLAS_CHECK(cublasDestroy(handle));
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  RAFT_CUBLAS_TRY(cublasDestroy(handle));
 }
 
 template <typename InType, typename OutType>
@@ -89,7 +89,7 @@ void naiveReduction(OutType* dots,
   } else {
     naiveCoalescedReduction(dots, data, N, D, stream);
   }
-  CUDA_CHECK(cudaDeviceSynchronize());
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
 }
 
 }  // end namespace linalg
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu
index ac387c16bb..6f3671540e 100644
--- a/cpp/test/linalg/strided_reduction.cu
+++ b/cpp/test/linalg/strided_reduction.cu
@@ -61,7 +61,7 @@ class stridedReductionTest : public ::testing::TestWithParam<stridedReductionInp
 
     unaryAndGemv(dots_exp.data(), data.data(), cols, rows, stream);
     stridedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu
index 77c14a8a7b..33968f4e9f 100644
--- a/cpp/test/linalg/subtract.cu
+++ b/cpp/test/linalg/subtract.cu
@@ -36,7 +36,7 @@ void naiveSubtractElem(Type* out, const Type* in1, const Type* in2, int len, cud
   static const int TPB = 64;
   int nblks            = raft::ceildiv(len, TPB);
   naiveSubtractElemKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename Type>
@@ -52,7 +52,7 @@ void naiveSubtractScalar(Type* out, const Type* in1, const Type in2, int len, cu
   static const int TPB = 64;
   int nblks            = raft::ceildiv(len, TPB);
   naiveSubtractScalarKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename T>
@@ -96,7 +96,7 @@ class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
     subtractScalar(out.data(), out.data(), T(1), len, stream);
     subtract(in1.data(), in1.data(), in2.data(), len, stream);
     subtractScalar(in1.data(), in1.data(), T(1), len, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu
index 61c2c2e3db..801067dc96 100644
--- a/cpp/test/linalg/svd.cu
+++ b/cpp/test/linalg/svd.cu
@@ -91,7 +91,7 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
           true,
           true,
           stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index fde5599bc1..a63b08e970 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -63,7 +63,7 @@ class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
 
     transpose(handle, data.data(), data_trans.data(), params.n_row, params.n_col, stream);
     transpose(data.data(), params.n_row, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu
index ff6723973d..333eebe830 100644
--- a/cpp/test/linalg/unary_op.cu
+++ b/cpp/test/linalg/unary_op.cu
@@ -59,7 +59,7 @@ class UnaryOpTest : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxTyp
     raft::random::Rng r(params.seed);
     auto len = params.len;
     r.uniform(in.data(), len, InType(-1.0), InType(1.0), stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
   virtual void DoTest()
@@ -68,7 +68,7 @@ class UnaryOpTest : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxTyp
     auto scalar = params.scalar;
     naiveScale(out_ref.data(), in.data(), scalar, len, stream);
     unaryOpLaunch(out.data(), in.data(), scalar, len, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
     ASSERT_TRUE(devArrMatch(
       out_ref.data(), out.data(), params.len, CompareApprox<OutType>(params.tolerance)));
   }
@@ -91,7 +91,7 @@ class WriteOnlyUnaryOpTest : public UnaryOpTest<OutType, IdxType, OutType> {
     auto scalar = this->params.scalar;
     naiveScale(this->out_ref.data(), (OutType*)nullptr, scalar, len, this->stream);
     unaryOpLaunch(this->out.data(), (OutType*)nullptr, scalar, len, this->stream);
-    CUDA_CHECK(cudaStreamSynchronize(this->stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(this->stream));
     ASSERT_TRUE(devArrMatch(this->out_ref.data(),
                             this->out.data(),
                             this->params.len,
diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/test/linalg/unary_op.cuh
index 3343389af8..d8ab6fa90a 100644
--- a/cpp/test/linalg/unary_op.cuh
+++ b/cpp/test/linalg/unary_op.cuh
@@ -43,7 +43,7 @@ void naiveScale(OutType* out, const InType* in, InType scalar, int len, cudaStre
   static const int TPB = 64;
   int nblks            = raft::ceildiv(len, TPB);
   naiveScaleKernel<InType, OutType, IdxType><<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename InType, typename IdxType = int, typename OutType = InType>
diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu
index 7042f5b48d..a1001f3816 100644
--- a/cpp/test/matrix/math.cu
+++ b/cpp/test/matrix/math.cu
@@ -36,7 +36,7 @@ void naivePower(Type* in, Type* out, int len, cudaStream_t stream)
   static const int TPB = 64;
   int nblks            = raft::ceildiv(len, TPB);
   nativePowerKernel<Type><<<nblks, TPB, 0, stream>>>(in, out, len);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename Type>
@@ -52,7 +52,7 @@ void naiveSqrt(Type* in, Type* out, int len)
   static const int TPB = 64;
   int nblks            = raft::ceildiv(len, TPB);
   nativeSqrtKernel<Type><<<nblks, TPB>>>(in, out, len);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename Type>
@@ -89,7 +89,7 @@ template <typename Type>
 void naiveSignFlip(Type* in, Type* out, int rowCount, int colCount)
 {
   naiveSignFlipKernel<Type><<<colCount, 1>>>(in, out, rowCount, colCount);
-  CUDA_CHECK(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename T>
@@ -176,7 +176,7 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
     update_device(out_smallzero_ref.data(), in_small_val_zero_ref_h.data(), 4, stream);
     setSmallValuesZero(out_smallzero.data(), in_smallzero.data(), 4, stream);
     setSmallValuesZero(in_smallzero.data(), 4, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu
index 6f052f7b46..696ef2dd08 100644
--- a/cpp/test/matrix/matrix.cu
+++ b/cpp/test/matrix/matrix.cu
@@ -63,7 +63,7 @@ class MatrixTest : public ::testing::TestWithParam<MatrixInputs<T>> {
 
     rmm::device_uvector<T> outTrunc(6, stream);
     truncZeroOrigin(in1.data(), params.n_row, outTrunc.data(), 3, 2, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
@@ -112,7 +112,7 @@ class MatrixCopyRowsTest : public ::testing::Test {
       indices(n_selected, handle.get_stream()),
       output(n_cols * n_selected, handle.get_stream())
   {
-    CUDA_CHECK(cudaStreamCreate(&stream));
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
     handle.set_stream(stream);
     raft::update_device(indices.data(), indices_host, n_selected, stream);
     // Init input array
@@ -121,7 +121,7 @@ class MatrixCopyRowsTest : public ::testing::Test {
     thrust::copy(handle.get_thrust_policy(), first, first + n_cols * n_rows, ptr);
   }
 
-  void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); }
+  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
 
   void testCopyRows()
   {
diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp
index 5cfcc910fd..7f4dfb8702 100644
--- a/cpp/test/mr/device/buffer.cpp
+++ b/cpp/test/mr/device/buffer.cpp
@@ -28,7 +28,7 @@ namespace device {
 TEST(Raft, DeviceBufferAlloc)
 {
   cudaStream_t stream;
-  CUDA_CHECK(cudaStreamCreate(&stream));
+  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
   // no allocation at construction
   rmm::device_uvector<char> buff(0, stream);
   ASSERT_EQ(0, buff.size());
@@ -48,8 +48,8 @@ TEST(Raft, DeviceBufferAlloc)
   ASSERT_EQ(10, buff.size());
   buff.release();
   ASSERT_EQ(0, buff.size());
-  CUDA_CHECK(cudaStreamSynchronize(stream));
-  CUDA_CHECK(cudaStreamDestroy(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
 }
 
 TEST(Raft, DeviceBufferZeroResize)
@@ -64,7 +64,7 @@ TEST(Raft, DeviceBufferZeroResize)
   rmm::mr::set_current_device_resource(limit_mr.get());
 
   cudaStream_t stream;
-  CUDA_CHECK(cudaStreamCreate(&stream));
+  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
   // no allocation at construction
   rmm::device_uvector<char> buff(10, stream);
   ASSERT_EQ(10, buff.size());
@@ -83,8 +83,8 @@ TEST(Raft, DeviceBufferZeroResize)
 
   rmm::mr::set_current_device_resource(curr_mr);
 
-  CUDA_CHECK(cudaStreamSynchronize(stream));
-  CUDA_CHECK(cudaStreamDestroy(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
 }
 
 }  // namespace device
diff --git a/cpp/test/mr/host/buffer.cpp b/cpp/test/mr/host/buffer.cpp
index aadf05285c..d645ffa0e0 100644
--- a/cpp/test/mr/host/buffer.cpp
+++ b/cpp/test/mr/host/buffer.cpp
@@ -28,7 +28,7 @@ TEST(Raft, HostBuffer)
 {
   auto alloc = std::make_shared<default_allocator>();
   cudaStream_t stream;
-  CUDA_CHECK(cudaStreamCreate(&stream));
+  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
   // no allocation at construction
   buffer<char> buff(alloc, stream);
   ASSERT_EQ(0, buff.size());
@@ -48,8 +48,8 @@ TEST(Raft, HostBuffer)
   ASSERT_EQ(10, buff.size());
   buff.release();
   ASSERT_EQ(0, buff.size());
-  CUDA_CHECK(cudaStreamSynchronize(stream));
-  CUDA_CHECK(cudaStreamDestroy(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
 }
 
 TEST(Raft, DeviceToHostBuffer)
@@ -57,13 +57,13 @@ TEST(Raft, DeviceToHostBuffer)
   auto d_alloc = std::make_shared<device::default_allocator>();
   auto h_alloc = std::make_shared<default_allocator>();
   cudaStream_t stream;
-  CUDA_CHECK(cudaStreamCreate(&stream));
+  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
   device::buffer<char> d_buff(d_alloc, stream, 32);
-  CUDA_CHECK(cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream));
   buffer<char> h_buff(h_alloc, d_buff);
   ASSERT_EQ(d_buff.size(), h_buff.size());
-  CUDA_CHECK(cudaStreamSynchronize(stream));
-  CUDA_CHECK(cudaStreamDestroy(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
 }
 
 }  // namespace host
diff --git a/cpp/test/mst.cu b/cpp/test/mst.cu
index 90a6d7bd87..88b34cfb85 100644
--- a/cpp/test/mst.cu
+++ b/cpp/test/mst.cu
@@ -137,15 +137,15 @@ class MSTTest : public ::testing::TestWithParam<MSTTestInput<vertex_t, edge_t, w
     rmm::device_uvector<vertex_t> mst_dst(2 * v - 2, handle.get_stream());
     rmm::device_uvector<vertex_t> color(v, handle.get_stream());
 
-    CUDA_CHECK(cudaMemsetAsync(mst_src.data(),
-                               std::numeric_limits<vertex_t>::max(),
-                               mst_src.size() * sizeof(vertex_t),
-                               handle.get_stream()));
-    CUDA_CHECK(cudaMemsetAsync(mst_dst.data(),
-                               std::numeric_limits<vertex_t>::max(),
-                               mst_dst.size() * sizeof(vertex_t),
-                               handle.get_stream()));
-    CUDA_CHECK(
+    RAFT_CUDA_TRY(cudaMemsetAsync(mst_src.data(),
+                                  std::numeric_limits<vertex_t>::max(),
+                                  mst_src.size() * sizeof(vertex_t),
+                                  handle.get_stream()));
+    RAFT_CUDA_TRY(cudaMemsetAsync(mst_dst.data(),
+                                  std::numeric_limits<vertex_t>::max(),
+                                  mst_dst.size() * sizeof(vertex_t),
+                                  handle.get_stream()));
+    RAFT_CUDA_TRY(
       cudaMemsetAsync(color.data(), 0, color.size() * sizeof(vertex_t), handle.get_stream()));
 
     vertex_t* color_ptr = thrust::raw_pointer_cast(color.data());
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
index 69dc146486..eb5e8c0ae5 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/test/random/rng.cu
@@ -91,7 +91,7 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
       stats(2, stream)
   {
     data.resize(params.len, stream);
-    CUDA_CHECK(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(T), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(T), stream));
   }
 
  protected:
@@ -119,10 +119,10 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     meanKernel<T, threads><<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(
       stats.data(), data.data(), params.len);
     update_host<T>(h_stats, stats.data(), 2, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
     h_stats[0] /= params.len;
     h_stats[1] = (h_stats[1] / params.len) - (h_stats[0] * h_stats[0]);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
   void getExpectedMeanVar(T meanvar[2])
@@ -375,7 +375,7 @@ TEST(Rng, MeanError)
   int len             = num_samples * num_experiments;
 
   cudaStream_t stream;
-  CUDA_CHECK(cudaStreamCreate(&stream));
+  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
 
   rmm::device_uvector<float> data(len, stream);
   rmm::device_uvector<float> mean_result(num_experiments, stream);
@@ -399,7 +399,7 @@ TEST(Rng, MeanError)
     std::vector<float> h_std_result(num_experiments);
     update_host(h_mean_result.data(), mean_result.data(), num_experiments, stream);
     update_host(h_std_result.data(), std_result.data(), num_experiments, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
     auto d_mean = quick_mean(h_mean_result);
 
     // std-dev of mean; also known as mean error
@@ -415,7 +415,7 @@ TEST(Rng, MeanError)
 
     ASSERT_TRUE((diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5));
   }
-  CUDA_CHECK(cudaStreamDestroy(stream));
+  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
 
   // std::cout << "mean_res:" << h_mean_result << "\n";
 }
@@ -428,7 +428,7 @@ class ScaledBernoulliTest : public ::testing::Test {
  protected:
   void SetUp() override
   {
-    CUDA_CHECK(cudaStreamCreate(&stream));
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
     Rng r(42);
     r.scaled_bernoulli(data.data(), len, T(0.5), T(scale), stream);
   }
@@ -464,7 +464,7 @@ class BernoulliTest : public ::testing::Test {
   {
     Rng r(42);
     r.bernoulli(data.data(), len, T(0.5), stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
   void trueFalseCheck()
@@ -515,7 +515,7 @@ class RngNormalTableTest : public ::testing::TestWithParam<RngNormalTableInputs<
       stats(2, stream),
       mu_vec(params.cols, stream)
   {
-    CUDA_CHECK(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(T), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(T), stream));
   }
 
  protected:
@@ -534,10 +534,10 @@ class RngNormalTableTest : public ::testing::TestWithParam<RngNormalTableInputs<
     meanKernel<T, threads>
       <<<raft::ceildiv(len, threads), threads, 0, stream>>>(stats.data(), data.data(), len);
     update_host<T>(h_stats, stats.data(), 2, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
     h_stats[0] /= len;
     h_stats[1] = (h_stats[1] / len) - (h_stats[0] * h_stats[0]);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
   void getExpectedMeanVar(T meanvar[2])
diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu
index f0331b7746..d5701e1708 100644
--- a/cpp/test/random/rng_int.cu
+++ b/cpp/test/random/rng_int.cu
@@ -77,7 +77,7 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
       stats(2, stream)
   {
     data.resize(params.len, stream);
-    CUDA_CHECK(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(float), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(float), stream));
   }
 
  protected:
@@ -94,10 +94,10 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     meanKernel<T, threads><<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(
       stats.data(), data.data(), params.len);
     update_host<float>(h_stats, stats.data(), 2, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
     h_stats[0] /= params.len;
     h_stats[1] = (h_stats[1] / params.len) - (h_stats[0] * h_stats[0]);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
   void getExpectedMeanVar(float meanvar[2])
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu
index a681bbb07d..710049cbce 100644
--- a/cpp/test/random/sample_without_replacement.cu
+++ b/cpp/test/random/sample_without_replacement.cu
@@ -77,7 +77,7 @@ class SWoRTest : public ::testing::TestWithParam<SWoRInputs<T>> {
                                params.len,
                                stream);
     update_host(&(h_outIdx[0]), outIdx.data(), params.sampledLen, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu
index 57e7414861..df138e2bdb 100644
--- a/cpp/test/sparse/connect_components.cu
+++ b/cpp/test/sparse/connect_components.cu
@@ -127,7 +127,7 @@ class ConnectComponentsTest
                                                                     false,
                                                                     false);
 
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
     // The sum of edges for both MST runs should be n_rows - 1
     final_edges = output_mst.n_edges + mst_coo.n_edges;
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index dd774c1d79..3b69c9240c 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -68,9 +68,9 @@ TEST_P(SortedCOOToCSR, Result)
   rmm::device_uvector<int> in(nnz, stream);
   rmm::device_uvector<int> exp(4, stream);
   rmm::device_uvector<int> out(4, stream);
-  CUDA_CHECK(cudaMemsetAsync(in.data(), 0, in.size() * sizeof(int), stream));
-  CUDA_CHECK(cudaMemsetAsync(exp.data(), 0, exp.size() * sizeof(int), stream));
-  CUDA_CHECK(cudaMemsetAsync(out.data(), 0, out.size() * sizeof(int), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(in.data(), 0, in.size() * sizeof(int), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(exp.data(), 0, exp.size() * sizeof(int), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(out.data(), 0, out.size() * sizeof(int), stream));
 
   raft::update_device(in.data(), in_h, nnz, stream);
   raft::update_device(exp.data(), exp_h, 4, stream);
diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu
index 768397e617..c8c593790f 100644
--- a/cpp/test/sparse/csr_row_slice.cu
+++ b/cpp/test/sparse/csr_row_slice.cu
@@ -98,7 +98,7 @@ class CSRRowSliceTest : public ::testing::TestWithParam<CSRRowSliceInputs<value_
     update_device(
       out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream);
     update_device(out_data_ref.data(), out_data_ref_h.data(), out_data_ref_h.size(), stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
   void SetUp() override
@@ -124,7 +124,7 @@ class CSRRowSliceTest : public ::testing::TestWithParam<CSRRowSliceInputs<value_
                                              out_data.data(),
                                              stream);
 
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CHECK_CUDA(cudaStreamSynchronize(stream));
   }
 
   void compare()
diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu
index bbc7c8c185..3194c8c98e 100644
--- a/cpp/test/sparse/csr_to_dense.cu
+++ b/cpp/test/sparse/csr_to_dense.cu
@@ -83,12 +83,12 @@ class CSRToDenseTest : public ::testing::TestWithParam<CSRToDenseInputs<value_id
     std::vector<value_t> out_ref_h = params.out_ref_h;
 
     update_device(out_ref.data(), out_ref_h.data(), out_ref_h.size(), stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CHECK_CUDA(cudaStreamSynchronize(stream));
   }
 
   void SetUp() override
   {
-    CUSPARSE_CHECK(cusparseCreate(&handle));
+    RAFT_CUSPARSE_TRY(cusparseCreate(&handle));
 
     make_data();
 
@@ -103,8 +103,8 @@ class CSRToDenseTest : public ::testing::TestWithParam<CSRToDenseInputs<value_id
                           stream,
                           true);
 
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-    CUSPARSE_CHECK(cusparseDestroy(handle));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    RAFT_CUSPARSE_TRY(cusparseDestroy(handle));
   }
 
   void compare()
diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu
index 4e762ed91b..b8aea773ae 100644
--- a/cpp/test/sparse/csr_transpose.cu
+++ b/cpp/test/sparse/csr_transpose.cu
@@ -118,7 +118,7 @@ class CSRTransposeTest : public ::testing::TestWithParam<CSRTransposeInputs<valu
                                         params.nnz,
                                         stream);
 
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
   void compare()
diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu
index d11da1068a..0f10faf600 100644
--- a/cpp/test/sparse/degree.cu
+++ b/cpp/test/sparse/degree.cu
@@ -57,8 +57,8 @@ TEST_P(COODegree, Result)
   rmm::device_uvector<int> in_rows(5, stream);
   rmm::device_uvector<int> verify(5, stream);
   rmm::device_uvector<int> results(5, stream);
-  CUDA_CHECK(cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream));
-  CUDA_CHECK(cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream));
 
   raft::update_device(in_rows.data(), *&in_rows_h, 5, stream);
   raft::update_device(verify.data(), *&verify_h, 5, stream);
@@ -68,7 +68,7 @@ TEST_P(COODegree, Result)
 
   ASSERT_TRUE(raft::devArrMatch<int>(verify.data(), results.data(), 5, raft::Compare<int>()));
 
-  CUDA_CHECK(cudaStreamDestroy(stream));
+  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
 }
 
 typedef SparseDegreeTests<float> COODegreeNonzero;
@@ -85,9 +85,9 @@ TEST_P(COODegreeNonzero, Result)
   rmm::device_uvector<int> verify(5, stream);
   rmm::device_uvector<int> results(5, stream);
   rmm::device_uvector<float> in_vals(5, stream);
-  CUDA_CHECK(cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream));
-  CUDA_CHECK(cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream));
-  CUDA_CHECK(cudaMemsetAsync(in_vals.data(), 0, in_vals.size() * sizeof(float), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(in_vals.data(), 0, in_vals.size() * sizeof(float), stream));
 
   raft::update_device(in_rows.data(), *&in_rows_h, 5, stream);
   raft::update_device(verify.data(), *&verify_h, 5, stream);
@@ -98,7 +98,7 @@ TEST_P(COODegreeNonzero, Result)
 
   ASSERT_TRUE(raft::devArrMatch<int>(verify.data(), results.data(), 5, raft::Compare<int>()));
 
-  CUDA_CHECK(cudaStreamDestroy(stream));
+  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
 }
 
 INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree, ::testing::ValuesIn(inputsf));
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index d0de8705ab..2c8a91b8b8 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -222,7 +222,7 @@ class SparseDistanceCOOSPMVTest
 
     run_spmv();
 
-    CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
   }
 
   void compare()
diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu
index 8538c9cf39..f4f346561c 100644
--- a/cpp/test/sparse/distance.cu
+++ b/cpp/test/sparse/distance.cu
@@ -92,7 +92,7 @@ class SparseDistanceTest
 
     pairwiseDistance(out_dists.data(), dist_config, params.metric, params.metric_arg);
 
-    CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
   }
 
   void compare()
diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu
index efa399acdb..77c66e2133 100644
--- a/cpp/test/sparse/filter.cu
+++ b/cpp/test/sparse/filter.cu
@@ -101,7 +101,7 @@ TEST_P(COORemoveZeros, Result)
   ASSERT_TRUE(raft::devArrMatch<int>(out_ref.cols(), out.cols(), 2, raft::Compare<int>()));
   ASSERT_TRUE(raft::devArrMatch<float>(out_ref.vals(), out.vals(), 2, raft::Compare<float>()));
 
-  CUDA_CHECK(cudaStreamDestroy(stream));
+  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
   free(out_vals_ref_h);
 
   delete[] in_h_rows;
diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu
index d4f57a381f..389e8c4b9c 100644
--- a/cpp/test/sparse/knn.cu
+++ b/cpp/test/sparse/knn.cu
@@ -101,7 +101,7 @@ class SparseKNNTest : public ::testing::TestWithParam<SparseKNNInputs<value_idx,
                                                                  params.batch_size_query,
                                                                  params.metric);
 
-    CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
   }
 
   void compare()
diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu
index 584272fb2c..9af5e9103b 100644
--- a/cpp/test/sparse/knn_graph.cu
+++ b/cpp/test/sparse/knn_graph.cu
@@ -88,7 +88,7 @@ class KNNGraphTest : public ::testing::TestWithParam<KNNGraphInputs<value_idx, v
       out->rows(), out->cols(), out->vals(), out->nnz, sum.data());
 
     sum_h = sum.value(stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
   void TearDown() override { delete out; }
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu
index d506e3e54d..81e6dc4768 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/sparse/linkage.cu
@@ -118,7 +118,7 @@ double compute_rand_index(T* firstClusterArray,
 
   // allocating and initializing memory for a and b in the GPU
   rmm::device_uvector<uint64_t> arr_buf(2, stream);
-  CUDA_CHECK(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream));
 
   // kernel configuration
   static const int BLOCK_DIM_Y = 16, BLOCK_DIM_X = 16;
@@ -133,10 +133,10 @@ double compute_rand_index(T* firstClusterArray,
   // synchronizing and updating the calculated values of a and b from device to host
   uint64_t ab_host[2] = {0};
   raft::update_host(ab_host, arr_buf.data(), 2, stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
   // error handling
-  CUDA_CHECK(cudaGetLastError());
+  RAFT_CUDA_TRY(cudaGetLastError());
 
   // denominator
   uint64_t nChooseTwo = size * (size - 1) / 2;
@@ -188,7 +188,7 @@ class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
       params.c,
       params.n_clusters);
 
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
     score = compute_rand_index(labels.data(), labels_ref.data(), params.n_row, stream);
   }
diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu
index b2658f37ca..0a0864ce15 100644
--- a/cpp/test/sparse/sort.cu
+++ b/cpp/test/sparse/sort.cu
@@ -51,7 +51,7 @@ TEST_P(COOSort, Result)
   params = ::testing::TestWithParam<SparseSortInput<float>>::GetParam();
   raft::random::Rng r(params.seed);
   cudaStream_t stream;
-  CUDA_CHECK(cudaStreamCreate(&stream));
+  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
 
   rmm::device_uvector<int> in_rows(params.nnz, stream);
   rmm::device_uvector<int> in_cols(params.nnz, stream);
@@ -85,7 +85,7 @@ TEST_P(COOSort, Result)
   delete[] in_cols_h;
   delete[] verify_h;
 
-  CUDA_CHECK(cudaStreamDestroy(stream));
+  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
 }
 
 INSTANTIATE_TEST_CASE_P(SparseSortTest, COOSort, ::testing::ValuesIn(inputsf));
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu
index b9fc868cf0..9c766d2d05 100644
--- a/cpp/test/sparse/symmetrize.cu
+++ b/cpp/test/sparse/symmetrize.cu
@@ -111,7 +111,7 @@ class SparseSymmetrizeTest
       out.rows(), out.cols(), out.vals(), out.nnz, sum.data());
 
     sum_h = sum.value(stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
@@ -173,7 +173,7 @@ TEST_P(COOSymmetrize, Result)
     [] __device__(int row, int col, float val, float trans) { return val + trans; },
     stream);
 
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   std::cout << out << std::endl;
 
   ASSERT_TRUE(out.nnz == nnz * 2);
diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu
index 0a1680badc..00f83254c3 100644
--- a/cpp/test/spatial/ball_cover.cu
+++ b/cpp/test/spatial/ball_cover.cu
@@ -176,7 +176,7 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam<BallCoverInputs> {
                   d_ref_D.data(),
                   d_ref_I.data());
 
-    CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
 
     // Allocate predicted arrays
     rmm::device_uvector<value_idx> d_pred_I(n * k, handle.get_stream());
@@ -188,7 +188,7 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam<BallCoverInputs> {
     raft::spatial::knn::rbc_knn_query(
       handle, index, k, d_train_inputs.data(), n, d_pred_I.data(), d_pred_D.data(), true, weight);
 
-    CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
     // What we really want are for the distances to match exactly. The
     // indices may or may not match exactly, depending upon the ordering which
     // can be nondeterministic.
@@ -273,7 +273,7 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam<BallCoverInputs> {
                                                                         translations,
                                                                         metric);
 
-    CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
 
     // Allocate predicted arrays
     rmm::device_uvector<value_idx> d_pred_I(n * k, handle.get_stream());
@@ -284,7 +284,7 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam<BallCoverInputs> {
     raft::spatial::knn::rbc_all_knn_query(
       handle, index, k, d_pred_I.data(), d_pred_D.data(), true, weight);
 
-    CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
     // What we really want are for the distances to match exactly. The
     // indices may or may not match exactly, depending upon the ordering which
     // can be nondeterministic.
diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu
index e48a3c6657..078d5e0eec 100644
--- a/cpp/test/spatial/fused_l2_knn.cu
+++ b/cpp/test/spatial/fused_l2_knn.cu
@@ -77,7 +77,7 @@ testing::AssertionResult devArrMatchKnnPair(const T* expected_idx,
   raft::update_host<T>(act_idx_h.get(), actual_idx, size, stream);
   raft::update_host<DistT>(exp_dist_h.get(), expected_dist, size, stream);
   raft::update_host<DistT>(act_dist_h.get(), actual_dist, size, stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   for (size_t i(0); i < rows; ++i) {
     for (size_t j(0); j < cols; ++j) {
       auto idx      = i * cols + j;  // row major assumption!
@@ -168,7 +168,7 @@ class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
 
     gpu_res.noTempMemory();
     int device;
-    CUDA_CHECK(cudaGetDevice(&device));
+    RAFT_CUDA_TRY(cudaGetDevice(&device));
     gpu_res.setDefaultStream(device, handle_.get_stream());
 
     faiss::gpu::GpuDistanceParams args;
diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu
index bff7665f83..e268dc0c55 100644
--- a/cpp/test/spatial/haversine.cu
+++ b/cpp/test/spatial/haversine.cu
@@ -94,7 +94,7 @@ class HaversineKNNTest : public ::testing::Test {
                                               k,
                                               stream);
 
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
   void SetUp() override { basicTest(); }
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index 49e5aaab4b..8ab33745f3 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -120,16 +120,16 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
     distances_.resize(rows_ * k_, stream);
     search_labels_.resize(rows_, stream);
 
-    CUDA_CHECK(
+    RAFT_CUDA_TRY(
       cudaMemsetAsync(actual_labels_.data(), 0, actual_labels_.size() * sizeof(int), stream));
-    CUDA_CHECK(
+    RAFT_CUDA_TRY(
       cudaMemsetAsync(expected_labels_.data(), 0, expected_labels_.size() * sizeof(int), stream));
-    CUDA_CHECK(cudaMemsetAsync(input_.data(), 0, input_.size() * sizeof(float), stream));
-    CUDA_CHECK(
+    RAFT_CUDA_TRY(cudaMemsetAsync(input_.data(), 0, input_.size() * sizeof(float), stream));
+    RAFT_CUDA_TRY(
       cudaMemsetAsync(search_data_.data(), 0, search_data_.size() * sizeof(float), stream));
-    CUDA_CHECK(cudaMemsetAsync(indices_.data(), 0, indices_.size() * sizeof(int64_t), stream));
-    CUDA_CHECK(cudaMemsetAsync(distances_.data(), 0, distances_.size() * sizeof(float), stream));
-    CUDA_CHECK(
+    RAFT_CUDA_TRY(cudaMemsetAsync(indices_.data(), 0, indices_.size() * sizeof(int64_t), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(distances_.data(), 0, distances_.size() * sizeof(float), stream));
+    RAFT_CUDA_TRY(
       cudaMemsetAsync(search_labels_.data(), 0, search_labels_.size() * sizeof(int), stream));
 
     std::vector<float> row_major_input;
@@ -149,7 +149,7 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
     raft::copy(input_.data(), input_ptr, rows_ * cols_, stream);
     raft::copy(search_data_.data(), input_ptr, rows_ * cols_, stream);
     raft::copy(search_labels_.data(), labels_ptr, rows_, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  private:
diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu
index 69f6a47978..5069b4f256 100644
--- a/cpp/test/spatial/selection.cu
+++ b/cpp/test/spatial/selection.cu
@@ -108,7 +108,7 @@ class SparseSelectionTest
                                  k,
                                  stream);
 
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
   void compare()
diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu
index 6a76a289d7..8f2e2ecef1 100644
--- a/cpp/test/stats/mean_center.cu
+++ b/cpp/test/stats/mean_center.cu
@@ -78,7 +78,7 @@ class MeanCenterTest : public ::testing::TestWithParam<MeanCenterInputs<T, IdxTy
                               params.rowMajor,
                               params.bcastAlongRows,
                               (T)-1.0);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index 3efc54264e..b6f7a078bd 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -66,7 +66,7 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
     vars_act.resize(cols, stream);
     r.normal(data.data(), len, params.mean, params.stddev, stream);
     stdVarSGtest(data.data(), stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
   void stdVarSGtest(T* data, cudaStream_t stream)
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index ecb1171ea5..82766f6109 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -62,7 +62,7 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
 
     raft::update_device(data.data(), data_h, len, stream);
     sum(sum_act.data(), data.data(), cols, rows, false, stream);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
  protected:
diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h
index 58b9ae42ae..f2573f132b 100644
--- a/cpp/test/test_utils.h
+++ b/cpp/test/test_utils.h
@@ -91,7 +91,7 @@ testing::AssertionResult devArrMatch(
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(exp_h.get(), expected, size, stream);
   raft::update_host<T>(act_h.get(), actual, size, stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   for (size_t i(0); i < size; ++i) {
     auto exp = exp_h.get()[i];
     auto act = act_h.get()[i];
@@ -108,7 +108,7 @@ testing::AssertionResult devArrMatch(
 {
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(act_h.get(), actual, size, stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   for (size_t i(0); i < size; ++i) {
     auto act = act_h.get()[i];
     if (!eq_compare(expected, act)) {
@@ -132,7 +132,7 @@ testing::AssertionResult devArrMatch(const T* expected,
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(exp_h.get(), expected, size, stream);
   raft::update_host<T>(act_h.get(), actual, size, stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   for (size_t i(0); i < rows; ++i) {
     for (size_t j(0); j < cols; ++j) {
       auto idx = i * cols + j;  // row major assumption!
@@ -154,7 +154,7 @@ testing::AssertionResult devArrMatch(
   size_t size = rows * cols;
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(act_h.get(), actual, size, stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   for (size_t i(0); i < rows; ++i) {
     for (size_t j(0); j < cols; ++j) {
       auto idx = i * cols + j;  // row major assumption!
@@ -185,7 +185,7 @@ testing::AssertionResult devArrMatchHost(
 {
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(act_h.get(), actual_d, size, stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   bool ok   = true;
   auto fail = testing::AssertionFailure();
   for (size_t i(0); i < size; ++i) {
@@ -217,7 +217,7 @@ testing::AssertionResult diagonalMatch(
   size_t size = rows * cols;
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(act_h.get(), actual, size, stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   for (size_t i(0); i < rows; ++i) {
     for (size_t j(0); j < cols; ++j) {
       if (i != j) continue;
@@ -244,20 +244,20 @@ testing::AssertionResult match(const T expected, T actual, L eq_compare)
 /** @} */
 
 /** time the function call 'func' using cuda events */
-#define TIMEIT_LOOP(ms, count, func)                    \
-  do {                                                  \
-    cudaEvent_t start, stop;                            \
-    CUDA_CHECK(cudaEventCreate(&start));                \
-    CUDA_CHECK(cudaEventCreate(&stop));                 \
-    CUDA_CHECK(cudaEventRecord(start));                 \
-    for (int i = 0; i < count; ++i) {                   \
-      func;                                             \
-    }                                                   \
-    CUDA_CHECK(cudaEventRecord(stop));                  \
-    CUDA_CHECK(cudaEventSynchronize(stop));             \
-    ms = 0.f;                                           \
-    CUDA_CHECK(cudaEventElapsedTime(&ms, start, stop)); \
-    ms /= args.runs;                                    \
+#define TIMEIT_LOOP(ms, count, func)                       \
+  do {                                                     \
+    cudaEvent_t start, stop;                               \
+    RAFT_CUDA_TRY(cudaEventCreate(&start));                \
+    RAFT_CUDA_TRY(cudaEventCreate(&stop));                 \
+    RAFT_CUDA_TRY(cudaEventRecord(start));                 \
+    for (int i = 0; i < count; ++i) {                      \
+      func;                                                \
+    }                                                      \
+    RAFT_CUDA_TRY(cudaEventRecord(stop));                  \
+    RAFT_CUDA_TRY(cudaEventSynchronize(stop));             \
+    ms = 0.f;                                              \
+    RAFT_CUDA_TRY(cudaEventElapsedTime(&ms, start, stop)); \
+    ms /= args.runs;                                       \
   } while (0)
 
 inline std::vector<float> read_csv(std::string filename, bool skip_first_n_columns = 1)
diff --git a/python/setup.py b/python/setup.py
index b10ca783b0..f5b1e8bace 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -133,8 +133,13 @@ def remove_flags(compiler, *flags):
                     )
                 except Exception:
                     pass
+
         # Full optimization
         self.compiler.compiler_so.append("-O3")
+
+        # Ignore deprecation declaration warnings
+        self.compiler.compiler_so.append("-Wno-deprecated-declarations")
+
         # No debug symbols, full optimization, no '-Wstrict-prototypes' warning
         remove_flags(
             self.compiler, "-g", "-G", "-O1", "-O2", "-Wstrict-prototypes"

From 36e2efc50b33f1dd2ca69f8584e917d2c8227b9c Mon Sep 17 00:00:00 2001
From: Zach Bjornson <zbbjornson@gmail.com>
Date: Fri, 10 Dec 2021 03:50:30 -0700
Subject: [PATCH 055/171] Fix CUDA_CHECK_NO_THROW compatibility define (#414)

cuML builds broke after 2ecdd34a6f because `RAFT_CHECK_CUDA_NO_THROW` is not defined.

Authors:
  - Zach Bjornson (https://github.com/zbjornson)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/414
---
 cpp/include/raft/cudart_utils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index 1464cd070e..3539f1e1c0 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -121,7 +121,7 @@ struct cuda_error : public raft::exception {
 
 // FIXME: Remove after cuml rename
 #ifndef CUDA_CHECK_NO_THROW
-#define CUDA_CHECK_NO_THROW(call) RAFT_CHECK_CUDA_NO_THROW(call)
+#define CUDA_CHECK_NO_THROW(call) RAFT_CUDA_TRY_NO_THROW(call)
 #endif
 
 /**

From fd54f29f81075ab01faf8a81b5e5b255496a4073 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Fri, 10 Dec 2021 11:51:19 +0100
Subject: [PATCH 056/171] Fix using incorrect macro RAFT_CHECK_CUDA in place of
 RAFT_CUDA_TRY (#415)

Fix the incorrect macro slipped in during refactoring in https://github.com/rapidsai/raft/commit/2ecdd34a6f0b878fe91312bf0156a3e70240fc7f (build fails when in debug mode).

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/415
---
 cpp/test/sparse/csr_row_slice.cu | 2 +-
 cpp/test/sparse/csr_to_dense.cu  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu
index c8c593790f..e37827d18d 100644
--- a/cpp/test/sparse/csr_row_slice.cu
+++ b/cpp/test/sparse/csr_row_slice.cu
@@ -124,7 +124,7 @@ class CSRRowSliceTest : public ::testing::TestWithParam<CSRRowSliceInputs<value_
                                              out_data.data(),
                                              stream);
 
-    RAFT_CHECK_CUDA(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
   void compare()
diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu
index 3194c8c98e..1a51e12bda 100644
--- a/cpp/test/sparse/csr_to_dense.cu
+++ b/cpp/test/sparse/csr_to_dense.cu
@@ -83,7 +83,7 @@ class CSRToDenseTest : public ::testing::TestWithParam<CSRToDenseInputs<value_id
     std::vector<value_t> out_ref_h = params.out_ref_h;
 
     update_device(out_ref.data(), out_ref_h.data(), out_ref_h.size(), stream);
-    RAFT_CHECK_CUDA(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
   void SetUp() override

From 367688f8edf93357169fd084740ef92515f4034d Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Fri, 10 Dec 2021 13:17:38 -0500
Subject: [PATCH 057/171] Adding no throw macro variants (#417)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/417
---
 cpp/include/raft/handle.hpp                 | 24 ++++++++++++---------
 cpp/include/raft/linalg/cublas_wrappers.h   | 23 ++++++++++++++++++++
 cpp/include/raft/linalg/cusolver_wrappers.h | 20 +++++++++++++++++
 3 files changed, 57 insertions(+), 10 deletions(-)

diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 4b7605cc63..97b442afe3 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -104,7 +104,7 @@ class handle_t {
   {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cublas_initialized_) {
-      RAFT_CUBLAS_TRY(cublasCreate(&cublas_handle_));
+      RAFT_CUBLAS_TRY_NO_THROW(cublasCreate(&cublas_handle_));
       cublas_initialized_ = true;
     }
     return cublas_handle_;
@@ -114,7 +114,7 @@ class handle_t {
   {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusolver_dn_initialized_) {
-      RAFT_CUSOLVER_TRY(cusolverDnCreate(&cusolver_dn_handle_));
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnCreate(&cusolver_dn_handle_));
       cusolver_dn_initialized_ = true;
     }
     return cusolver_dn_handle_;
@@ -124,7 +124,7 @@ class handle_t {
   {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusolver_sp_initialized_) {
-      RAFT_CUSOLVER_TRY(cusolverSpCreate(&cusolver_sp_handle_));
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpCreate(&cusolver_sp_handle_));
       cusolver_sp_initialized_ = true;
     }
     return cusolver_sp_handle_;
@@ -134,7 +134,7 @@ class handle_t {
   {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusparse_initialized_) {
-      RAFT_CUSPARSE_TRY(cusparseCreate(&cusparse_handle_));
+      RAFT_CUSPARSE_TRY_NO_THROW(cusparseCreate(&cusparse_handle_));
       cusparse_initialized_ = true;
     }
     return cusparse_handle_;
@@ -218,7 +218,7 @@ class handle_t {
   {
     std::lock_guard<std::mutex> _(mutex_);
     if (!device_prop_initialized_) {
-      RAFT_CUDA_TRY(cudaGetDeviceProperties(&prop_, dev_id_));
+      RAFT_CUDA_TRY_NO_THROW(cudaGetDeviceProperties(&prop_, dev_id_));
       device_prop_initialized_ = true;
     }
     return prop_;
@@ -253,11 +253,15 @@ class handle_t {
   void destroy_resources()
   {
     ///@todo: enable *_NO_THROW variants once we have enabled logging
-    if (cusparse_initialized_) { RAFT_CUSPARSE_TRY(cusparseDestroy(cusparse_handle_)); }
-    if (cusolver_dn_initialized_) { RAFT_CUSOLVER_TRY(cusolverDnDestroy(cusolver_dn_handle_)); }
-    if (cusolver_sp_initialized_) { RAFT_CUSOLVER_TRY(cusolverSpDestroy(cusolver_sp_handle_)); }
-    if (cublas_initialized_) { RAFT_CUBLAS_TRY(cublasDestroy(cublas_handle_)); }
-    RAFT_CUDA_TRY(cudaEventDestroy(event_));
+    if (cusparse_initialized_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroy(cusparse_handle_)); }
+    if (cusolver_dn_initialized_) {
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
+    }
+    if (cusolver_sp_initialized_) {
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
+    }
+    if (cublas_initialized_) { RAFT_CUBLAS_TRY_NO_THROW(cublasDestroy(cublas_handle_)); }
+    RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(event_));
   }
 };  // class handle_t
 
diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h
index d125aa40dd..024ed4a0e2 100644
--- a/cpp/include/raft/linalg/cublas_wrappers.h
+++ b/cpp/include/raft/linalg/cublas_wrappers.h
@@ -89,8 +89,31 @@ inline const char* cublas_error_to_string(cublasStatus_t err)
 #define CUBLAS_TRY(call) RAFT_CUBLAS_TRY(call)
 #endif
 
+// /**
+//  * @brief check for cuda runtime API errors but log error instead of raising
+//  *        exception.
+//  */
+#define RAFT_CUBLAS_TRY_NO_THROW(call)                               \
+  do {                                                               \
+    cublasStatus_t const status = call;                              \
+    if (CUBLAS_STATUS_SUCCESS != status) {                           \
+      printf("CUBLAS call='%s' at file=%s line=%d failed with %s\n", \
+             #call,                                                  \
+             __FILE__,                                               \
+             __LINE__,                                               \
+             raft::linalg::detail::cublas_error_to_string(status));  \
+    }                                                                \
+  } while (0)
+
 /** FIXME: remove after cuml rename */
+#ifndef CUBLAS_CHECK
 #define CUBLAS_CHECK(call) CUBLAS_TRY(call)
+#endif
+
+/** FIXME: remove after cuml rename */
+#ifndef CUBLAS_CHECK_NO_THROW
+#define CUBLAS_CHECK_NO_THROW(call) RAFT_CUBLAS_TRY_NO_THROW(call)
+#endif
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/cusolver_wrappers.h
index 0c94804111..988e7512d5 100644
--- a/cpp/include/raft/linalg/cusolver_wrappers.h
+++ b/cpp/include/raft/linalg/cusolver_wrappers.h
@@ -88,11 +88,31 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err)
 #define CUSOLVER_TRY(call) RAFT_CUSOLVER_TRY(call)
 #endif
 
+// /**
+//  * @brief check for cuda runtime API errors but log error instead of raising
+//  *        exception.
+//  */
+#define RAFT_CUSOLVER_TRY_NO_THROW(call)                               \
+  do {                                                                 \
+    cusolverStatus_t const status = call;                              \
+    if (CUSOLVER_STATUS_SUCCESS != status) {                           \
+      printf("CUSOLVER call='%s' at file=%s line=%d failed with %s\n", \
+             #call,                                                    \
+             __FILE__,                                                 \
+             __LINE__,                                                 \
+             raft::linalg::detail::cusolver_error_to_string(status));  \
+    }                                                                  \
+  } while (0)
+
 // FIXME: remove after cuml rename
 #ifndef CUSOLVER_CHECK
 #define CUSOLVER_CHECK(call) CUSOLVER_TRY(call)
 #endif
 
+#ifndef CUSOLVER_CHECK_NO_THROW
+#define CUSOLVER_CHECK_NO_THROW(call) CUSOLVER_TRY_NO_THROW(call)
+#endif
+
 namespace raft {
 namespace linalg {
 

From a68a4aafa38718fd00db68ee42b55b6185342f6b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Mon, 13 Dec 2021 14:05:21 -0500
Subject: [PATCH 058/171] Disabling fused l2 knn again. Not sure how this got
 added back. (#421)

This got added back to raft, somehow. Disabling again until we figure out what's causing the umap gtest failures downstream.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/421
---
 .../knn/detail/knn_brute_force_faiss.cuh      | 112 +++++++++---------
 1 file changed, 56 insertions(+), 56 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 8962c27d52..414c1dc1ce 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -294,63 +294,63 @@ void brute_force_knn_impl(
     cudaStream_t stream = raft::select_stream(userStream, internalStreams, n_int_streams, i);
 
     //    // TODO: Enable this once we figure out why it's causing pytest failures in cuml.
-    if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true &&
-        (metric == raft::distance::DistanceType::L2Unexpanded ||
-         metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
-         metric == raft::distance::DistanceType::L2Expanded ||
-         metric == raft::distance::DistanceType::L2SqrtExpanded)) {
-      fusedL2Knn(D,
-                 out_i_ptr,
-                 out_d_ptr,
-                 input[i],
-                 search_items,
-                 sizes[i],
-                 n,
-                 k,
-                 rowMajorIndex,
-                 rowMajorQuery,
-                 stream,
-                 metric);
-    } else {
-      switch (metric) {
-        case raft::distance::DistanceType::Haversine:
-
-          ASSERT(D == 2,
-                 "Haversine distance requires 2 dimensions "
-                 "(latitude / longitude).");
-
-          haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream);
-          break;
-        default:
-          faiss::MetricType m = build_faiss_metric(metric);
-
-          faiss::gpu::StandardGpuResources gpu_res;
-
-          gpu_res.noTempMemory();
-          gpu_res.setDefaultStream(device, stream);
-
-          faiss::gpu::GpuDistanceParams args;
-          args.metric          = m;
-          args.metricArg       = metricArg;
-          args.k               = k;
-          args.dims            = D;
-          args.vectors         = input[i];
-          args.vectorsRowMajor = rowMajorIndex;
-          args.numVectors      = sizes[i];
-          args.queries         = search_items;
-          args.queriesRowMajor = rowMajorQuery;
-          args.numQueries      = n;
-          args.outDistances    = out_d_ptr;
-          args.outIndices      = out_i_ptr;
-
-          /**
-           * @todo: Until FAISS supports pluggable allocation strategies,
-           * we will not reap the benefits of the pool allocator for
-           * avoiding device-wide synchronizations from cudaMalloc/cudaFree
-           */
-          bfKnn(&gpu_res, args);
-      }
+    //    if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true &&
+    //        (metric == raft::distance::DistanceType::L2Unexpanded ||
+    //         metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
+    //         metric == raft::distance::DistanceType::L2Expanded ||
+    //         metric == raft::distance::DistanceType::L2SqrtExpanded)) {
+    //      fusedL2Knn(D,
+    //                 out_i_ptr,
+    //                 out_d_ptr,
+    //                 input[i],
+    //                 search_items,
+    //                 sizes[i],
+    //                 n,
+    //                 k,
+    //                 rowMajorIndex,
+    //                 rowMajorQuery,
+    //                 stream,
+    //                 metric);
+    //    } else {
+    switch (metric) {
+      case raft::distance::DistanceType::Haversine:
+
+        ASSERT(D == 2,
+               "Haversine distance requires 2 dimensions "
+               "(latitude / longitude).");
+
+        haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream);
+        break;
+      default:
+        faiss::MetricType m = build_faiss_metric(metric);
+
+        faiss::gpu::StandardGpuResources gpu_res;
+
+        gpu_res.noTempMemory();
+        gpu_res.setDefaultStream(device, stream);
+
+        faiss::gpu::GpuDistanceParams args;
+        args.metric          = m;
+        args.metricArg       = metricArg;
+        args.k               = k;
+        args.dims            = D;
+        args.vectors         = input[i];
+        args.vectorsRowMajor = rowMajorIndex;
+        args.numVectors      = sizes[i];
+        args.queries         = search_items;
+        args.queriesRowMajor = rowMajorQuery;
+        args.numQueries      = n;
+        args.outDistances    = out_d_ptr;
+        args.outIndices      = out_i_ptr;
+
+        /**
+         * @todo: Until FAISS supports pluggable allocation strategies,
+         * we will not reap the benefits of the pool allocator for
+         * avoiding device-wide synchronizations from cudaMalloc/cudaFree
+         */
+        bfKnn(&gpu_res, args);
     }
+    //    }
 
     RAFT_CUDA_TRY(cudaPeekAtLastError());
   }

From 906e5e627d639a80c1eb947f8d60ed586a3463b8 Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Mon, 13 Dec 2021 14:12:16 -0800
Subject: [PATCH 059/171] One cudaStream_t instance per raft::handle_t (#291)

closes #293
closes #115

This PR also updates the cython build to `std=c++17`.

Authors:
  - Divye Gala (https://github.com/divyegala)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - William Hicks (https://github.com/wphicks)
  - Seunghwa Kang (https://github.com/seunghwak)
  - Corey J. Nolet (https://github.com/cjnolet)
  - Tamas Bela Feher (https://github.com/tfeher)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/raft/pull/291
---
 cpp/cmake/modules/ConfigureCUDA.cmake         |   2 +-
 cpp/include/raft/comms/test.hpp               |   8 +-
 cpp/include/raft/handle.hpp                   | 174 +++++++++++-------
 cpp/include/raft/label/classlabels.cuh        |  17 +-
 .../raft/spatial/knn/detail/ball_cover.cuh    |   6 +-
 .../knn/detail/knn_brute_force_faiss.cuh      |  23 +--
 cpp/include/raft/spatial/knn/knn.hpp          |   8 +-
 cpp/test/distance/dist_adj.cu                 |  11 +-
 cpp/test/distance/dist_canberra.cu            |   4 +-
 cpp/test/distance/dist_chebyshev.cu           |   4 +-
 cpp/test/distance/dist_correlation.cu         |   4 +-
 cpp/test/distance/dist_cos.cu                 |   8 +-
 cpp/test/distance/dist_euc_exp.cu             |   8 +-
 cpp/test/distance/dist_euc_unexp.cu           |   8 +-
 cpp/test/distance/dist_hamming.cu             |   4 +-
 cpp/test/distance/dist_hellinger.cu           |   4 +-
 cpp/test/distance/dist_jensen_shannon.cu      |   4 +-
 cpp/test/distance/dist_kl_divergence.cu       |   4 +-
 cpp/test/distance/dist_l1.cu                  |   4 +-
 cpp/test/distance/dist_minkowski.cu           |   4 +-
 cpp/test/distance/dist_russell_rao.cu         |   4 +-
 cpp/test/distance/distance_base.cuh           |  33 ++--
 cpp/test/distance/fused_l2_nn.cu              |  16 +-
 cpp/test/handle.cpp                           |  49 ++---
 cpp/test/label/merge_labels.cu                |   2 +-
 cpp/test/linalg/add.cu                        |   4 +-
 cpp/test/linalg/add.cuh                       |   4 +-
 cpp/test/linalg/binary_op.cu                  |  31 ++--
 cpp/test/linalg/cholesky_r1.cu                |  43 +++--
 cpp/test/linalg/coalesced_reduction.cu        |  14 +-
 cpp/test/linalg/divide.cu                     |   4 +-
 cpp/test/linalg/eig.cu                        |  30 ++-
 cpp/test/linalg/eig_sel.cu                    |  12 +-
 cpp/test/linalg/eltwise.cu                    |  16 +-
 cpp/test/linalg/gemm_layout.cu                |   1 +
 cpp/test/linalg/gemv.cu                       |   3 +-
 cpp/test/linalg/map_then_reduce.cu            |  36 ++--
 cpp/test/linalg/matrix_vector_op.cu           |   6 +-
 cpp/test/linalg/matrix_vector_op.cuh          |  11 +-
 cpp/test/matrix/math.cu                       | 103 +++++++----
 cpp/test/matrix/matrix.cu                     |  36 ++--
 cpp/test/sparse/add.cu                        |   6 +-
 cpp/test/sparse/convert_csr.cu                |   2 +-
 cpp/test/sparse/filter.cu                     |   1 +
 cpp/test/sparse/norm.cu                       |   1 +
 cpp/test/sparse/reduce.cu                     |   2 +-
 cpp/test/sparse/row_op.cu                     |   4 +-
 cpp/test/sparse/sort.cu                       |   4 +-
 cpp/test/spatial/ball_cover.cu                |  14 +-
 cpp/test/spatial/haversine.cu                 |   5 +-
 cpp/test/spatial/knn.cu                       |   2 +-
 cpp/test/stats/mean_center.cu                 |   3 +-
 cpp/test/stats/stddev.cu                      |  25 ++-
 python/raft/common/handle.pxd                 |  21 ++-
 python/raft/common/handle.pyx                 |  61 +++---
 python/raft/dask/common/comms.py              |   4 +-
 python/raft/dask/common/comms_utils.pyx       |   2 +-
 57 files changed, 515 insertions(+), 409 deletions(-)

diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index 3786910be0..02bd15c407 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -23,7 +23,7 @@ if(CMAKE_COMPILER_IS_GNUCXX)
     list(APPEND RAFT_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations)
 endif()
 
-list(APPEND RAFT_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
+list(APPEND RAFT_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr --default-stream per-thread)
 
 # set warnings as errors
 if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2.0)
diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp
index 93b57b13a0..01ad6369f8 100644
--- a/cpp/include/raft/comms/test.hpp
+++ b/cpp/include/raft/comms/test.hpp
@@ -528,10 +528,10 @@ bool test_commsplit(const handle_t& h, int n_colors)
   if (n_colors > size) n_colors = size;
 
   // first we need to assign to a color, then assign the rank within the color
-  int color = rank % n_colors;
-  int key   = rank / n_colors;
-
-  handle_t new_handle(1);
+  int color        = rank % n_colors;
+  int key          = rank / n_colors;
+  auto stream_pool = std::make_shared<rmm::cuda_stream_pool>(1);
+  handle_t new_handle(rmm::cuda_stream_default, stream_pool);
   auto shared_comm = std::make_shared<comms_t>(communicator.comm_split(color, key));
   new_handle.set_comms(shared_comm);
 
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 97b442afe3..bba7fabc54 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -47,48 +47,31 @@ namespace raft {
  *        necessary cuda kernels and/or libraries
  */
 class handle_t {
- private:
-  static constexpr int kNumDefaultWorkerStreams = 0;
-
  public:
+  // delete copy/move constructors and assignment operators as
+  // copying and moving underlying resources is unsafe
+  handle_t(const handle_t&) = delete;
+  handle_t& operator=(const handle_t&) = delete;
+  handle_t(handle_t&&)                 = delete;
+  handle_t& operator=(handle_t&&) = delete;
+
   /**
-   * @brief Construct a handle with the specified number of worker streams
+   * @brief Construct a handle with a stream view and stream pool
    *
-   * @param[in] n_streams number worker streams to be created
+   * @param[in] stream the default stream (which has the default per-thread stream if unspecified)
+   * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified)
    */
-  explicit handle_t(int n_streams = kNumDefaultWorkerStreams)
+  handle_t(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
+           std::shared_ptr<rmm::cuda_stream_pool> stream_pool = {nullptr})
     : dev_id_([]() -> int {
         int cur_dev = -1;
         RAFT_CUDA_TRY(cudaGetDevice(&cur_dev));
         return cur_dev;
-      }())
+      }()),
+      stream_view_{stream_view},
+      stream_pool_{stream_pool}
   {
-    if (n_streams != 0) { streams_ = std::make_unique<rmm::cuda_stream_pool>(n_streams); }
     create_resources();
-    thrust_policy_ = std::make_unique<rmm::exec_policy>(user_stream_);
-  }
-
-  /**
-   * @brief Construct a light handle copy from another
-   * user stream, cuda handles, comms and worker pool are not copied
-   * The user_stream of the returned handle is set to the specified stream
-   * of the other handle worker pool
-   * @param[in] other other handle for which to use streams
-   * @param[in] stream_id stream id in `other` worker streams
-   * to be set as user stream in the constructed handle
-   * @param[in] n_streams number worker streams to be created
-   */
-  handle_t(const handle_t& other, int stream_id, int n_streams = kNumDefaultWorkerStreams)
-    : dev_id_(other.get_device())
-  {
-    RAFT_EXPECTS(other.get_num_internal_streams() > 0,
-                 "ERROR: the main handle must have at least one worker stream\n");
-    if (n_streams != 0) { streams_ = std::make_unique<rmm::cuda_stream_pool>(n_streams); }
-    prop_                    = other.get_device_properties();
-    device_prop_initialized_ = true;
-    create_resources();
-    set_stream(other.get_internal_stream(stream_id));
-    thrust_policy_ = std::make_unique<rmm::exec_policy>(user_stream_);
   }
 
   /** Destroys all held-up resources */
@@ -96,15 +79,12 @@ class handle_t {
 
   int get_device() const { return dev_id_; }
 
-  void set_stream(cudaStream_t stream) { user_stream_ = stream; }
-  cudaStream_t get_stream() const { return user_stream_; }
-  rmm::cuda_stream_view get_stream_view() const { return rmm::cuda_stream_view(user_stream_); }
-
   cublasHandle_t get_cublas_handle() const
   {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cublas_initialized_) {
       RAFT_CUBLAS_TRY_NO_THROW(cublasCreate(&cublas_handle_));
+      RAFT_CUBLAS_TRY_NO_THROW(cublasSetStream(cublas_handle_, stream_view_));
       cublas_initialized_ = true;
     }
     return cublas_handle_;
@@ -115,6 +95,7 @@ class handle_t {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusolver_dn_initialized_) {
       RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnCreate(&cusolver_dn_handle_));
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnSetStream(cusolver_dn_handle_, stream_view_));
       cusolver_dn_initialized_ = true;
     }
     return cusolver_dn_handle_;
@@ -125,6 +106,7 @@ class handle_t {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusolver_sp_initialized_) {
       RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpCreate(&cusolver_sp_handle_));
+      RAFT_CUSOLVER_TRY_NO_THROW(cusolverSpSetStream(cusolver_sp_handle_, stream_view_));
       cusolver_sp_initialized_ = true;
     }
     return cusolver_sp_handle_;
@@ -135,6 +117,7 @@ class handle_t {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusparse_initialized_) {
       RAFT_CUSPARSE_TRY_NO_THROW(cusparseCreate(&cusparse_handle_));
+      RAFT_CUSPARSE_TRY_NO_THROW(cusparseSetStream(cusparse_handle_, stream_view_));
       cusparse_initialized_ = true;
     }
     return cusparse_handle_;
@@ -142,48 +125,103 @@ class handle_t {
 
   rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; }
 
-  // legacy compatibility for cuML
-  cudaStream_t get_internal_stream(int sid) const
+  /**
+   * @brief synchronize main stream on the handle
+   */
+  void sync_stream() const { stream_view_.synchronize(); }
+
+  /**
+   * @brief returns main stream on the handle
+   */
+  rmm::cuda_stream_view get_stream() const { return stream_view_; }
+
+  /**
+   * @brief returns whether stream pool was initialized on the handle
+   */
+
+  bool is_stream_pool_initialized() const { return stream_pool_.get() != nullptr; }
+
+  /**
+   * @brief returns stream pool on the handle
+   */
+  const rmm::cuda_stream_pool& get_stream_pool() const
   {
-    RAFT_EXPECTS(streams_.get() != nullptr,
-                 "ERROR: rmm::cuda_stream_pool was not initialized with a non-zero value");
-    return streams_->get_stream(sid).value();
+    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+    return *stream_pool_;
   }
-  // new accessor return rmm::cuda_stream_view
-  rmm::cuda_stream_view get_internal_stream_view(int sid) const
+
+  std::size_t get_stream_pool_size() const
   {
-    RAFT_EXPECTS(streams_.get() != nullptr,
-                 "ERROR: rmm::cuda_stream_pool was not initialized with a non-zero value");
-    return streams_->get_stream(sid);
+    return is_stream_pool_initialized() ? stream_pool_->get_pool_size() : 0;
   }
 
-  int get_num_internal_streams() const
+  /**
+   * @brief return stream from pool
+   */
+  rmm::cuda_stream_view get_stream_from_stream_pool() const
   {
-    return streams_.get() != nullptr ? streams_->get_pool_size() : 0;
+    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+    return stream_pool_->get_stream();
   }
 
-  std::vector<cudaStream_t> get_internal_streams() const
+  /**
+   * @brief return stream from pool at index
+   */
+  rmm::cuda_stream_view get_stream_from_stream_pool(std::size_t stream_idx) const
+  {
+    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+    return stream_pool_->get_stream(stream_idx);
+  }
+
+  /**
+   * @brief return stream from pool if size > 0, else main stream on handle
+   */
+  rmm::cuda_stream_view get_next_usable_stream() const
+  {
+    return is_stream_pool_initialized() ? get_stream_from_stream_pool() : stream_view_;
+  }
+
+  /**
+   * @brief return stream from pool at index if size > 0, else main stream on handle
+   *
+   * @param[in] stream_index the required index of the stream in the stream pool if available
+   */
+  rmm::cuda_stream_view get_next_usable_stream(std::size_t stream_idx) const
+  {
+    return is_stream_pool_initialized() ? get_stream_from_stream_pool(stream_idx) : stream_view_;
+  }
+
+  /**
+   * @brief synchronize the stream pool on the handle
+   */
+  void sync_stream_pool() const
   {
-    std::vector<cudaStream_t> int_streams_vec;
-    for (int i = 0; i < get_num_internal_streams(); i++) {
-      int_streams_vec.push_back(get_internal_stream(i));
+    for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
+      stream_pool_->get_stream(i).synchronize();
     }
-    return int_streams_vec;
   }
 
-  void wait_on_user_stream() const
+  /**
+   * @brief synchronize subset of stream pool
+   *
+   * @param[in] stream_indices the indices of the streams in the stream pool to synchronize
+   */
+  void sync_stream_pool(const std::vector<std::size_t> stream_indices) const
   {
-    RAFT_CUDA_TRY(cudaEventRecord(event_, user_stream_));
-    for (int i = 0; i < get_num_internal_streams(); i++) {
-      RAFT_CUDA_TRY(cudaStreamWaitEvent(get_internal_stream(i), event_, 0));
+    RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
+    for (const auto& stream_index : stream_indices) {
+      stream_pool_->get_stream(stream_index).synchronize();
     }
   }
 
-  void wait_on_internal_streams() const
+  /**
+   * @brief ask stream pool to wait on last event in main stream
+   */
+  void wait_stream_pool_on_stream() const
   {
-    for (int i = 0; i < get_num_internal_streams(); i++) {
-      RAFT_CUDA_TRY(cudaEventRecord(event_, get_internal_stream(i)));
-      RAFT_CUDA_TRY(cudaStreamWaitEvent(user_stream_, event_, 0));
+    RAFT_CUDA_TRY(cudaEventRecord(event_, stream_view_));
+    for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
+      RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_pool_->get_stream(i), event_, 0));
     }
   }
 
@@ -229,7 +267,6 @@ class handle_t {
   std::unordered_map<std::string, std::shared_ptr<comms::comms_t>> subcomms_;
 
   const int dev_id_;
-  std::unique_ptr<rmm::cuda_stream_pool> streams_{nullptr};
   mutable cublasHandle_t cublas_handle_;
   mutable bool cublas_initialized_{false};
   mutable cusolverDnHandle_t cusolver_dn_handle_;
@@ -239,7 +276,8 @@ class handle_t {
   mutable cusparseHandle_t cusparse_handle_;
   mutable bool cusparse_initialized_{false};
   std::unique_ptr<rmm::exec_policy> thrust_policy_{nullptr};
-  cudaStream_t user_stream_{nullptr};
+  rmm::cuda_stream_view stream_view_{rmm::cuda_stream_per_thread};
+  std::shared_ptr<rmm::cuda_stream_pool> stream_pool_{nullptr};
   cudaEvent_t event_;
   mutable cudaDeviceProp prop_;
   mutable bool device_prop_initialized_{false};
@@ -247,12 +285,13 @@ class handle_t {
 
   void create_resources()
   {
+    thrust_policy_ = std::make_unique<rmm::exec_policy>(stream_view_);
+
     RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
   }
 
   void destroy_resources()
   {
-    ///@todo: enable *_NO_THROW variants once we have enabled logging
     if (cusparse_initialized_) { RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroy(cusparse_handle_)); }
     if (cusolver_dn_initialized_) {
       RAFT_CUSOLVER_TRY_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
@@ -270,11 +309,12 @@ class handle_t {
  */
 class stream_syncer {
  public:
-  explicit stream_syncer(const handle_t& handle) : handle_(handle)
+  explicit stream_syncer(const handle_t& handle) : handle_(handle) { handle_.sync_stream(); }
+  ~stream_syncer()
   {
-    handle_.wait_on_user_stream();
+    handle_.wait_stream_pool_on_stream();
+    handle_.sync_stream_pool();
   }
-  ~stream_syncer() { handle_.wait_on_internal_streams(); }
 
   stream_syncer(const stream_syncer& other) = delete;
   stream_syncer& operator=(const stream_syncer& other) = delete;
diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh
index 4e9e993b78..6cc23576f1 100644
--- a/cpp/include/raft/label/classlabels.cuh
+++ b/cpp/include/raft/label/classlabels.cuh
@@ -51,16 +51,23 @@ int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n,
 
   // Query how much temporary storage we will need for cub operations
   // and allocate it
-  cub::DeviceRadixSort::SortKeys(NULL, bytes, y, workspace.data(), n);
+  cub::DeviceRadixSort::SortKeys(
+    NULL, bytes, y, workspace.data(), n, 0, sizeof(value_t) * 8, stream);
   cub::DeviceSelect::Unique(
-    NULL, bytes2, workspace.data(), workspace.data(), d_num_selected.data(), n);
+    NULL, bytes2, workspace.data(), workspace.data(), d_num_selected.data(), n, stream);
   bytes = max(bytes, bytes2);
   rmm::device_uvector<char> cub_storage(bytes, stream);
 
   // Select Unique classes
-  cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, workspace.data(), n);
-  cub::DeviceSelect::Unique(
-    cub_storage.data(), bytes, workspace.data(), workspace.data(), d_num_selected.data(), n);
+  cub::DeviceRadixSort::SortKeys(
+    cub_storage.data(), bytes, y, workspace.data(), n, 0, sizeof(value_t) * 8, stream);
+  cub::DeviceSelect::Unique(cub_storage.data(),
+                            bytes,
+                            workspace.data(),
+                            workspace.data(),
+                            d_num_selected.data(),
+                            n,
+                            stream);
 
   int n_unique = d_num_selected.value(stream);
   // Copy unique classes to output
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
index 59fce73188..3e787811bd 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
@@ -164,7 +164,8 @@ void k_closest_landmarks(const raft::handle_t& handle,
   std::vector<value_t*> input      = {index.get_R()};
   std::vector<std::uint32_t> sizes = {index.n_landmarks};
 
-  brute_force_knn_impl<std::uint32_t, std::int64_t>(input,
+  brute_force_knn_impl<std::uint32_t, std::int64_t>(handle,
+                                                    input,
                                                     sizes,
                                                     index.n,
                                                     const_cast<value_t*>(query_pts),
@@ -172,9 +173,6 @@ void k_closest_landmarks(const raft::handle_t& handle,
                                                     R_knn_inds,
                                                     R_knn_dists,
                                                     k,
-                                                    handle.get_stream(),
-                                                    nullptr,
-                                                    0,
                                                     true,
                                                     true,
                                                     nullptr,
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 414c1dc1ce..12b7124773 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -18,6 +18,7 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include <rmm/cuda_stream_pool.hpp>
 
 #include <rmm/device_uvector.hpp>
 
@@ -217,6 +218,7 @@ inline void knn_merge_parts(value_t* inK,
  */
 template <typename IntType = int, typename IdxType = std::int64_t>
 void brute_force_knn_impl(
+  const raft::handle_t& handle,
   std::vector<float*>& input,
   std::vector<IntType>& sizes,
   IntType D,
@@ -225,15 +227,14 @@ void brute_force_knn_impl(
   IdxType* res_I,
   float* res_D,
   IntType k,
-  cudaStream_t userStream,
-  cudaStream_t* internalStreams       = nullptr,
-  int n_int_streams                   = 0,
   bool rowMajorIndex                  = true,
   bool rowMajorQuery                  = true,
   std::vector<IdxType>* translations  = nullptr,
   raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
   float metricArg                     = 0)
 {
+  auto userStream = handle.get_stream();
+
   ASSERT(input.size() == sizes.size(), "input and sizes vectors should be the same size");
 
   std::vector<IdxType>* id_ranges;
@@ -284,14 +285,14 @@ void brute_force_knn_impl(
     out_I = all_I.data();
   }
 
-  // Sync user stream only if using other streams to parallelize query
-  if (n_int_streams > 0) RAFT_CUDA_TRY(cudaStreamSynchronize(userStream));
+  // Make other streams from pool wait on main stream
+  handle.wait_stream_pool_on_stream();
 
   for (size_t i = 0; i < input.size(); i++) {
     float* out_d_ptr   = out_D + (i * k * n);
     IdxType* out_i_ptr = out_I + (i * k * n);
 
-    cudaStream_t stream = raft::select_stream(userStream, internalStreams, n_int_streams, i);
+    auto stream = handle.get_next_usable_stream(i);
 
     //    // TODO: Enable this once we figure out why it's causing pytest failures in cuml.
     //    if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true &&
@@ -358,9 +359,7 @@ void brute_force_knn_impl(
   // Sync internal streams if used. We don't need to
   // sync the user stream because we'll already have
   // fully serial execution.
-  for (int i = 0; i < n_int_streams; i++) {
-    RAFT_CUDA_TRY(cudaStreamSynchronize(internalStreams[i]));
-  }
+  handle.sync_stream_pool();
 
   if (input.size() > 1 || translations != nullptr) {
     // This is necessary for proper index translations. If there are
@@ -378,11 +377,7 @@ void brute_force_knn_impl(
     float p = 0.5;  // standard l2
     if (metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / metricArg;
     raft::linalg::unaryOp<float>(
-      res_D,
-      res_D,
-      n * k,
-      [p] __device__(float input) { return powf(fabsf(input), p); },
-      userStream);
+      res_D, res_D, n * k, [p] __device__(float input) { return powf(input, p); }, userStream);
   }
 
   query_metric_processor->revert(search_items);
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index eb9a8f1436..e1e1eac248 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -140,9 +140,8 @@ inline void brute_force_knn(raft::handle_t const& handle,
 {
   ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size");
 
-  std::vector<cudaStream_t> int_streams = handle.get_internal_streams();
-
-  detail::brute_force_knn_impl(input,
+  detail::brute_force_knn_impl(handle,
+                               input,
                                sizes,
                                D,
                                search_items,
@@ -150,9 +149,6 @@ inline void brute_force_knn(raft::handle_t const& handle,
                                res_I,
                                res_D,
                                k,
-                               handle.get_stream(),
-                               int_streams.data(),
-                               handle.get_num_internal_streams(),
                                rowMajorIndex,
                                rowMajorQuery,
                                translations,
diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index a748b0ef0e..c0598804a8 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -57,11 +57,12 @@ void naiveDistanceAdj(bool* dist,
                       int n,
                       int k,
                       DataType eps,
-                      bool isRowMajor)
+                      bool isRowMajor,
+                      cudaStream_t stream)
 {
   static const dim3 TPB(16, 32, 1);
   dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1);
-  naiveDistanceAdjKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, eps, isRowMajor);
+  naiveDistanceAdjKernel<DataType><<<nblks, TPB, 0, stream>>>(dist, x, y, m, n, k, eps, isRowMajor);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -106,7 +107,7 @@ class DistanceAdjTest : public ::testing::TestWithParam<DistanceAdjInputs<DataTy
 
     DataType threshold = params.eps;
 
-    naiveDistanceAdj(dist_ref.data(), x.data(), y.data(), m, n, k, threshold, isRowMajor);
+    naiveDistanceAdj(dist_ref.data(), x.data(), y.data(), m, n, k, threshold, isRowMajor, stream);
     size_t worksize = raft::distance::
       getWorkspaceSize<raft::distance::DistanceType::L2Expanded, DataType, DataType, bool>(
         x.data(), y.data(), m, n, k);
@@ -155,7 +156,7 @@ TEST_P(DistanceAdjTestF, Result)
 {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare<bool>()));
+  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare<bool>(), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestF, ::testing::ValuesIn(inputsf));
 
@@ -174,7 +175,7 @@ TEST_P(DistanceAdjTestD, Result)
 {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare<bool>()));
+  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare<bool>(), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestD, ::testing::ValuesIn(inputsd));
 
diff --git a/cpp/test/distance/dist_canberra.cu b/cpp/test/distance/dist_canberra.cu
index db318605b4..ca90907779 100644
--- a/cpp/test/distance/dist_canberra.cu
+++ b/cpp/test/distance/dist_canberra.cu
@@ -40,7 +40,7 @@ TEST_P(DistanceCanberraF, Result)
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(raft::devArrMatch(
-    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraF, ::testing::ValuesIn(inputsf));
 
@@ -60,7 +60,7 @@ TEST_P(DistanceCanberraD, Result)
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(raft::devArrMatch(
-    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD, ::testing::ValuesIn(inputsd));
 
diff --git a/cpp/test/distance/dist_chebyshev.cu b/cpp/test/distance/dist_chebyshev.cu
index c7dccfe712..641b958d72 100644
--- a/cpp/test/distance/dist_chebyshev.cu
+++ b/cpp/test/distance/dist_chebyshev.cu
@@ -40,7 +40,7 @@ TEST_P(DistanceLinfF, Result)
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(raft::devArrMatch(
-    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfF, ::testing::ValuesIn(inputsf));
 
@@ -60,7 +60,7 @@ TEST_P(DistanceLinfD, Result)
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(raft::devArrMatch(
-    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD, ::testing::ValuesIn(inputsd));
 
diff --git a/cpp/test/distance/dist_correlation.cu b/cpp/test/distance/dist_correlation.cu
index 0648ed96ca..72df5b10f4 100644
--- a/cpp/test/distance/dist_correlation.cu
+++ b/cpp/test/distance/dist_correlation.cu
@@ -41,7 +41,7 @@ TEST_P(DistanceCorrelationF, Result)
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(raft::devArrMatch(
-    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationF, ::testing::ValuesIn(inputsf));
 
@@ -61,7 +61,7 @@ TEST_P(DistanceCorrelationD, Result)
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(raft::devArrMatch(
-    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationD, ::testing::ValuesIn(inputsd));
 
diff --git a/cpp/test/distance/dist_cos.cu b/cpp/test/distance/dist_cos.cu
index b3e6a4c97f..a085e82705 100644
--- a/cpp/test/distance/dist_cos.cu
+++ b/cpp/test/distance/dist_cos.cu
@@ -39,8 +39,8 @@ TEST_P(DistanceExpCosF, Result)
 {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(
-    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosF, ::testing::ValuesIn(inputsf));
 
@@ -59,8 +59,8 @@ TEST_P(DistanceExpCosD, Result)
 {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(
-    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosD, ::testing::ValuesIn(inputsd));
 
diff --git a/cpp/test/distance/dist_euc_exp.cu b/cpp/test/distance/dist_euc_exp.cu
index 75ff7e682a..f840a91bec 100644
--- a/cpp/test/distance/dist_euc_exp.cu
+++ b/cpp/test/distance/dist_euc_exp.cu
@@ -39,8 +39,8 @@ TEST_P(DistanceEucExpTestF, Result)
 {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(
-    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestF, ::testing::ValuesIn(inputsf));
 
@@ -59,8 +59,8 @@ TEST_P(DistanceEucExpTestD, Result)
 {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(
-    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestD, ::testing::ValuesIn(inputsd));
 
diff --git a/cpp/test/distance/dist_euc_unexp.cu b/cpp/test/distance/dist_euc_unexp.cu
index 88affa16d5..6d374f3332 100644
--- a/cpp/test/distance/dist_euc_unexp.cu
+++ b/cpp/test/distance/dist_euc_unexp.cu
@@ -40,8 +40,8 @@ TEST_P(DistanceEucUnexpTestF, Result)
 {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(
-    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestF, ::testing::ValuesIn(inputsf));
 
@@ -60,8 +60,8 @@ TEST_P(DistanceEucUnexpTestD, Result)
 {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(
-    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestD, ::testing::ValuesIn(inputsd));
 
diff --git a/cpp/test/distance/dist_hamming.cu b/cpp/test/distance/dist_hamming.cu
index 631adc751c..e0f1efc3f7 100644
--- a/cpp/test/distance/dist_hamming.cu
+++ b/cpp/test/distance/dist_hamming.cu
@@ -41,7 +41,7 @@ TEST_P(DistanceHammingF, Result)
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(raft::devArrMatch(
-    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingF, ::testing::ValuesIn(inputsf));
 
@@ -61,7 +61,7 @@ TEST_P(DistanceHammingD, Result)
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(raft::devArrMatch(
-    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingD, ::testing::ValuesIn(inputsd));
 
diff --git a/cpp/test/distance/dist_hellinger.cu b/cpp/test/distance/dist_hellinger.cu
index 8a07c8836f..caa96f189d 100644
--- a/cpp/test/distance/dist_hellinger.cu
+++ b/cpp/test/distance/dist_hellinger.cu
@@ -41,7 +41,7 @@ TEST_P(DistanceHellingerExpF, Result)
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(raft::devArrMatch(
-    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpF, ::testing::ValuesIn(inputsf));
 
@@ -61,7 +61,7 @@ TEST_P(DistanceHellingerExpD, Result)
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(raft::devArrMatch(
-    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD, ::testing::ValuesIn(inputsd));
 
diff --git a/cpp/test/distance/dist_jensen_shannon.cu b/cpp/test/distance/dist_jensen_shannon.cu
index 3cda31a852..74b02ef18d 100644
--- a/cpp/test/distance/dist_jensen_shannon.cu
+++ b/cpp/test/distance/dist_jensen_shannon.cu
@@ -41,7 +41,7 @@ TEST_P(DistanceJensenShannonF, Result)
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(raft::devArrMatch(
-    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonF, ::testing::ValuesIn(inputsf));
 
@@ -61,7 +61,7 @@ TEST_P(DistanceJensenShannonD, Result)
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(raft::devArrMatch(
-    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonD, ::testing::ValuesIn(inputsd));
 
diff --git a/cpp/test/distance/dist_kl_divergence.cu b/cpp/test/distance/dist_kl_divergence.cu
index 4303b8cc8f..e551eda0ab 100644
--- a/cpp/test/distance/dist_kl_divergence.cu
+++ b/cpp/test/distance/dist_kl_divergence.cu
@@ -41,7 +41,7 @@ TEST_P(DistanceKLDivergenceF, Result)
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(raft::devArrMatch(
-    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceF, ::testing::ValuesIn(inputsf));
 
@@ -61,7 +61,7 @@ TEST_P(DistanceKLDivergenceD, Result)
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(raft::devArrMatch(
-    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceD, ::testing::ValuesIn(inputsd));
 
diff --git a/cpp/test/distance/dist_l1.cu b/cpp/test/distance/dist_l1.cu
index dad160ca41..ac2ee024f6 100644
--- a/cpp/test/distance/dist_l1.cu
+++ b/cpp/test/distance/dist_l1.cu
@@ -40,7 +40,7 @@ TEST_P(DistanceUnexpL1F, Result)
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(raft::devArrMatch(
-    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1F, ::testing::ValuesIn(inputsf));
 
@@ -60,7 +60,7 @@ TEST_P(DistanceUnexpL1D, Result)
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(raft::devArrMatch(
-    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1D, ::testing::ValuesIn(inputsd));
 
diff --git a/cpp/test/distance/dist_minkowski.cu b/cpp/test/distance/dist_minkowski.cu
index 34f6d2825e..f0a6833f2b 100644
--- a/cpp/test/distance/dist_minkowski.cu
+++ b/cpp/test/distance/dist_minkowski.cu
@@ -40,7 +40,7 @@ TEST_P(DistanceLpUnexpF, Result)
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(raft::devArrMatch(
-    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpF, ::testing::ValuesIn(inputsf));
 
@@ -60,7 +60,7 @@ TEST_P(DistanceLpUnexpD, Result)
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(raft::devArrMatch(
-    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpD, ::testing::ValuesIn(inputsd));
 
diff --git a/cpp/test/distance/dist_russell_rao.cu b/cpp/test/distance/dist_russell_rao.cu
index e0bfcd7eb3..42234d4f0b 100644
--- a/cpp/test/distance/dist_russell_rao.cu
+++ b/cpp/test/distance/dist_russell_rao.cu
@@ -41,7 +41,7 @@ TEST_P(DistanceRussellRaoF, Result)
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(raft::devArrMatch(
-    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoF, ::testing::ValuesIn(inputsf));
 
@@ -61,7 +61,7 @@ TEST_P(DistanceRussellRaoD, Result)
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(raft::devArrMatch(
-    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoD, ::testing::ValuesIn(inputsd));
 
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index ec9d35bb09..8d150a4a87 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -307,7 +307,8 @@ void naiveDistance(DataType* dist,
                    int k,
                    raft::distance::DistanceType type,
                    bool isRowMajor,
-                   DataType metric_arg = 2.0f)
+                   DataType metric_arg = 2.0f,
+                   cudaStream_t stream = 0)
 {
   static const dim3 TPB(16, 32, 1);
   dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1);
@@ -317,38 +318,46 @@ void naiveDistance(DataType* dist,
     case raft::distance::DistanceType::Linf:
     case raft::distance::DistanceType::L1:
       naiveL1_Linf_CanberraDistanceKernel<DataType>
-        <<<nblks, TPB>>>(dist, x, y, m, n, k, type, isRowMajor);
+        <<<nblks, TPB, 0, stream>>>(dist, x, y, m, n, k, type, isRowMajor);
       break;
     case raft::distance::DistanceType::L2SqrtUnexpanded:
     case raft::distance::DistanceType::L2Unexpanded:
     case raft::distance::DistanceType::L2SqrtExpanded:
     case raft::distance::DistanceType::L2Expanded:
-      naiveDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, type, isRowMajor);
+      naiveDistanceKernel<DataType>
+        <<<nblks, TPB, 0, stream>>>(dist, x, y, m, n, k, type, isRowMajor);
       break;
     case raft::distance::DistanceType::CosineExpanded:
-      naiveCosineDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveCosineDistanceKernel<DataType>
+        <<<nblks, TPB, 0, stream>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     case raft::distance::DistanceType::HellingerExpanded:
-      naiveHellingerDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveHellingerDistanceKernel<DataType>
+        <<<nblks, TPB, 0, stream>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     case raft::distance::DistanceType::LpUnexpanded:
       naiveLpUnexpDistanceKernel<DataType>
-        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor, metric_arg);
+        <<<nblks, TPB, 0, stream>>>(dist, x, y, m, n, k, isRowMajor, metric_arg);
       break;
     case raft::distance::DistanceType::HammingUnexpanded:
-      naiveHammingDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveHammingDistanceKernel<DataType>
+        <<<nblks, TPB, 0, stream>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     case raft::distance::DistanceType::JensenShannon:
-      naiveJensenShannonDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveJensenShannonDistanceKernel<DataType>
+        <<<nblks, TPB, 0, stream>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     case raft::distance::DistanceType::RusselRaoExpanded:
-      naiveRussellRaoDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveRussellRaoDistanceKernel<DataType>
+        <<<nblks, TPB, 0, stream>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     case raft::distance::DistanceType::KLDivergence:
-      naiveKLDivergenceDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveKLDivergenceDistanceKernel<DataType>
+        <<<nblks, TPB, 0, stream>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     case raft::distance::DistanceType::CorrelationExpanded:
-      naiveCorrelationDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveCorrelationDistanceKernel<DataType>
+        <<<nblks, TPB, 0, stream>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     default: FAIL() << "should be here\n";
   }
@@ -433,7 +442,7 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
       r.uniform(y.data(), n * k, DataType(-1.0), DataType(1.0), stream);
     }
     naiveDistance(
-      dist_ref.data(), x.data(), y.data(), m, n, k, distanceType, isRowMajor, metric_arg);
+      dist_ref.data(), x.data(), y.data(), m, n, k, distanceType, isRowMajor, metric_arg, stream);
     size_t worksize = raft::distance::getWorkspaceSize<distanceType, DataType, DataType, DataType>(
       x.data(), y.data(), m, n, k);
     rmm::device_uvector<char> workspace(worksize, stream);
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 68ad220734..072176e503 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -250,7 +250,7 @@ TEST_P(FusedL2NNTestF_Sq, Result)
 {
   runTest(min.data());
   ASSERT_TRUE(devArrMatch(
-    min_ref.data(), min.data(), params.m, CompareApproxAbsKVP<float>(params.tolerance)));
+    min_ref.data(), min.data(), params.m, CompareApproxAbsKVP<float>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sq, ::testing::ValuesIn(inputsf));
 typedef FusedL2NNTest<float, true> FusedL2NNTestF_Sqrt;
@@ -258,7 +258,7 @@ TEST_P(FusedL2NNTestF_Sqrt, Result)
 {
   runTest(min.data());
   ASSERT_TRUE(devArrMatch(
-    min_ref.data(), min.data(), params.m, CompareApproxAbsKVP<float>(params.tolerance)));
+    min_ref.data(), min.data(), params.m, CompareApproxAbsKVP<float>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sqrt, ::testing::ValuesIn(inputsf));
 
@@ -285,7 +285,7 @@ TEST_P(FusedL2NNTestD_Sq, Result)
 {
   runTest(min.data());
   ASSERT_TRUE(devArrMatch(
-    min_ref.data(), min.data(), params.m, CompareApproxAbsKVP<double>(params.tolerance)));
+    min_ref.data(), min.data(), params.m, CompareApproxAbsKVP<double>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sq, ::testing::ValuesIn(inputsd));
 typedef FusedL2NNTest<double, true> FusedL2NNTestD_Sqrt;
@@ -293,7 +293,7 @@ TEST_P(FusedL2NNTestD_Sqrt, Result)
 {
   runTest(min.data());
   ASSERT_TRUE(devArrMatch(
-    min_ref.data(), min.data(), params.m, CompareApproxAbsKVP<double>(params.tolerance)));
+    min_ref.data(), min.data(), params.m, CompareApproxAbsKVP<double>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sqrt, ::testing::ValuesIn(inputsd));
 
@@ -330,7 +330,7 @@ TEST_P(FusedL2NNDetTestF_Sq, Result)
   runTest(min.data());  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
     runTest(min1.data());
-    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<float>()));
+    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<float>(), stream));
   }
 }
 INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sq, ::testing::ValuesIn(inputsf));
@@ -340,7 +340,7 @@ TEST_P(FusedL2NNDetTestF_Sqrt, Result)
   runTest(min.data());  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
     runTest(min1.data());
-    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<float>()));
+    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<float>(), stream));
   }
 }
 INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sqrt, ::testing::ValuesIn(inputsf));
@@ -351,7 +351,7 @@ TEST_P(FusedL2NNDetTestD_Sq, Result)
   runTest(min.data());  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
     runTest(min1.data());
-    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<double>()));
+    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<double>(), stream));
   }
 }
 INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sq, ::testing::ValuesIn(inputsd));
@@ -361,7 +361,7 @@ TEST_P(FusedL2NNDetTestD_Sqrt, Result)
   runTest(min.data());  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
     runTest(min1.data());
-    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<double>()));
+    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<double>(), stream));
   }
 }
 INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sqrt, ::testing::ValuesIn(inputsd));
diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp
index 81b8bb6c6c..ddc0806a65 100644
--- a/cpp/test/handle.cpp
+++ b/cpp/test/handle.cpp
@@ -26,7 +26,7 @@ TEST(Raft, HandleDefault)
 {
   handle_t h;
   ASSERT_EQ(0, h.get_device());
-  ASSERT_EQ(nullptr, h.get_stream());
+  ASSERT_EQ(rmm::cuda_stream_per_thread, h.get_stream());
   ASSERT_NE(nullptr, h.get_cublas_handle());
   ASSERT_NE(nullptr, h.get_cusolver_dn_handle());
   ASSERT_NE(nullptr, h.get_cusolver_sp_handle());
@@ -35,44 +35,33 @@ TEST(Raft, HandleDefault)
 
 TEST(Raft, Handle)
 {
-  handle_t h(4);
-  ASSERT_EQ(4, h.get_num_internal_streams());
+  // test stream pool creation
+  constexpr std::size_t n_streams = 4;
+  auto stream_pool                = std::make_shared<rmm::cuda_stream_pool>(n_streams);
+  handle_t h(rmm::cuda_stream_default, stream_pool);
+  ASSERT_EQ(n_streams, h.get_stream_pool_size());
+
+  // test non default stream handle
   cudaStream_t stream;
   RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-  h.set_stream(stream);
-  ASSERT_EQ(stream, h.get_stream());
+  rmm::cuda_stream_view stream_view(stream);
+  handle_t handle(stream_view);
+  ASSERT_EQ(stream_view, handle.get_stream());
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   RAFT_CUDA_TRY(cudaStreamDestroy(stream));
 }
 
-TEST(Raft, GetInternalStreams)
-{
-  handle_t h(4);
-  auto streams = h.get_internal_streams();
-  ASSERT_EQ(4U, streams.size());
-}
-
 TEST(Raft, GetHandleFromPool)
 {
-  handle_t parent(4);
-
-  handle_t child(parent, 2);
-  ASSERT_EQ(parent.get_internal_stream(2), child.get_stream());
+  constexpr std::size_t n_streams = 4;
+  auto stream_pool                = std::make_shared<rmm::cuda_stream_pool>(n_streams);
+  handle_t parent(rmm::cuda_stream_default, stream_pool);
 
-  child.set_stream(parent.get_internal_stream(3));
-  ASSERT_EQ(parent.get_internal_stream(3), child.get_stream());
-  ASSERT_NE(parent.get_internal_stream(2), child.get_stream());
-
-  ASSERT_EQ(parent.get_device(), child.get_device());
+  for (std::size_t i = 0; i < n_streams; i++) {
+    auto worker_stream = parent.get_stream_from_stream_pool(i);
+    handle_t child(worker_stream);
+    ASSERT_EQ(parent.get_stream_from_stream_pool(i), child.get_stream());
+  }
 }
 
-TEST(Raft, GetHandleStreamViews)
-{
-  handle_t parent(4);
-
-  handle_t child(parent, 2);
-  ASSERT_EQ(parent.get_internal_stream_view(2), child.get_stream_view());
-  ASSERT_EQ(parent.get_internal_stream_view(2).value(), child.get_stream_view().value());
-  EXPECT_FALSE(child.get_stream_view().is_default());
-}
 }  // namespace raft
diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu
index dd67f0fd89..726c5c427b 100644
--- a/cpp/test/label/merge_labels.cu
+++ b/cpp/test/label/merge_labels.cu
@@ -65,7 +65,7 @@ class MergeLabelsTest : public ::testing::TestWithParam<MergeLabelsInputs<Index_
 
     cudaStreamSynchronize(stream);
     ASSERT_TRUE(raft::devArrMatch<Index_>(
-      expected.data(), labels_a.data(), params.N, raft::Compare<Index_>()));
+      expected.data(), labels_a.data(), params.N, raft::Compare<Index_>(), stream));
   }
 
  protected:
diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu
index 2b51f4640a..b65a8665bc 100644
--- a/cpp/test/linalg/add.cu
+++ b/cpp/test/linalg/add.cu
@@ -45,7 +45,7 @@ class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
     int len = params.len;
     r.uniform(in1.data(), len, InT(-1.0), InT(1.0), stream);
     r.uniform(in2.data(), len, InT(-1.0), InT(1.0), stream);
-    naiveAddElem<InT, OutT>(out_ref.data(), in1.data(), in2.data(), len);
+    naiveAddElem<InT, OutT>(out_ref.data(), in1.data(), in2.data(), len, stream);
     add<InT, OutT>(out.data(), in1.data(), in2.data(), len, stream);
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
@@ -53,7 +53,7 @@ class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
   void compare()
   {
     ASSERT_TRUE(raft::devArrMatch(
-      out_ref.data(), out.data(), params.len, raft::CompareApprox<OutT>(params.tolerance)));
+      out_ref.data(), out.data(), params.len, raft::CompareApprox<OutT>(params.tolerance), stream));
   }
 
  protected:
diff --git a/cpp/test/linalg/add.cuh b/cpp/test/linalg/add.cuh
index 5e887e0040..70e4866407 100644
--- a/cpp/test/linalg/add.cuh
+++ b/cpp/test/linalg/add.cuh
@@ -30,11 +30,11 @@ __global__ void naiveAddElemKernel(OutT* out, const InT* in1, const InT* in2, in
 }
 
 template <typename InT, typename OutT = InT>
-void naiveAddElem(OutT* out, const InT* in1, const InT* in2, int len)
+void naiveAddElem(OutT* out, const InT* in1, const InT* in2, int len, cudaStream_t stream)
 {
   static const int TPB = 64;
   int nblks            = raft::ceildiv(len, TPB);
-  naiveAddElemKernel<InT, OutT><<<nblks, TPB>>>(out, in1, in2, len);
+  naiveAddElemKernel<InT, OutT><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu
index bb62ddced3..3de29c6ee8 100644
--- a/cpp/test/linalg/binary_op.cu
+++ b/cpp/test/linalg/binary_op.cu
@@ -76,8 +76,8 @@ const std::vector<BinaryOpInputs<float, int>> inputsf_i32 = {{0.000001f, 1024 *
 typedef BinaryOpTest<float, int> BinaryOpTestF_i32;
 TEST_P(BinaryOpTestF_i32, Result)
 {
-  ASSERT_TRUE(
-    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32, ::testing::ValuesIn(inputsf_i32));
 
@@ -85,8 +85,8 @@ const std::vector<BinaryOpInputs<float, size_t>> inputsf_i64 = {{0.000001f, 1024
 typedef BinaryOpTest<float, size_t> BinaryOpTestF_i64;
 TEST_P(BinaryOpTestF_i64, Result)
 {
-  ASSERT_TRUE(
-    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i64, ::testing::ValuesIn(inputsf_i64));
 
@@ -95,8 +95,8 @@ const std::vector<BinaryOpInputs<float, int, double>> inputsf_i32_d = {
 typedef BinaryOpTest<float, int, double> BinaryOpTestF_i32_D;
 TEST_P(BinaryOpTestF_i32_D, Result)
 {
-  ASSERT_TRUE(
-    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32_D, ::testing::ValuesIn(inputsf_i32_d));
 
@@ -104,8 +104,8 @@ const std::vector<BinaryOpInputs<double, int>> inputsd_i32 = {{0.00000001, 1024
 typedef BinaryOpTest<double, int> BinaryOpTestD_i32;
 TEST_P(BinaryOpTestD_i32, Result)
 {
-  ASSERT_TRUE(
-    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i32, ::testing::ValuesIn(inputsd_i32));
 
@@ -114,24 +114,18 @@ const std::vector<BinaryOpInputs<double, size_t>> inputsd_i64 = {
 typedef BinaryOpTest<double, size_t> BinaryOpTestD_i64;
 TEST_P(BinaryOpTestD_i64, Result)
 {
-  ASSERT_TRUE(
-    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i64, ::testing::ValuesIn(inputsd_i64));
 
 template <typename math_t>
 class BinaryOpAlignment : public ::testing::Test {
  protected:
-  BinaryOpAlignment()
-  {
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-    handle.set_stream(stream);
-  }
-  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
-
  public:
   void Misaligned()
   {
+    auto stream = handle.get_stream();
     // Test to trigger cudaErrorMisalignedAddress if veclen is incorrectly
     // chosen.
     int n = 1024;
@@ -146,11 +140,10 @@ class BinaryOpAlignment : public ::testing::Test {
       y.data() + 19,
       256,
       [] __device__(math_t x, math_t y) { return x + y; },
-      stream);
+      handle.get_stream());
   }
 
   raft::handle_t handle;
-  cudaStream_t stream;
 };
 typedef ::testing::Types<float, double> FloatTypes;
 TYPED_TEST_CASE(BinaryOpAlignment, FloatTypes);
diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu
index 1c3d99a883..0326cf5a47 100644
--- a/cpp/test/linalg/cholesky_r1.cu
+++ b/cpp/test/linalg/cholesky_r1.cu
@@ -38,9 +38,7 @@ class CholeskyR1Test : public ::testing::Test {
       devInfo(handle.get_stream()),
       workspace(0, handle.get_stream())
   {
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-    handle.set_stream(stream);
-    raft::update_device(G.data(), G_host, n_rows * n_rows, stream);
+    raft::update_device(G.data(), G_host, n_rows * n_rows, handle.get_stream());
 
     // Allocate workspace
     solver_handle = handle.get_cusolver_dn_handle();
@@ -49,27 +47,31 @@ class CholeskyR1Test : public ::testing::Test {
     int n_bytes = 0;
     // Initializing in CUBLAS_FILL_MODE_LOWER, because that has larger workspace
     // requirements.
-    raft::linalg::choleskyRank1Update(
-      handle, L.data(), n_rows, n_rows, nullptr, &n_bytes, CUBLAS_FILL_MODE_LOWER, stream);
+    raft::linalg::choleskyRank1Update(handle,
+                                      L.data(),
+                                      n_rows,
+                                      n_rows,
+                                      nullptr,
+                                      &n_bytes,
+                                      CUBLAS_FILL_MODE_LOWER,
+                                      handle.get_stream());
     Lwork = std::max(Lwork * sizeof(math_t), (size_t)n_bytes);
-    workspace.resize(Lwork, stream);
+    workspace.resize(Lwork, handle.get_stream());
   }
 
-  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
-
   void testR1Update()
   {
     int n = n_rows * n_rows;
     std::vector<cublasFillMode_t> fillmode{CUBLAS_FILL_MODE_LOWER, CUBLAS_FILL_MODE_UPPER};
     for (auto uplo : fillmode) {
-      raft::copy(L.data(), G.data(), n, stream);
+      raft::copy(L.data(), G.data(), n, handle.get_stream());
       for (int rank = 1; rank <= n_rows; rank++) {
         std::stringstream ss;
         ss << "Rank " << rank << ((uplo == CUBLAS_FILL_MODE_LOWER) ? ", lower" : ", upper");
         SCOPED_TRACE(ss.str());
 
         // Expected solution using Cholesky factorization from scratch
-        raft::copy(L_exp.data(), G.data(), n, stream);
+        raft::copy(L_exp.data(), G.data(), n, handle.get_stream());
         RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnpotrf(solver_handle,
                                                         uplo,
                                                         rank,
@@ -78,33 +80,36 @@ class CholeskyR1Test : public ::testing::Test {
                                                         (math_t*)workspace.data(),
                                                         Lwork,
                                                         devInfo.data(),
-                                                        stream));
+                                                        handle.get_stream()));
 
         // Incremental Cholesky factorization using rank one updates.
         raft::linalg::choleskyRank1Update(
-          handle, L.data(), rank, n_rows, workspace.data(), &Lwork, uplo, stream);
+          handle, L.data(), rank, n_rows, workspace.data(), &Lwork, uplo, handle.get_stream());
 
-        ASSERT_TRUE(raft::devArrMatch(
-          L_exp.data(), L.data(), n_rows * rank, raft::CompareApprox<math_t>(3e-3)));
+        ASSERT_TRUE(raft::devArrMatch(L_exp.data(),
+                                      L.data(),
+                                      n_rows * rank,
+                                      raft::CompareApprox<math_t>(3e-3),
+                                      handle.get_stream()));
       }
     }
   }
 
   void testR1Error()
   {
-    raft::update_device(G.data(), G2_host, 4, stream);
+    raft::update_device(G.data(), G2_host, 4, handle.get_stream());
     std::vector<cublasFillMode_t> fillmode{CUBLAS_FILL_MODE_LOWER, CUBLAS_FILL_MODE_UPPER};
     for (auto uplo : fillmode) {
-      raft::copy(L.data(), G.data(), 4, stream);
+      raft::copy(L.data(), G.data(), 4, handle.get_stream());
       ASSERT_NO_THROW(raft::linalg::choleskyRank1Update(
-        handle, L.data(), 1, 2, workspace.data(), &Lwork, uplo, stream));
+        handle, L.data(), 1, 2, workspace.data(), &Lwork, uplo, handle.get_stream()));
       ASSERT_THROW(raft::linalg::choleskyRank1Update(
-                     handle, L.data(), 2, 2, workspace.data(), &Lwork, uplo, stream),
+                     handle, L.data(), 2, 2, workspace.data(), &Lwork, uplo, handle.get_stream()),
                    raft::exception);
 
       math_t eps = std::numeric_limits<math_t>::epsilon();
       ASSERT_NO_THROW(raft::linalg::choleskyRank1Update(
-        handle, L.data(), 2, 2, workspace.data(), &Lwork, uplo, stream, eps));
+        handle, L.data(), 2, 2, workspace.data(), &Lwork, uplo, handle.get_stream(), eps));
     }
   }
 
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index 910e6a2365..4773ecf50f 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -101,15 +101,21 @@ const std::vector<coalescedReductionInputs<double>> inputsd = {{0.000000001, 102
 typedef coalescedReductionTest<float> coalescedReductionTestF;
 TEST_P(coalescedReductionTestF, Result)
 {
-  ASSERT_TRUE(raft::devArrMatch(
-    dots_exp.data(), dots_act.data(), params.rows, raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(dots_exp.data(),
+                                dots_act.data(),
+                                params.rows,
+                                raft::CompareApprox<float>(params.tolerance),
+                                stream));
 }
 
 typedef coalescedReductionTest<double> coalescedReductionTestD;
 TEST_P(coalescedReductionTestD, Result)
 {
-  ASSERT_TRUE(raft::devArrMatch(
-    dots_exp.data(), dots_act.data(), params.rows, raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(dots_exp.data(),
+                                dots_act.data(),
+                                params.rows,
+                                raft::CompareApprox<double>(params.tolerance),
+                                stream));
 }
 
 INSTANTIATE_TEST_CASE_P(coalescedReductionTests,
diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu
index 7f57c79a7e..d2d2f24397 100644
--- a/cpp/test/linalg/divide.cu
+++ b/cpp/test/linalg/divide.cu
@@ -79,7 +79,7 @@ typedef DivideTest<float> DivideTestF;
 TEST_P(DivideTestF, Result)
 {
   ASSERT_TRUE(devArrMatch(
-    out_ref.data(), out.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
+    out_ref.data(), out.data(), params.len, raft::CompareApprox<float>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestF, ::testing::ValuesIn(inputsf));
 
@@ -88,7 +88,7 @@ const std::vector<UnaryOpInputs<double>> inputsd = {{0.000001f, 1024 * 1024, 2.f
 TEST_P(DivideTestD, Result)
 {
   ASSERT_TRUE(devArrMatch(
-    out_ref.data(), out.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
+    out_ref.data(), out.data(), params.len, raft::CompareApprox<double>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestD, ::testing::ValuesIn(inputsd));
 
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
index c9d95d2058..6bdd880118 100644
--- a/cpp/test/linalg/eig.cu
+++ b/cpp/test/linalg/eig.cu
@@ -160,7 +160,8 @@ TEST_P(EigTestValF, Result)
   ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(),
                                 eig_vals.data(),
                                 params.n_col,
-                                raft::CompareApproxAbs<float>(params.tolerance)));
+                                raft::CompareApproxAbs<float>(params.tolerance),
+                                stream));
 }
 
 typedef EigTest<double> EigTestValD;
@@ -169,7 +170,8 @@ TEST_P(EigTestValD, Result)
   ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(),
                                 eig_vals.data(),
                                 params.n_col,
-                                raft::CompareApproxAbs<double>(params.tolerance)));
+                                raft::CompareApproxAbs<double>(params.tolerance),
+                                stream));
 }
 
 typedef EigTest<float> EigTestVecF;
@@ -178,7 +180,8 @@ TEST_P(EigTestVecF, Result)
   ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(),
                                 eig_vectors.data(),
                                 params.len,
-                                raft::CompareApproxAbs<float>(params.tolerance)));
+                                raft::CompareApproxAbs<float>(params.tolerance),
+                                stream));
 }
 
 typedef EigTest<double> EigTestVecD;
@@ -187,7 +190,8 @@ TEST_P(EigTestVecD, Result)
   ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(),
                                 eig_vectors.data(),
                                 params.len,
-                                raft::CompareApproxAbs<double>(params.tolerance)));
+                                raft::CompareApproxAbs<double>(params.tolerance),
+                                stream));
 }
 
 typedef EigTest<float> EigTestValJacobiF;
@@ -196,7 +200,8 @@ TEST_P(EigTestValJacobiF, Result)
   ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(),
                                 eig_vals_jacobi.data(),
                                 params.n_col,
-                                raft::CompareApproxAbs<float>(params.tolerance)));
+                                raft::CompareApproxAbs<float>(params.tolerance),
+                                stream));
 }
 
 typedef EigTest<double> EigTestValJacobiD;
@@ -205,7 +210,8 @@ TEST_P(EigTestValJacobiD, Result)
   ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(),
                                 eig_vals_jacobi.data(),
                                 params.n_col,
-                                raft::CompareApproxAbs<double>(params.tolerance)));
+                                raft::CompareApproxAbs<double>(params.tolerance),
+                                stream));
 }
 
 typedef EigTest<float> EigTestVecJacobiF;
@@ -214,7 +220,8 @@ TEST_P(EigTestVecJacobiF, Result)
   ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(),
                                 eig_vectors_jacobi.data(),
                                 params.len,
-                                raft::CompareApproxAbs<float>(params.tolerance)));
+                                raft::CompareApproxAbs<float>(params.tolerance),
+                                stream));
 }
 
 typedef EigTest<double> EigTestVecJacobiD;
@@ -223,7 +230,8 @@ TEST_P(EigTestVecJacobiD, Result)
   ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(),
                                 eig_vectors_jacobi.data(),
                                 params.len,
-                                raft::CompareApproxAbs<double>(params.tolerance)));
+                                raft::CompareApproxAbs<double>(params.tolerance),
+                                stream));
 }
 
 typedef EigTest<float> EigTestVecCompareF;
@@ -232,7 +240,8 @@ TEST_P(EigTestVecCompareF, Result)
   ASSERT_TRUE(raft::devArrMatch(eig_vectors_large.data(),
                                 eig_vectors_jacobi_large.data(),
                                 (params.n * params.n),
-                                raft::CompareApproxAbs<float>(params.tolerance)));
+                                raft::CompareApproxAbs<float>(params.tolerance),
+                                stream));
 }
 
 typedef EigTest<double> EigTestVecCompareD;
@@ -241,7 +250,8 @@ TEST_P(EigTestVecCompareD, Result)
   ASSERT_TRUE(raft::devArrMatch(eig_vectors_large.data(),
                                 eig_vectors_jacobi_large.data(),
                                 (params.n * params.n),
-                                raft::CompareApproxAbs<double>(params.tolerance)));
+                                raft::CompareApproxAbs<double>(params.tolerance),
+                                stream));
 }
 
 INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValF, ::testing::ValuesIn(inputsf2));
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index 518dce4048..e41651ef61 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -117,7 +117,8 @@ TEST_P(EigSelTestValF, Result)
   ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(),
                                 eig_vals.data(),
                                 params.n_col,
-                                raft::CompareApproxAbs<float>(params.tolerance)));
+                                raft::CompareApproxAbs<float>(params.tolerance),
+                                stream));
 }
 
 typedef EigSelTest<double> EigSelTestValD;
@@ -126,7 +127,8 @@ TEST_P(EigSelTestValD, Result)
   ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(),
                                 eig_vals.data(),
                                 params.n_col,
-                                raft::CompareApproxAbs<double>(params.tolerance)));
+                                raft::CompareApproxAbs<double>(params.tolerance),
+                                stream));
 }
 
 typedef EigSelTest<float> EigSelTestVecF;
@@ -135,7 +137,8 @@ TEST_P(EigSelTestVecF, Result)
   ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(),
                                 eig_vectors.data(),
                                 12,
-                                raft::CompareApproxAbs<float>(params.tolerance)));
+                                raft::CompareApproxAbs<float>(params.tolerance),
+                                stream));
 }
 
 typedef EigSelTest<double> EigSelTestVecD;
@@ -144,7 +147,8 @@ TEST_P(EigSelTestVecD, Result)
   ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(),
                                 eig_vectors.data(),
                                 12,
-                                raft::CompareApproxAbs<double>(params.tolerance)));
+                                raft::CompareApproxAbs<double>(params.tolerance),
+                                stream));
 }
 
 INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValF, ::testing::ValuesIn(inputsf2));
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu
index 023b04f8ed..1f6c411b79 100644
--- a/cpp/test/linalg/eltwise.cu
+++ b/cpp/test/linalg/eltwise.cu
@@ -95,15 +95,15 @@ const std::vector<ScalarMultiplyInputs<double>> inputsd1 = {
 typedef ScalarMultiplyTest<float> ScalarMultiplyTestF;
 TEST_P(ScalarMultiplyTestF, Result)
 {
-  ASSERT_TRUE(
-    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance), stream));
 }
 
 typedef ScalarMultiplyTest<double> ScalarMultiplyTestD;
 TEST_P(ScalarMultiplyTestD, Result)
 {
-  ASSERT_TRUE(
-    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance), stream));
 }
 
 INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestF, ::testing::ValuesIn(inputsf1));
@@ -182,15 +182,15 @@ const std::vector<EltwiseAddInputs<double>> inputsd2 = {{0.00000001, 1024 * 1024
 typedef EltwiseAddTest<float> EltwiseAddTestF;
 TEST_P(EltwiseAddTestF, Result)
 {
-  ASSERT_TRUE(
-    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance), stream));
 }
 
 typedef EltwiseAddTest<double> EltwiseAddTestD;
 TEST_P(EltwiseAddTestD, Result)
 {
-  ASSERT_TRUE(
-    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance), stream));
 }
 
 INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestF, ::testing::ValuesIn(inputsf2));
diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu
index da07ed797e..6f512aec71 100644
--- a/cpp/test/linalg/gemm_layout.cu
+++ b/cpp/test/linalg/gemm_layout.cu
@@ -105,6 +105,7 @@ class GemmLayoutTest : public ::testing::TestWithParam<GemmLayoutInputs<T>> {
          params.xLayout,
          params.yLayout,
          stream);
+    handle.sync_stream();
   }
 
   void TearDown() override
diff --git a/cpp/test/linalg/gemv.cu b/cpp/test/linalg/gemv.cu
index 4d5472f38c..962b17fa24 100644
--- a/cpp/test/linalg/gemv.cu
+++ b/cpp/test/linalg/gemv.cu
@@ -106,7 +106,7 @@ class GemvTest : public ::testing::TestWithParam<GemvInputs<T>> {
     dim3 blocks(raft::ceildiv<int>(yElems, 256), 1, 1);
     dim3 threads(256, 1, 1);
 
-    naiveGemv<<<blocks, threads>>>(
+    naiveGemv<<<blocks, threads, 0, stream>>>(
       refy.data(), A.data(), x.data(), params.n_rows, params.n_cols, params.lda, params.trans_a);
 
     gemv(handle,
@@ -118,6 +118,7 @@ class GemvTest : public ::testing::TestWithParam<GemvInputs<T>> {
          y.data(),
          params.trans_a,
          stream);
+    handle.sync_stream();
   }
 
   void TearDown() override {}
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index c35e1ea9ef..0baeba5807 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -103,16 +103,16 @@ const std::vector<MapReduceInputs<float>> inputsf = {{0.001f, 1024 * 1024, 1234U
 typedef MapReduceTest<float, float> MapReduceTestFF;
 TEST_P(MapReduceTestFF, Result)
 {
-  ASSERT_TRUE(
-    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF, ::testing::ValuesIn(inputsf));
 
 typedef MapReduceTest<float, double> MapReduceTestFD;
 TEST_P(MapReduceTestFD, Result)
 {
-  ASSERT_TRUE(
-    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFD, ::testing::ValuesIn(inputsf));
 
@@ -120,8 +120,8 @@ const std::vector<MapReduceInputs<double>> inputsd = {{0.000001, 1024 * 1024, 12
 typedef MapReduceTest<double, double> MapReduceTestDD;
 TEST_P(MapReduceTestDD, Result)
 {
-  ASSERT_TRUE(
-    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestDD, ::testing::ValuesIn(inputsd));
 
@@ -133,37 +133,37 @@ class MapGenericReduceTest : public ::testing::Test {
  protected:
   MapGenericReduceTest() : input(n, handle.get_stream()), output(handle.get_stream())
   {
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-    handle.set_stream(stream);
-    initInput(input.data(), input.size(), stream);
+    initInput(input.data(), input.size(), handle.get_stream());
   }
 
-  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
-
  public:
   void initInput(InType* input, int n, cudaStream_t stream)
   {
     raft::random::Rng r(137);
-    r.uniform(input, n, InType(2), InType(3), stream);
+    r.uniform(input, n, InType(2), InType(3), handle.get_stream());
     InType val = 1;
-    raft::update_device(input + 42, &val, 1, stream);
+    raft::update_device(input + 42, &val, 1, handle.get_stream());
     val = 5;
-    raft::update_device(input + 337, &val, 1, stream);
+    raft::update_device(input + 337, &val, 1, handle.get_stream());
   }
 
   void testMin()
   {
     auto op               = [] __device__(InType in) { return in; };
     const OutType neutral = std::numeric_limits<InType>::max();
-    mapThenReduce(output.data(), input.size(), neutral, op, cub::Min(), stream, input.data());
-    EXPECT_TRUE(raft::devArrMatch(OutType(1), output.data(), 1, raft::Compare<OutType>()));
+    mapThenReduce(
+      output.data(), input.size(), neutral, op, cub::Min(), handle.get_stream(), input.data());
+    EXPECT_TRUE(raft::devArrMatch(
+      OutType(1), output.data(), 1, raft::Compare<OutType>(), handle.get_stream()));
   }
   void testMax()
   {
     auto op               = [] __device__(InType in) { return in; };
     const OutType neutral = std::numeric_limits<InType>::min();
-    mapThenReduce(output.data(), input.size(), neutral, op, cub::Max(), stream, input.data());
-    EXPECT_TRUE(raft::devArrMatch(OutType(5), output.data(), 1, raft::Compare<OutType>()));
+    mapThenReduce(
+      output.data(), input.size(), neutral, op, cub::Max(), handle.get_stream(), input.data());
+    EXPECT_TRUE(raft::devArrMatch(
+      OutType(5), output.data(), 1, raft::Compare<OutType>(), handle.get_stream()));
   }
 
  protected:
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index 9f2a1ac78f..b471972304 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -111,7 +111,8 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<T, IdxType>>
                   N,
                   params.rowMajor,
                   params.bcastAlongRows,
-                  (T)1.0);
+                  (T)1.0,
+                  stream);
     } else {
       naiveMatVec(out_ref.data(),
                   in.data(),
@@ -120,7 +121,8 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<T, IdxType>>
                   N,
                   params.rowMajor,
                   params.bcastAlongRows,
-                  (T)1.0);
+                  (T)1.0,
+                  stream);
     }
     matrixVectorOpLaunch(out.data(),
                          in.data(),
diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh
index 70a68fb542..e51802c135 100644
--- a/cpp/test/linalg/matrix_vector_op.cuh
+++ b/cpp/test/linalg/matrix_vector_op.cuh
@@ -54,12 +54,14 @@ void naiveMatVec(Type* out,
                  IdxType N,
                  bool rowMajor,
                  bool bcastAlongRows,
-                 Type scalar)
+                 Type scalar,
+                 cudaStream_t stream)
 {
   static const IdxType TPB = 64;
   IdxType len              = N * D;
   IdxType nblks            = raft::ceildiv(len, TPB);
-  naiveMatVecKernel<Type><<<nblks, TPB>>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar);
+  naiveMatVecKernel<Type>
+    <<<nblks, TPB, 0, stream>>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -98,13 +100,14 @@ void naiveMatVec(Type* out,
                  IdxType N,
                  bool rowMajor,
                  bool bcastAlongRows,
-                 Type scalar)
+                 Type scalar,
+                 cudaStream_t stream)
 {
   static const IdxType TPB = 64;
   IdxType len              = N * D;
   IdxType nblks            = raft::ceildiv(len, TPB);
   naiveMatVecKernel<Type>
-    <<<nblks, TPB>>>(out, mat, vec1, vec2, D, N, rowMajor, bcastAlongRows, scalar);
+    <<<nblks, TPB, 0, stream>>>(out, mat, vec1, vec2, D, N, rowMajor, bcastAlongRows, scalar);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu
index a1001f3816..1e11062a87 100644
--- a/cpp/test/matrix/math.cu
+++ b/cpp/test/matrix/math.cu
@@ -47,11 +47,11 @@ __global__ void nativeSqrtKernel(Type* in, Type* out, int len)
 }
 
 template <typename Type>
-void naiveSqrt(Type* in, Type* out, int len)
+void naiveSqrt(Type* in, Type* out, int len, cudaStream_t stream)
 {
   static const int TPB = 64;
   int nblks            = raft::ceildiv(len, TPB);
-  nativeSqrtKernel<Type><<<nblks, TPB>>>(in, out, len);
+  nativeSqrtKernel<Type><<<nblks, TPB, 0, stream>>>(in, out, len);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -86,9 +86,9 @@ __global__ void naiveSignFlipKernel(Type* in, Type* out, int rowCount, int colCo
 }
 
 template <typename Type>
-void naiveSignFlip(Type* in, Type* out, int rowCount, int colCount)
+void naiveSignFlip(Type* in, Type* out, int rowCount, int colCount, cudaStream_t stream)
 {
-  naiveSignFlipKernel<Type><<<colCount, 1>>>(in, out, rowCount, colCount);
+  naiveSignFlipKernel<Type><<<colCount, 1, 0, stream>>>(in, out, rowCount, colCount);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -149,12 +149,13 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
     naivePower(in_power.data(), out_power_ref.data(), len, stream);
     power(in_power.data(), len, stream);
 
-    naiveSqrt(in_sqrt.data(), out_sqrt_ref.data(), len);
+    naiveSqrt(in_sqrt.data(), out_sqrt_ref.data(), len, stream);
     seqRoot(in_sqrt.data(), len, stream);
 
     ratio(handle, in_ratio.data(), in_ratio.data(), 4, stream);
 
-    naiveSignFlip(in_sign_flip.data(), out_sign_flip_ref.data(), params.n_row, params.n_col);
+    naiveSignFlip(
+      in_sign_flip.data(), out_sign_flip_ref.data(), params.n_row, params.n_col, stream);
     signFlip(in_sign_flip.data(), params.n_row, params.n_col, stream);
 
     // default threshold is 1e-15
@@ -196,43 +197,55 @@ const std::vector<MathInputs<double>> inputsd = {{0.00001, 1024, 1024, 1024 * 10
 typedef MathTest<float> MathPowerTestF;
 TEST_P(MathPowerTestF, Result)
 {
-  ASSERT_TRUE(devArrMatch(
-    in_power.data(), out_power_ref.data(), params.len, CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(in_power.data(),
+                          out_power_ref.data(),
+                          params.len,
+                          CompareApprox<float>(params.tolerance),
+                          stream));
 }
 
 typedef MathTest<double> MathPowerTestD;
 TEST_P(MathPowerTestD, Result)
 {
-  ASSERT_TRUE(devArrMatch(
-    in_power.data(), out_power_ref.data(), params.len, CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(in_power.data(),
+                          out_power_ref.data(),
+                          params.len,
+                          CompareApprox<double>(params.tolerance),
+                          stream));
 }
 
 typedef MathTest<float> MathSqrtTestF;
 TEST_P(MathSqrtTestF, Result)
 {
-  ASSERT_TRUE(devArrMatch(
-    in_sqrt.data(), out_sqrt_ref.data(), params.len, CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(in_sqrt.data(),
+                          out_sqrt_ref.data(),
+                          params.len,
+                          CompareApprox<float>(params.tolerance),
+                          stream));
 }
 
 typedef MathTest<double> MathSqrtTestD;
 TEST_P(MathSqrtTestD, Result)
 {
-  ASSERT_TRUE(devArrMatch(
-    in_sqrt.data(), out_sqrt_ref.data(), params.len, CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(in_sqrt.data(),
+                          out_sqrt_ref.data(),
+                          params.len,
+                          CompareApprox<double>(params.tolerance),
+                          stream));
 }
 
 typedef MathTest<float> MathRatioTestF;
 TEST_P(MathRatioTestF, Result)
 {
-  ASSERT_TRUE(
-    devArrMatch(in_ratio.data(), out_ratio_ref.data(), 4, CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    in_ratio.data(), out_ratio_ref.data(), 4, CompareApprox<float>(params.tolerance), stream));
 }
 
 typedef MathTest<double> MathRatioTestD;
 TEST_P(MathRatioTestD, Result)
 {
-  ASSERT_TRUE(
-    devArrMatch(in_ratio.data(), out_ratio_ref.data(), 4, CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    in_ratio.data(), out_ratio_ref.data(), 4, CompareApprox<double>(params.tolerance), stream));
 }
 
 typedef MathTest<float> MathSignFlipTestF;
@@ -241,7 +254,8 @@ TEST_P(MathSignFlipTestF, Result)
   ASSERT_TRUE(devArrMatch(in_sign_flip.data(),
                           out_sign_flip_ref.data(),
                           params.len,
-                          CompareApprox<float>(params.tolerance)));
+                          CompareApprox<float>(params.tolerance),
+                          stream));
 }
 
 typedef MathTest<double> MathSignFlipTestD;
@@ -250,49 +264,62 @@ TEST_P(MathSignFlipTestD, Result)
   ASSERT_TRUE(devArrMatch(in_sign_flip.data(),
                           out_sign_flip_ref.data(),
                           params.len,
-                          CompareApprox<double>(params.tolerance)));
+                          CompareApprox<double>(params.tolerance),
+                          stream));
 }
 
 typedef MathTest<float> MathReciprocalTestF;
 TEST_P(MathReciprocalTestF, Result)
 {
-  ASSERT_TRUE(
-    devArrMatch(in_recip.data(), in_recip_ref.data(), 4, CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    in_recip.data(), in_recip_ref.data(), 4, CompareApprox<float>(params.tolerance), stream));
 
   // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`.
-  ASSERT_TRUE(
-    devArrMatch(out_recip.data(), in_recip_ref.data(), 3, CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_recip.data(), in_recip_ref.data(), 3, CompareApprox<float>(params.tolerance), stream));
 }
 
 typedef MathTest<double> MathReciprocalTestD;
 TEST_P(MathReciprocalTestD, Result)
 {
-  ASSERT_TRUE(
-    devArrMatch(in_recip.data(), in_recip_ref.data(), 4, CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    in_recip.data(), in_recip_ref.data(), 4, CompareApprox<double>(params.tolerance), stream));
 
   // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`.
-  ASSERT_TRUE(
-    devArrMatch(out_recip.data(), in_recip_ref.data(), 3, CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_recip.data(), in_recip_ref.data(), 3, CompareApprox<double>(params.tolerance), stream));
 }
 
 typedef MathTest<float> MathSetSmallZeroTestF;
 TEST_P(MathSetSmallZeroTestF, Result)
 {
-  ASSERT_TRUE(devArrMatch(
-    in_smallzero.data(), out_smallzero_ref.data(), 4, CompareApprox<float>(params.tolerance)));
-
-  ASSERT_TRUE(devArrMatch(
-    out_smallzero.data(), out_smallzero_ref.data(), 4, CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(in_smallzero.data(),
+                          out_smallzero_ref.data(),
+                          4,
+                          CompareApprox<float>(params.tolerance),
+                          stream));
+
+  ASSERT_TRUE(devArrMatch(out_smallzero.data(),
+                          out_smallzero_ref.data(),
+                          4,
+                          CompareApprox<float>(params.tolerance),
+                          stream));
 }
 
 typedef MathTest<double> MathSetSmallZeroTestD;
 TEST_P(MathSetSmallZeroTestD, Result)
 {
-  ASSERT_TRUE(devArrMatch(
-    in_smallzero.data(), out_smallzero_ref.data(), 4, CompareApprox<double>(params.tolerance)));
-
-  ASSERT_TRUE(devArrMatch(
-    out_smallzero.data(), out_smallzero_ref.data(), 4, CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(in_smallzero.data(),
+                          out_smallzero_ref.data(),
+                          4,
+                          CompareApprox<double>(params.tolerance),
+                          stream));
+
+  ASSERT_TRUE(devArrMatch(out_smallzero.data(),
+                          out_smallzero_ref.data(),
+                          4,
+                          CompareApprox<double>(params.tolerance),
+                          stream));
 }
 
 INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestF, ::testing::ValuesIn(inputsf));
diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu
index 696ef2dd08..85bf780112 100644
--- a/cpp/test/matrix/matrix.cu
+++ b/cpp/test/matrix/matrix.cu
@@ -84,7 +84,8 @@ TEST_P(MatrixTestF, Result)
   ASSERT_TRUE(raft::devArrMatch(in1.data(),
                                 in2.data(),
                                 params.n_row * params.n_col,
-                                raft::CompareApprox<float>(params.tolerance)));
+                                raft::CompareApprox<float>(params.tolerance),
+                                stream));
 }
 
 typedef MatrixTest<double> MatrixTestD;
@@ -93,7 +94,8 @@ TEST_P(MatrixTestD, Result)
   ASSERT_TRUE(raft::devArrMatch(in1.data(),
                                 in2.data(),
                                 params.n_row * params.n_col,
-                                raft::CompareApprox<double>(params.tolerance)));
+                                raft::CompareApprox<double>(params.tolerance),
+                                stream));
 }
 
 INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestF, ::testing::ValuesIn(inputsf2));
@@ -108,12 +110,11 @@ class MatrixCopyRowsTest : public ::testing::Test {
 
  protected:
   MatrixCopyRowsTest()
-    : input(n_cols * n_rows, handle.get_stream()),
+    : stream(handle.get_stream()),
+      input(n_cols * n_rows, handle.get_stream()),
       indices(n_selected, handle.get_stream()),
       output(n_cols * n_selected, handle.get_stream())
   {
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-    handle.set_stream(stream);
     raft::update_device(indices.data(), indices_host, n_selected, stream);
     // Init input array
     thrust::counting_iterator<idx_t> first(0);
@@ -121,17 +122,28 @@ class MatrixCopyRowsTest : public ::testing::Test {
     thrust::copy(handle.get_thrust_policy(), first, first + n_cols * n_rows, ptr);
   }
 
-  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
-
   void testCopyRows()
   {
-    copyRows(
-      input.data(), n_rows, n_cols, output.data(), indices.data(), n_selected, stream, false);
+    copyRows(input.data(),
+             n_rows,
+             n_cols,
+             output.data(),
+             indices.data(),
+             n_selected,
+             handle.get_stream(),
+             false);
     EXPECT_TRUE(raft::devArrMatchHost(
-      output_exp_colmajor, output.data(), n_selected * n_cols, raft::Compare<math_t>()));
-    copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(), n_selected, stream, true);
+      output_exp_colmajor, output.data(), n_selected * n_cols, raft::Compare<math_t>(), stream));
+    copyRows(input.data(),
+             n_rows,
+             n_cols,
+             output.data(),
+             indices.data(),
+             n_selected,
+             handle.get_stream(),
+             true);
     EXPECT_TRUE(raft::devArrMatchHost(
-      output_exp_rowmajor, output.data(), n_selected * n_cols, raft::Compare<math_t>()));
+      output_exp_rowmajor, output.data(), n_selected * n_cols, raft::Compare<math_t>(), stream));
   }
 
  protected:
diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu
index e1223b90a3..74f419be23 100644
--- a/cpp/test/sparse/add.cu
+++ b/cpp/test/sparse/add.cu
@@ -103,7 +103,7 @@ class CSRAddTest : public ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>>
 
     ASSERT_TRUE(nnz == nnz_result);
     ASSERT_TRUE(raft::devArrMatch<Index_>(
-      ind_verify.data(), ind_result.data(), n_rows, raft::Compare<Index_>()));
+      ind_verify.data(), ind_result.data(), n_rows, raft::Compare<Index_>(), stream));
 
     linalg::csr_add_finalize<Type_f>(ind_a.data(),
                                      ind_ptr_a.data(),
@@ -120,9 +120,9 @@ class CSRAddTest : public ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>>
                                      stream);
 
     ASSERT_TRUE(raft::devArrMatch<Index_>(
-      ind_ptr_verify.data(), ind_ptr_result.data(), nnz, raft::Compare<Index_>()));
+      ind_ptr_verify.data(), ind_ptr_result.data(), nnz, raft::Compare<Index_>(), stream));
     ASSERT_TRUE(raft::devArrMatch<Type_f>(
-      values_verify.data(), values_result.data(), nnz, raft::Compare<Type_f>()));
+      values_verify.data(), values_result.data(), nnz, raft::Compare<Type_f>(), stream));
   }
 
  protected:
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index 3b69c9240c..d78cc2d026 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -77,7 +77,7 @@ TEST_P(SortedCOOToCSR, Result)
 
   convert::sorted_coo_to_csr<int>(in.data(), nnz, out.data(), 4, stream);
 
-  ASSERT_TRUE(raft::devArrMatch<int>(out.data(), exp.data(), 4, raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out.data(), exp.data(), 4, raft::Compare<int>(), stream));
 
   cudaStreamDestroy(stream);
 
diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu
index 77c66e2133..dc9b2d63ad 100644
--- a/cpp/test/sparse/filter.cu
+++ b/cpp/test/sparse/filter.cu
@@ -96,6 +96,7 @@ TEST_P(COORemoveZeros, Result)
   raft::update_device(out_ref.vals(), out_vals_ref_h, 2, stream);
 
   op::coo_remove_zeros<float>(&in, &out, stream);
+  CUDA_CHECK(cudaStreamSynchronize(stream));
 
   ASSERT_TRUE(raft::devArrMatch<int>(out_ref.rows(), out.rows(), 2, raft::Compare<int>()));
   ASSERT_TRUE(raft::devArrMatch<int>(out_ref.cols(), out.cols(), 2, raft::Compare<int>()));
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
index be26b6f24b..59c0961699 100644
--- a/cpp/test/sparse/norm.cu
+++ b/cpp/test/sparse/norm.cu
@@ -73,6 +73,7 @@ class CSRRowNormalizeTest : public ::testing::TestWithParam<CSRRowNormalizeInput
           ex_scan.data(), in_vals.data(), nnz, n_rows, result.data(), stream);
         break;
     }
+    CUDA_CHECK(cudaStreamSynchronize(stream));
 
     ASSERT_TRUE(
       raft::devArrMatch<Type_f>(verify.data(), result.data(), nnz, raft::Compare<Type_f>()));
diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu
index f66cd873d5..41328b5f78 100644
--- a/cpp/test/sparse/reduce.cu
+++ b/cpp/test/sparse/reduce.cu
@@ -78,7 +78,7 @@ class SparseReduceTest : public ::testing::TestWithParam<SparseReduceInputs<valu
                                      params.in_rows.size(),
                                      params.m,
                                      params.n);
-
+    CUDA_CHECK(cudaStreamSynchronize(stream));
     ASSERT_TRUE(raft::devArrMatch<value_idx>(
       out_rows.data(), out.rows(), out.nnz, raft::Compare<value_idx>()));
     ASSERT_TRUE(raft::devArrMatch<value_idx>(
diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu
index e650661c0d..be523bc97f 100644
--- a/cpp/test/sparse/row_op.cu
+++ b/cpp/test/sparse/row_op.cu
@@ -78,8 +78,8 @@ class CSRRowOpTest : public ::testing::TestWithParam<CSRRowOpInputs<Type_f, Inde
 
     csr_row_op_wrapper<Type_f, Index_>(ex_scan.data(), n_rows, nnz, result.data(), stream);
 
-    ASSERT_TRUE(
-      raft::devArrMatch<Type_f>(verify.data(), result.data(), nnz, raft::Compare<Type_f>()));
+    ASSERT_TRUE(raft::devArrMatch<Type_f>(
+      verify.data(), result.data(), nnz, raft::Compare<Type_f>(), stream));
   }
 
  protected:
diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu
index 0a0864ce15..85ee0fe79b 100644
--- a/cpp/test/sparse/sort.cu
+++ b/cpp/test/sparse/sort.cu
@@ -78,8 +78,8 @@ TEST_P(COOSort, Result)
   op::coo_sort(
     params.m, params.n, params.nnz, in_rows.data(), in_cols.data(), in_vals.data(), stream);
 
-  ASSERT_TRUE(
-    raft::devArrMatch<int>(verify.data(), in_rows.data(), params.nnz, raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(
+    verify.data(), in_rows.data(), params.nnz, raft::Compare<int>(), stream));
 
   delete[] in_rows_h;
   delete[] in_cols_h;
diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu
index 00f83254c3..73c0f87fdd 100644
--- a/cpp/test/spatial/ball_cover.cu
+++ b/cpp/test/spatial/ball_cover.cu
@@ -105,10 +105,10 @@ void compute_bfknn(const raft::handle_t& handle,
   std::vector<value_t*> input_vec = {const_cast<value_t*>(X1)};
   std::vector<uint32_t> sizes_vec = {n};
 
-  cudaStream_t* int_streams          = nullptr;
   std::vector<int64_t>* translations = nullptr;
 
-  raft::spatial::knn::detail::brute_force_knn_impl<uint32_t, int64_t>(input_vec,
+  raft::spatial::knn::detail::brute_force_knn_impl<uint32_t, int64_t>(handle,
+                                                                      input_vec,
                                                                       sizes_vec,
                                                                       d,
                                                                       const_cast<value_t*>(X2),
@@ -116,9 +116,6 @@ void compute_bfknn(const raft::handle_t& handle,
                                                                       inds,
                                                                       dists,
                                                                       k,
-                                                                      handle.get_stream(),
-                                                                      int_streams,
-                                                                      0,
                                                                       true,
                                                                       true,
                                                                       translations,
@@ -251,13 +248,13 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam<BallCoverInputs> {
                         ToRadians());
     }
 
-    cudaStream_t* int_streams          = nullptr;
     std::vector<int64_t>* translations = nullptr;
 
     std::vector<float*> input_vec   = {d_train_inputs.data()};
     std::vector<uint32_t> sizes_vec = {n};
 
-    raft::spatial::knn::detail::brute_force_knn_impl<uint32_t, int64_t>(input_vec,
+    raft::spatial::knn::detail::brute_force_knn_impl<uint32_t, int64_t>(handle,
+                                                                        input_vec,
                                                                         sizes_vec,
                                                                         d,
                                                                         d_train_inputs.data(),
@@ -265,9 +262,6 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam<BallCoverInputs> {
                                                                         d_ref_I.data(),
                                                                         d_ref_D.data(),
                                                                         k,
-                                                                        handle.get_stream(),
-                                                                        int_streams,
-                                                                        0,
                                                                         true,
                                                                         true,
                                                                         translations,
diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu
index e268dc0c55..171b698265 100644
--- a/cpp/test/spatial/haversine.cu
+++ b/cpp/test/spatial/haversine.cu
@@ -121,9 +121,10 @@ typedef HaversineKNNTest<int, float> HaversineKNNTestF;
 
 TEST_F(HaversineKNNTestF, Fit)
 {
+  ASSERT_TRUE(raft::devArrMatch(
+    d_ref_D.data(), d_pred_D.data(), n * n, raft::CompareApprox<float>(1e-3), stream));
   ASSERT_TRUE(
-    raft::devArrMatch(d_ref_D.data(), d_pred_D.data(), n * n, raft::CompareApprox<float>(1e-3)));
-  ASSERT_TRUE(raft::devArrMatch(d_ref_I.data(), d_pred_I.data(), n * n, raft::Compare<int>()));
+    raft::devArrMatch(d_ref_I.data(), d_pred_I.data(), n * n, raft::Compare<int>(), stream));
 }
 
 }  // namespace knn
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index 8ab33745f3..2fb9bd2ca5 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -103,7 +103,7 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
       expected_labels_.data(), rows_, k_, search_labels_.data());
 
     ASSERT_TRUE(devArrMatch(
-      expected_labels_.data(), actual_labels_.data(), rows_ * k_, raft::Compare<int>()));
+      expected_labels_.data(), actual_labels_.data(), rows_ * k_, raft::Compare<int>(), stream));
   }
 
   void SetUp() override
diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu
index 8f2e2ecef1..e14a9062d3 100644
--- a/cpp/test/stats/mean_center.cu
+++ b/cpp/test/stats/mean_center.cu
@@ -77,7 +77,8 @@ class MeanCenterTest : public ::testing::TestWithParam<MeanCenterInputs<T, IdxTy
                               rows,
                               params.rowMajor,
                               params.bcastAlongRows,
-                              (T)-1.0);
+                              (T)-1.0,
+                              stream);
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index b6f7a078bd..73969ab41d 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -133,20 +133,29 @@ typedef StdDevTest<float> StdDevTestF;
 TEST_P(StdDevTestF, Result)
 {
   ASSERT_TRUE(devArrMatch(
-    params.stddev, stddev_act.data(), params.cols, CompareApprox<float>(params.tolerance)));
+    params.stddev, stddev_act.data(), params.cols, CompareApprox<float>(params.tolerance), stream));
 
-  ASSERT_TRUE(devArrMatch(
-    stddev_act.data(), vars_act.data(), params.cols, CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(stddev_act.data(),
+                          vars_act.data(),
+                          params.cols,
+                          CompareApprox<float>(params.tolerance),
+                          stream));
 }
 
 typedef StdDevTest<double> StdDevTestD;
 TEST_P(StdDevTestD, Result)
 {
-  ASSERT_TRUE(devArrMatch(
-    params.stddev, stddev_act.data(), params.cols, CompareApprox<double>(params.tolerance)));
-
-  ASSERT_TRUE(devArrMatch(
-    stddev_act.data(), vars_act.data(), params.cols, CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(params.stddev,
+                          stddev_act.data(),
+                          params.cols,
+                          CompareApprox<double>(params.tolerance),
+                          stream));
+
+  ASSERT_TRUE(devArrMatch(stddev_act.data(),
+                          vars_act.data(),
+                          params.cols,
+                          CompareApprox<double>(params.tolerance),
+                          stream));
 }
 
 INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestF, ::testing::ValuesIn(inputsf));
diff --git a/python/raft/common/handle.pxd b/python/raft/common/handle.pxd
index 884d81bed1..d2ae0a401d 100644
--- a/python/raft/common/handle.pxd
+++ b/python/raft/common/handle.pxd
@@ -22,7 +22,10 @@
 
 from libcpp.memory cimport shared_ptr
 from .cuda cimport _Stream
-
+from rmm._lib.cuda_stream_view cimport cuda_stream_view
+from rmm._lib.cuda_stream_pool cimport cuda_stream_pool
+from libcpp.memory cimport shared_ptr
+from libcpp.memory cimport unique_ptr
 
 cdef extern from "raft/mr/device/allocator.hpp" \
         namespace "raft::mr::device" nogil:
@@ -32,7 +35,15 @@ cdef extern from "raft/mr/device/allocator.hpp" \
 cdef extern from "raft/handle.hpp" namespace "raft" nogil:
     cdef cppclass handle_t:
         handle_t() except +
-        handle_t(int ns) except +
-        void set_stream(_Stream s) except +
-        _Stream get_stream() except +
-        int get_num_internal_streams() except +
+        handle_t(cuda_stream_view stream_view) except +
+        handle_t(cuda_stream_view stream_view,
+                 shared_ptr[cuda_stream_pool] stream_pool) except +
+        void set_device_allocator(shared_ptr[allocator] a) except +
+        shared_ptr[allocator] get_device_allocator() except +
+        cuda_stream_view get_stream() except +
+        void sync_stream() except +
+
+cdef class Handle:
+    cdef unique_ptr[handle_t] c_obj
+    cdef shared_ptr[cuda_stream_pool] stream_pool
+    cdef int n_streams
diff --git a/python/raft/common/handle.pyx b/python/raft/common/handle.pyx
index 7198695cb4..1accf9e679 100644
--- a/python/raft/common/handle.pyx
+++ b/python/raft/common/handle.pyx
@@ -21,6 +21,8 @@
 
 # import raft
 from libcpp.memory cimport shared_ptr
+from rmm._lib.cuda_stream_view cimport cuda_stream_per_thread
+from rmm._lib.cuda_stream_view cimport cuda_stream_view
 
 from .cuda cimport _Stream, _Error, cudaStreamSynchronize
 from .cuda import CudaRuntimeError
@@ -38,8 +40,7 @@ cdef class Handle:
 
         from raft.common import Stream, Handle
         stream = Stream()
-        handle = Handle()
-        handle.setStream(stream)
+        handle = Handle(stream)
 
         # call algos here
 
@@ -50,51 +51,39 @@ cdef class Handle:
         del handle  # optional!
     """
 
-    # handle_t doesn't have copy operator. So, use pointer for the object
-    # python world cannot access to this raw object directly, hence use
-    # 'size_t'!
-    cdef size_t h
-
-    # not using __dict__ unless we need it to keep this Extension as lean as
-    # possible
-    cdef int n_streams
-
-    def __cinit__(self, n_streams=0):
+    def __cinit__(self, stream=None, n_streams=0):
         self.n_streams = n_streams
-        self.h = <size_t>(new handle_t(n_streams))
-
-    def __dealloc__(self):
-        h_ = <handle_t*>self.h
-        del h_
-
-    def setStream(self, stream):
-        cdef size_t s = <size_t>stream.getStream()
-        cdef handle_t* h_ = <handle_t*>self.h
-        h_.set_stream(<_Stream>s)
+        if n_streams > 0:
+            self.stream_pool.reset(new cuda_stream_pool(n_streams))
+
+        cdef cuda_stream_view c_stream
+        if stream is None:
+            # this constructor will construct a "main" handle on
+            # per-thread default stream, which is non-blocking
+            self.c_obj.reset(new handle_t(cuda_stream_per_thread,
+                                          self.stream_pool))
+        else:
+            # this constructor constructs a handle on user stream
+            c_stream = cuda_stream_view(<_Stream><size_t> stream.getStream())
+            self.c_obj.reset(new handle_t(c_stream,
+                                          self.stream_pool))
 
     def sync(self):
         """
         Issues a sync on the stream set for this handle.
-
-        Once we make `raft.common.cuda.Stream` as a mandatory option
-        for creating `raft.common.Handle`, this should go away
         """
-        cdef handle_t* h_ = <handle_t*>self.h
-        cdef _Stream stream = h_.get_stream()
-        cdef _Error e = cudaStreamSynchronize(stream)
-        if e != 0:
-            raise CudaRuntimeError("Stream sync")
+        self.c_obj.get()[0].sync_stream()
 
     def getHandle(self):
-        return self.h
-
-    def getNumInternalStreams(self):
-        cdef handle_t* h_ = <handle_t*>self.h
-        return h_.get_num_internal_streams()
+        return <size_t> self.c_obj.get()
 
     def __getstate__(self):
         return self.n_streams
 
     def __setstate__(self, state):
         self.n_streams = state
-        self.h = <size_t>(new handle_t(self.n_streams))
+        if self.n_streams > 0:
+            self.stream_pool.reset(new cuda_stream_pool(self.n_streams))
+
+        self.c_obj.reset(new handle_t(cuda_stream_per_thread,
+                                      self.stream_pool))
diff --git a/python/raft/dask/common/comms.py b/python/raft/dask/common/comms.py
index 27533dfb9a..ee768b41ff 100644
--- a/python/raft/dask/common/comms.py
+++ b/python/raft/dask/common/comms.py
@@ -509,7 +509,7 @@ def _func_build_handle_p2p(sessionId, streams_per_handle, verbose):
         sessionId=sessionId, state_object=worker
     )
 
-    handle = Handle(streams_per_handle)
+    handle = Handle(n_streams=streams_per_handle)
     nccl_comm = raft_comm_state["nccl"]
     eps = raft_comm_state["ucp_eps"]
     nWorkers = raft_comm_state["nworkers"]
@@ -546,7 +546,7 @@ def _func_build_handle(sessionId, streams_per_handle, verbose):
             topic="info", msg="Finished injecting comms on handle."
         )
 
-    handle = Handle(streams_per_handle)
+    handle = Handle(n_streams=streams_per_handle)
 
     raft_comm_state = get_raft_comm_state(
         sessionId=sessionId, state_object=worker
diff --git a/python/raft/dask/common/comms_utils.pyx b/python/raft/dask/common/comms_utils.pyx
index 20f004b1d6..7370085805 100644
--- a/python/raft/dask/common/comms_utils.pyx
+++ b/python/raft/dask/common/comms_utils.pyx
@@ -225,7 +225,7 @@ def perform_test_comms_device_multicast_sendrecv(handle, n_trials):
     n_trilas : int
                Number of test trials
     """
-    cdef const handle_t *h = <handle_t*><size_t>handle.getHandle()
+    cdef const handle_t *h = <handle_t *> <size_t> handle.getHandle()
     return test_pointToPoint_device_multicast_sendrecv(deref(h), <int>n_trials)
 
 
From 18f91de29988509cb9ed1a2ec36d1fdff4fca522 Mon Sep 17 00:00:00 2001
From: Jordan Jacobelli <jjacobelli@nvidia.com>
Date: Tue, 14 Dec 2021 18:01:27 +0100
Subject: [PATCH 060/171] Update ucx-py version on release using rvc

---
 ci/gpu/build.sh              |  5 ++++-
 ci/local/old-gpubuild.sh     |  5 ++++-
 ci/release/update-version.sh | 10 +++++++++-
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index c2a318c39a..c8d526703f 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -29,6 +29,9 @@ export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 # Read options for cloning/running downstream repo tests
 source "$WORKSPACE/ci/prtest.config"
 
+# ucx-py version
+export UCX_PY_VERSION='0.24.*'
+
 ################################################################################
 # SETUP - Check environment
 ################################################################################
@@ -52,7 +55,7 @@ gpuci_mamba_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid
       "rmm=${MINOR_VERSION}" \
       "dask-cudf=${MINOR_VERSION}" \
       "dask-cuda=${MINOR_VERSION}" \
-      "ucx-py=0.24.*" \
+      "ucx-py=${UCX_PY_VERSION}" \
       "rapids-build-env=${MINOR_VERSION}.*" \
       "rapids-notebook-env=${MINOR_VERSION}.*" \
       "rapids-doc-env=${MINOR_VERSION}.*"
diff --git a/ci/local/old-gpubuild.sh b/ci/local/old-gpubuild.sh
index 7f0f375d77..ce6db28291 100644
--- a/ci/local/old-gpubuild.sh
+++ b/ci/local/old-gpubuild.sh
@@ -29,6 +29,9 @@ export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 # Read options for cloning/running downstream repo tests
 source "$WORKSPACE/ci/prtest.config"
 
+# ucx-py version
+export UCX_PY_VERSION='0.24.*'
+
 ################################################################################
 # SETUP - Check environment
 ################################################################################
@@ -56,7 +59,7 @@ gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid
       "distributed>=2.12.0" \
       "dask-cudf=${MINOR_VERSION}" \
       "dask-cuda=${MINOR_VERSION}" \
-      "ucx-py=0.24.*"
+      "ucx-py=${UCX_PY_VERSION}"
 
 if [ "$RUN_CUML_LIBCUML_TESTS" = "ON" ] || [ "$RUN_CUML_PRIMS_TESTS" = "ON" ] || [ "$RUN_CUML_PYTHON_TESTS" = "ON" ]; then
   gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 45383aaf68..58eb3bbd67 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -22,6 +22,7 @@ CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
 NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
 NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
+NEXT_UCX_PY_VERSION="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG}).*"
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 
@@ -31,4 +32,11 @@ function sed_runner() {
 }
 
 sed_runner 's/'"RAFT VERSION .* LANGUAGES"'/'"RAFT VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt
-sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' cpp/CMakeLists.txt
\ No newline at end of file
+sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' cpp/CMakeLists.txt
+
+for FILE in conda/environments/*.yml; do
+  sed_runner "s/ucx-py=.*/ucx-py=${NEXT_UCX_PY_VERSION}/g" ${FILE};
+done
+
+sed_runner "s/export UCX_PY_VERSION=.*/export UCX_PY_VERSION='${NEXT_UCX_PY_VERSION}'/g" ci/gpu/build.sh
+sed_runner "s/export UCX_PY_VERSION=.*/export UCX_PY_VERSION='${NEXT_UCX_PY_VERSION}'/g" ci/local/old-gpubuild.sh

From 681b19d8d890ddaeaa67117108723b16e9aba529 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Wed, 15 Dec 2021 16:33:40 -0500
Subject: [PATCH 061/171] Adding distance specializations (#376)

This adds new export targets for distance specializations that speed up building and re-building by 1) increasing parallelism / compilation units, and 2) reuses compiled object files across different compilation units.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/376
---
 cpp/CMakeLists.txt                            | 77 +++++++++++++-----
 cpp/include/raft/distance/specializations.hpp | 19 +++++
 .../specializations/detail/canberra.hpp       | 67 ++++++++++++++++
 .../specializations/detail/chebyshev.hpp      | 66 +++++++++++++++
 .../specializations/detail/correlation.hpp    | 68 ++++++++++++++++
 .../specializations/detail/cosine.hpp         | 68 ++++++++++++++++
 .../detail/hamming_unexpanded.hpp             | 68 ++++++++++++++++
 .../detail/hellinger_expanded.hpp             | 68 ++++++++++++++++
 .../specializations/detail/jensen_shannon.hpp | 66 +++++++++++++++
 .../specializations/detail/kl_divergence.hpp  | 67 ++++++++++++++++
 .../distance/specializations/detail/l1.hpp    | 65 +++++++++++++++
 .../specializations/detail/l2_expanded.hpp    | 66 +++++++++++++++
 .../detail/l2_sqrt_expanded.hpp               | 68 ++++++++++++++++
 .../detail/l2_sqrt_unexpanded.hpp             | 68 ++++++++++++++++
 .../specializations/detail/l2_unexpanded.hpp  | 67 ++++++++++++++++
 .../specializations/detail/lp_unexpanded.hpp  | 67 ++++++++++++++++
 .../distance/specializations/distance.hpp     | 32 ++++++++
 .../sparse/selection/detail/knn_graph.cuh     | 32 ++++----
 .../raft/sparse/selection/knn_graph.hpp       |  7 +-
 cpp/include/raft/spatial/knn/ball_cover.hpp   | 28 +++++--
 .../spatial/knn/detail/ball_cover/common.cuh  | 22 +++--
 .../knn/detail/ball_cover/registers.cuh       |  6 +-
 .../knn/detail/knn_brute_force_faiss.cuh      | 22 ++---
 cpp/include/raft/spatial/knn/knn.hpp          | 29 +++----
 .../raft/spatial/knn/specializations.hpp      | 21 +++++
 .../knn/specializations/ball_cover.hpp        | 55 +++++++++++++
 .../detail/ball_cover_lowdim.hpp              | 56 +++++++++++++
 .../knn/specializations/fused_l2_knn.hpp      | 80 +++++++++++++++++++
 .../raft/spatial/knn/specializations/knn.hpp  | 55 +++++++++++++
 .../specializations/detail/canberra.cu        | 63 +++++++++++++++
 .../specializations/detail/chebyshev.cu       | 63 +++++++++++++++
 .../specializations/detail/correlation.cu     | 65 +++++++++++++++
 .../distance/specializations/detail/cosine.cu | 64 +++++++++++++++
 .../detail/hamming_unexpanded.cu              | 65 +++++++++++++++
 .../detail/hellinger_expanded.cu              | 64 +++++++++++++++
 .../specializations/detail/jensen_shannon.cu  | 64 +++++++++++++++
 .../specializations/detail/kl_divergence.cu   | 64 +++++++++++++++
 cpp/src/distance/specializations/detail/l1.cu | 63 +++++++++++++++
 .../specializations/detail/l2_expanded.cu     | 64 +++++++++++++++
 .../detail/l2_sqrt_expanded.cu                | 64 +++++++++++++++
 .../detail/l2_sqrt_unexpanded.cu              | 64 +++++++++++++++
 .../specializations/detail/l2_unexpanded.cu   | 64 +++++++++++++++
 .../specializations/detail/lp_unexpanded.cu   | 64 +++++++++++++++
 cpp/src/nn/specializations/ball_cover.cu      | 58 ++++++++++++++
 .../detail/ball_cover_lowdim.cu               | 55 +++++++++++++
 cpp/src/nn/specializations/fused_l2_knn.cu    | 80 +++++++++++++++++++
 cpp/src/nn/specializations/knn.cu             | 56 +++++++++++++
 cpp/test/CMakeLists.txt                       |  3 +-
 cpp/test/distance/distance_base.cuh           |  7 +-
 cpp/test/sparse/knn_graph.cu                  |  1 +
 cpp/test/spatial/ball_cover.cu                |  1 +
 cpp/test/spatial/knn.cu                       |  2 +
 cpp/test/spatial/selection.cu                 |  1 +
 53 files changed, 2552 insertions(+), 87 deletions(-)
 create mode 100644 cpp/include/raft/distance/specializations.hpp
 create mode 100644 cpp/include/raft/distance/specializations/detail/canberra.hpp
 create mode 100644 cpp/include/raft/distance/specializations/detail/chebyshev.hpp
 create mode 100644 cpp/include/raft/distance/specializations/detail/correlation.hpp
 create mode 100644 cpp/include/raft/distance/specializations/detail/cosine.hpp
 create mode 100644 cpp/include/raft/distance/specializations/detail/hamming_unexpanded.hpp
 create mode 100644 cpp/include/raft/distance/specializations/detail/hellinger_expanded.hpp
 create mode 100644 cpp/include/raft/distance/specializations/detail/jensen_shannon.hpp
 create mode 100644 cpp/include/raft/distance/specializations/detail/kl_divergence.hpp
 create mode 100644 cpp/include/raft/distance/specializations/detail/l1.hpp
 create mode 100644 cpp/include/raft/distance/specializations/detail/l2_expanded.hpp
 create mode 100644 cpp/include/raft/distance/specializations/detail/l2_sqrt_expanded.hpp
 create mode 100644 cpp/include/raft/distance/specializations/detail/l2_sqrt_unexpanded.hpp
 create mode 100644 cpp/include/raft/distance/specializations/detail/l2_unexpanded.hpp
 create mode 100644 cpp/include/raft/distance/specializations/detail/lp_unexpanded.hpp
 create mode 100644 cpp/include/raft/distance/specializations/distance.hpp
 create mode 100644 cpp/include/raft/spatial/knn/specializations.hpp
 create mode 100644 cpp/include/raft/spatial/knn/specializations/ball_cover.hpp
 create mode 100644 cpp/include/raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp
 create mode 100644 cpp/include/raft/spatial/knn/specializations/fused_l2_knn.hpp
 create mode 100644 cpp/include/raft/spatial/knn/specializations/knn.hpp
 create mode 100644 cpp/src/distance/specializations/detail/canberra.cu
 create mode 100644 cpp/src/distance/specializations/detail/chebyshev.cu
 create mode 100644 cpp/src/distance/specializations/detail/correlation.cu
 create mode 100644 cpp/src/distance/specializations/detail/cosine.cu
 create mode 100644 cpp/src/distance/specializations/detail/hamming_unexpanded.cu
 create mode 100644 cpp/src/distance/specializations/detail/hellinger_expanded.cu
 create mode 100644 cpp/src/distance/specializations/detail/jensen_shannon.cu
 create mode 100644 cpp/src/distance/specializations/detail/kl_divergence.cu
 create mode 100644 cpp/src/distance/specializations/detail/l1.cu
 create mode 100644 cpp/src/distance/specializations/detail/l2_expanded.cu
 create mode 100644 cpp/src/distance/specializations/detail/l2_sqrt_expanded.cu
 create mode 100644 cpp/src/distance/specializations/detail/l2_sqrt_unexpanded.cu
 create mode 100644 cpp/src/distance/specializations/detail/l2_unexpanded.cu
 create mode 100644 cpp/src/distance/specializations/detail/lp_unexpanded.cu
 create mode 100644 cpp/src/nn/specializations/ball_cover.cu
 create mode 100644 cpp/src/nn/specializations/detail/ball_cover_lowdim.cu
 create mode 100644 cpp/src/nn/specializations/fused_l2_knn.cu
 create mode 100644 cpp/src/nn/specializations/knn.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 66b13b6710..b48eff36db 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -119,28 +119,66 @@ rapids_cmake_install_lib_dir( lib_dir )
 
 include(CPack)
 
+file(GLOB_RECURSE RAFT_DISTANCE_SOURCES "src/distance/specializations/*.cu")
+file(GLOB_RECURSE RAFT_NN_SOURCES "src/nn/specializations/*.cu" )
+
+add_library(raft_distance SHARED ${RAFT_DISTANCE_SOURCES})
+add_library(raft::raft_distance ALIAS raft_distance)
+
+add_library(raft_nn SHARED ${RAFT_NN_SOURCES})
+add_library(raft::raft_nn ALIAS raft_nn)
+
 add_library(raft INTERFACE)
 add_library(raft::raft ALIAS raft)
 target_include_directories(raft INTERFACE "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
                                           "$<INSTALL_INTERFACE:include>")
 
-target_link_libraries(raft
-INTERFACE
-  CUDA::cublas
-  CUDA::curand
-  CUDA::cusolver
-  CUDA::cudart
-  CUDA::cusparse
-  rmm::rmm
-  cuco::cuco
-  )
+target_include_directories(raft_distance PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
+        "$<INSTALL_INTERFACE:include>")
+
+target_include_directories(raft_nn PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
+        "$<INSTALL_INTERFACE:include>")
+
+set(RAFT_LINK_LIBRARIES
+        CUDA::cublas
+        CUDA::curand
+        CUDA::cusolver
+        CUDA::cudart
+        CUDA::cusparse
+        rmm::rmm
+        cuco::cuco
+        )
+
+target_link_libraries(raft INTERFACE ${RAFT_LINK_LIBRARIES})
+target_link_libraries(raft_distance PUBLIC ${RAFT_LINK_LIBRARIES})
+target_link_libraries(raft_nn PUBLIC ${RAFT_LINK_LIBRARIES} FAISS::FAISS)
+
+target_compile_options(raft_distance
+        PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
+        "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
+        )
+
+
+target_compile_options(raft_nn
+        PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
+        "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
+        )
 
+target_compile_features(raft_distance PUBLIC cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
+target_compile_features(raft_nn PUBLIC cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
 target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
 
+install(TARGETS raft_distance
+        DESTINATION ${lib_dir}
+        EXPORT raft-exports)
+
+install(TARGETS raft_nn
+        DESTINATION ${lib_dir}
+        EXPORT raft-exports)
+
 install(TARGETS raft
         DESTINATION ${lib_dir}
-        EXPORT raft-exports
-        )
+        EXPORT raft-exports)
 
 include(GNUInstallDirs)
 install(DIRECTORY include/raft/
@@ -149,23 +187,22 @@ install(DIRECTORY include/raft/
 
 # Temporary install of raft.hpp while the file is removed
 install(FILES include/raft.hpp
-	DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft
-	)
+	DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft)
 
 ##############################################################################
 # - install export -----------------------------------------------------------
 set(doc_string
 [=[
-Provide targets for the RAFT: RAPIDS Analytics Frameworks Toolset.
-
-RAFT is a repository containining shared utilities, mathematical operations
-and common functions for the analytics components of RAPIDS.
+Provide targets for the RAFT: RAPIDS Analytics Framework Toolkit.
 
+RAPIDS Analytics Framework Toolkit contains shared representations,
+mathematical computational primitives, and utilities that accelerate
+building analytics and data science algorithms in the RAPIDS ecosystem.
 ]=])
 
  rapids_export(INSTALL raft
     EXPORT_SET raft-exports
-    GLOBAL_TARGETS raft # since we can't hook into EXPORT SETS
+    GLOBAL_TARGETS raft raft_distance# since we can't hook into EXPORT SETS
     NAMESPACE raft::
     DOCUMENTATION doc_string
     )
@@ -175,7 +212,7 @@ and common functions for the analytics components of RAPIDS.
 
 rapids_export(BUILD raft
     EXPORT_SET raft-exports
-    GLOBAL_TARGETS raft # since we can't hook into EXPORT SETS
+    GLOBAL_TARGETS raft raft_distance raft_nn# since we can't hook into EXPORT SETS
     LANGUAGES CUDA
     DOCUMENTATION doc_string
     NAMESPACE raft::
diff --git a/cpp/include/raft/distance/specializations.hpp b/cpp/include/raft/distance/specializations.hpp
new file mode 100644
index 0000000000..e70943e731
--- /dev/null
+++ b/cpp/include/raft/distance/specializations.hpp
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/specializations/distance.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/distance/specializations/detail/canberra.hpp b/cpp/include/raft/distance/specializations/detail/canberra.hpp
new file mode 100644
index 0000000000..2e71685532
--- /dev/null
+++ b/cpp/include/raft/distance/specializations/detail/canberra.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+extern template void distance<raft::distance::DistanceType::Canberra, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+extern template void distance<raft::distance::DistanceType::Canberra, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::Canberra, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/specializations/detail/chebyshev.hpp b/cpp/include/raft/distance/specializations/detail/chebyshev.hpp
new file mode 100644
index 0000000000..dc03e047be
--- /dev/null
+++ b/cpp/include/raft/distance/specializations/detail/chebyshev.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+extern template void distance<raft::distance::DistanceType::Linf, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+extern template void distance<raft::distance::DistanceType::Linf, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::Linf, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/specializations/detail/correlation.hpp b/cpp/include/raft/distance/specializations/detail/correlation.hpp
new file mode 100644
index 0000000000..2e7683ab10
--- /dev/null
+++ b/cpp/include/raft/distance/specializations/detail/correlation.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+extern template void
+distance<raft::distance::DistanceType::CorrelationExpanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::CorrelationExpanded, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::CorrelationExpanded, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/specializations/detail/cosine.hpp b/cpp/include/raft/distance/specializations/detail/cosine.hpp
new file mode 100644
index 0000000000..b47d294645
--- /dev/null
+++ b/cpp/include/raft/distance/specializations/detail/cosine.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+extern template void
+distance<raft::distance::DistanceType::CosineExpanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::CosineExpanded, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::CosineExpanded, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/specializations/detail/hamming_unexpanded.hpp b/cpp/include/raft/distance/specializations/detail/hamming_unexpanded.hpp
new file mode 100644
index 0000000000..29a4ca03d9
--- /dev/null
+++ b/cpp/include/raft/distance/specializations/detail/hamming_unexpanded.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+extern template void
+distance<raft::distance::DistanceType::HammingUnexpanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::HammingUnexpanded, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::HammingUnexpanded, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/specializations/detail/hellinger_expanded.hpp b/cpp/include/raft/distance/specializations/detail/hellinger_expanded.hpp
new file mode 100644
index 0000000000..264003ec0e
--- /dev/null
+++ b/cpp/include/raft/distance/specializations/detail/hellinger_expanded.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+extern template void
+distance<raft::distance::DistanceType::HellingerExpanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::HellingerExpanded, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::HellingerExpanded, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/specializations/detail/jensen_shannon.hpp b/cpp/include/raft/distance/specializations/detail/jensen_shannon.hpp
new file mode 100644
index 0000000000..3135a4c579
--- /dev/null
+++ b/cpp/include/raft/distance/specializations/detail/jensen_shannon.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+extern template void
+distance<raft::distance::DistanceType::JensenShannon, float, float, float, int>(const float* x,
+                                                                                const float* y,
+                                                                                float* dist,
+                                                                                int m,
+                                                                                int n,
+                                                                                int k,
+                                                                                void* workspace,
+                                                                                size_t worksize,
+                                                                                cudaStream_t stream,
+                                                                                bool isRowMajor,
+                                                                                float metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::JensenShannon, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+extern template void
+distance<raft::distance::DistanceType::JensenShannon, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/specializations/detail/kl_divergence.hpp b/cpp/include/raft/distance/specializations/detail/kl_divergence.hpp
new file mode 100644
index 0000000000..207fca6bc2
--- /dev/null
+++ b/cpp/include/raft/distance/specializations/detail/kl_divergence.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+extern template void distance<raft::distance::DistanceType::KLDivergence, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::KLDivergence, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::KLDivergence, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/specializations/detail/l1.hpp b/cpp/include/raft/distance/specializations/detail/l1.hpp
new file mode 100644
index 0000000000..e8eddfe1e4
--- /dev/null
+++ b/cpp/include/raft/distance/specializations/detail/l1.hpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+extern template void distance<raft::distance::DistanceType::L1, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+extern template void distance<raft::distance::DistanceType::L1, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+extern template void distance<raft::distance::DistanceType::L1, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/specializations/detail/l2_expanded.hpp b/cpp/include/raft/distance/specializations/detail/l2_expanded.hpp
new file mode 100644
index 0000000000..db37b8db9f
--- /dev/null
+++ b/cpp/include/raft/distance/specializations/detail/l2_expanded.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+extern template void distance<raft::distance::DistanceType::L2Expanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::L2Expanded, double, double, double, int>(const double* x,
+                                                                                const double* y,
+                                                                                double* dist,
+                                                                                int m,
+                                                                                int n,
+                                                                                int k,
+                                                                                void* workspace,
+                                                                                size_t worksize,
+                                                                                cudaStream_t stream,
+                                                                                bool isRowMajor,
+                                                                                double metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::L2Expanded, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/specializations/detail/l2_sqrt_expanded.hpp b/cpp/include/raft/distance/specializations/detail/l2_sqrt_expanded.hpp
new file mode 100644
index 0000000000..ac23c9c357
--- /dev/null
+++ b/cpp/include/raft/distance/specializations/detail/l2_sqrt_expanded.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+extern template void
+distance<raft::distance::DistanceType::L2SqrtExpanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::L2SqrtExpanded, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::L2SqrtExpanded, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/specializations/detail/l2_sqrt_unexpanded.hpp b/cpp/include/raft/distance/specializations/detail/l2_sqrt_unexpanded.hpp
new file mode 100644
index 0000000000..1e38575fbf
--- /dev/null
+++ b/cpp/include/raft/distance/specializations/detail/l2_sqrt_unexpanded.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+extern template void
+distance<raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/specializations/detail/l2_unexpanded.hpp b/cpp/include/raft/distance/specializations/detail/l2_unexpanded.hpp
new file mode 100644
index 0000000000..035c9ef693
--- /dev/null
+++ b/cpp/include/raft/distance/specializations/detail/l2_unexpanded.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+extern template void distance<raft::distance::DistanceType::L2Unexpanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::L2Unexpanded, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::L2Unexpanded, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/specializations/detail/lp_unexpanded.hpp b/cpp/include/raft/distance/specializations/detail/lp_unexpanded.hpp
new file mode 100644
index 0000000000..83eda5f07b
--- /dev/null
+++ b/cpp/include/raft/distance/specializations/detail/lp_unexpanded.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+extern template void distance<raft::distance::DistanceType::LpUnexpanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::LpUnexpanded, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+extern template void
+distance<raft::distance::DistanceType::LpUnexpanded, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/specializations/distance.hpp b/cpp/include/raft/distance/specializations/distance.hpp
new file mode 100644
index 0000000000..a57d6f49a5
--- /dev/null
+++ b/cpp/include/raft/distance/specializations/distance.hpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/specializations/detail/canberra.hpp>
+#include <raft/distance/specializations/detail/chebyshev.hpp>
+#include <raft/distance/specializations/detail/correlation.hpp>
+#include <raft/distance/specializations/detail/cosine.hpp>
+#include <raft/distance/specializations/detail/hamming_unexpanded.hpp>
+#include <raft/distance/specializations/detail/hellinger_expanded.hpp>
+#include <raft/distance/specializations/detail/jensen_shannon.hpp>
+#include <raft/distance/specializations/detail/kl_divergence.hpp>
+#include <raft/distance/specializations/detail/l1.hpp>
+#include <raft/distance/specializations/detail/l2_expanded.hpp>
+#include <raft/distance/specializations/detail/l2_sqrt_expanded.hpp>
+#include <raft/distance/specializations/detail/l2_sqrt_unexpanded.hpp>
+#include <raft/distance/specializations/detail/l2_unexpanded.hpp>
+#include <raft/distance/specializations/detail/lp_unexpanded.hpp>
diff --git a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
index 83cb23f513..3fad21f307 100644
--- a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
@@ -104,7 +104,7 @@ void knn_graph(const handle_t& handle,
                raft::sparse::COO<value_t, value_idx>& out,
                int c = 15)
 {
-  int k = build_k(m, c);
+  size_t k = build_k(m, c);
 
   auto stream = handle.get_stream();
 
@@ -120,7 +120,7 @@ void knn_graph(const handle_t& handle,
   std::vector<value_t*> inputs;
   inputs.push_back(const_cast<value_t*>(X));
 
-  std::vector<int> sizes;
+  std::vector<size_t> sizes;
   sizes.push_back(m);
 
   // This is temporary. Once faiss is updated, we should be able to
@@ -128,19 +128,19 @@ void knn_graph(const handle_t& handle,
   rmm::device_uvector<int64_t> int64_indices(nnz, stream);
 
   uint32_t knn_start = curTimeMillis();
-  raft::spatial::knn::brute_force_knn(handle,
-                                      inputs,
-                                      sizes,
-                                      n,
-                                      const_cast<value_t*>(X),
-                                      m,
-                                      int64_indices.data(),
-                                      data.data(),
-                                      k,
-                                      true,
-                                      true,
-                                      nullptr,
-                                      metric);
+  raft::spatial::knn::brute_force_knn<int64_t, value_t, size_t>(handle,
+                                                                inputs,
+                                                                sizes,
+                                                                n,
+                                                                const_cast<value_t*>(X),
+                                                                m,
+                                                                int64_indices.data(),
+                                                                data.data(),
+                                                                k,
+                                                                true,
+                                                                true,
+                                                                nullptr,
+                                                                metric);
 
   // convert from current knn's 64-bit to 32-bit.
   conv_indices(int64_indices.data(), indices.data(), nnz, stream);
@@ -152,4 +152,4 @@ void knn_graph(const handle_t& handle,
 };  // namespace detail
 };  // namespace selection
 };  // namespace sparse
-};  // end namespace raft
\ No newline at end of file
+};  // end namespace raft
diff --git a/cpp/include/raft/sparse/selection/knn_graph.hpp b/cpp/include/raft/sparse/selection/knn_graph.hpp
index 7af452541f..96ce02e06a 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.hpp
+++ b/cpp/include/raft/sparse/selection/knn_graph.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <raft/linalg/distance_type.h>
+#include <cstdint>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/selection/detail/knn_graph.cuh>
 
@@ -44,8 +45,8 @@ namespace selection {
 template <typename value_idx = int, typename value_t = float>
 void knn_graph(const handle_t& handle,
                const value_t* X,
-               size_t m,
-               size_t n,
+               std::size_t m,
+               std::size_t n,
                raft::distance::DistanceType metric,
                raft::sparse::COO<value_t, value_idx>& out,
                int c = 15)
@@ -55,4 +56,4 @@ void knn_graph(const handle_t& handle,
 
 };  // namespace selection
 };  // namespace sparse
-};  // end namespace raft
\ No newline at end of file
+};  // end namespace raft
diff --git a/cpp/include/raft/spatial/knn/ball_cover.hpp b/cpp/include/raft/spatial/knn/ball_cover.hpp
index cb2b9e99cd..39f5845794 100644
--- a/cpp/include/raft/spatial/knn/ball_cover.hpp
+++ b/cpp/include/raft/spatial/knn/ball_cover.hpp
@@ -34,10 +34,10 @@ void rbc_build_index(const raft::handle_t& handle,
 {
   ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions");
   if (index.metric == raft::distance::DistanceType::Haversine) {
-    detail::rbc_build_index(handle, index, detail::HaversineFunc());
+    detail::rbc_build_index(handle, index, detail::HaversineFunc<value_t, value_int>());
   } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
              index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
-    detail::rbc_build_index(handle, index, detail::EuclideanFunc());
+    detail::rbc_build_index(handle, index, detail::EuclideanFunc<value_t, value_int>());
   } else {
     RAFT_FAIL("Metric not support");
   }
@@ -84,12 +84,24 @@ void rbc_all_knn_query(const raft::handle_t& handle,
 {
   ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions");
   if (index.metric == raft::distance::DistanceType::Haversine) {
-    detail::rbc_all_knn_query(
-      handle, index, k, inds, dists, detail::HaversineFunc(), perform_post_filtering, weight);
+    detail::rbc_all_knn_query(handle,
+                              index,
+                              k,
+                              inds,
+                              dists,
+                              detail::HaversineFunc<value_t, value_int>(),
+                              perform_post_filtering,
+                              weight);
   } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
              index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
-    detail::rbc_all_knn_query(
-      handle, index, k, inds, dists, detail::EuclideanFunc(), perform_post_filtering, weight);
+    detail::rbc_all_knn_query(handle,
+                              index,
+                              k,
+                              inds,
+                              dists,
+                              detail::EuclideanFunc<value_t, value_int>(),
+                              perform_post_filtering,
+                              weight);
   } else {
     RAFT_FAIL("Metric not supported");
   }
@@ -146,7 +158,7 @@ void rbc_knn_query(const raft::handle_t& handle,
                           n_query_pts,
                           inds,
                           dists,
-                          detail::HaversineFunc(),
+                          detail::HaversineFunc<value_t, value_int>(),
                           perform_post_filtering,
                           weight);
   } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
@@ -158,7 +170,7 @@ void rbc_knn_query(const raft::handle_t& handle,
                           n_query_pts,
                           inds,
                           dists,
-                          detail::EuclideanFunc(),
+                          detail::EuclideanFunc<value_t, value_int>(),
                           perform_post_filtering,
                           weight);
   } else {
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
index 181dad1a90..4b3065a0f3 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
@@ -38,21 +38,31 @@ struct NNComp {
   }
 };
 
-struct HaversineFunc {
-  template <typename value_t, typename value_int = std::uint32_t>
+template <typename value_t, typename value_int = std::uint32_t>
+struct DistFunc {
+  virtual __device__ __host__ __forceinline__ value_t operator()(const value_t* a,
+                                                                 const value_t* b,
+                                                                 const value_int n_dims)
+  {
+    return -1;
+  };
+};
+
+template <typename value_t, typename value_int = std::uint32_t>
+struct HaversineFunc : public DistFunc<value_t, value_int> {
   __device__ __host__ __forceinline__ value_t operator()(const value_t* a,
                                                          const value_t* b,
-                                                         const value_int n_dims)
+                                                         const value_int n_dims) override
   {
     return raft::spatial::knn::detail::compute_haversine(a[0], b[0], a[1], b[1]);
   }
 };
 
-struct EuclideanFunc {
-  template <typename value_t, typename value_int = std::uint32_t>
+template <typename value_t, typename value_int = std::uint32_t>
+struct EuclideanFunc : public DistFunc<value_t, value_int> {
   __device__ __host__ __forceinline__ value_t operator()(const value_t* a,
                                                          const value_t* b,
-                                                         const value_int n_dims)
+                                                         const value_int n_dims) override
   {
     value_t sum_sq = 0;
     for (value_int i = 0; i < n_dims; ++i) {
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
index 5d28258f7a..32d8068834 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
@@ -470,7 +470,7 @@ void rbc_low_dim_pass_one(const raft::handle_t& handle,
                           value_int k,
                           const value_idx* R_knn_inds,
                           const value_t* R_knn_dists,
-                          dist_func dfunc,
+                          dist_func& dfunc,
                           value_idx* inds,
                           value_t* dists,
                           float weight,
@@ -601,7 +601,7 @@ void rbc_low_dim_pass_two(const raft::handle_t& handle,
                           value_int k,
                           const value_idx* R_knn_inds,
                           const value_t* R_knn_dists,
-                          dist_func dfunc,
+                          dist_func& dfunc,
                           value_idx* inds,
                           value_t* dists,
                           float weight,
@@ -611,7 +611,7 @@ void rbc_low_dim_pass_two(const raft::handle_t& handle,
 
   rmm::device_uvector<std::uint32_t> bitset(bitset_size * index.m, handle.get_stream());
 
-  perform_post_filter_registers<value_idx, value_t, value_int, 128, dist_func>
+  perform_post_filter_registers<value_idx, value_t, value_int, 128>
     <<<n_query_rows, 128, bitset_size * sizeof(std::uint32_t), handle.get_stream()>>>(
       index.get_X(),
       index.n,
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 12b7124773..68590d3d7d 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -216,16 +216,16 @@ inline void knn_merge_parts(value_t* inK,
  * @param[in] metric corresponds to the raft::distance::DistanceType enum (default is L2Expanded)
  * @param[in] metricArg metric argument to use. Corresponds to the p arg for lp norm
  */
-template <typename IntType = int, typename IdxType = std::int64_t>
+template <typename IntType = int, typename IdxType = std::int64_t, typename value_t = float>
 void brute_force_knn_impl(
   const raft::handle_t& handle,
-  std::vector<float*>& input,
+  std::vector<value_t*>& input,
   std::vector<IntType>& sizes,
   IntType D,
-  float* search_items,
+  value_t* search_items,
   IntType n,
   IdxType* res_I,
-  float* res_D,
+  value_t* res_D,
   IntType k,
   bool rowMajorIndex                  = true,
   bool rowMajorQuery                  = true,
@@ -254,14 +254,14 @@ void brute_force_knn_impl(
   }
 
   // perform preprocessing
-  std::unique_ptr<MetricProcessor<float>> query_metric_processor =
-    create_processor<float>(metric, n, D, k, rowMajorQuery, userStream);
+  std::unique_ptr<MetricProcessor<value_t>> query_metric_processor =
+    create_processor<value_t>(metric, n, D, k, rowMajorQuery, userStream);
   query_metric_processor->preprocess(search_items);
 
-  std::vector<std::unique_ptr<MetricProcessor<float>>> metric_processors(input.size());
+  std::vector<std::unique_ptr<MetricProcessor<value_t>>> metric_processors(input.size());
   for (size_t i = 0; i < input.size(); i++) {
     metric_processors[i] =
-      create_processor<float>(metric, sizes[i], D, k, rowMajorQuery, userStream);
+      create_processor<value_t>(metric, sizes[i], D, k, rowMajorQuery, userStream);
     metric_processors[i]->preprocess(input[i]);
   }
 
@@ -271,10 +271,10 @@ void brute_force_knn_impl(
   rmm::device_uvector<std::int64_t> trans(id_ranges->size(), userStream);
   raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), userStream);
 
-  rmm::device_uvector<float> all_D(0, userStream);
+  rmm::device_uvector<value_t> all_D(0, userStream);
   rmm::device_uvector<std::int64_t> all_I(0, userStream);
 
-  float* out_D   = res_D;
+  value_t* out_D = res_D;
   IdxType* out_I = res_I;
 
   if (input.size() > 1) {
@@ -289,7 +289,7 @@ void brute_force_knn_impl(
   handle.wait_stream_pool_on_stream();
 
   for (size_t i = 0; i < input.size(); i++) {
-    float* out_d_ptr   = out_D + (i * k * n);
+    value_t* out_d_ptr = out_D + (i * k * n);
     IdxType* out_i_ptr = out_I + (i * k * n);
 
     auto stream = handle.get_next_usable_stream(i);
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index e1e1eac248..b29c4cc51c 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -123,20 +123,21 @@ inline void select_k(value_t* inK,
  * @param[in] translations starting offsets for partitions. should be the same size
  *            as input vector.
  */
-inline void brute_force_knn(raft::handle_t const& handle,
-                            std::vector<float*>& input,
-                            std::vector<int>& sizes,
-                            int D,
-                            float* search_items,
-                            int n,
-                            int64_t* res_I,
-                            float* res_D,
-                            int k,
-                            bool rowMajorIndex                 = true,
-                            bool rowMajorQuery                 = true,
-                            std::vector<int64_t>* translations = nullptr,
-                            distance::DistanceType metric      = distance::DistanceType::L2Expanded,
-                            float metric_arg                   = 2.0f)
+template <typename value_idx = std::int64_t, typename value_t = float, typename value_int = int>
+void brute_force_knn(raft::handle_t const& handle,
+                     std::vector<value_t*>& input,
+                     std::vector<value_int>& sizes,
+                     value_int D,
+                     value_t* search_items,
+                     value_int n,
+                     value_idx* res_I,
+                     value_t* res_D,
+                     value_int k,
+                     bool rowMajorIndex                   = true,
+                     bool rowMajorQuery                   = true,
+                     std::vector<value_idx>* translations = nullptr,
+                     distance::DistanceType metric        = distance::DistanceType::L2Unexpanded,
+                     float metric_arg                     = 2.0f)
 {
   ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size");
 
diff --git a/cpp/include/raft/spatial/knn/specializations.hpp b/cpp/include/raft/spatial/knn/specializations.hpp
new file mode 100644
index 0000000000..663e77c6a0
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/specializations.hpp
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/spatial/knn/specializations/ball_cover.hpp>
+#include <raft/spatial/knn/specializations/fused_l2_knn.hpp>
+#include <raft/spatial/knn/specializations/knn.hpp>
diff --git a/cpp/include/raft/spatial/knn/specializations/ball_cover.hpp b/cpp/include/raft/spatial/knn/specializations/ball_cover.hpp
new file mode 100644
index 0000000000..dad0b3f1ee
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/specializations/ball_cover.hpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+#include <raft/spatial/knn/ball_cover_common.h>
+#include <raft/spatial/knn/ball_cover.hpp>
+#include <raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+extern template class BallCoverIndex<int, float, std::uint32_t>;
+extern template class BallCoverIndex<std::int64_t, float, std::uint32_t>;
+
+extern template void rbc_build_index<std::int64_t, float, std::uint32_t>(
+  const raft::handle_t& handle, BallCoverIndex<std::int64_t, float, std::uint32_t>& index);
+
+extern template void rbc_knn_query<std::int64_t, float, std::uint32_t>(
+  const raft::handle_t& handle,
+  BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
+  std::uint32_t k,
+  const float* query,
+  std::uint32_t n_query_pts,
+  std::int64_t* inds,
+  float* dists,
+  bool perform_post_filtering,
+  float weight);
+
+extern template void rbc_all_knn_query<std::int64_t, float, std::uint32_t>(
+  const raft::handle_t& handle,
+  BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
+  std::uint32_t k,
+  std::int64_t* inds,
+  float* dists,
+  bool perform_post_filtering,
+  float weight);
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp b/cpp/include/raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp
new file mode 100644
index 0000000000..d0e4813332
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/spatial/knn/detail/ball_cover/common.cuh>
+#include <raft/spatial/knn/detail/ball_cover/registers.cuh>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+
+extern template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t>(
+  const raft::handle_t& handle,
+  BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
+  const float* query,
+  const std::uint32_t n_query_rows,
+  std::uint32_t k,
+  const std::int64_t* R_knn_inds,
+  const float* R_knn_dists,
+  DistFunc<float, std::uint32_t>& dfunc,
+  std::int64_t* inds,
+  float* dists,
+  float weight,
+  std::uint32_t* dists_counter);
+
+extern template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t>(
+  const raft::handle_t& handle,
+  BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
+  const float* query,
+  const std::uint32_t n_query_rows,
+  std::uint32_t k,
+  const std::int64_t* R_knn_inds,
+  const float* R_knn_dists,
+  DistFunc<float, std::uint32_t>& dfunc,
+  std::int64_t* inds,
+  float* dists,
+  float weight,
+  std::uint32_t* post_dists_counter);
+};  // namespace detail
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/specializations/fused_l2_knn.hpp b/cpp/include/raft/spatial/knn/specializations/fused_l2_knn.hpp
new file mode 100644
index 0000000000..961351d734
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/specializations/fused_l2_knn.hpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+
+extern template void fusedL2Knn<long, float, true>(size_t D,
+                                                   long* out_inds,
+                                                   float* out_dists,
+                                                   const float* index,
+                                                   const float* query,
+                                                   size_t n_index_rows,
+                                                   size_t n_query_rows,
+                                                   int k,
+                                                   bool rowMajorIndex,
+                                                   bool rowMajorQuery,
+                                                   cudaStream_t stream,
+                                                   raft::distance::DistanceType metric);
+
+extern template void fusedL2Knn<long, float, false>(size_t D,
+                                                    long* out_inds,
+                                                    float* out_dists,
+                                                    const float* index,
+                                                    const float* query,
+                                                    size_t n_index_rows,
+                                                    size_t n_query_rows,
+                                                    int k,
+                                                    bool rowMajorIndex,
+                                                    bool rowMajorQuery,
+                                                    cudaStream_t stream,
+                                                    raft::distance::DistanceType metric);
+
+extern template void fusedL2Knn<int, float, true>(size_t D,
+                                                  int* out_inds,
+                                                  float* out_dists,
+                                                  const float* index,
+                                                  const float* query,
+                                                  size_t n_index_rows,
+                                                  size_t n_query_rows,
+                                                  int k,
+                                                  bool rowMajorIndex,
+                                                  bool rowMajorQuery,
+                                                  cudaStream_t stream,
+                                                  raft::distance::DistanceType metric);
+
+extern template void fusedL2Knn<int, float, false>(size_t D,
+                                                   int* out_inds,
+                                                   float* out_dists,
+                                                   const float* index,
+                                                   const float* query,
+                                                   size_t n_index_rows,
+                                                   size_t n_query_rows,
+                                                   int k,
+                                                   bool rowMajorIndex,
+                                                   bool rowMajorQuery,
+                                                   cudaStream_t stream,
+                                                   raft::distance::DistanceType metric);
+
+};  // namespace detail
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/specializations/knn.hpp b/cpp/include/raft/spatial/knn/specializations/knn.hpp
new file mode 100644
index 0000000000..bd8673af39
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/specializations/knn.hpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/spatial/knn/knn.hpp>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+extern template void brute_force_knn<long, float, int>(raft::handle_t const& handle,
+                                                       std::vector<float*>& input,
+                                                       std::vector<int>& sizes,
+                                                       int D,
+                                                       float* search_items,
+                                                       int n,
+                                                       long* res_I,
+                                                       float* res_D,
+                                                       int k,
+                                                       bool rowMajorIndex,
+                                                       bool rowMajorQuery,
+                                                       std::vector<long>* translations,
+                                                       distance::DistanceType metric,
+                                                       float metric_arg);
+
+extern template void brute_force_knn<long, float, unsigned int>(raft::handle_t const& handle,
+                                                                std::vector<float*>& input,
+                                                                std::vector<unsigned int>& sizes,
+                                                                unsigned int D,
+                                                                float* search_items,
+                                                                unsigned int n,
+                                                                long* res_I,
+                                                                float* res_D,
+                                                                unsigned int k,
+                                                                bool rowMajorIndex,
+                                                                bool rowMajorQuery,
+                                                                std::vector<long>* translations,
+                                                                distance::DistanceType metric,
+                                                                float metric_arg);
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/canberra.cu b/cpp/src/distance/specializations/detail/canberra.cu
new file mode 100644
index 0000000000..b2dd993ab7
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/canberra.cu
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::Canberra, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+template void distance<raft::distance::DistanceType::Canberra, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+template void distance<raft::distance::DistanceType::Canberra, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/chebyshev.cu b/cpp/src/distance/specializations/detail/chebyshev.cu
new file mode 100644
index 0000000000..ab310515bd
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/chebyshev.cu
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::Linf, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+template void distance<raft::distance::DistanceType::Linf, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+template void distance<raft::distance::DistanceType::Linf, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/correlation.cu b/cpp/src/distance/specializations/detail/correlation.cu
new file mode 100644
index 0000000000..04b9e5bf69
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/correlation.cu
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::CorrelationExpanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+template void
+distance<raft::distance::DistanceType::CorrelationExpanded, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+template void
+distance<raft::distance::DistanceType::CorrelationExpanded, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/cosine.cu b/cpp/src/distance/specializations/detail/cosine.cu
new file mode 100644
index 0000000000..bc19599511
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/cosine.cu
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::CosineExpanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+template void distance<raft::distance::DistanceType::CosineExpanded, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+template void
+distance<raft::distance::DistanceType::CosineExpanded, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/hamming_unexpanded.cu b/cpp/src/distance/specializations/detail/hamming_unexpanded.cu
new file mode 100644
index 0000000000..e5e66e85bd
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/hamming_unexpanded.cu
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::HammingUnexpanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+template void
+distance<raft::distance::DistanceType::HammingUnexpanded, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+template void
+distance<raft::distance::DistanceType::HammingUnexpanded, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/hellinger_expanded.cu b/cpp/src/distance/specializations/detail/hellinger_expanded.cu
new file mode 100644
index 0000000000..fa9b8e14d6
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/hellinger_expanded.cu
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::HellingerExpanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+template void
+distance<raft::distance::DistanceType::HellingerExpanded, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+template void
+distance<raft::distance::DistanceType::HellingerExpanded, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/jensen_shannon.cu b/cpp/src/distance/specializations/detail/jensen_shannon.cu
new file mode 100644
index 0000000000..37f1f81fb1
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/jensen_shannon.cu
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::JensenShannon, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+template void distance<raft::distance::DistanceType::JensenShannon, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+template void
+distance<raft::distance::DistanceType::JensenShannon, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/kl_divergence.cu b/cpp/src/distance/specializations/detail/kl_divergence.cu
new file mode 100644
index 0000000000..f6412cdefd
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/kl_divergence.cu
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::KLDivergence, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+template void distance<raft::distance::DistanceType::KLDivergence, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+template void
+distance<raft::distance::DistanceType::KLDivergence, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/l1.cu b/cpp/src/distance/specializations/detail/l1.cu
new file mode 100644
index 0000000000..5df9c9ece6
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l1.cu
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::L1, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+template void distance<raft::distance::DistanceType::L1, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+template void distance<raft::distance::DistanceType::L1, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/l2_expanded.cu b/cpp/src/distance/specializations/detail/l2_expanded.cu
new file mode 100644
index 0000000000..1b122ca331
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l2_expanded.cu
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::L2Expanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+template void distance<raft::distance::DistanceType::L2Expanded, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+template void
+distance<raft::distance::DistanceType::L2Expanded, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_expanded.cu b/cpp/src/distance/specializations/detail/l2_sqrt_expanded.cu
new file mode 100644
index 0000000000..f87d08b94b
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l2_sqrt_expanded.cu
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::L2SqrtExpanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+template void distance<raft::distance::DistanceType::L2SqrtExpanded, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+template void
+distance<raft::distance::DistanceType::L2SqrtExpanded, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded.cu b/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded.cu
new file mode 100644
index 0000000000..7067cc9015
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded.cu
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+template void distance<raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+template void
+distance<raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/l2_unexpanded.cu b/cpp/src/distance/specializations/detail/l2_unexpanded.cu
new file mode 100644
index 0000000000..bdd57b13b2
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l2_unexpanded.cu
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::L2Unexpanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+template void distance<raft::distance::DistanceType::L2Unexpanded, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+template void
+distance<raft::distance::DistanceType::L2Unexpanded, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/lp_unexpanded.cu b/cpp/src/distance/specializations/detail/lp_unexpanded.cu
new file mode 100644
index 0000000000..a3c15e498d
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/lp_unexpanded.cu
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::LpUnexpanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+template void distance<raft::distance::DistanceType::LpUnexpanded, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+template void
+distance<raft::distance::DistanceType::LpUnexpanded, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/nn/specializations/ball_cover.cu b/cpp/src/nn/specializations/ball_cover.cu
new file mode 100644
index 0000000000..656aec3323
--- /dev/null
+++ b/cpp/src/nn/specializations/ball_cover.cu
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/spatial/knn/ball_cover_common.h>
+#include <cstdint>
+#include <raft/spatial/knn/ball_cover.hpp>
+
+// Ignore upstream specializations to avoid unnecessary recompiling
+#include <raft/distance/specializations.hpp>
+#include <raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp>
+#include <raft/spatial/knn/specializations/fused_l2_knn.hpp>
+#include <raft/spatial/knn/specializations/knn.hpp>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+template class BallCoverIndex<int, float, std::uint32_t>;
+template class BallCoverIndex<std::int64_t, float, std::uint32_t>;
+
+template void rbc_build_index<std::int64_t, float, std::uint32_t>(
+  const raft::handle_t& handle, BallCoverIndex<std::int64_t, float, std::uint32_t>& index);
+
+template void rbc_knn_query<std::int64_t, float, std::uint32_t>(
+  const raft::handle_t& handle,
+  BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
+  std::uint32_t k,
+  const float* query,
+  std::uint32_t n_query_pts,
+  std::int64_t* inds,
+  float* dists,
+  bool perform_post_filtering,
+  float weight);
+
+template void rbc_all_knn_query<std::int64_t, float, std::uint32_t>(
+  const raft::handle_t& handle,
+  BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
+  std::uint32_t k,
+  std::int64_t* inds,
+  float* dists,
+  bool perform_post_filtering,
+  float weight);
+
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
diff --git a/cpp/src/nn/specializations/detail/ball_cover_lowdim.cu b/cpp/src/nn/specializations/detail/ball_cover_lowdim.cu
new file mode 100644
index 0000000000..dea7fe8d41
--- /dev/null
+++ b/cpp/src/nn/specializations/detail/ball_cover_lowdim.cu
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/spatial/knn/detail/ball_cover/registers.cuh>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+
+template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t>(
+  const raft::handle_t& handle,
+  BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
+  const float* query,
+  const std::uint32_t n_query_rows,
+  std::uint32_t k,
+  const std::int64_t* R_knn_inds,
+  const float* R_knn_dists,
+  DistFunc<float, std::uint32_t>& dfunc,
+  std::int64_t* inds,
+  float* dists,
+  float weight,
+  std::uint32_t* dists_counter);
+
+template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t>(
+  const raft::handle_t& handle,
+  BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
+  const float* query,
+  const std::uint32_t n_query_rows,
+  std::uint32_t k,
+  const std::int64_t* R_knn_inds,
+  const float* R_knn_dists,
+  DistFunc<float, std::uint32_t>& dfunc,
+  std::int64_t* inds,
+  float* dists,
+  float weight,
+  std::uint32_t* post_dists_counter);
+};  // namespace detail
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
\ No newline at end of file
diff --git a/cpp/src/nn/specializations/fused_l2_knn.cu b/cpp/src/nn/specializations/fused_l2_knn.cu
new file mode 100644
index 0000000000..26aa7069e9
--- /dev/null
+++ b/cpp/src/nn/specializations/fused_l2_knn.cu
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+
+template void fusedL2Knn<long, float, true>(size_t D,
+                                            long* out_inds,
+                                            float* out_dists,
+                                            const float* index,
+                                            const float* query,
+                                            size_t n_index_rows,
+                                            size_t n_query_rows,
+                                            int k,
+                                            bool rowMajorIndex,
+                                            bool rowMajorQuery,
+                                            cudaStream_t stream,
+                                            raft::distance::DistanceType metric);
+
+template void fusedL2Knn<long, float, false>(size_t D,
+                                             long* out_inds,
+                                             float* out_dists,
+                                             const float* index,
+                                             const float* query,
+                                             size_t n_index_rows,
+                                             size_t n_query_rows,
+                                             int k,
+                                             bool rowMajorIndex,
+                                             bool rowMajorQuery,
+                                             cudaStream_t stream,
+                                             raft::distance::DistanceType metric);
+
+template void fusedL2Knn<int, float, true>(size_t D,
+                                           int* out_inds,
+                                           float* out_dists,
+                                           const float* index,
+                                           const float* query,
+                                           size_t n_index_rows,
+                                           size_t n_query_rows,
+                                           int k,
+                                           bool rowMajorIndex,
+                                           bool rowMajorQuery,
+                                           cudaStream_t stream,
+                                           raft::distance::DistanceType metric);
+
+template void fusedL2Knn<int, float, false>(size_t D,
+                                            int* out_inds,
+                                            float* out_dists,
+                                            const float* index,
+                                            const float* query,
+                                            size_t n_index_rows,
+                                            size_t n_query_rows,
+                                            int k,
+                                            bool rowMajorIndex,
+                                            bool rowMajorQuery,
+                                            cudaStream_t stream,
+                                            raft::distance::DistanceType metric);
+
+};  // namespace detail
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
diff --git a/cpp/src/nn/specializations/knn.cu b/cpp/src/nn/specializations/knn.cu
new file mode 100644
index 0000000000..8973cfbb02
--- /dev/null
+++ b/cpp/src/nn/specializations/knn.cu
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/spatial/knn/knn.hpp>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+template void brute_force_knn<long, float, int>(raft::handle_t const& handle,
+                                                std::vector<float*>& input,
+                                                std::vector<int>& sizes,
+                                                int D,
+                                                float* search_items,
+                                                int n,
+                                                long* res_I,
+                                                float* res_D,
+                                                int k,
+                                                bool rowMajorIndex,
+                                                bool rowMajorQuery,
+                                                std::vector<long>* translations,
+                                                distance::DistanceType metric,
+                                                float metric_arg);
+
+template void brute_force_knn<long, float, unsigned int>(raft::handle_t const& handle,
+                                                         std::vector<float*>& input,
+                                                         std::vector<unsigned int>& sizes,
+                                                         unsigned int D,
+                                                         float* search_items,
+                                                         unsigned int n,
+                                                         long* res_I,
+                                                         float* res_D,
+                                                         unsigned int k,
+                                                         bool rowMajorIndex,
+                                                         bool rowMajorQuery,
+                                                         std::vector<long>* translations,
+                                                         distance::DistanceType metric,
+                                                         float metric_arg);
+
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 14052293cf..b270204489 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -132,7 +132,8 @@ PRIVATE
   CUDA::cusparse
   rmm::rmm
   cuco::cuco
-  FAISS::FAISS
+  raft_distance
+  raft_nn
   GTest::gtest
   GTest::gtest_main
   Threads::Threads
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index 8d150a4a87..751e895552 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -18,6 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/distance.hpp>
+#include <raft/distance/specializations.hpp>
 #include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
@@ -395,12 +396,8 @@ void distanceLauncher(DataType* x,
                       bool isRowMajor,
                       DataType metric_arg = 2.0f)
 {
-  auto fin_op = [dist2, threshold] __device__(DataType d_val, int g_d_idx) {
-    dist2[g_d_idx] = (d_val < threshold) ? 0.f : d_val;
-    return d_val;
-  };
   raft::distance::distance<distanceType, DataType, DataType, DataType>(
-    x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg);
+    x, y, dist, m, n, k, workspace, worksize, stream, isRowMajor, metric_arg);
 }
 
 template <raft::distance::DistanceType distanceType, typename DataType>
diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu
index 9af5e9103b..e7e5854186 100644
--- a/cpp/test/sparse/knn_graph.cu
+++ b/cpp/test/sparse/knn_graph.cu
@@ -23,6 +23,7 @@
 
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/selection/knn_graph.hpp>
+#include <raft/spatial/knn/specializations.hpp>
 
 #include <iostream>
 
diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu
index 73c0f87fdd..21d9aaf71f 100644
--- a/cpp/test/spatial/ball_cover.cu
+++ b/cpp/test/spatial/ball_cover.cu
@@ -18,6 +18,7 @@
 #include <raft/linalg/distance_type.h>
 #include <raft/spatial/knn/ball_cover.hpp>
 #include <raft/spatial/knn/detail/knn_brute_force_faiss.cuh>
+#include <raft/spatial/knn/specializations.hpp>
 #include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
 #include "spatial_data.h"
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index 2fb9bd2ca5..5681f66e25 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -17,7 +17,9 @@
 #include "../test_utils.h"
 
 #include <raft/linalg/distance_type.h>
+
 #include <raft/spatial/knn/knn.hpp>
+#include <raft/spatial/knn/specializations.hpp>
 
 #include <rmm/device_buffer.hpp>
 
diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu
index 5069b4f256..4409f893a8 100644
--- a/cpp/test/spatial/selection.cu
+++ b/cpp/test/spatial/selection.cu
@@ -21,6 +21,7 @@
 
 #include <raft/sparse/detail/utils.h>
 #include <raft/spatial/knn/knn.hpp>
+#include <raft/spatial/knn/specializations.hpp>
 
 namespace raft {
 namespace spatial {

From d2e5b7afdd44d33b773a889ea8e6aa57c8397427 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
Date: Thu, 16 Dec 2021 08:06:16 -0500
Subject: [PATCH 062/171] Remove `IncludeCategories` from `.clang-format`
 (#412)

It was recently noticed that the `IncludeCategories`:
```
IncludeCategories:
  - Regex:           '^<ext/.*\.h>'
    Priority:        2
  - Regex:           '^<.*\.h>'
    Priority:        1
  - Regex:           '^<.*'
    Priority:        2
  - Regex:           '.*'
    Priority:        3
```
In the `.clang-format` are not really necessary as `ext` has no meaning in RAPIDS. This PR removes these.

Note these changes are being made in all repos:
* `cudf`: https://github.com/rapidsai/cudf/pull/9876
* `rmm`: https://github.com/rapidsai/rmm/pull/933
* `cuml`: https://github.com/rapidsai/cuml/pull/4438
* `cugraph`: https://github.com/rapidsai/cugraph/pull/1987
* `cuspatial`: https://github.com/rapidsai/cuspatial/pull/470

Authors:
  - Conor Hoekstra (https://github.com/codereport)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/412
---
 cpp/.clang-format                                    |  9 ---------
 cpp/include/raft/comms/helper.hpp                    |  2 +-
 cpp/include/raft/comms/mpi_comms.hpp                 |  2 +-
 cpp/include/raft/comms/std_comms.hpp                 |  8 ++++----
 cpp/include/raft/cudart_utils.h                      |  2 +-
 cpp/include/raft/distance/detail/distance.cuh        |  2 +-
 cpp/include/raft/distance/detail/fused_l2_nn.cuh     |  2 +-
 .../raft/distance/detail/pairwise_distance_base.cuh  |  2 +-
 cpp/include/raft/distance/distance.hpp               |  2 +-
 cpp/include/raft/distance/fused_l2_nn.hpp            |  2 +-
 cpp/include/raft/handle.hpp                          |  4 ++--
 cpp/include/raft/label/classlabels.cuh               |  2 +-
 cpp/include/raft/label/merge_labels.cuh              |  4 ++--
 cpp/include/raft/linalg/cholesky_r1_update.cuh       |  4 ++--
 cpp/include/raft/linalg/eig.cuh                      |  4 ++--
 cpp/include/raft/linalg/gemm.cuh                     |  2 +-
 cpp/include/raft/linalg/gemv.h                       |  2 +-
 cpp/include/raft/linalg/init.h                       |  2 +-
 cpp/include/raft/linalg/lanczos.hpp                  |  2 +-
 cpp/include/raft/linalg/reduce.cuh                   |  2 +-
 cpp/include/raft/linalg/strided_reduction.cuh        |  2 +-
 cpp/include/raft/linalg/subtract.cuh                 |  2 +-
 cpp/include/raft/linalg/svd.cuh                      | 10 +++++-----
 cpp/include/raft/linalg/transpose.h                  |  2 +-
 cpp/include/raft/linalg/unary_op.cuh                 |  2 +-
 cpp/include/raft/matrix/detail/math.cuh              |  2 +-
 cpp/include/raft/matrix/math.hpp                     | 12 +-----------
 cpp/include/raft/matrix/matrix.hpp                   |  6 +++---
 cpp/include/raft/mr/device/buffer.hpp                |  2 +-
 cpp/include/raft/mr/host/buffer.hpp                  |  2 +-
 cpp/include/raft/random/detail/rng_impl.cuh          |  6 +++---
 cpp/include/raft/sparse/convert/detail/coo.cuh       |  4 ++--
 cpp/include/raft/sparse/convert/detail/csr.cuh       |  6 +++---
 cpp/include/raft/sparse/convert/detail/dense.cuh     |  2 +-
 cpp/include/raft/sparse/detail/coo.cuh               |  2 +-
 cpp/include/raft/sparse/detail/csr.cuh               |  2 +-
 .../raft/sparse/distance/detail/bin_distance.cuh     |  2 +-
 cpp/include/raft/sparse/distance/detail/coo_spmv.cuh |  4 ++--
 .../raft/sparse/distance/detail/ip_distance.cuh      |  6 +++---
 .../raft/sparse/distance/detail/l2_distance.cuh      |  6 +++---
 .../raft/sparse/distance/detail/lp_distance.cuh      |  6 +++---
 cpp/include/raft/sparse/distance/distance.hpp        |  6 +++---
 .../raft/sparse/hierarchy/detail/agglomerative.cuh   |  2 +-
 .../raft/sparse/hierarchy/detail/connectivities.cuh  |  4 ++--
 cpp/include/raft/sparse/hierarchy/detail/mst.cuh     |  2 +-
 cpp/include/raft/sparse/linalg/detail/add.cuh        |  2 +-
 cpp/include/raft/sparse/linalg/detail/degree.cuh     |  2 +-
 cpp/include/raft/sparse/linalg/detail/norm.cuh       |  2 +-
 cpp/include/raft/sparse/linalg/detail/spectral.cuh   |  2 +-
 cpp/include/raft/sparse/linalg/detail/symmetrize.cuh |  8 ++++----
 cpp/include/raft/sparse/linalg/detail/transpose.h    |  2 +-
 cpp/include/raft/sparse/linalg/transpose.hpp         |  2 +-
 .../raft/sparse/mst/detail/mst_solver_inl.cuh        |  4 ++--
 cpp/include/raft/sparse/op/detail/filter.cuh         |  4 ++--
 cpp/include/raft/sparse/op/detail/reduce.cuh         | 12 ++++++------
 cpp/include/raft/sparse/op/detail/row_op.cuh         |  2 +-
 cpp/include/raft/sparse/op/detail/slice.h            |  4 ++--
 cpp/include/raft/sparse/op/detail/sort.h             |  4 ++--
 cpp/include/raft/sparse/op/slice.hpp                 |  2 +-
 cpp/include/raft/sparse/op/sort.hpp                  |  2 +-
 .../sparse/selection/detail/connect_components.cuh   |  4 ++--
 cpp/include/raft/sparse/selection/detail/knn.cuh     |  4 ++--
 .../raft/sparse/selection/detail/knn_graph.cuh       |  2 +-
 cpp/include/raft/sparse/selection/knn.hpp            |  2 +-
 cpp/include/raft/spatial/knn/ball_cover.hpp          |  4 ++--
 cpp/include/raft/spatial/knn/ball_cover_common.h     |  2 +-
 .../raft/spatial/knn/detail/ann_quantized_faiss.cuh  |  6 +++---
 cpp/include/raft/spatial/knn/detail/ball_cover.cuh   |  4 ++--
 .../raft/spatial/knn/detail/ball_cover/common.cuh    |  4 ++--
 .../raft/spatial/knn/detail/ball_cover/registers.cuh |  4 ++--
 .../raft/spatial/knn/detail/block_select_faiss.cuh   |  2 +-
 cpp/include/raft/spatial/knn/detail/common_faiss.h   |  2 +-
 cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh |  2 +-
 .../raft/spatial/knn/detail/haversine_distance.cuh   |  6 +++---
 .../spatial/knn/detail/knn_brute_force_faiss.cuh     |  8 ++++----
 .../raft/spatial/knn/detail/selection_faiss.cuh      |  2 +-
 .../raft/spatial/knn/detail/warp_select_faiss.cuh    |  2 +-
 cpp/include/raft/spectral/kmeans.hpp                 |  2 +-
 cpp/include/raft/spectral/lapack.hpp                 |  2 +-
 cpp/include/raft/spectral/matrix_wrappers.hpp        |  2 +-
 cpp/include/raft/vectorized.cuh                      |  2 +-
 cpp/test/cudart_utils.cpp                            |  2 +-
 cpp/test/distance/dist_adj.cu                        |  4 ++--
 cpp/test/distance/distance_base.cuh                  |  4 ++--
 cpp/test/distance/fused_l2_nn.cu                     |  4 ++--
 cpp/test/handle.cpp                                  |  2 +-
 cpp/test/integer_utils.cpp                           |  2 +-
 cpp/test/label/label.cu                              |  4 ++--
 cpp/test/label/merge_labels.cu                       |  4 ++--
 cpp/test/lap/lap.cu                                  |  2 +-
 cpp/test/linalg/add.cu                               |  4 ++--
 cpp/test/linalg/binary_op.cu                         |  4 ++--
 cpp/test/linalg/binary_op.cuh                        |  2 +-
 cpp/test/linalg/cholesky_r1.cu                       |  4 ++--
 cpp/test/linalg/coalesced_reduction.cu               |  6 +++---
 cpp/test/linalg/divide.cu                            |  4 ++--
 cpp/test/linalg/eig.cu                               |  4 ++--
 cpp/test/linalg/eig_sel.cu                           |  4 ++--
 cpp/test/linalg/eltwise.cu                           |  2 +-
 cpp/test/linalg/gemm_layout.cu                       |  2 +-
 cpp/test/linalg/gemv.cu                              |  4 ++--
 cpp/test/linalg/map.cu                               |  2 +-
 cpp/test/linalg/map_then_reduce.cu                   |  4 ++--
 cpp/test/linalg/matrix_vector_op.cu                  |  4 ++--
 cpp/test/linalg/matrix_vector_op.cuh                 |  2 +-
 cpp/test/linalg/multiply.cu                          |  4 ++--
 cpp/test/linalg/norm.cu                              |  2 +-
 cpp/test/linalg/reduce.cu                            |  6 +++---
 cpp/test/linalg/reduce.cuh                           |  6 +++++-
 cpp/test/linalg/strided_reduction.cu                 |  4 ++--
 cpp/test/linalg/subtract.cu                          |  2 +-
 cpp/test/linalg/svd.cu                               |  4 ++--
 cpp/test/linalg/transpose.cu                         |  4 ++--
 cpp/test/linalg/unary_op.cu                          |  4 ++--
 cpp/test/linalg/unary_op.cuh                         |  2 +-
 cpp/test/matrix/math.cu                              |  2 +-
 cpp/test/matrix/matrix.cu                            |  2 +-
 cpp/test/mr/device/buffer.cpp                        |  2 +-
 cpp/test/mst.cu                                      |  2 +-
 cpp/test/random/rng.cu                               |  6 +++---
 cpp/test/random/rng_int.cu                           |  6 +++---
 cpp/test/random/sample_without_replacement.cu        |  4 ++--
 cpp/test/sparse/add.cu                               |  2 +-
 cpp/test/sparse/connect_components.cu                |  2 +-
 cpp/test/sparse/convert_csr.cu                       |  2 +-
 cpp/test/sparse/csr_to_dense.cu                      |  2 +-
 cpp/test/sparse/csr_transpose.cu                     |  2 +-
 cpp/test/sparse/degree.cu                            |  2 +-
 cpp/test/sparse/dist_coo_spmv.cu                     |  2 +-
 cpp/test/sparse/filter.cu                            |  2 +-
 cpp/test/sparse/knn.cu                               |  2 +-
 cpp/test/sparse/knn_graph.cu                         |  2 +-
 cpp/test/sparse/norm.cu                              |  2 +-
 cpp/test/sparse/reduce.cu                            |  4 ++--
 cpp/test/sparse/row_op.cu                            |  2 +-
 cpp/test/sparse/sort.cu                              |  2 +-
 cpp/test/spatial/ball_cover.cu                       |  8 ++++----
 cpp/test/spatial/fused_l2_knn.cu                     |  2 +-
 cpp/test/spatial/haversine.cu                        |  4 ++--
 cpp/test/stats/mean.cu                               |  8 ++++----
 cpp/test/stats/mean_center.cu                        |  4 ++--
 cpp/test/stats/stddev.cu                             |  2 +-
 cpp/test/stats/sum.cu                                |  2 +-
 cpp/test/test_utils.h                                |  4 ++--
 144 files changed, 238 insertions(+), 253 deletions(-)

diff --git a/cpp/.clang-format b/cpp/.clang-format
index 77a14d72db..18f376d660 100644
--- a/cpp/.clang-format
+++ b/cpp/.clang-format
@@ -72,15 +72,6 @@ ForEachMacros:
   - Q_FOREACH
   - BOOST_FOREACH
 IncludeBlocks: Preserve
-IncludeCategories:
-  - Regex:           '^<ext/.*\.h>'
-    Priority:        2
-  - Regex:           '^<.*\.h>'
-    Priority:        1
-  - Regex:           '^<.*'
-    Priority:        2
-  - Regex:           '.*'
-    Priority:        3
 IncludeIsMainRegex: '([-_](test|unittest))?$'
 IndentCaseLabels: true
 IndentPPDirectives: None
diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp
index 2be5b0d23f..09a767bea7 100644
--- a/cpp/include/raft/comms/helper.hpp
+++ b/cpp/include/raft/comms/helper.hpp
@@ -20,9 +20,9 @@
 #include <raft/handle.hpp>
 #include <raft/mr/device/buffer.hpp>
 
+#include <iostream>
 #include <nccl.h>
 #include <ucp/api/ucp.h>
-#include <iostream>
 
 namespace raft {
 namespace comms {
diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp
index 413763c07f..5cdde29db5 100644
--- a/cpp/include/raft/comms/mpi_comms.hpp
+++ b/cpp/include/raft/comms/mpi_comms.hpp
@@ -26,9 +26,9 @@
 #include <mpi.h>
 #include <nccl.h>
 
-#include <raft/cudart_utils.h>
 #include <raft/comms/comms.hpp>
 #include <raft/comms/util.hpp>
+#include <raft/cudart_utils.h>
 #include <raft/error.hpp>
 #include <raft/handle.hpp>
 
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index 6afb0f56c6..99f15643a1 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -24,14 +24,14 @@
 
 #include <raft/error.hpp>
 
-#include <raft/cudart_utils.h>
 #include <raft/comms/util.hpp>
+#include <raft/cudart_utils.h>
 
 #include <cuda_runtime.h>
 
+#include <raft/comms/ucp_helper.hpp>
 #include <ucp/api/ucp.h>
 #include <ucp/api/ucp_def.h>
-#include <raft/comms/ucp_helper.hpp>
 
 #include <nccl.h>
 
@@ -39,14 +39,14 @@
 #include <unordered_set>
 #include <utility>
 
-#include <stdlib.h>
-#include <time.h>
 #include <algorithm>
 #include <chrono>
 #include <cstdio>
 #include <exception>
 #include <memory>
+#include <stdlib.h>
 #include <thread>
+#include <time.h>
 
 namespace raft {
 namespace comms {
diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index 3539f1e1c0..936065afba 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -22,9 +22,9 @@
 
 #include <cuda_runtime.h>
 
-#include <execinfo.h>
 #include <chrono>
 #include <cstdio>
+#include <execinfo.h>
 #include <iomanip>
 #include <iostream>
 #include <mutex>
diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index 9eeccdb827..21031afef1 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cuda_runtime_api.h>
-#include <raft/linalg/distance_type.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/detail/canberra.cuh>
 #include <raft/distance/detail/chebyshev.cuh>
@@ -31,6 +30,7 @@
 #include <raft/distance/detail/l1.cuh>
 #include <raft/distance/detail/minkowski.cuh>
 #include <raft/distance/detail/russell_rao.cuh>
+#include <raft/linalg/distance_type.h>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
index f81f27576b..6ad939ecd5 100644
--- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
@@ -16,12 +16,12 @@
 
 #pragma once
 
-#include <stdint.h>
 #include <cub/cub.cuh>
 #include <limits>
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
 #include <raft/linalg/contractions.cuh>
+#include <stdint.h>
 
 namespace raft {
 namespace distance {
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index bfba7ab144..08911e0350 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 #pragma once
-#include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/linalg/contractions.cuh>
 #include <raft/linalg/norm.cuh>
 #include <raft/vectorized.cuh>
diff --git a/cpp/include/raft/distance/distance.hpp b/cpp/include/raft/distance/distance.hpp
index 66832c12d2..3dad7ea6d7 100644
--- a/cpp/include/raft/distance/distance.hpp
+++ b/cpp/include/raft/distance/distance.hpp
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include <raft/linalg/distance_type.h>
 #include <raft/distance/detail/distance.cuh>
 #include <raft/handle.hpp>
+#include <raft/linalg/distance_type.h>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/distance/fused_l2_nn.hpp b/cpp/include/raft/distance/fused_l2_nn.hpp
index d924ef217c..b293f0c237 100644
--- a/cpp/include/raft/distance/fused_l2_nn.hpp
+++ b/cpp/include/raft/distance/fused_l2_nn.hpp
@@ -16,12 +16,12 @@
 
 #pragma once
 
-#include <stdint.h>
 #include <cub/cub.cuh>
 #include <limits>
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/detail/fused_l2_nn.cuh>
 #include <raft/handle.hpp>
+#include <stdint.h>
 
 namespace raft {
 namespace distance {
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index bba7fabc54..996c56ca9d 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -32,13 +32,13 @@
 ///@todo: enable once we have migrated cuml-comms layer too
 //#include <common/cuml_comms_int.hpp>
 
+#include "cudart_utils.h"
+#include <raft/comms/comms.hpp>
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/linalg/cusolver_wrappers.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/comms/comms.hpp>
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/exec_policy.hpp>
-#include "cudart_utils.h"
 
 namespace raft {
 
diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh
index 6cc23576f1..dce732b06b 100644
--- a/cpp/include/raft/label/classlabels.cuh
+++ b/cpp/include/raft/label/classlabels.cuh
@@ -18,8 +18,8 @@
 
 #include <cub/cub.cuh>
 
-#include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/linalg/unary_op.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/merge_labels.cuh
index 9818b5d71b..a3f2411102 100644
--- a/cpp/include/raft/label/merge_labels.cuh
+++ b/cpp/include/raft/label/merge_labels.cuh
@@ -16,12 +16,12 @@
 
 #pragma once
 
-#include <math.h>
 #include <limits>
+#include <math.h>
 
+#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/init.h>
-#include <raft/cuda_utils.cuh>
 
 namespace raft {
 namespace label {
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh
index 31e3a99a81..40009414ed 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
 #include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/cublas_wrappers.h>
+#include <raft/linalg/cusolver_wrappers.h>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
index b67c9d494a..200f69a88a 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -17,10 +17,10 @@
 #pragma once
 
 #include <cuda_runtime_api.h>
-#include <raft/cudart_utils.h>
-#include <raft/linalg/cusolver_wrappers.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
+#include <raft/linalg/cusolver_wrappers.h>
 #include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh
index 959f74ee2b..9aff35619e 100644
--- a/cpp/include/raft/linalg/gemm.cuh
+++ b/cpp/include/raft/linalg/gemm.cuh
@@ -17,9 +17,9 @@
 #pragma once
 
 #include <cublas_v2.h>
-#include <raft/linalg/cublas_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
+#include <raft/linalg/cublas_wrappers.h>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/gemv.h b/cpp/include/raft/linalg/gemv.h
index 965cd32a57..462107df65 100644
--- a/cpp/include/raft/linalg/gemv.h
+++ b/cpp/include/raft/linalg/gemv.h
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <cublas_v2.h>
-#include <raft/linalg/cublas_wrappers.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/cublas_wrappers.h>
 
 #include <raft/handle.hpp>
 
diff --git a/cpp/include/raft/linalg/init.h b/cpp/include/raft/linalg/init.h
index 41ef4d4641..03d4f99e90 100644
--- a/cpp/include/raft/linalg/init.h
+++ b/cpp/include/raft/linalg/init.h
@@ -16,10 +16,10 @@
 
 #pragma once
 
+#include <rmm/exec_policy.hpp>
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <rmm/exec_policy.hpp>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp
index ef2b6cc941..9376994742 100644
--- a/cpp/include/raft/linalg/lanczos.hpp
+++ b/cpp/include/raft/linalg/lanczos.hpp
@@ -26,8 +26,8 @@
 #include <curand.h>
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cublas_wrappers.h>
 #include <raft/handle.hpp>
+#include <raft/linalg/cublas_wrappers.h>
 #include <raft/spectral/lapack.hpp>
 #include <raft/spectral/matrix_wrappers.hpp>
 #include <raft/spectral/warn_dbg.hpp>
diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh
index 693a797db9..1f14f6eb31 100644
--- a/cpp/include/raft/linalg/reduce.cuh
+++ b/cpp/include/raft/linalg/reduce.cuh
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include <raft/cuda_utils.cuh>
 #include "coalesced_reduction.cuh"
 #include "strided_reduction.cuh"
+#include <raft/cuda_utils.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh
index f931c976fd..0434f87151 100644
--- a/cpp/include/raft/linalg/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/strided_reduction.cuh
@@ -16,10 +16,10 @@
 
 #pragma once
 
+#include "unary_op.cuh"
 #include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
 #include <type_traits>
-#include "unary_op.cuh"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh
index 7ffcb734f8..b33378bf33 100644
--- a/cpp/include/raft/linalg/subtract.cuh
+++ b/cpp/include/raft/linalg/subtract.cuh
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include <raft/cuda_utils.cuh>
 #include "binary_op.cuh"
 #include "unary_op.cuh"
+#include <raft/cuda_utils.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh
index c4dd8a3fd4..b75497f725 100644
--- a/cpp/include/raft/linalg/svd.cuh
+++ b/cpp/include/raft/linalg/svd.cuh
@@ -16,18 +16,18 @@
 
 #pragma once
 
+#include "eig.cuh"
+#include "gemm.cuh"
+#include "transpose.h"
+#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/linalg/cusolver_wrappers.h>
-#include <raft/cuda_utils.cuh>
-#include <raft/handle.hpp>
 #include <raft/matrix/math.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
-#include "eig.cuh"
-#include "gemm.cuh"
-#include "transpose.h"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/transpose.h
index 63dbae1c8a..6b629e73f0 100644
--- a/cpp/include/raft/linalg/transpose.h
+++ b/cpp/include/raft/linalg/transpose.h
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <raft/linalg/cublas_wrappers.h>
 #include <raft/handle.hpp>
+#include <raft/linalg/cublas_wrappers.h>
 #include <rmm/exec_policy.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh
index d10bc859fe..ae8cff2325 100644
--- a/cpp/include/raft/linalg/unary_op.cuh
+++ b/cpp/include/raft/linalg/unary_op.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/vectorized.cuh>
 
 namespace raft {
diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh
index aa0947b3f0..5e194b8dd4 100644
--- a/cpp/include/raft/matrix/detail/math.cuh
+++ b/cpp/include/raft/matrix/detail/math.cuh
@@ -114,4 +114,4 @@ void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream)
 
 }  // end namespace detail
 }  // end namespace matrix
-}  // end namespace raft
\ No newline at end of file
+}  // end namespace raft
diff --git a/cpp/include/raft/matrix/math.hpp b/cpp/include/raft/matrix/math.hpp
index df6eb6f489..c3f591202a 100644
--- a/cpp/include/raft/matrix/math.hpp
+++ b/cpp/include/raft/matrix/math.hpp
@@ -240,17 +240,7 @@ void reciprocal(math_t* in,
     d_dest,
     d_src,
     len,
-    [=] __device__(math_t a) {
-      if (setzero) {
-        if (abs(a) <= thres) {
-          return math_t(0);
-        } else {
-          return scalar / a;
-        }
-      } else {
-        return scalar / a;
-      }
-    },
+    [=] __device__(math_t a) { return setzero && (abs(a) <= thres) ? math_t{0} : scalar / a; },
     stream);
 }
 
diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp
index a7a43cff6e..dac2afb5a0 100644
--- a/cpp/include/raft/matrix/matrix.hpp
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -18,13 +18,13 @@
 
 #include "detail/matrix.cuh"
 
+#include <algorithm>
+#include <cstddef>
 #include <cuda_runtime.h>
 #include <cusolverDn.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cublas_wrappers.h>
-#include <algorithm>
-#include <cstddef>
 #include <raft/handle.hpp>
+#include <raft/linalg/cublas_wrappers.h>
 
 namespace raft {
 namespace matrix {
diff --git a/cpp/include/raft/mr/device/buffer.hpp b/cpp/include/raft/mr/device/buffer.hpp
index 2b9d84368f..9b5ff11c50 100644
--- a/cpp/include/raft/mr/device/buffer.hpp
+++ b/cpp/include/raft/mr/device/buffer.hpp
@@ -16,9 +16,9 @@
 
 #pragma once
 
+#include "allocator.hpp"
 #include <memory>
 #include <raft/mr/buffer_base.hpp>
-#include "allocator.hpp"
 
 namespace raft {
 namespace mr {
diff --git a/cpp/include/raft/mr/host/buffer.hpp b/cpp/include/raft/mr/host/buffer.hpp
index 52475ad6ec..204b384719 100644
--- a/cpp/include/raft/mr/host/buffer.hpp
+++ b/cpp/include/raft/mr/host/buffer.hpp
@@ -16,10 +16,10 @@
 
 #pragma once
 
+#include "allocator.hpp"
 #include <memory>
 #include <raft/mr/buffer_base.hpp>
 #include <raft/mr/device/buffer.hpp>
-#include "allocator.hpp"
 
 namespace raft {
 namespace mr {
diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh
index cdebd650f9..9ca3859e18 100644
--- a/cpp/include/raft/random/detail/rng_impl.cuh
+++ b/cpp/include/raft/random/detail/rng_impl.cuh
@@ -16,17 +16,17 @@
 
 #pragma once
 
-#include <curand_kernel.h>
-#include <raft/cudart_utils.h>
-#include <stdint.h>
 #include <cstdio>
 #include <cstdlib>
+#include <curand_kernel.h>
 #include <raft/common/cub_wrappers.cuh>
 #include <raft/common/scatter.cuh>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 #include <random>
 #include <rmm/device_uvector.hpp>
+#include <stdint.h>
 #include <type_traits>
 
 namespace raft {
diff --git a/cpp/include/raft/sparse/convert/detail/coo.cuh b/cpp/include/raft/sparse/convert/detail/coo.cuh
index 9a2eef89d2..fd300dcdba 100644
--- a/cpp/include/raft/sparse/convert/detail/coo.cuh
+++ b/cpp/include/raft/sparse/convert/detail/coo.cuh
@@ -17,9 +17,9 @@
 #pragma once
 
 #include <cusparse_v2.h>
+#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/cuda_utils.cuh>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
@@ -29,8 +29,8 @@
 #include <algorithm>
 #include <iostream>
 
-#include <raft/sparse/detail/utils.h>
 #include <raft/sparse/coo.hpp>
+#include <raft/sparse/detail/utils.h>
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/include/raft/sparse/convert/detail/csr.cuh b/cpp/include/raft/sparse/convert/detail/csr.cuh
index 2641fae0b8..0f4dc4976c 100644
--- a/cpp/include/raft/sparse/convert/detail/csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/csr.cuh
@@ -18,10 +18,10 @@
 
 #include <cusparse_v2.h>
 
-#include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
+#include <raft/sparse/cusparse_wrappers.h>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
@@ -33,8 +33,8 @@
 #include <algorithm>
 #include <iostream>
 
-#include <raft/sparse/detail/utils.h>
 #include <raft/sparse/coo.hpp>
+#include <raft/sparse/detail/utils.h>
 #include <raft/sparse/linalg/degree.hpp>
 #include <raft/sparse/op/row_op.hpp>
 
diff --git a/cpp/include/raft/sparse/convert/detail/dense.cuh b/cpp/include/raft/sparse/convert/detail/dense.cuh
index 1f3e170b33..9f48fd2172 100644
--- a/cpp/include/raft/sparse/convert/detail/dense.cuh
+++ b/cpp/include/raft/sparse/convert/detail/dense.cuh
@@ -17,9 +17,9 @@
 #pragma once
 
 #include <cusparse_v2.h>
+#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/cuda_utils.cuh>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
diff --git a/cpp/include/raft/sparse/detail/coo.cuh b/cpp/include/raft/sparse/detail/coo.cuh
index ccf9a1dd0a..38a3c8f351 100644
--- a/cpp/include/raft/sparse/detail/coo.cuh
+++ b/cpp/include/raft/sparse/detail/coo.cuh
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <raft/cudart_utils.h>
 #include <iostream>
+#include <raft/cudart_utils.h>
 #include <rmm/device_uvector.hpp>
 
 #pragma once
diff --git a/cpp/include/raft/sparse/detail/csr.cuh b/cpp/include/raft/sparse/detail/csr.cuh
index 62835e3bc2..cb39f34ba4 100644
--- a/cpp/include/raft/sparse/detail/csr.cuh
+++ b/cpp/include/raft/sparse/detail/csr.cuh
@@ -17,9 +17,9 @@
 #pragma once
 
 #include <cusparse_v2.h>
+#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/cuda_utils.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
index ad97e0853a..07bf251f14 100644
--- a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
@@ -18,12 +18,12 @@
 
 #include <limits.h>
 
+#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/sparse/detail/utils.h>
 #include <raft/sparse/distance/common.h>
-#include <raft/cuda_utils.cuh>
 #include <raft/sparse/distance/detail/ip_distance.cuh>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
index fe5ce9c67a..c23a2b1537 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
@@ -19,10 +19,10 @@
 #include "coo_spmv_strategies/dense_smem_strategy.cuh"
 #include "coo_spmv_strategies/hash_strategy.cuh"
 
-#include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/mr/device/buffer.hpp>
+#include <raft/sparse/cusparse_wrappers.h>
 
 #include "../../csr.hpp"
 #include "../../detail/utils.h"
diff --git a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
index 03c13df511..00054a8e96 100644
--- a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
@@ -17,15 +17,15 @@
 #pragma once
 
 #include <limits.h>
+#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/cuda_utils.cuh>
 
-#include <raft/sparse/detail/utils.h>
-#include <raft/sparse/distance/common.h>
 #include <raft/sparse/convert/csr.hpp>
 #include <raft/sparse/convert/dense.hpp>
+#include <raft/sparse/detail/utils.h>
+#include <raft/sparse/distance/common.h>
 #include <raft/sparse/distance/detail/coo_spmv.cuh>
 #include <raft/sparse/distance/detail/operators.cuh>
 #include <raft/sparse/linalg/transpose.hpp>
diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
index 2a9c5363dd..7f63a7fec8 100644
--- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
@@ -18,14 +18,14 @@
 
 #include <raft/spatial/knn/knn.hpp>
 
+#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/sparse/csr.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/sparse/detail/utils.h>
 #include <raft/sparse/distance/common.h>
-#include <raft/cuda_utils.cuh>
-#include <raft/linalg/unary_op.cuh>
-#include <raft/sparse/csr.hpp>
 #include <raft/sparse/distance/detail/ip_distance.cuh>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
index 5be9de97c3..1e907c98eb 100644
--- a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
@@ -18,18 +18,18 @@
 
 #include <limits.h>
 
+#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/cuda_utils.cuh>
 
 #include <rmm/device_uvector.hpp>
 
-#include <raft/sparse/detail/utils.h>
 #include <raft/sparse/csr.hpp>
+#include <raft/sparse/detail/utils.h>
 
-#include <raft/sparse/distance/common.h>
 #include <raft/sparse/convert/coo.hpp>
+#include <raft/sparse/distance/common.h>
 #include <raft/sparse/distance/detail/operators.cuh>
 
 #include <nvfunctional>
diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp
index 9b708f4b27..2f121dce33 100644
--- a/cpp/include/raft/sparse/distance/distance.hpp
+++ b/cpp/include/raft/sparse/distance/distance.hpp
@@ -19,16 +19,16 @@
 #include <raft/cudart_utils.h>
 #include <unordered_set>
 
-#include <raft/linalg/distance_type.h>
-#include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/distance_type.h>
 #include <raft/mr/device/buffer.hpp>
+#include <raft/sparse/cusparse_wrappers.h>
 
-#include <raft/sparse/detail/utils.h>
 #include <raft/sparse/convert/coo.hpp>
 #include <raft/sparse/convert/csr.hpp>
 #include <raft/sparse/convert/dense.hpp>
 #include <raft/sparse/csr.hpp>
+#include <raft/sparse/detail/utils.h>
 #include <raft/sparse/linalg/transpose.hpp>
 
 #include <raft/sparse/distance/detail/bin_distance.cuh>
diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
index 1952f19900..4e78494e6b 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 #include <raft/mr/device/buffer.hpp>
 
diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
index 5eb5213f8d..5d4640f4a6 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
@@ -16,18 +16,18 @@
 
 #pragma once
 
-#include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 
 #include <raft/linalg/unary_op.cuh>
 #include <rmm/device_uvector.hpp>
 
 #include <raft/linalg/distance_type.h>
-#include <raft/sparse/hierarchy/common.h>
 #include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/convert/csr.hpp>
 #include <raft/sparse/coo.hpp>
+#include <raft/sparse/hierarchy/common.h>
 #include <raft/sparse/selection/knn_graph.hpp>
 
 #include <limits>
diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
index b3b06c5fe9..7173c76c08 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 
 #include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/mst/mst.cuh>
diff --git a/cpp/include/raft/sparse/linalg/detail/add.cuh b/cpp/include/raft/sparse/linalg/detail/add.cuh
index 61b72596b5..769c7e523f 100644
--- a/cpp/include/raft/sparse/linalg/detail/add.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/add.cuh
@@ -18,9 +18,9 @@
 
 #include <cusparse_v2.h>
 
+#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/cuda_utils.cuh>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
diff --git a/cpp/include/raft/sparse/linalg/detail/degree.cuh b/cpp/include/raft/sparse/linalg/detail/degree.cuh
index dfbeb09a5b..bf5484d3a4 100644
--- a/cpp/include/raft/sparse/linalg/detail/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/degree.cuh
@@ -22,8 +22,8 @@
 #include <cuda_runtime.h>
 #include <stdio.h>
 
-#include <raft/sparse/detail/utils.h>
 #include <raft/sparse/coo.hpp>
+#include <raft/sparse/detail/utils.h>
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/include/raft/sparse/linalg/detail/norm.cuh b/cpp/include/raft/sparse/linalg/detail/norm.cuh
index 2ba661c938..f4b4f65f7e 100644
--- a/cpp/include/raft/sparse/linalg/detail/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/norm.cuh
@@ -17,9 +17,9 @@
 #pragma once
 
 #include <cusparse_v2.h>
+#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/cuda_utils.cuh>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
index 016dccd161..de62f25ffa 100644
--- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -16,8 +16,8 @@
 
 #include <raft/cudart_utils.h>
 
-#include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/sparse/cusparse_wrappers.h>
 #include <raft/spectral/partition.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
index 85c47ef97b..045f0e14bc 100644
--- a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
@@ -18,16 +18,16 @@
 
 #include <cusparse_v2.h>
 
+#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/cuda_utils.cuh>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/device_ptr.h>
-#include <thrust/scan.h>
 #include <raft/device_atomics.cuh>
 #include <raft/sparse/op/sort.hpp>
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
 
 #include <cuda_runtime.h>
 #include <stdio.h>
@@ -35,9 +35,9 @@
 #include <algorithm>
 #include <iostream>
 
-#include <raft/sparse/detail/utils.h>
 #include <raft/sparse/convert/csr.hpp>
 #include <raft/sparse/coo.hpp>
+#include <raft/sparse/detail/utils.h>
 #include <raft/sparse/op/reduce.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/sparse/linalg/detail/transpose.h b/cpp/include/raft/sparse/linalg/detail/transpose.h
index 55652e2275..be74a72817 100644
--- a/cpp/include/raft/sparse/linalg/detail/transpose.h
+++ b/cpp/include/raft/sparse/linalg/detail/transpose.h
@@ -18,9 +18,9 @@
 
 #include <cusparse_v2.h>
 
+#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/cuda_utils.cuh>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
diff --git a/cpp/include/raft/sparse/linalg/transpose.hpp b/cpp/include/raft/sparse/linalg/transpose.hpp
index c2c478ccec..6e40b647e9 100644
--- a/cpp/include/raft/sparse/linalg/transpose.hpp
+++ b/cpp/include/raft/sparse/linalg/transpose.hpp
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <raft/sparse/linalg/detail/transpose.h>
 #include <raft/handle.hpp>
+#include <raft/sparse/linalg/detail/transpose.h>
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
index 5397b3fb95..8b1307b377 100644
--- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
+++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <curand.h>
 #include <chrono>
+#include <curand.h>
 
 #include "mst_kernels.cuh"
 #include "utils.cuh"
@@ -26,13 +26,13 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <iostream>
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
 #include <thrust/host_vector.h>
 #include <thrust/reduce.h>
 #include <thrust/sort.h>
 #include <thrust/transform.h>
-#include <iostream>
 
 namespace raft {
 namespace mst {
diff --git a/cpp/include/raft/sparse/op/detail/filter.cuh b/cpp/include/raft/sparse/op/detail/filter.cuh
index b5d819ebac..6e5d518619 100644
--- a/cpp/include/raft/sparse/op/detail/filter.cuh
+++ b/cpp/include/raft/sparse/op/detail/filter.cuh
@@ -18,9 +18,9 @@
 
 #include <cusparse_v2.h>
 
+#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/cuda_utils.cuh>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -33,8 +33,8 @@
 #include <cstdio>
 #include <iostream>
 
-#include <raft/sparse/detail/utils.h>
 #include <raft/sparse/coo.hpp>
+#include <raft/sparse/detail/utils.h>
 #include <raft/sparse/linalg/degree.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/sparse/op/detail/reduce.cuh b/cpp/include/raft/sparse/op/detail/reduce.cuh
index a959e4a3f7..074a139ba9 100644
--- a/cpp/include/raft/sparse/op/detail/reduce.cuh
+++ b/cpp/include/raft/sparse/op/detail/reduce.cuh
@@ -18,26 +18,26 @@
 
 #include <cusparse_v2.h>
 
-#include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/mr/device/buffer.hpp>
+#include <raft/sparse/cusparse_wrappers.h>
 
-#include <thrust/device_ptr.h>
-#include <thrust/scan.h>
 #include <raft/device_atomics.cuh>
 #include <raft/sparse/op/sort.hpp>
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
 
 #include <cuda_runtime.h>
-#include <stdio.h>
 #include <rmm/device_uvector.hpp>
+#include <stdio.h>
 
 #include <algorithm>
 #include <iostream>
 
-#include <raft/sparse/detail/utils.h>
 #include <raft/sparse/convert/csr.hpp>
 #include <raft/sparse/coo.hpp>
+#include <raft/sparse/detail/utils.h>
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/include/raft/sparse/op/detail/row_op.cuh b/cpp/include/raft/sparse/op/detail/row_op.cuh
index 402e8dcce5..b8803d4926 100644
--- a/cpp/include/raft/sparse/op/detail/row_op.cuh
+++ b/cpp/include/raft/sparse/op/detail/row_op.cuh
@@ -18,9 +18,9 @@
 
 #include <cusparse_v2.h>
 
+#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/cuda_utils.cuh>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
diff --git a/cpp/include/raft/sparse/op/detail/slice.h b/cpp/include/raft/sparse/op/detail/slice.h
index 366f37bc46..3c47d19a0b 100644
--- a/cpp/include/raft/sparse/op/detail/slice.h
+++ b/cpp/include/raft/sparse/op/detail/slice.h
@@ -18,10 +18,10 @@
 
 #include <cusparse_v2.h>
 
-#include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/linalg/unary_op.cuh>
+#include <raft/sparse/cusparse_wrappers.h>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
diff --git a/cpp/include/raft/sparse/op/detail/sort.h b/cpp/include/raft/sparse/op/detail/sort.h
index 9f32dd97b1..94feda1e76 100644
--- a/cpp/include/raft/sparse/op/detail/sort.h
+++ b/cpp/include/raft/sparse/op/detail/sort.h
@@ -16,11 +16,11 @@
 
 #pragma once
 
+#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
+#include <raft/sparse/coo.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/sparse/detail/utils.h>
-#include <raft/cuda_utils.cuh>
-#include <raft/sparse/coo.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/device_ptr.h>
diff --git a/cpp/include/raft/sparse/op/slice.hpp b/cpp/include/raft/sparse/op/slice.hpp
index adda68f0fb..e73063d0d6 100644
--- a/cpp/include/raft/sparse/op/slice.hpp
+++ b/cpp/include/raft/sparse/op/slice.hpp
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <raft/sparse/op/detail/slice.h>
 #include <raft/handle.hpp>
+#include <raft/sparse/op/detail/slice.h>
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/include/raft/sparse/op/sort.hpp b/cpp/include/raft/sparse/op/sort.hpp
index e5b243e53f..eb5c716976 100644
--- a/cpp/include/raft/sparse/op/sort.hpp
+++ b/cpp/include/raft/sparse/op/sort.hpp
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <raft/sparse/op/detail/sort.h>
 #include <raft/handle.hpp>
+#include <raft/sparse/op/detail/sort.h>
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/include/raft/sparse/selection/detail/connect_components.cuh b/cpp/include/raft/sparse/selection/detail/connect_components.cuh
index 8f420a67f4..817b9782f2 100644
--- a/cpp/include/raft/sparse/selection/detail/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/detail/connect_components.cuh
@@ -27,10 +27,10 @@
 
 #include <raft/cudart_utils.h>
 
-#include <thrust/device_ptr.h>
-#include <thrust/sort.h>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
 
 #include <cub/cub.cuh>
 
diff --git a/cpp/include/raft/sparse/selection/detail/knn.cuh b/cpp/include/raft/sparse/selection/detail/knn.cuh
index efb8d0201d..de0a15c029 100644
--- a/cpp/include/raft/sparse/selection/detail/knn.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn.cuh
@@ -18,16 +18,16 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
-#include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/matrix.hpp>
 #include <raft/mr/device/buffer.hpp>
 
-#include <raft/sparse/detail/utils.h>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/csr.hpp>
+#include <raft/sparse/detail/utils.h>
 #include <raft/sparse/distance/distance.hpp>
 #include <raft/sparse/op/slice.hpp>
 #include <raft/spatial/knn/knn.hpp>
diff --git a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
index 3fad21f307..1191251039 100644
--- a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/linalg/symmetrize.hpp>
diff --git a/cpp/include/raft/sparse/selection/knn.hpp b/cpp/include/raft/sparse/selection/knn.hpp
index 141026dc82..63d9da6114 100644
--- a/cpp/include/raft/sparse/selection/knn.hpp
+++ b/cpp/include/raft/sparse/selection/knn.hpp
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <raft/linalg/distance_type.h>
 #include <raft/handle.hpp>
+#include <raft/linalg/distance_type.h>
 #include <raft/sparse/selection/detail/knn.cuh>
 
 namespace raft {
diff --git a/cpp/include/raft/spatial/knn/ball_cover.hpp b/cpp/include/raft/spatial/knn/ball_cover.hpp
index 39f5845794..a4bdb8f2de 100644
--- a/cpp/include/raft/spatial/knn/ball_cover.hpp
+++ b/cpp/include/raft/spatial/knn/ball_cover.hpp
@@ -18,11 +18,11 @@
 
 #include <cstdint>
 
-#include <raft/linalg/distance_type.h>
-#include <thrust/transform.h>
 #include "ball_cover_common.h"
 #include "detail/ball_cover.cuh"
 #include "detail/ball_cover/common.cuh"
+#include <raft/linalg/distance_type.h>
+#include <thrust/transform.h>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/include/raft/spatial/knn/ball_cover_common.h b/cpp/include/raft/spatial/knn/ball_cover_common.h
index e38124edb6..2830b81cc0 100644
--- a/cpp/include/raft/spatial/knn/ball_cover_common.h
+++ b/cpp/include/raft/spatial/knn/ball_cover_common.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include <raft/linalg/distance_type.h>
 #include <cstdint>
 #include <raft/handle.hpp>
+#include <raft/linalg/distance_type.h>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index b7f124c51e..ff08917585 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -22,9 +22,9 @@
 #include "common_faiss.h"
 #include "processing.hpp"
 
-#include <raft/cudart_utils.h>
-#include <raft/cuda_utils.cuh>
 #include "processing.hpp"
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 
 #include <label/classlabels.cuh>
 #include <raft/distance/distance.hpp>
@@ -36,10 +36,10 @@
 #include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/utils/Heap.h>
 #include <faiss/gpu/utils/Limits.cuh>
 #include <faiss/gpu/utils/Select.cuh>
 #include <faiss/gpu/utils/Tensor.cuh>
+#include <faiss/utils/Heap.h>
 
 #include <thrust/iterator/transform_iterator.h>
 
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
index 3e787811bd..81eee717d6 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
@@ -26,8 +26,8 @@
 #include "knn_brute_force_faiss.cuh"
 #include "selection_faiss.cuh"
 
-#include <limits.h>
 #include <cstdint>
+#include <limits.h>
 
 #include <raft/cuda_utils.cuh>
 
@@ -38,9 +38,9 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <faiss/utils/Heap.h>
 #include <faiss/gpu/utils/Limits.cuh>
 #include <faiss/gpu/utils/Select.cuh>
+#include <faiss/utils/Heap.h>
 
 #include <thrust/functional.h>
 #include <thrust/reduce.h>
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
index 4b3065a0f3..a6656a3b67 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include <thrust/functional.h>
-#include <cstdint>
 #include "../haversine_distance.cuh"
+#include <cstdint>
+#include <thrust/functional.h>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
index 32d8068834..a06cfd09de 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
@@ -23,14 +23,14 @@
 #include "../haversine_distance.cuh"
 #include "../selection_faiss.cuh"
 
-#include <limits.h>
 #include <cstdint>
+#include <limits.h>
 
 #include <raft/cuda_utils.cuh>
 
-#include <faiss/utils/Heap.h>
 #include <faiss/gpu/utils/Limits.cuh>
 #include <faiss/gpu/utils/Select.cuh>
+#include <faiss/utils/Heap.h>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh b/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh
index a53a5b03e6..34240fba64 100644
--- a/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh
@@ -7,11 +7,11 @@
 
 #pragma once
 
-#include <faiss/gpu/utils/StaticUtils.h>
 #include <faiss/gpu/utils/DeviceDefs.cuh>
 #include <faiss/gpu/utils/MergeNetworkUtils.cuh>
 #include <faiss/gpu/utils/PtxUtils.cuh>
 #include <faiss/gpu/utils/Select.cuh>
+#include <faiss/gpu/utils/StaticUtils.h>
 #include <faiss/gpu/utils/WarpShuffles.cuh>
 
 #include "warp_select_faiss.cuh"
diff --git a/cpp/include/raft/spatial/knn/detail/common_faiss.h b/cpp/include/raft/spatial/knn/detail/common_faiss.h
index 5618186dfc..328ec0bf81 100644
--- a/cpp/include/raft/spatial/knn/detail/common_faiss.h
+++ b/cpp/include/raft/spatial/knn/detail/common_faiss.h
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 
 #include <faiss/gpu/GpuDistance.h>
 #include <raft/linalg/distance_type.h>
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index 27a23034c5..65115b2ccb 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -19,9 +19,9 @@
 #include <limits>
 #include <raft/linalg/norm.cuh>
 // TODO: Need to hide the PairwiseDistance class impl and expose to public API
+#include "processing.hpp"
 #include <raft/distance/detail/distance.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include "processing.hpp"
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
index 049c11514c..fdbe77c561 100644
--- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
+++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
@@ -16,18 +16,18 @@
 
 #pragma once
 
-#include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/utils/Heap.h>
 #include <faiss/gpu/utils/Limits.cuh>
 #include <faiss/gpu/utils/Select.cuh>
+#include <faiss/utils/Heap.h>
 
-#include <raft/linalg/distance_type.h>
 #include <raft/handle.hpp>
+#include <raft/linalg/distance_type.h>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 68590d3d7d..9aef395ad3 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <rmm/cuda_stream_pool.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -25,16 +25,16 @@
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/utils/Heap.h>
 #include <faiss/gpu/utils/Limits.cuh>
 #include <faiss/gpu/utils/Select.cuh>
+#include <faiss/utils/Heap.h>
 
-#include <raft/linalg/distance_type.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <cstdint>
 #include <iostream>
 #include <raft/handle.hpp>
+#include <raft/linalg/distance_type.h>
 #include <set>
+#include <thrust/iterator/transform_iterator.h>
 
 #include "fused_l2_knn.cuh"
 #include "haversine_distance.cuh"
diff --git a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
index 327efe49bb..07de6bcff9 100644
--- a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
@@ -22,9 +22,9 @@
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/utils/Heap.h>
 #include <faiss/gpu/utils/Limits.cuh>
 #include <faiss/gpu/utils/Select.cuh>
+#include <faiss/utils/Heap.h>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh b/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh
index abc4cdf545..39b55e315e 100644
--- a/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh
@@ -9,10 +9,10 @@
 
 #include <cub/cub.cuh>
 
-#include <faiss/gpu/utils/StaticUtils.h>
 #include <faiss/gpu/utils/DeviceDefs.cuh>
 #include <faiss/gpu/utils/MergeNetworkUtils.cuh>
 #include <faiss/gpu/utils/PtxUtils.cuh>
+#include <faiss/gpu/utils/StaticUtils.h>
 #include <faiss/gpu/utils/WarpShuffles.cuh>
 
 namespace faiss {
diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp
index 549dd4917c..bcb28bfb1e 100644
--- a/cpp/include/raft/spectral/kmeans.hpp
+++ b/cpp/include/raft/spectral/kmeans.hpp
@@ -29,9 +29,9 @@
 #include <thrust/sort.h>
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cublas_wrappers.h>
 #include <raft/device_atomics.cuh>
 #include <raft/handle.hpp>
+#include <raft/linalg/cublas_wrappers.h>
 #include <raft/spectral/matrix_wrappers.hpp>
 #include <raft/spectral/warn_dbg.hpp>
 
diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp
index 35fc22c770..891c367d5e 100644
--- a/cpp/include/raft/spectral/lapack.hpp
+++ b/cpp/include/raft/spectral/lapack.hpp
@@ -17,9 +17,9 @@
 #pragma once
 #include <cusolverDn.h>
 
+#include <raft/error.hpp>
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/linalg/cusolver_wrappers.h>
-#include <raft/error.hpp>
 
 // for now; TODO: check if/where this `define` should be;
 //
diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp
index 0d79904707..a260e75505 100644
--- a/cpp/include/raft/spectral/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/matrix_wrappers.hpp
@@ -16,9 +16,9 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/sparse/cusparse_wrappers.h>
-#include <raft/handle.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/fill.h>
diff --git a/cpp/include/raft/vectorized.cuh b/cpp/include/raft/vectorized.cuh
index b44d8bb4ad..44c6a74162 100644
--- a/cpp/include/raft/vectorized.cuh
+++ b/cpp/include/raft/vectorized.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <cuda_fp16.h>
 #include "cuda_utils.cuh"
+#include <cuda_fp16.h>
 
 namespace raft {
 
diff --git a/cpp/test/cudart_utils.cpp b/cpp/test/cudart_utils.cpp
index d9c69ce519..ff7588ce49 100644
--- a/cpp/test/cudart_utils.cpp
+++ b/cpp/test/cudart_utils.cpp
@@ -15,8 +15,8 @@
  */
 
 #include <gtest/gtest.h>
-#include <raft/cudart_utils.h>
 #include <iostream>
+#include <raft/cudart_utils.h>
 
 namespace raft {
 
diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index c0598804a8..8637d1f6bb 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
-#include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/distance/distance.hpp>
 #include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>
-#include "../test_utils.h"
 
 namespace raft {
 namespace distance {
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index 751e895552..9372a15a91 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
-#include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/distance/distance.hpp>
 #include <raft/distance/specializations.hpp>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
 
 namespace raft {
 namespace distance {
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 072176e503..27908d8f15 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
-#include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/distance/detail/fused_l2_nn.cuh>
 #include <raft/distance/fused_l2_nn.hpp>
 #include <raft/linalg/norm.cuh>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
 
 namespace raft {
 namespace distance {
diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp
index ddc0806a65..22816d0aad 100644
--- a/cpp/test/handle.cpp
+++ b/cpp/test/handle.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <gtest/gtest.h>
 #include <cstddef>
+#include <gtest/gtest.h>
 #include <iostream>
 #include <memory>
 #include <raft/handle.hpp>
diff --git a/cpp/test/integer_utils.cpp b/cpp/test/integer_utils.cpp
index d883de59fe..71567deb45 100644
--- a/cpp/test/integer_utils.cpp
+++ b/cpp/test/integer_utils.cpp
@@ -15,8 +15,8 @@
  */
 
 #include <gtest/gtest.h>
-#include <raft/integer_utils.h>
 #include <iostream>
+#include <raft/integer_utils.h>
 
 namespace raft {
 
diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu
index 4b56a9ad6f..d441bf95a8 100644
--- a/cpp/test/label/label.cu
+++ b/cpp/test/label/label.cu
@@ -18,9 +18,9 @@
 
 #include <raft/label/classlabels.cuh>
 
-#include <raft/cudart_utils.h>
-#include <raft/cuda_utils.cuh>
 #include "../test_utils.h"
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 
 #include <iostream>
 #include <vector>
diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu
index 726c5c427b..5d30af795f 100644
--- a/cpp/test/label/merge_labels.cu
+++ b/cpp/test/label/merge_labels.cu
@@ -17,12 +17,12 @@
 #include <gtest/gtest.h>
 #include <raft/label/merge_labels.cuh>
 
+#include "../test_utils.h"
 #include <raft/cudart_utils.h>
-#include <thrust/device_ptr.h>
 #include <raft/handle.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
-#include "../test_utils.h"
+#include <thrust/device_ptr.h>
 
 #include <vector>
 
diff --git a/cpp/test/lap/lap.cu b/cpp/test/lap/lap.cu
index 183c0bd2f3..afdebae1f8 100644
--- a/cpp/test/lap/lap.cu
+++ b/cpp/test/lap/lap.cu
@@ -26,8 +26,8 @@
 
 #include <rmm/device_uvector.hpp>
 
-#include <omp.h>
 #include <iostream>
+#include <omp.h>
 #include <raft/lap/lap.cuh>
 #include <random>
 
diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu
index b65a8665bc..b54e69df36 100644
--- a/cpp/test/linalg/add.cu
+++ b/cpp/test/linalg/add.cu
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
+#include "add.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/add.cuh>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
-#include "add.cuh"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu
index 3de29c6ee8..2cf0679849 100644
--- a/cpp/test/linalg/binary_op.cu
+++ b/cpp/test/linalg/binary_op.cu
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
+#include "binary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/binary_op.cuh>
 #include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>
-#include "../test_utils.h"
-#include "binary_op.cuh"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/test/linalg/binary_op.cuh
index 60450695e7..c2ba8c18ee 100644
--- a/cpp/test/linalg/binary_op.cuh
+++ b/cpp/test/linalg/binary_op.cuh
@@ -16,9 +16,9 @@
 
 #pragma once
 
+#include "../test_utils.h"
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/binary_op.cuh>
-#include "../test_utils.h"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu
index 0326cf5a47..9f07341c33 100644
--- a/cpp/test/linalg/cholesky_r1.cu
+++ b/cpp/test/linalg/cholesky_r1.cu
@@ -16,15 +16,15 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cusolver_wrappers.h>
 #include <raft/handle.hpp>
 #include <raft/linalg/cholesky_r1_update.cuh>
+#include <raft/linalg/cusolver_wrappers.h>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include "../test_utils.h"
 #include <sstream>
 #include <vector>
-#include "../test_utils.h"
 namespace raft {
 namespace linalg {
 
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index 4773ecf50f..f7bd0b60a4 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
+#include "reduce.cuh"
 #include <gtest/gtest.h>
-#include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/linalg/coalesced_reduction.cuh>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
-#include "reduce.cuh"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu
index d2d2f24397..51173c07f0 100644
--- a/cpp/test/linalg/divide.cu
+++ b/cpp/test/linalg/divide.cu
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
+#include "unary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/divide.cuh>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
-#include "unary_op.cuh"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
index 6bdd880118..83bca6198d 100644
--- a/cpp/test/linalg/eig.cu
+++ b/cpp/test/linalg/eig.cu
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
-#include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/linalg/eig.cuh>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index e41651ef61..b46efc38fd 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -16,12 +16,12 @@
 
 #if CUDART_VERSION >= 10010
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
-#include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/linalg/eig.cuh>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu
index 1f6c411b79..56e091837a 100644
--- a/cpp/test/linalg/eltwise.cu
+++ b/cpp/test/linalg/eltwise.cu
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/eltwise.cuh>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu
index 6f512aec71..d539fe9a69 100644
--- a/cpp/test/linalg/gemm_layout.cu
+++ b/cpp/test/linalg/gemm_layout.cu
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/gemm.cuh>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/gemv.cu b/cpp/test/linalg/gemv.cu
index 962b17fa24..f32d4cf809 100644
--- a/cpp/test/linalg/gemv.cu
+++ b/cpp/test/linalg/gemv.cu
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
-#include <raft/linalg/gemv.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/gemv.h>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu
index f79aac9b7f..74b812a63b 100644
--- a/cpp/test/linalg/map.cu
+++ b/cpp/test/linalg/map.cu
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/eltwise.cuh>
 #include <raft/linalg/map.cuh>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index 0baeba5807..5b60b69d36 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
-#include <raft/cudart_utils.h>
 #include <limits>
+#include <raft/cudart_utils.h>
 #include <raft/linalg/map_then_reduce.cuh>
 #include <raft/random/rng.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
-#include "../test_utils.h"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index b471972304..4ff5243826 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
+#include "matrix_vector_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
-#include "matrix_vector_op.cuh"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh
index e51802c135..531bf370d4 100644
--- a/cpp/test/linalg/matrix_vector_op.cuh
+++ b/cpp/test/linalg/matrix_vector_op.cuh
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/matrix_vector_op.cuh>
-#include "../test_utils.h"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu
index a4ad0f1d4f..7bf797a4dc 100644
--- a/cpp/test/linalg/multiply.cu
+++ b/cpp/test/linalg/multiply.cu
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
+#include "unary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/multiply.cuh>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
-#include "unary_op.cuh"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
index 3fdedc1814..6415d4dca9 100644
--- a/cpp/test/linalg/norm.cu
+++ b/cpp/test/linalg/norm.cu
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/norm.cuh>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
index ba354de2f1..fa05397555 100644
--- a/cpp/test/linalg/reduce.cu
+++ b/cpp/test/linalg/reduce.cu
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
+#include "reduce.cuh"
 #include <gtest/gtest.h>
-#include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/linalg/reduce.cuh>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
-#include "reduce.cuh"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh
index e74af2c6fe..5fb7ddbc7b 100644
--- a/cpp/test/linalg/reduce.cuh
+++ b/cpp/test/linalg/reduce.cuh
@@ -17,11 +17,15 @@
 #pragma once
 
 #include <cublas_v2.h>
-#include <raft/linalg/cublas_wrappers.h>
+
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/cublas_wrappers.h>
 #include <raft/linalg/unary_op.cuh>
+
 #include <rmm/device_uvector.hpp>
 
+#include <thrust/device_ptr.h>
+
 namespace raft {
 namespace linalg {
 
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu
index 6f3671540e..f2aa02d9d9 100644
--- a/cpp/test/linalg/strided_reduction.cu
+++ b/cpp/test/linalg/strided_reduction.cu
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
+#include "reduce.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/strided_reduction.cuh>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
-#include "reduce.cuh"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu
index 33968f4e9f..9c62eeb9f1 100644
--- a/cpp/test/linalg/subtract.cu
+++ b/cpp/test/linalg/subtract.cu
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/subtract.cuh>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu
index 801067dc96..47895cbc6a 100644
--- a/cpp/test/linalg/svd.cu
+++ b/cpp/test/linalg/svd.cu
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
-#include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/linalg/svd.cuh>
 #include <raft/matrix/matrix.hpp>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index a63b08e970..6aa83fc074 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
+#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/transpose.h>
-#include <raft/cuda_utils.cuh>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu
index 333eebe830..1b132955f5 100644
--- a/cpp/test/linalg/unary_op.cu
+++ b/cpp/test/linalg/unary_op.cu
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
+#include "unary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
-#include "unary_op.cuh"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/test/linalg/unary_op.cuh
index d8ab6fa90a..8bb2d1e0be 100644
--- a/cpp/test/linalg/unary_op.cuh
+++ b/cpp/test/linalg/unary_op.cuh
@@ -16,9 +16,9 @@
 
 #pragma once
 
+#include "../test_utils.h"
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
-#include "../test_utils.h"
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu
index 1e11062a87..3215df0d73 100644
--- a/cpp/test/matrix/math.cu
+++ b/cpp/test/matrix/math.cu
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/matrix/math.hpp>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
 
 namespace raft {
 namespace matrix {
diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu
index 85bf780112..86b94fb011 100644
--- a/cpp/test/matrix/matrix.cu
+++ b/cpp/test/matrix/matrix.cu
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/matrix/matrix.hpp>
 #include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>
-#include "../test_utils.h"
 
 namespace raft {
 namespace matrix {
diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp
index 7f4dfb8702..4861a4ca1f 100644
--- a/cpp/test/mr/device/buffer.cpp
+++ b/cpp/test/mr/device/buffer.cpp
@@ -15,9 +15,9 @@
  */
 
 #include <gtest/gtest.h>
-#include <raft/cudart_utils.h>
 #include <iostream>
 #include <memory>
+#include <raft/cudart_utils.h>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/limiting_resource_adaptor.hpp>
 
diff --git a/cpp/test/mst.cu b/cpp/test/mst.cu
index 88b34cfb85..d47b70f7be 100644
--- a/cpp/test/mst.cu
+++ b/cpp/test/mst.cu
@@ -16,11 +16,11 @@
 
 #include <bits/stdc++.h>
 
+#include "test_utils.h"
 #include <gtest/gtest.h>
 #include <iostream>
 #include <rmm/device_uvector.hpp>
 #include <vector>
-#include "test_utils.h"
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
index eb5e8c0ae5..08e522d369 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/test/random/rng.cu
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#include <gtest/gtest.h>
-#include <raft/cudart_utils.h>
+#include "../test_utils.h"
 #include <cub/cub.cuh>
+#include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/random/rng.hpp>
 #include <raft/stats/mean.hpp>
 #include <raft/stats/stddev.hpp>
-#include "../test_utils.h"
 
 namespace raft {
 namespace random {
diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu
index d5701e1708..4b0f1f0a4f 100644
--- a/cpp/test/random/rng_int.cu
+++ b/cpp/test/random/rng_int.cu
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include <gtest/gtest.h>
-#include <raft/cudart_utils.h>
+#include "../test_utils.h"
 #include <cub/cub.cuh>
+#include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
 
 namespace raft {
 namespace random {
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu
index 710049cbce..d3b1baf388 100644
--- a/cpp/test/random/sample_without_replacement.cu
+++ b/cpp/test/random/sample_without_replacement.cu
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
-#include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/random/rng.hpp>
 #include <set>
 #include <vector>
-#include "../test_utils.h"
 
 namespace raft {
 namespace random {
diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu
index 74f419be23..0804b46957 100644
--- a/cpp/test/sparse/add.cu
+++ b/cpp/test/sparse/add.cu
@@ -19,9 +19,9 @@
 #include <raft/sparse/csr.hpp>
 #include <raft/sparse/linalg/add.hpp>
 
+#include "../test_utils.h"
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
 
 #include <iostream>
 #include <limits>
diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu
index df138e2bdb..c9b15737a1 100644
--- a/cpp/test/sparse/connect_components.cu
+++ b/cpp/test/sparse/connect_components.cu
@@ -18,8 +18,8 @@
 
 #include <cub/cub.cuh>
 
-#include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <vector>
 
 #include <raft/sparse/linalg/symmetrize.hpp>
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index d78cc2d026..f4cd5640fe 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
 
 #include <raft/sparse/convert/csr.hpp>
 #include <raft/sparse/coo.hpp>
diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu
index 1a51e12bda..8dec9492bb 100644
--- a/cpp/test/sparse/csr_to_dense.cu
+++ b/cpp/test/sparse/csr_to_dense.cu
@@ -19,8 +19,8 @@
 #include <raft/handle.hpp>
 
 #include <gtest/gtest.h>
-#include <raft/sparse/cusparse_wrappers.h>
 #include <raft/sparse/convert/dense.hpp>
+#include <raft/sparse/cusparse_wrappers.h>
 
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu
index b8aea773ae..b1a432422e 100644
--- a/cpp/test/sparse/csr_transpose.cu
+++ b/cpp/test/sparse/csr_transpose.cu
@@ -19,8 +19,8 @@
 #include <gtest/gtest.h>
 
 #include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
 #include <raft/handle.hpp>
+#include <raft/sparse/cusparse_wrappers.h>
 #include <raft/sparse/linalg/transpose.hpp>
 
 #include "../test_utils.h"
diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu
index 0f10faf600..6f567c260d 100644
--- a/cpp/test/sparse/degree.cu
+++ b/cpp/test/sparse/degree.cu
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
 
 #include <raft/sparse/linalg/degree.hpp>
 
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index 2c8a91b8b8..dc136d6f18 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -20,8 +20,8 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
-#include <raft/sparse/cusparse_wrappers.h>
 #include <raft/linalg/unary_op.cuh>
+#include <raft/sparse/cusparse_wrappers.h>
 #include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/convert/coo.hpp>
diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu
index dc9b2d63ad..210eefa850 100644
--- a/cpp/test/sparse/filter.cu
+++ b/cpp/test/sparse/filter.cu
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
 
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/op/filter.hpp>
diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu
index 389e8c4b9c..bcfa796931 100644
--- a/cpp/test/sparse/knn.cu
+++ b/cpp/test/sparse/knn.cu
@@ -17,10 +17,10 @@
 #include <cusparse_v2.h>
 #include <gtest/gtest.h>
 
+#include "../test_utils.h"
 #include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/sparse/selection/knn.hpp>
-#include "../test_utils.h"
 
 #include <raft/cudart_utils.h>
 #include <raft/sparse/cusparse_wrappers.h>
diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu
index e7e5854186..df9bb4e3e4 100644
--- a/cpp/test/sparse/knn_graph.cu
+++ b/cpp/test/sparse/knn_graph.cu
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
-#include "../test_utils.h"
 
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/selection/knn_graph.hpp>
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
index 59c0961699..8d61ca06a8 100644
--- a/cpp/test/sparse/norm.cu
+++ b/cpp/test/sparse/norm.cu
@@ -16,11 +16,11 @@
 
 #include <gtest/gtest.h>
 
+#include "../test_utils.h"
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.hpp>
 #include <raft/sparse/csr.hpp>
 #include <raft/sparse/linalg/norm.hpp>
-#include "../test_utils.h"
 
 #include <iostream>
 #include <limits>
diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu
index 41328b5f78..b550857797 100644
--- a/cpp/test/sparse/reduce.cu
+++ b/cpp/test/sparse/reduce.cu
@@ -16,14 +16,14 @@
 
 #include <gtest/gtest.h>
 
-#include <raft/cudart_utils.h>
+#include "../test_utils.h"
 #include <iostream>
 #include <limits>
+#include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/op/reduce.hpp>
 #include <rmm/device_uvector.hpp>
-#include "../test_utils.h"
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu
index be523bc97f..8d0317abac 100644
--- a/cpp/test/sparse/row_op.cu
+++ b/cpp/test/sparse/row_op.cu
@@ -19,9 +19,9 @@
 #include <raft/sparse/csr.hpp>
 #include <raft/sparse/op/row_op.hpp>
 
+#include "../test_utils.h"
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
 
 #include <iostream>
 #include <limits>
diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu
index 85ee0fe79b..66407341da 100644
--- a/cpp/test/sparse/sort.cu
+++ b/cpp/test/sparse/sort.cu
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.hpp>
-#include "../test_utils.h"
 
 #include <raft/sparse/op/sort.hpp>
 
diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu
index 21d9aaf71f..7b44c477aa 100644
--- a/cpp/test/spatial/ball_cover.cu
+++ b/cpp/test/spatial/ball_cover.cu
@@ -14,20 +14,20 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
+#include "spatial_data.h"
 #include <raft/cudart_utils.h>
 #include <raft/linalg/distance_type.h>
 #include <raft/spatial/knn/ball_cover.hpp>
 #include <raft/spatial/knn/detail/knn_brute_force_faiss.cuh>
 #include <raft/spatial/knn/specializations.hpp>
 #include <rmm/device_uvector.hpp>
-#include "../test_utils.h"
-#include "spatial_data.h"
 
-#include <thrust/transform.h>
 #include <rmm/exec_policy.hpp>
+#include <thrust/transform.h>
 
-#include <gtest/gtest.h>
 #include <cstdint>
+#include <gtest/gtest.h>
 #include <iostream>
 #include <vector>
 
diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu
index 078d5e0eec..d30c018ed2 100644
--- a/cpp/test/spatial/fused_l2_knn.cu
+++ b/cpp/test/spatial/fused_l2_knn.cu
@@ -20,8 +20,8 @@
 #include <faiss/gpu/StandardGpuResources.h>
 
 #include <raft/linalg/distance_type.h>
-#include <raft/spatial/knn/detail/common_faiss.h>
 #include <raft/random/rng.hpp>
+#include <raft/spatial/knn/detail/common_faiss.h>
 #include <raft/spatial/knn/detail/fused_l2_knn.cuh>
 #include <raft/spatial/knn/knn.hpp>
 
diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu
index 171b698265..f60ec54bbc 100644
--- a/cpp/test/spatial/haversine.cu
+++ b/cpp/test/spatial/haversine.cu
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
-#include <raft/linalg/distance_type.h>
 #include <iostream>
+#include <raft/linalg/distance_type.h>
 #include <raft/spatial/knn/detail/haversine_distance.cuh>
 #include <rmm/device_uvector.hpp>
 #include <vector>
-#include "../test_utils.h"
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu
index b8ea2cb799..f6ad98e1a4 100644
--- a/cpp/test/stats/mean.cu
+++ b/cpp/test/stats/mean.cu
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
-#include <raft/cudart_utils.h>
-#include <stdio.h>
-#include <stdlib.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 #include <raft/random/rng.hpp>
 #include <raft/stats/mean.hpp>
-#include "../test_utils.h"
+#include <stdio.h>
+#include <stdlib.h>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu
index e14a9062d3..af6d7c8d7b 100644
--- a/cpp/test/stats/mean_center.cu
+++ b/cpp/test/stats/mean_center.cu
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
+#include "../linalg/matrix_vector_op.cuh"
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/random/rng.hpp>
 #include <raft/stats/mean.hpp>
 #include <raft/stats/mean_center.hpp>
-#include "../linalg/matrix_vector_op.cuh"
-#include "../test_utils.h"
 
 namespace raft {
 namespace stats {
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index 73969ab41d..24206cee7e 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/matrix/math.hpp>
 #include <raft/random/rng.hpp>
 #include <raft/stats/mean.hpp>
 #include <raft/stats/stddev.hpp>
-#include "../test_utils.h"
 
 namespace raft {
 namespace stats {
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index 82766f6109..55e72656da 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
+#include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/eltwise.cuh>
 #include <raft/random/rng.hpp>
 #include <raft/stats/sum.hpp>
-#include "../test_utils.h"
 
 namespace raft {
 namespace stats {
diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h
index f2573f132b..df7c2cbe36 100644
--- a/cpp/test/test_utils.h
+++ b/cpp/test/test_utils.h
@@ -16,10 +16,10 @@
 
 #pragma once
 #include <gtest/gtest.h>
-#include <raft/cudart_utils.h>
 #include <iostream>
 #include <memory>
 #include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
 
 #include <fstream>
 #include <sstream>
@@ -67,7 +67,7 @@ struct CompareApproxAbs {
 };
 
 template <typename T>
-T abs(const T& a)
+__host__ __device__ T abs(const T& a)
 {
   return a > T(0) ? a : -a;
 }

From c05c1fac9f27f61bce3f525f1f21431d02d391a6 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Fri, 17 Dec 2021 12:49:17 +0100
Subject: [PATCH 063/171] Fix clang-format style errors (#425)

Looks like a few style errors slipped through in one of the last commits preventing other PR from passing their CI checks. This PR fixes that.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Matt Joux (https://github.com/MatthiasKohl)
  - Tamas Bela Feher (https://github.com/tfeher)

URL: https://github.com/rapidsai/raft/pull/425
---
 cpp/include/raft/sparse/selection/knn_graph.hpp             | 3 ++-
 cpp/include/raft/spatial/knn/specializations/ball_cover.hpp | 6 +++---
 cpp/src/nn/specializations/ball_cover.cu                    | 5 +++--
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/cpp/include/raft/sparse/selection/knn_graph.hpp b/cpp/include/raft/sparse/selection/knn_graph.hpp
index 96ce02e06a..9e540b6f6d 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.hpp
+++ b/cpp/include/raft/sparse/selection/knn_graph.hpp
@@ -17,10 +17,11 @@
 #pragma once
 
 #include <raft/linalg/distance_type.h>
-#include <cstdint>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/selection/detail/knn_graph.cuh>
 
+#include <cstdint>
+
 namespace raft {
 namespace sparse {
 namespace selection {
diff --git a/cpp/include/raft/spatial/knn/specializations/ball_cover.hpp b/cpp/include/raft/spatial/knn/specializations/ball_cover.hpp
index dad0b3f1ee..6b8b10b35a 100644
--- a/cpp/include/raft/spatial/knn/specializations/ball_cover.hpp
+++ b/cpp/include/raft/spatial/knn/specializations/ball_cover.hpp
@@ -16,12 +16,12 @@
 
 #pragma once
 
-#include <cstdint>
-
-#include <raft/spatial/knn/ball_cover_common.h>
 #include <raft/spatial/knn/ball_cover.hpp>
+#include <raft/spatial/knn/ball_cover_common.h>
 #include <raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp>
 
+#include <cstdint>
+
 namespace raft {
 namespace spatial {
 namespace knn {
diff --git a/cpp/src/nn/specializations/ball_cover.cu b/cpp/src/nn/specializations/ball_cover.cu
index 656aec3323..ceb9468c21 100644
--- a/cpp/src/nn/specializations/ball_cover.cu
+++ b/cpp/src/nn/specializations/ball_cover.cu
@@ -14,9 +14,8 @@
  * limitations under the License.
  */
 
-#include <raft/spatial/knn/ball_cover_common.h>
-#include <cstdint>
 #include <raft/spatial/knn/ball_cover.hpp>
+#include <raft/spatial/knn/ball_cover_common.h>
 
 // Ignore upstream specializations to avoid unnecessary recompiling
 #include <raft/distance/specializations.hpp>
@@ -24,6 +23,8 @@
 #include <raft/spatial/knn/specializations/fused_l2_knn.hpp>
 #include <raft/spatial/knn/specializations/knn.hpp>
 
+#include <cstdint>
+
 namespace raft {
 namespace spatial {
 namespace knn {

From 754cc889ef5b292caaba83676f95aa3f11f80308 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Fri, 17 Dec 2021 15:45:28 +0100
Subject: [PATCH 064/171] NVTX range helpers (#416)

Moves the helpers for NVTX ranges from cuml with minor additions.

Also adds the helpers to couple places in the codebase - as a way of testing the new feature.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - William Hicks (https://github.com/wphicks)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/416
---
 cpp/CMakeLists.txt                      |   9 ++
 cpp/include/raft/common/detail/nvtx.hpp | 203 ++++++++++++++++++++++++
 cpp/include/raft/common/nvtx.hpp        | 155 ++++++++++++++++++
 cpp/include/raft/linalg/svd.cuh         |   7 +
 cpp/test/CMakeLists.txt                 |  12 +-
 cpp/test/distance/distance_base.cuh     |   4 +
 cpp/test/eigen_solvers.cu               |   3 +
 cpp/test/nvtx.cpp                       |  50 ++++++
 python/raft/common/cuda.pyx             |   5 +-
 9 files changed, 435 insertions(+), 13 deletions(-)
 create mode 100644 cpp/include/raft/common/detail/nvtx.hpp
 create mode 100644 cpp/include/raft/common/nvtx.hpp
 create mode 100644 cpp/test/nvtx.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index b48eff36db..efebfff429 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -145,6 +145,7 @@ set(RAFT_LINK_LIBRARIES
         CUDA::cusolver
         CUDA::cudart
         CUDA::cusparse
+        $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
         rmm::rmm
         cuco::cuco
         )
@@ -153,6 +154,14 @@ target_link_libraries(raft INTERFACE ${RAFT_LINK_LIBRARIES})
 target_link_libraries(raft_distance PUBLIC ${RAFT_LINK_LIBRARIES})
 target_link_libraries(raft_nn PUBLIC ${RAFT_LINK_LIBRARIES} FAISS::FAISS)
 
+set(RAFT_COMPILE_DEFINITIONS
+        $<$<BOOL:${NVTX}>:NVTX_ENABLED>
+        )
+
+target_compile_definitions(raft INTERFACE ${RAFT_COMPILE_DEFINITIONS})
+target_compile_definitions(raft_distance PRIVATE ${RAFT_COMPILE_DEFINITIONS})
+target_compile_definitions(raft_nn PRIVATE ${RAFT_COMPILE_DEFINITIONS})
+
 target_compile_options(raft_distance
         PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
         "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
diff --git a/cpp/include/raft/common/detail/nvtx.hpp b/cpp/include/raft/common/detail/nvtx.hpp
new file mode 100644
index 0000000000..4cef7c07bc
--- /dev/null
+++ b/cpp/include/raft/common/detail/nvtx.hpp
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace raft::common::nvtx::detail {
+
+#ifdef NVTX_ENABLED
+
+#include <cstdint>
+#include <cstdlib>
+#include <mutex>
+#include <nvToolsExt.h>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+
+/**
+ * @brief An internal struct to store associated state with the color
+ * generator
+ */
+struct color_gen_state {
+  /** collection of all tagged colors generated so far */
+  static inline std::unordered_map<std::string, uint32_t> all_colors_;
+  /** mutex for accessing the above map */
+  static inline std::mutex map_mutex_;
+  /** saturation */
+  static inline constexpr float kS = 0.9f;
+  /** value */
+  static inline constexpr float kV = 0.85f;
+  /** golden ratio */
+  static inline constexpr float kPhi = 1.61803f;
+  /** inverse golden ratio */
+  static inline constexpr float kInvPhi = 1.f / kPhi;
+};
+
+// all h, s, v are in range [0, 1]
+// Ref: http://en.wikipedia.org/wiki/HSL_and_HSV#Converting_to_RGB
+inline auto hsv2rgb(float h, float s, float v) -> uint32_t
+{
+  uint32_t out = 0xff000000u;
+  if (s <= 0.0f) { return out; }
+  // convert hue from [0, 1] range to [0, 360]
+  float h_deg = h * 360.f;
+  if (0.f > h_deg || h_deg >= 360.f) h_deg = 0.f;
+  h_deg /= 60.f;
+  int h_range = static_cast<int>(h_deg);
+  float h_mod = h_deg - h_range;
+  float x     = v * (1.f - s);
+  float y     = v * (1.f - (s * h_mod));
+  float z     = v * (1.f - (s * (1.f - h_mod)));
+  float r, g, b;
+  switch (h_range) {
+    case 0:
+      r = v;
+      g = z;
+      b = x;
+      break;
+    case 1:
+      r = y;
+      g = v;
+      b = x;
+      break;
+    case 2:
+      r = x;
+      g = v;
+      b = z;
+      break;
+    case 3:
+      r = x;
+      g = y;
+      b = v;
+      break;
+    case 4:
+      r = z;
+      g = x;
+      b = v;
+      break;
+    case 5:
+    default:
+      r = v;
+      g = x;
+      b = y;
+      break;
+  }
+  out |= (uint32_t(r * 256.f) << 16);
+  out |= (uint32_t(g * 256.f) << 8);
+  out |= uint32_t(b * 256.f);
+  return out;
+}
+
+/**
+ * @brief Helper method to generate 'visually distinct' colors.
+ * Inspired from https://martin.ankerl.com/2009/12/09/how-to-create-random-colors-programmatically/
+ * However, if an associated tag is passed, it will look up in its history for
+ * any generated color against this tag and if found, just returns it, else
+ * generates a new color, assigns a tag to it and stores it for future usage.
+ * Such a thing is very useful for nvtx markers where the ranges associated
+ * with a specific tag should ideally get the same color for the purpose of
+ * visualizing it on nsight-systems timeline.
+ * @param tag look for any previously generated colors with this tag or
+ * associate the currently generated color with it
+ * @return returns 32b RGB integer with alpha channel set of 0xff
+ */
+inline auto generate_next_color(const std::string& tag) -> uint32_t
+{
+  // std::unordered_map<std::string, uint32_t> color_gen_state::all_colors_;
+  // std::mutex color_gen_state::map_mutex_;
+
+  std::lock_guard<std::mutex> guard(color_gen_state::map_mutex_);
+  if (!tag.empty()) {
+    auto itr = color_gen_state::all_colors_.find(tag);
+    if (itr != color_gen_state::all_colors_.end()) { return itr->second; }
+  }
+  auto h = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
+  h += color_gen_state::kInvPhi;
+  if (h >= 1.f) h -= 1.f;
+  auto rgb = hsv2rgb(h, color_gen_state::kS, color_gen_state::kV);
+  if (!tag.empty()) { color_gen_state::all_colors_[tag] = rgb; }
+  return rgb;
+}
+
+template <typename Domain, typename = Domain>
+struct domain_store {
+  /* If `Domain::name` does not exist, this default instance is used and throws the error. */
+  static_assert(sizeof(Domain) != sizeof(Domain),
+                "Type used to identify a domain must contain a static member 'char const* name'");
+  static inline nvtxDomainHandle_t const kValue = nullptr;
+};
+
+template <typename Domain>
+struct domain_store<
+  Domain,
+  /* Check if there exists `Domain::name` */
+  std::enable_if_t<
+    std::is_same<char const*, typename std::decay<decltype(Domain::name)>::type>::value,
+    Domain>> {
+  static inline nvtxDomainHandle_t const kValue = nvtxDomainCreateA(Domain::name);
+};
+
+template <typename Domain>
+inline void push_range_name(const char* name)
+{
+  nvtxEventAttributes_t event_attrib = {0};
+  event_attrib.version               = NVTX_VERSION;
+  event_attrib.size                  = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+  event_attrib.colorType             = NVTX_COLOR_ARGB;
+  event_attrib.color                 = generate_next_color(name);
+  event_attrib.messageType           = NVTX_MESSAGE_TYPE_ASCII;
+  event_attrib.message.ascii         = name;
+  nvtxDomainRangePushEx(domain_store<Domain>::kValue, &event_attrib);
+}
+
+template <typename Domain, typename... Args>
+inline void push_range(const char* format, Args... args)
+{
+  if constexpr (sizeof...(args) > 0) {
+    int length = std::snprintf(nullptr, 0, format, args...);
+    assert(length >= 0);
+    std::vector<char> buf(length + 1);
+    std::snprintf(buf.data(), length + 1, format, args...);
+    push_range_name<Domain>(buf.data());
+  } else {
+    push_range_name<Domain>(format);
+  }
+}
+
+template <typename Domain>
+inline void pop_range()
+{
+  nvtxDomainRangePop(domain_store<Domain>::kValue);
+}
+
+#else  // NVTX_ENABLED
+
+template <typename Domain, typename... Args>
+inline void push_range(const char* format, Args... args)
+{
+}
+
+template <typename Domain>
+inline void pop_range()
+{
+}
+
+#endif  // NVTX_ENABLED
+
+}  // namespace raft::common::nvtx::detail
diff --git a/cpp/include/raft/common/nvtx.hpp b/cpp/include/raft/common/nvtx.hpp
new file mode 100644
index 0000000000..918d5e10d8
--- /dev/null
+++ b/cpp/include/raft/common/nvtx.hpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/nvtx.hpp"
+#include <optional>
+
+/**
+ * \section Usage
+ *
+ * To add NVTX ranges to your code, use the `nvtx::range` RAII object. A
+ * range begins when the object is created, and ends when the object is
+ * destroyed.
+ *
+ * The example below creates nested NVTX ranges. The range `fun_scope` spans
+ * the whole function, while the range `epoch_scope` spans an iteration
+ * (and appears 5 times in the timeline).
+ * \code{.cpp}
+ * #include <raft/common/nvtx.hpp>
+ * void some_function(int k){
+ *   // Begins a NVTX range with the messsage "some_function_{k}"
+ *   // The range ends when some_function() returns
+ *   common::nvtx::range fun_scope( r{"some_function_%d", k};
+ *
+ *   for(int i = 0; i < 5; i++){
+ *     common::nvtx::range epoch_scope{"epoch-%d", i};
+ *     // some logic inside the loop
+ *   }
+ * }
+ * \endcode
+ *
+ * \section Domains
+ *
+ * All NVTX ranges are assigned to domains. A domain defines a named timeline in
+ * the Nsight Systems view. By default, we put all ranges into a domain `domain::app`
+ * named "application". This is controlled by the template parameter `Domain`.
+ *
+ * The example below defines a domain and uses it in a function.
+ * \code{.cpp}
+ * #include <raft/common/nvtx.hpp>
+ *
+ * struct my_app_domain {
+ *   static constexpr char const* name{"my application"};
+ * }
+ *
+ * void some_function(int k){
+ *   // This NVTX range appears in the timeline named "my application" in Nsight Systems.
+ *   common::nvtx::range<my_app_domain> fun_scope( r{"some_function_%d", k};
+ *   // some logic inside the loop
+ * }
+ * \endcode
+ */
+namespace raft::common::nvtx {
+
+namespace domain {
+
+/** @brief The default NVTX domain. */
+struct app {
+  static constexpr char const* name{"application"};
+};
+
+/** @brief This NVTX domain is supposed to be used within raft.  */
+struct raft {
+  static constexpr char const* name{"raft"};
+};
+
+}  // namespace domain
+
+/**
+ * @brief Push a named NVTX range.
+ *
+ * @tparam Domain optional struct that defines the NVTX domain message;
+ *   You can create a new domain with a custom message as follows:
+ *   \code{.cpp}
+ *      struct custom_domain { static constexpr char const* name{"custom message"}; }
+ *   \endcode
+ *   NB: make sure to use the same domain for `push_range` and `pop_range`.
+ * @param format range name format (accepts printf-style arguments)
+ * @param args the arguments for the printf-style formatting
+ */
+template <typename Domain = domain::app, typename... Args>
+inline void push_range(const char* format, Args... args)
+{
+  detail::push_range<Domain, Args...>(format, args...);
+}
+
+/**
+ * @brief Pop the latest range.
+ *
+ * @tparam Domain optional struct that defines the NVTX domain message;
+ *   You can create a new domain with a custom message as follows:
+ *   \code{.cpp}
+ *      struct custom_domain { static constexpr char const* name{"custom message"}; }
+ *   \endcode
+ *   NB: make sure to use the same domain for `push_range` and `pop_range`.
+ */
+template <typename Domain = domain::app>
+inline void pop_range()
+{
+  detail::pop_range<Domain>();
+}
+
+/**
+ * @brief Push a named NVTX range that would be popped at the end of the object lifetime.
+ *
+ * Refer to \ref Usage for the usage examples.
+ *
+ * @tparam Domain optional struct that defines the NVTX domain message;
+ *   You can create a new domain with a custom message as follows:
+ *   \code{.cpp}
+ *      struct custom_domain { static constexpr char const* name{"custom message"}; }
+ *   \endcode
+ */
+template <typename Domain = domain::app>
+class range {
+ public:
+  /**
+   * Push a named NVTX range.
+   * At the end of the object lifetime, pop the range back.
+   *
+   * @param format range name format (accepts printf-style arguments)
+   * @param args the arguments for the printf-style formatting
+   */
+  template <typename... Args>
+  explicit range(const char* format, Args... args)
+  {
+    push_range<Domain, Args...>(format, args...);
+  }
+
+  ~range() { pop_range<Domain>(); }
+
+  /* This object is not meant to be touched. */
+  range(const range&) = delete;
+  range(range&&)      = delete;
+  auto operator=(const range&) -> range& = delete;
+  auto operator=(range&&) -> range&                = delete;
+  static auto operator new(std::size_t) -> void*   = delete;
+  static auto operator new[](std::size_t) -> void* = delete;
+};
+
+}  // namespace raft::common::nvtx
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh
index b75497f725..2afae788a1 100644
--- a/cpp/include/raft/linalg/svd.cuh
+++ b/cpp/include/raft/linalg/svd.cuh
@@ -19,6 +19,7 @@
 #include "eig.cuh"
 #include "gemm.cuh"
 #include "transpose.h"
+#include <raft/common/nvtx.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
@@ -63,6 +64,8 @@ void svdQR(const raft::handle_t& handle,
            bool gen_right_vec,
            cudaStream_t stream)
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "raft::linalg::svdQR(%d, %d)", n_rows, n_cols);
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
   cublasHandle_t cublasH       = handle.get_cublas_handle();
 
@@ -140,6 +143,8 @@ void svdEig(const raft::handle_t& handle,
             bool gen_left_vec,
             cudaStream_t stream)
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "raft::linalg::svdEig(%d, %d)", n_rows, n_cols);
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
   cublasHandle_t cublasH       = handle.get_cublas_handle();
 
@@ -218,6 +223,8 @@ void svdJacobi(const raft::handle_t& handle,
                int max_sweeps,
                cudaStream_t stream)
 {
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "raft::linalg::svdJacobi(%d, %d)", n_rows, n_cols);
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   gesvdjInfo_t gesvdj_params = NULL;
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index b270204489..b37c671525 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -36,6 +36,7 @@ add_executable(test_raft
     test/eigen_solvers.cu
     test/handle.cpp
     test/integer_utils.cpp
+    test/nvtx.cpp
     test/pow2_utils.cu
     test/label/label.cu
     test/label/merge_labels.cu
@@ -117,21 +118,14 @@ target_compile_options(test_raft
 )
 
 target_include_directories(test_raft
-    PUBLIC  "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
-            "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/test>"
+    PUBLIC  "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/test>"
             "${FAISS_GPU_HEADERS}"
 )
 
 
 target_link_libraries(test_raft
 PRIVATE
-  CUDA::cublas
-  CUDA::curand
-  CUDA::cusolver
-  CUDA::cudart
-  CUDA::cusparse
-  rmm::rmm
-  cuco::cuco
+  raft # transitively links all CUDA libs, etc
   raft_distance
   raft_nn
   GTest::gtest
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index 9372a15a91..475202137b 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -16,6 +16,7 @@
 
 #include "../test_utils.h"
 #include <gtest/gtest.h>
+#include <raft/common/nvtx.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance.hpp>
@@ -416,6 +417,9 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
 
   void SetUp() override
   {
+    auto testInfo = testing::UnitTest::GetInstance()->current_test_info();
+    common::nvtx::range fun_scope("test::%s/%s", testInfo->test_suite_name(), testInfo->name());
+
     raft::random::Rng r(params.seed);
     int m               = params.m;
     int n               = params.n;
diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu
index dc7de92eb8..f898d11d2e 100644
--- a/cpp/test/eigen_solvers.cu
+++ b/cpp/test/eigen_solvers.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <raft/common/nvtx.hpp>
 #include <raft/handle.hpp>
 #include <raft/spectral/partition.hpp>
 
@@ -27,6 +28,7 @@ namespace raft {
 
 TEST(Raft, EigenSolvers)
 {
+  common::nvtx::range fun_scope("test::EigenSolvers");
   using namespace matrix;
   using index_type = int;
   using value_type = double;
@@ -67,6 +69,7 @@ TEST(Raft, EigenSolvers)
 
 TEST(Raft, SpectralSolvers)
 {
+  common::nvtx::range fun_scope("test::SpectralSolvers");
   using namespace matrix;
   using index_type = int;
   using value_type = double;
diff --git a/cpp/test/nvtx.cpp b/cpp/test/nvtx.cpp
new file mode 100644
index 0000000000..81f692a215
--- /dev/null
+++ b/cpp/test/nvtx.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifdef NVTX_ENABLED
+#include <gtest/gtest.h>
+#include <raft/common/detail/nvtx.hpp>
+/**
+ * tests for the functionality of generating next color based on string
+ * entered in the NVTX Range marker wrappers
+ */
+
+namespace raft {
+
+class NvtxNextColorTest : public ::testing::Test {
+ protected:
+  void SetUp() override
+  {
+    const std::string temp1 = "foo";
+    const std::string temp2 = "bar";
+
+    diff_string_diff_color = common::nvtx::detail::generate_next_color(temp1) !=
+                             common::nvtx::detail::generate_next_color(temp2);
+    same_string_same_color = common::nvtx::detail::generate_next_color(temp1) ==
+                             common::nvtx::detail::generate_next_color(temp1);
+  }
+  void TearDown() {}
+  bool diff_string_diff_color = false;
+  bool same_string_same_color = false;
+};
+
+TEST_F(NvtxNextColorTest, generate_next_color)
+{
+  EXPECT_TRUE(diff_string_diff_color);
+  EXPECT_TRUE(same_string_same_color);
+}
+
+}  // end namespace raft
+#endif
diff --git a/python/raft/common/cuda.pyx b/python/raft/common/cuda.pyx
index baa46bfef8..0b97eeba67 100644
--- a/python/raft/common/cuda.pyx
+++ b/python/raft/common/cuda.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,9 +19,6 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
-import functools
-from libcpp.string cimport string
-
 
 class CudaRuntimeError(RuntimeError):
     def __init__(self, extraMsg=None):

From f48612d508a4b02196cfc584a2f2f61350ff2c66 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Fri, 17 Dec 2021 16:31:51 +0100
Subject: [PATCH 065/171] Fix C++ doxygen documentation (#426)

Make the doxygen docs buildable again. The changes:

1. A minimal set of changes required for `docs_raft` target to complete successfully
2. A `build.sh` section for building the docs target (as in cuml)
3. CI target to make sure new documentation errors do not slip in.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/raft/pull/426
---
 build.sh                                                 | 8 +++++++-
 ci/gpu/build.sh                                          | 5 ++++-
 cpp/include/raft/handle.hpp                              | 7 ++++---
 cpp/include/raft/matrix/math.hpp                         | 3 ++-
 cpp/include/raft/matrix/matrix.hpp                       | 5 +++--
 cpp/include/raft/random/rng.hpp                          | 3 ++-
 cpp/include/raft/sparse/op/filter.hpp                    | 1 -
 cpp/include/raft/sparse/op/reduce.hpp                    | 2 +-
 cpp/include/raft/sparse/op/slice.hpp                     | 6 +++---
 cpp/include/raft/sparse/selection/connect_components.hpp | 6 ++++--
 cpp/include/raft/sparse/selection/knn.hpp                | 5 +++--
 cpp/include/raft/sparse/selection/knn_graph.hpp          | 1 -
 12 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/build.sh b/build.sh
index 7dff773b67..a609670419 100755
--- a/build.sh
+++ b/build.sh
@@ -18,13 +18,14 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean cppraft pyraft -v -g --allgpuarch --nvtx --show_depr_warn -h --buildgtest --buildfaiss"
+VALIDARGS="clean cppraft pyraft cppdocs -v -g --allgpuarch --nvtx --show_depr_warn -h --buildgtest --buildfaiss"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
    cppraft          - build the cuml C++ code only. Also builds the C-wrapper library
                       around the C++ code.
    pyraft             - build the cuml Python package
+   cppdocs            - build the C++ doxygen documentation
  and <flag> is:
    -v               - verbose build mode
    -g               - build for debug
@@ -130,6 +131,11 @@ if (( ${CLEAN} == 1 )); then
     cd ${REPODIR}
 fi
 
+if hasArg cppdocs; then
+    cd ${CPP_RAFT_BUILD_DIR}
+    cmake --build ${CPP_RAFT_BUILD_DIR} --target docs_raft
+fi
+
 ################################################################################
 # Configure for building all C++ targets
 if (( ${NUMARGS} == 0 )) || hasArg cppraft; then
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index c8d526703f..83cc6fdaef 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #########################################
 # cuML GPU build and test script for CI #
 #########################################
@@ -90,6 +90,9 @@ export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
 gpuci_logger "Build C++ and Python targets"
 "$WORKSPACE/build.sh" cppraft pyraft -v
 
+gpuci_logger "Building doxygen C++ docs"
+"$WORKSPACE/build.sh" cppdocs -v
+
 gpuci_logger "Resetting LD_LIBRARY_PATH"
 
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH_CACHED
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 996c56ca9d..6421ba5344 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -58,7 +58,8 @@ class handle_t {
   /**
    * @brief Construct a handle with a stream view and stream pool
    *
-   * @param[in] stream the default stream (which has the default per-thread stream if unspecified)
+   * @param[in] stream_view the default stream (which has the default per-thread stream if
+   * unspecified)
    * @param[in] stream_pool the stream pool used (which has default of nullptr if unspecified)
    */
   handle_t(rmm::cuda_stream_view stream_view                  = rmm::cuda_stream_per_thread,
@@ -184,7 +185,7 @@ class handle_t {
   /**
    * @brief return stream from pool at index if size > 0, else main stream on handle
    *
-   * @param[in] stream_index the required index of the stream in the stream pool if available
+   * @param[in] stream_idx the required index of the stream in the stream pool if available
    */
   rmm::cuda_stream_view get_next_usable_stream(std::size_t stream_idx) const
   {
diff --git a/cpp/include/raft/matrix/math.hpp b/cpp/include/raft/matrix/math.hpp
index c3f591202a..8639cdfb02 100644
--- a/cpp/include/raft/matrix/math.hpp
+++ b/cpp/include/raft/matrix/math.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -308,6 +308,7 @@ void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_
  * @brief ratio of every element over sum of input vector is calculated
  * @tparam math_t data-type upon which the math operation will be performed
  * @tparam IdxType Integer type used to for addressing
+ * @param handle
  * @param src: input matrix
  * @param dest: output matrix. The result is stored in the dest matrix
  * @param len: number elements of input matrix
diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp
index dac2afb5a0..a89c28ab80 100644
--- a/cpp/include/raft/matrix/matrix.hpp
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -169,6 +169,7 @@ void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
  * @param n_cols: number of columns of input matrix
  * @param h_separator: horizontal separator character
  * @param v_separator: vertical separator character
+ * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
 void print(const m_t* in,
@@ -275,9 +276,9 @@ void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
 
 /**
  * @brief Get the L2/F-norm of a matrix/vector
+ * @param handle
  * @param in: input matrix/vector with totally size elements
  * @param size: size of the matrix/vector
- * @param cublasH cublas handle
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
diff --git a/cpp/include/raft/random/rng.hpp b/cpp/include/raft/random/rng.hpp
index 0cced7c626..4ec25e71a2 100644
--- a/cpp/include/raft/random/rng.hpp
+++ b/cpp/include/raft/random/rng.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -335,6 +335,7 @@ class Rng : public detail::RngImpl {
    * @tparam DataT data type
    * @tparam WeightsT weights type
    * @tparam IdxT index type
+   * @param handle
    * @param out output sampled array (of length 'sampledLen')
    * @param outIdx indices of the sampled array (of length 'sampledLen'). Pass
    * a nullptr if this is not required.
diff --git a/cpp/include/raft/sparse/op/filter.hpp b/cpp/include/raft/sparse/op/filter.hpp
index 8e57f53d9a..0dff063e91 100644
--- a/cpp/include/raft/sparse/op/filter.hpp
+++ b/cpp/include/raft/sparse/op/filter.hpp
@@ -38,7 +38,6 @@ namespace op {
  * @param cur_cnnz array of counts per row
  * @param scalar: scalar to remove from arrays
  * @param n: number of rows in dense matrix
- * @param d_alloc device allocator for temporary buffers
  * @param stream: cuda stream to use
  */
 template <typename T>
diff --git a/cpp/include/raft/sparse/op/reduce.hpp b/cpp/include/raft/sparse/op/reduce.hpp
index 094968b11c..b181f1c46f 100644
--- a/cpp/include/raft/sparse/op/reduce.hpp
+++ b/cpp/include/raft/sparse/op/reduce.hpp
@@ -57,6 +57,7 @@ void compute_duplicates_mask(
  * the sorting of values.
  * @tparam value_idx
  * @tparam value_t
+ * @param[in] handle
  * @param[out] out output COO, the nnz will be computed allocate() will be called in this function.
  * @param[in] rows COO rows array, size nnz
  * @param[in] cols COO cols array, size nnz
@@ -64,7 +65,6 @@ void compute_duplicates_mask(
  * @param[in] nnz number of nonzeros in COO input arrays
  * @param[in] m number of rows in COO input matrix
  * @param[in] n number of columns in COO input matrix
- * @param[in] stream cuda ops will be ordered wrt this stream
  */
 template <typename value_idx, typename value_t>
 void max_duplicates(const raft::handle_t& handle,
diff --git a/cpp/include/raft/sparse/op/slice.hpp b/cpp/include/raft/sparse/op/slice.hpp
index e73063d0d6..917233319c 100644
--- a/cpp/include/raft/sparse/op/slice.hpp
+++ b/cpp/include/raft/sparse/op/slice.hpp
@@ -49,8 +49,8 @@ void csr_row_slice_indptr(value_idx start_row,
 
 /**
  * Slice rows from a CSR, populate column and data arrays
- * @tparam[in] value_idx : data type of CSR index arrays
- * @tparam[in] value_t : data type of CSR data array
+ * @tparam value_idx : data type of CSR index arrays
+ * @tparam value_t : data type of CSR data array
  * @param[in] start_offset : beginning column offset to slice
  * @param[in] stop_offset : ending column offset to slice
  * @param[in] indices : column indices array from input CSR
@@ -74,4 +74,4 @@ void csr_row_slice_populate(value_idx start_offset,
 
 };  // namespace op
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/selection/connect_components.hpp b/cpp/include/raft/sparse/selection/connect_components.hpp
index bc03cd56e2..23d247b50e 100644
--- a/cpp/include/raft/sparse/selection/connect_components.hpp
+++ b/cpp/include/raft/sparse/selection/connect_components.hpp
@@ -54,9 +54,11 @@ value_idx get_n_components(value_idx* colors, size_t n_rows, cudaStream_t stream
  * @param[out] out output edge list containing nearest cross-component
  *             edges.
  * @param[in] X original (row-major) dense matrix for which knn graph should be constructed.
- * @param[in] colors array containing component number for each row of X
+ * @param[in] orig_colors array containing component number for each row of X
  * @param[in] n_rows number of rows in X
  * @param[in] n_cols number of cols in X
+ * @param[in] reduction_op
+ * @param[in] metric
  */
 template <typename value_idx, typename value_t, typename red_op>
 void connect_components(
@@ -73,4 +75,4 @@ void connect_components(
 }
 
 };  // end namespace linkage
-};  // end namespace raft
\ No newline at end of file
+};  // end namespace raft
diff --git a/cpp/include/raft/sparse/selection/knn.hpp b/cpp/include/raft/sparse/selection/knn.hpp
index 63d9da6114..ddfb74dedc 100644
--- a/cpp/include/raft/sparse/selection/knn.hpp
+++ b/cpp/include/raft/sparse/selection/knn.hpp
@@ -30,8 +30,9 @@ namespace selection {
  * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1)
  * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz)
  * @param[in] idxData csr data array of the index matrix (size idxNNZ)
- * @param[in] idxNNA number of non-zeros for sparse index matrix
+ * @param[in] idxNNZ number of non-zeros for sparse index matrix
  * @param[in] n_idx_rows number of data samples in index matrix
+ * @param[in] n_idx_cols
  * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1)
  * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ)
  * @param[in] queryData csr data array of the query matrix (size queryNNZ)
@@ -41,7 +42,7 @@ namespace selection {
  * @param[out] output_indices dense matrix for output indices (size n_query_rows * k)
  * @param[out] output_dists dense matrix for output distances (size n_query_rows * k)
  * @param[in] k the number of neighbors to query
- * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to
+ * @param[in] handle CUDA handle.get_stream() to order operations with respect to
  * @param[in] batch_size_index maximum number of rows to use from index matrix per batch
  * @param[in] batch_size_query maximum number of rows to use from query matrix per batch
  * @param[in] metric distance metric/measure to use
diff --git a/cpp/include/raft/sparse/selection/knn_graph.hpp b/cpp/include/raft/sparse/selection/knn_graph.hpp
index 9e540b6f6d..6d7cb826da 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.hpp
+++ b/cpp/include/raft/sparse/selection/knn_graph.hpp
@@ -40,7 +40,6 @@ namespace selection {
  * @param[in] n number of observations (columns) in X
  * @param[in] metric distance metric to use when constructing neighborhoods
  * @param[out] out output edge list
- * @param[out] out output edge list
  * @param c
  */
 template <typename value_idx = int, typename value_t = float>

From 084297fb559356b2bce24dc43e80a951824e2dcc Mon Sep 17 00:00:00 2001
From: AJ Schmidt <aschmidt@nvidia.com>
Date: Fri, 17 Dec 2021 17:13:30 -0500
Subject: [PATCH 066/171] update changelog

---
 CHANGELOG.md | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 521e535e86..49bae5eb7d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,33 @@
-# raft 21.12.00 (Date TBD)
+# raft 21.12.00 (9 Dec 2021)
 
-Please see https://github.com/rapidsai/raft/releases/tag/v21.12.00a for the latest changes to this development branch.
+## 🚨 Breaking Changes
+
+- Use 64 bit CuSolver API for Eigen decomposition ([#349](https://github.com/rapidsai/raft/pull/349)) [@lowener](https://github.com/lowener)
+
+## 🐛 Bug Fixes
+
+- Fixing bad host-&gt;device copy ([#375](https://github.com/rapidsai/raft/pull/375)) [@cjnolet](https://github.com/cjnolet)
+- Fix coalesced access checks in matrix_vector_op ([#372](https://github.com/rapidsai/raft/pull/372)) [@achirkin](https://github.com/achirkin)
+- Port libcudacxx patch from cudf ([#370](https://github.com/rapidsai/raft/pull/370)) [@dantegd](https://github.com/dantegd)
+- Fixing overflow in expanded distances ([#365](https://github.com/rapidsai/raft/pull/365)) [@cjnolet](https://github.com/cjnolet)
+
+## 📖 Documentation
+
+- Getting doxygen to run ([#371](https://github.com/rapidsai/raft/pull/371)) [@cjnolet](https://github.com/cjnolet)
+
+## 🛠️ Improvements
+
+- Upgrade `clang` to `11.1.0` ([#394](https://github.com/rapidsai/raft/pull/394)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix Changelog Merge Conflicts for `branch-21.12` ([#390](https://github.com/rapidsai/raft/pull/390)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Pin max `dask` &amp; `distributed` ([#388](https://github.com/rapidsai/raft/pull/388)) [@galipremsagar](https://github.com/galipremsagar)
+- Removing conflict w/ CUDA_CHECK ([#378](https://github.com/rapidsai/raft/pull/378)) [@cjnolet](https://github.com/cjnolet)
+- Update RAFT test directory ([#359](https://github.com/rapidsai/raft/pull/359)) [@viclafargue](https://github.com/viclafargue)
+- Update to UCX-Py 0.23 ([#358](https://github.com/rapidsai/raft/pull/358)) [@pentschev](https://github.com/pentschev)
+- Hiding implementation details for random, stats, and matrix ([#356](https://github.com/rapidsai/raft/pull/356)) [@divyegala](https://github.com/divyegala)
+- README updates ([#351](https://github.com/rapidsai/raft/pull/351)) [@cjnolet](https://github.com/cjnolet)
+- Use 64 bit CuSolver API for Eigen decomposition ([#349](https://github.com/rapidsai/raft/pull/349)) [@lowener](https://github.com/lowener)
+- Hiding implementation details for distance primitives (dense + sparse) ([#344](https://github.com/rapidsai/raft/pull/344)) [@cjnolet](https://github.com/cjnolet)
+- Unpin `dask` &amp; `distributed` in CI ([#338](https://github.com/rapidsai/raft/pull/338)) [@galipremsagar](https://github.com/galipremsagar)
 
 # raft 21.10.00 (7 Oct 2021)
 

From ccd5d755719cd6be570405d618de485b62202ca2 Mon Sep 17 00:00:00 2001
From: Mahesh Doijade <36705640+mdoijade@users.noreply.github.com>
Date: Tue, 21 Dec 2021 04:19:45 +0530
Subject: [PATCH 067/171] Fix bug in producer-consumer buffer exchange which
 occurs in UMAP test on GV100 (#429)

--  fix incorrect output in prod-cons code detected by UMAP test on GV100, the reason seems to be not using volatile and syncthreads.
-- enable fused L2 knn usage as all the issues are now resolved with this PR.

Authors:
  - Mahesh Doijade (https://github.com/mdoijade)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/429
---
 .../raft/spatial/knn/detail/fused_l2_knn.cuh  |  89 ++++++-------
 .../knn/detail/knn_brute_force_faiss.cuh      | 119 +++++++++---------
 2 files changed, 102 insertions(+), 106 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index 65115b2ccb..385e16383e 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -90,8 +90,8 @@ DI void storeWarpQShmem(myWarpSelect& heapArr,
 
 template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT, typename OutT>
 DI void storeWarpQGmem(myWarpSelect& heapArr,
-                       OutT* out_dists,
-                       IdxT* out_inds,
+                       volatile OutT* out_dists,
+                       volatile IdxT* out_inds,
                        const IdxT m,
                        const unsigned int numOfNN,
                        const IdxT starty)
@@ -115,8 +115,8 @@ DI void storeWarpQGmem(myWarpSelect& heapArr,
 
 template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT, typename OutT>
 DI void loadPrevTopKsGmemWarpQ(myWarpSelect& heapArr,
-                               OutT* out_dists,
-                               IdxT* out_inds,
+                               volatile OutT* out_dists,
+                               volatile IdxT* out_inds,
                                const IdxT m,
                                const unsigned int numOfNN,
                                const IdxT starty)
@@ -207,9 +207,9 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
                                                                   FinalLambda fin_op,
                                                                   bool sqrt,
                                                                   unsigned int numOfNN,
-                                                                  int* mutexes,
-                                                                  OutT* out_dists,
-                                                                  IdxT* out_inds)
+                                                                  volatile int* mutexes,
+                                                                  volatile OutT* out_dists,
+                                                                  volatile IdxT* out_inds)
 {
   extern __shared__ char smem[];
 
@@ -225,8 +225,6 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
                             IdxT gridStrideY) {
     if (gridDim.x == 1) { return; }
 
-    volatile int* mutex = mutexes;
-
     Pair* shDumpKV = nullptr;
     if (useNorms) {
       shDumpKV = (Pair*)(&smem[Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT))]);
@@ -240,7 +238,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
     //  0 -> consumer done consuming the buffer.
     // -1 -> consumer started consuming the buffer
     // -2 -> producer done filling the buffer
-    // blockIdx.x -> prod started to fill the buffer
+    //  1 -> prod acquired to fill the buffer
     if (blockIdx.x == 0) {
       auto cta_processed = 0;
       myWarpSelect heapArr1(identity, keyMax, numOfNN);
@@ -252,45 +250,41 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
 
       while (cta_processed < gridDim.x - 1) {
         if (threadIdx.x == 0) {
-          int32_t old = -3;
-          while (old != -1) {
-            old = atomicCAS((int*)&mutex[gridStrideY / Policy::Mblk], -2, -1);
-          }
-          __threadfence();
+          while (atomicCAS((int*)&mutexes[gridStrideY / Policy::Mblk], -2, -1) != -2)
+            ;
         }
+        __threadfence();
         __syncthreads();
 
 #pragma unroll
         for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-          const auto rowId      = starty + i * Policy::AccThRows;
-          const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+          const auto rowId = starty + i * Policy::AccThRows;
+          if (rowId < m) {
 #pragma unroll
-          for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) {
-            Pair otherKV;
-            otherKV.value  = identity;
-            otherKV.key    = keyMax;
-            const auto idx = j * warpSize + lid;
-            if (idx < numOfNN && rowId < m) {
-              otherKV.value                        = out_dists[rowId * numOfNN + idx];
-              otherKV.key                          = (uint32_t)out_inds[rowId * numOfNN + idx];
-              shDumpKV[shMemRowId * numOfNN + idx] = otherKV;
+            for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) {
+              Pair otherKV;
+              otherKV.value  = identity;
+              otherKV.key    = keyMax;
+              const auto idx = j * warpSize + lid;
+              if (idx < numOfNN) {
+                otherKV.value         = out_dists[rowId * numOfNN + idx];
+                otherKV.key           = (uint32_t)out_inds[rowId * numOfNN + idx];
+                const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+                shDumpKV[shMemRowId * numOfNN + idx] = otherKV;
+              }
             }
           }
         }
         __threadfence();
+        __syncthreads();
 
-        if (threadIdx.x == 0) {
-          mutex[gridStrideY / Policy::Mblk] = 0;
-          __threadfence();
-        }
+        if (threadIdx.x == 0) { atomicExch((int*)&mutexes[gridStrideY / Policy::Mblk], 0); }
+        __threadfence();
 
         // Perform merging of otherKV with topk's across warp.
-        __syncwarp();
-
 #pragma unroll
         for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-          const auto rowId      = starty + i * Policy::AccThRows;
-          const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+          const auto rowId = starty + i * Policy::AccThRows;
           if (rowId < m) {
 #pragma unroll
             for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) {
@@ -298,7 +292,10 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
               otherKV.value  = identity;
               otherKV.key    = keyMax;
               const auto idx = j * warpSize + lid;
-              if (idx < numOfNN) { otherKV = shDumpKV[shMemRowId * numOfNN + idx]; }
+              if (idx < numOfNN) {
+                const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+                otherKV               = shDumpKV[shMemRowId * numOfNN + idx];
+              }
               heapArr[i]->add(otherKV.value, otherKV.key);
             }
           }
@@ -317,33 +314,29 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
       storeWarpQGmem<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN, starty);
     } else {
       if (threadIdx.x == 0) {
-        int32_t old    = -1;
-        int32_t blkIdX = (int32_t)blockIdx.x;
-        while (old != blkIdX) {
-          old = atomicCAS((int*)&mutex[gridStrideY / Policy::Mblk], 0, blkIdX);
-        }
-        __threadfence();
+        while (atomicCAS((int*)&mutexes[gridStrideY / Policy::Mblk], 0, 1) != 0)
+          ;
       }
+      __threadfence();
       __syncthreads();
 
 #pragma unroll
       for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-        const auto rowId      = starty + i * Policy::AccThRows;
-        const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+        const auto rowId = starty + i * Policy::AccThRows;
         if (rowId < m) {
           for (int idx = lid; idx < numOfNN; idx += warpSize) {
-            Pair KVPair                      = shDumpKV[shMemRowId * numOfNN + idx];
+            const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+            Pair KVPair           = shDumpKV[shMemRowId * numOfNN + idx];
             out_dists[rowId * numOfNN + idx] = KVPair.value;
             out_inds[rowId * numOfNN + idx]  = (IdxT)KVPair.key;
           }
         }
       }
       __threadfence();
+      __syncthreads();
 
-      if (threadIdx.x == 0) {
-        mutex[gridStrideY / Policy::Mblk] = -2;
-        __threadfence();
-      }
+      if (threadIdx.x == 0) { atomicExch((int*)&mutexes[gridStrideY / Policy::Mblk], -2); }
+      __threadfence();
     }
   };
 
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 9aef395ad3..6e0ea1f538 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -294,64 +294,63 @@ void brute_force_knn_impl(
 
     auto stream = handle.get_next_usable_stream(i);
 
-    //    // TODO: Enable this once we figure out why it's causing pytest failures in cuml.
-    //    if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true &&
-    //        (metric == raft::distance::DistanceType::L2Unexpanded ||
-    //         metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
-    //         metric == raft::distance::DistanceType::L2Expanded ||
-    //         metric == raft::distance::DistanceType::L2SqrtExpanded)) {
-    //      fusedL2Knn(D,
-    //                 out_i_ptr,
-    //                 out_d_ptr,
-    //                 input[i],
-    //                 search_items,
-    //                 sizes[i],
-    //                 n,
-    //                 k,
-    //                 rowMajorIndex,
-    //                 rowMajorQuery,
-    //                 stream,
-    //                 metric);
-    //    } else {
-    switch (metric) {
-      case raft::distance::DistanceType::Haversine:
-
-        ASSERT(D == 2,
-               "Haversine distance requires 2 dimensions "
-               "(latitude / longitude).");
-
-        haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream);
-        break;
-      default:
-        faiss::MetricType m = build_faiss_metric(metric);
-
-        faiss::gpu::StandardGpuResources gpu_res;
-
-        gpu_res.noTempMemory();
-        gpu_res.setDefaultStream(device, stream);
-
-        faiss::gpu::GpuDistanceParams args;
-        args.metric          = m;
-        args.metricArg       = metricArg;
-        args.k               = k;
-        args.dims            = D;
-        args.vectors         = input[i];
-        args.vectorsRowMajor = rowMajorIndex;
-        args.numVectors      = sizes[i];
-        args.queries         = search_items;
-        args.queriesRowMajor = rowMajorQuery;
-        args.numQueries      = n;
-        args.outDistances    = out_d_ptr;
-        args.outIndices      = out_i_ptr;
-
-        /**
-         * @todo: Until FAISS supports pluggable allocation strategies,
-         * we will not reap the benefits of the pool allocator for
-         * avoiding device-wide synchronizations from cudaMalloc/cudaFree
-         */
-        bfKnn(&gpu_res, args);
+    if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true &&
+        (metric == raft::distance::DistanceType::L2Unexpanded ||
+         metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
+         metric == raft::distance::DistanceType::L2Expanded ||
+         metric == raft::distance::DistanceType::L2SqrtExpanded)) {
+      fusedL2Knn(D,
+                 out_i_ptr,
+                 out_d_ptr,
+                 input[i],
+                 search_items,
+                 sizes[i],
+                 n,
+                 k,
+                 rowMajorIndex,
+                 rowMajorQuery,
+                 stream,
+                 metric);
+    } else {
+      switch (metric) {
+        case raft::distance::DistanceType::Haversine:
+
+          ASSERT(D == 2,
+                 "Haversine distance requires 2 dimensions "
+                 "(latitude / longitude).");
+
+          haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream);
+          break;
+        default:
+          faiss::MetricType m = build_faiss_metric(metric);
+
+          faiss::gpu::StandardGpuResources gpu_res;
+
+          gpu_res.noTempMemory();
+          gpu_res.setDefaultStream(device, stream);
+
+          faiss::gpu::GpuDistanceParams args;
+          args.metric          = m;
+          args.metricArg       = metricArg;
+          args.k               = k;
+          args.dims            = D;
+          args.vectors         = input[i];
+          args.vectorsRowMajor = rowMajorIndex;
+          args.numVectors      = sizes[i];
+          args.queries         = search_items;
+          args.queriesRowMajor = rowMajorQuery;
+          args.numQueries      = n;
+          args.outDistances    = out_d_ptr;
+          args.outIndices      = out_i_ptr;
+
+          /**
+           * @todo: Until FAISS supports pluggable allocation strategies,
+           * we will not reap the benefits of the pool allocator for
+           * avoiding device-wide synchronizations from cudaMalloc/cudaFree
+           */
+          bfKnn(&gpu_res, args);
+      }
     }
-    //    }
 
     RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
@@ -377,7 +376,11 @@ void brute_force_knn_impl(
     float p = 0.5;  // standard l2
     if (metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / metricArg;
     raft::linalg::unaryOp<float>(
-      res_D, res_D, n * k, [p] __device__(float input) { return powf(input, p); }, userStream);
+      res_D,
+      res_D,
+      n * k,
+      [p] __device__(float input) { return powf(fabsf(input), p); },
+      userStream);
   }
 
   query_metric_processor->revert(search_items);

From c968df4b618fd5e00af4062f1a54574b345fcfe8 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Mon, 3 Jan 2022 12:20:35 -0800
Subject: [PATCH 068/171] Fix comms memory leak (#436)

Close #435

- [x] Add a virtual destructor to comms_iface (to invoke std_comms/mpi_comms destructors when std_comms/mpi_comms objects are deleted through a comms_iface pointer).
- [x] Set `owns_mpi_comms` to false in `initialize_mpi_comms` as this takes an already initialized `comm` as an input parameter so `comm` may better be destroyed by the caller (and no need to destroy if `comm` is MPI_COMM_WORLD, just calling MPI_Finalize will be sufficient).

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/raft/pull/436
---
 cpp/include/raft/comms/comms.hpp     | 2 ++
 cpp/include/raft/comms/mpi_comms.hpp | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index 68b8e723e9..0de84117e0 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -90,6 +90,8 @@ constexpr datatype_t get_type<double>()
 
 class comms_iface {
  public:
+  virtual ~comms_iface() {}
+
   virtual int get_size() const = 0;
   virtual int get_rank() const = 0;
 
diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp
index 5cdde29db5..432f250b59 100644
--- a/cpp/include/raft/comms/mpi_comms.hpp
+++ b/cpp/include/raft/comms/mpi_comms.hpp
@@ -439,7 +439,7 @@ class mpi_comms : public comms_iface {
 inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm)
 {
   auto communicator =
-    std::make_shared<comms_t>(std::unique_ptr<comms_iface>(new mpi_comms(comm, true)));
+    std::make_shared<comms_t>(std::unique_ptr<comms_iface>(new mpi_comms(comm, false)));
   handle->set_comms(communicator);
 };
 

From c958dc0b79db30a221f404b6399d42dccdf23672 Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Tue, 11 Jan 2022 12:38:02 -0800
Subject: [PATCH 069/171] Public apis for remainder of matrix and stats (#438)

At first, when I created public APIs for `matrix` and `stats` packages, I let implementation be in the public API if it was a simple thrust call or a call to another RAFT API. However, after further discussion, it's cleaner consistency to have the implementation always be in `detail`. I also added missing docs in this PR.

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/438
---
 cpp/include/raft/matrix/detail/math.cuh       | 339 ++++++++++++++++++
 cpp/include/raft/matrix/detail/matrix.cuh     | 109 ++++++
 cpp/include/raft/matrix/math.hpp              | 297 +++++++--------
 cpp/include/raft/matrix/matrix.hpp            |  79 +---
 cpp/include/raft/stats/detail/mean_center.cuh | 101 ++++++
 cpp/include/raft/stats/mean_center.hpp        |  26 +-
 6 files changed, 679 insertions(+), 272 deletions(-)
 create mode 100644 cpp/include/raft/stats/detail/mean_center.cuh

diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh
index 5e194b8dd4..95103ab98e 100644
--- a/cpp/include/raft/matrix/detail/math.cuh
+++ b/cpp/include/raft/matrix/detail/math.cuh
@@ -16,13 +16,352 @@
 
 #pragma once
 
+#include <raft/handle.hpp>
+
 #include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/map_then_reduce.cuh>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace matrix {
 namespace detail {
 
+template <typename math_t>
+void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream)
+{
+  auto d_src  = in;
+  auto d_dest = out;
+
+  raft::linalg::binaryOp(
+    d_dest,
+    d_src,
+    d_src,
+    len,
+    [=] __device__(math_t a, math_t b) { return scalar * a * b; },
+    stream);
+}
+
+template <typename math_t>
+void power(math_t* inout, math_t scalar, int len, cudaStream_t stream)
+{
+  power(inout, inout, scalar, len, stream);
+}
+
+template <typename math_t>
+void power(math_t* inout, int len, cudaStream_t stream)
+{
+  math_t scalar = 1.0;
+  power(inout, scalar, len, stream);
+}
+
+template <typename math_t>
+void power(math_t* in, math_t* out, int len, cudaStream_t stream)
+{
+  math_t scalar = 1.0;
+  power(in, out, scalar, len, stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void seqRoot(math_t* in,
+             math_t* out,
+             math_t scalar,
+             IdxType len,
+             cudaStream_t stream,
+             bool set_neg_zero = false)
+{
+  auto d_src  = in;
+  auto d_dest = out;
+
+  raft::linalg::unaryOp(
+    d_dest,
+    d_src,
+    len,
+    [=] __device__(math_t a) {
+      if (set_neg_zero) {
+        if (a < math_t(0)) {
+          return math_t(0);
+        } else {
+          return sqrt(a * scalar);
+        }
+      } else {
+        return sqrt(a * scalar);
+      }
+    },
+    stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void seqRoot(
+  math_t* inout, math_t scalar, IdxType len, cudaStream_t stream, bool set_neg_zero = false)
+{
+  seqRoot(inout, inout, scalar, len, stream, set_neg_zero);
+}
+
+template <typename math_t, typename IdxType = int>
+void seqRoot(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
+{
+  math_t scalar = 1.0;
+  seqRoot(in, out, scalar, len, stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void seqRoot(math_t* inout, IdxType len, cudaStream_t stream)
+{
+  math_t scalar = 1.0;
+  seqRoot(inout, inout, scalar, len, stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void setSmallValuesZero(
+  math_t* out, const math_t* in, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
+{
+  raft::linalg::unaryOp(
+    out,
+    in,
+    len,
+    [=] __device__(math_t a) {
+      if (a <= thres && -a <= thres) {
+        return math_t(0);
+      } else {
+        return a;
+      }
+    },
+    stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void setSmallValuesZero(math_t* inout, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
+{
+  setSmallValuesZero(inout, inout, len, stream, thres);
+}
+
+template <typename math_t, typename IdxType = int>
+void reciprocal(math_t* in,
+                math_t* out,
+                math_t scalar,
+                int len,
+                cudaStream_t stream,
+                bool setzero = false,
+                math_t thres = 1e-15)
+{
+  auto d_src  = in;
+  auto d_dest = out;
+
+  raft::linalg::unaryOp(
+    d_dest,
+    d_src,
+    len,
+    [=] __device__(math_t a) { return setzero && (abs(a) <= thres) ? math_t{0} : scalar / a; },
+    stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void reciprocal(math_t* inout,
+                math_t scalar,
+                IdxType len,
+                cudaStream_t stream,
+                bool setzero = false,
+                math_t thres = 1e-15)
+{
+  reciprocal(inout, inout, scalar, len, stream, setzero, thres);
+}
+
+template <typename math_t, typename IdxType = int>
+void reciprocal(math_t* inout, IdxType len, cudaStream_t stream)
+{
+  math_t scalar = 1.0;
+  reciprocal(inout, scalar, len, stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void reciprocal(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
+{
+  math_t scalar = 1.0;
+  reciprocal(in, out, scalar, len, stream);
+}
+
+template <typename math_t>
+void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_t stream = 0)
+{
+  raft::linalg::unaryOp(
+    out, in, len, [scalar] __device__(math_t in) { return scalar; }, stream);
+}
+
+template <typename math_t, typename IdxType = int>
+void ratio(
+  const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream)
+{
+  auto d_src  = src;
+  auto d_dest = dest;
+
+  rmm::device_scalar<math_t> d_sum(stream);
+  auto* d_sum_ptr = d_sum.data();
+  auto no_op      = [] __device__(math_t in) { return in; };
+  raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src);
+  raft::linalg::unaryOp(
+    d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); }, stream);
+}
+
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryMult(Type* data,
+                            const Type* vec,
+                            IdxType n_row,
+                            IdxType n_col,
+                            bool rowMajor,
+                            bool bcastAlongRows,
+                            cudaStream_t stream)
+{
+  raft::linalg::matrixVectorOp(
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a * b; },
+    stream);
+}
+
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryMultSkipZero(Type* data,
+                                    const Type* vec,
+                                    IdxType n_row,
+                                    IdxType n_col,
+                                    bool rowMajor,
+                                    bool bcastAlongRows,
+                                    cudaStream_t stream)
+{
+  raft::linalg::matrixVectorOp(
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) {
+      if (b == Type(0))
+        return a;
+      else
+        return a * b;
+    },
+    stream);
+}
+
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryDiv(Type* data,
+                           const Type* vec,
+                           IdxType n_row,
+                           IdxType n_col,
+                           bool rowMajor,
+                           bool bcastAlongRows,
+                           cudaStream_t stream)
+{
+  raft::linalg::matrixVectorOp(
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a / b; },
+    stream);
+}
+
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryDivSkipZero(Type* data,
+                                   const Type* vec,
+                                   IdxType n_row,
+                                   IdxType n_col,
+                                   bool rowMajor,
+                                   bool bcastAlongRows,
+                                   cudaStream_t stream,
+                                   bool return_zero = false)
+{
+  if (return_zero) {
+    raft::linalg::matrixVectorOp(
+      data,
+      data,
+      vec,
+      n_col,
+      n_row,
+      rowMajor,
+      bcastAlongRows,
+      [] __device__(Type a, Type b) {
+        if (raft::myAbs(b) < Type(1e-10))
+          return Type(0);
+        else
+          return a / b;
+      },
+      stream);
+  } else {
+    raft::linalg::matrixVectorOp(
+      data,
+      data,
+      vec,
+      n_col,
+      n_row,
+      rowMajor,
+      bcastAlongRows,
+      [] __device__(Type a, Type b) {
+        if (raft::myAbs(b) < Type(1e-10))
+          return a;
+        else
+          return a / b;
+      },
+      stream);
+  }
+}
+
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryAdd(Type* data,
+                           const Type* vec,
+                           IdxType n_row,
+                           IdxType n_col,
+                           bool rowMajor,
+                           bool bcastAlongRows,
+                           cudaStream_t stream)
+{
+  raft::linalg::matrixVectorOp(
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a + b; },
+    stream);
+}
+
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinarySub(Type* data,
+                           const Type* vec,
+                           IdxType n_row,
+                           IdxType n_col,
+                           bool rowMajor,
+                           bool bcastAlongRows,
+                           cudaStream_t stream)
+{
+  raft::linalg::matrixVectorOp(
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a - b; },
+    stream);
+}
+
 // Computes the argmax(d_in) column-wise in a DxN matrix
 template <typename T, int TPB>
 __global__ void argmaxKernel(const T* d_in, int D, int N, T* argmax)
diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index cf908c5e6d..f9cfffe64d 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -23,6 +23,14 @@
 
 #include <thrust/for_each.h>
 
+#include <algorithm>
+#include <cstddef>
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <raft/linalg/cublas_wrappers.h>
+
 namespace raft {
 namespace matrix {
 namespace detail {
@@ -56,6 +64,98 @@ void copyRows(const m_t* in,
   });
 }
 
+template <typename m_t, typename idx_t = int>
+void truncZeroOrigin(
+  m_t* in, idx_t in_n_rows, m_t* out, idx_t out_n_rows, idx_t out_n_cols, cudaStream_t stream)
+{
+  auto m         = out_n_rows;
+  auto k         = in_n_rows;
+  idx_t size     = out_n_rows * out_n_cols;
+  auto d_q       = in;
+  auto d_q_trunc = out;
+  auto counting  = thrust::make_counting_iterator<idx_t>(0);
+
+  thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(idx_t idx) {
+    idx_t row                = idx % m;
+    idx_t col                = idx / m;
+    d_q_trunc[col * m + row] = d_q[col * k + row];
+  });
+}
+
+template <typename m_t, typename idx_t = int>
+void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
+  auto n            = n_cols;
+  auto m            = n_rows;
+  idx_t size        = n_rows * n_cols;
+  auto d_q          = inout;
+  auto d_q_reversed = inout;
+  auto counting     = thrust::make_counting_iterator<idx_t>(0);
+
+  thrust::for_each(
+    rmm::exec_policy(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) {
+      idx_t dest_row             = idx % m;
+      idx_t dest_col             = idx / m;
+      idx_t src_row              = dest_row;
+      idx_t src_col              = (n - dest_col) - 1;
+      m_t temp                   = (m_t)d_q_reversed[idx];
+      d_q_reversed[idx]          = d_q[src_col * m + src_row];
+      d_q[src_col * m + src_row] = temp;
+    });
+}
+
+template <typename m_t, typename idx_t = int>
+void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
+  auto m            = n_rows;
+  idx_t size        = n_rows * n_cols;
+  auto d_q          = inout;
+  auto d_q_reversed = inout;
+  auto counting     = thrust::make_counting_iterator<idx_t>(0);
+
+  thrust::for_each(
+    rmm::exec_policy(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) {
+      idx_t dest_row = idx % m;
+      idx_t dest_col = idx / m;
+      idx_t src_row  = (m - dest_row) - 1;
+      ;
+      idx_t src_col = dest_col;
+
+      m_t temp                   = (m_t)d_q_reversed[idx];
+      d_q_reversed[idx]          = d_q[src_col * m + src_row];
+      d_q[src_col * m + src_row] = temp;
+    });
+}
+
+template <typename m_t, typename idx_t = int>
+void print(const m_t* in,
+           idx_t n_rows,
+           idx_t n_cols,
+           char h_separator    = ' ',
+           char v_separator    = '\n',
+           cudaStream_t stream = rmm::cuda_stream_default)
+{
+  std::vector<m_t> h_matrix = std::vector<m_t>(n_cols * n_rows);
+  raft::update_host(h_matrix.data(), in, n_cols * n_rows, stream);
+
+  for (idx_t i = 0; i < n_rows; i++) {
+    for (idx_t j = 0; j < n_cols; j++) {
+      printf("%1.4f%c", h_matrix[j * n_rows + i], j < n_cols - 1 ? h_separator : v_separator);
+    }
+  }
+}
+
+template <typename m_t, typename idx_t = int>
+void printHost(const m_t* in, idx_t n_rows, idx_t n_cols)
+{
+  for (idx_t i = 0; i < n_rows; i++) {
+    for (idx_t j = 0; j < n_cols; j++) {
+      printf("%1.4f ", in[j * n_rows + i]);
+    }
+    printf("\n");
+  }
+}
+
 /**
  * @brief Kernel for copying a slice of a big matrix to a small matrix with a
  * size matches that slice
@@ -173,6 +273,15 @@ void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
   matrixDiagonalInverse<m_t><<<grid, block, 0, stream>>>(in, len);
 }
 
+template <typename m_t, typename idx_t = int>
+m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream)
+{
+  cublasHandle_t cublasH = handle.get_cublas_handle();
+  m_t normval            = 0;
+  RAFT_CUBLAS_TRY(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream));
+  return normval;
+}
+
 }  // end namespace detail
 }  // end namespace matrix
 }  // end namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/math.hpp b/cpp/include/raft/matrix/math.hpp
index 8639cdfb02..619e20a702 100644
--- a/cpp/include/raft/matrix/math.hpp
+++ b/cpp/include/raft/matrix/math.hpp
@@ -18,14 +18,6 @@
 
 #include "detail/math.cuh"
 
-#include <raft/handle.hpp>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/map_then_reduce.cuh>
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/linalg/unary_op.cuh>
-#include <rmm/device_scalar.hpp>
-#include <rmm/device_uvector.hpp>
-
 namespace raft {
 namespace matrix {
 
@@ -45,16 +37,7 @@ namespace matrix {
 template <typename math_t>
 void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream)
 {
-  auto d_src  = in;
-  auto d_dest = out;
-
-  raft::linalg::binaryOp(
-    d_dest,
-    d_src,
-    d_src,
-    len,
-    [=] __device__(math_t a, math_t b) { return scalar * a * b; },
-    stream);
+  detail::power(in, out, scalar, len, stream);
 }
 
 /**
@@ -67,7 +50,7 @@ void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream)
 template <typename math_t>
 void power(math_t* inout, math_t scalar, int len, cudaStream_t stream)
 {
-  power(inout, inout, scalar, len, stream);
+  detail::power(inout, scalar, len, stream);
 }
 
 /**
@@ -79,8 +62,7 @@ void power(math_t* inout, math_t scalar, int len, cudaStream_t stream)
 template <typename math_t>
 void power(math_t* inout, int len, cudaStream_t stream)
 {
-  math_t scalar = 1.0;
-  power(inout, scalar, len, stream);
+  detail::power(inout, len, stream);
 }
 
 /**
@@ -94,8 +76,7 @@ void power(math_t* inout, int len, cudaStream_t stream)
 template <typename math_t>
 void power(math_t* in, math_t* out, int len, cudaStream_t stream)
 {
-  math_t scalar = 1.0;
-  power(in, out, scalar, len, stream);
+  detail::power(in, out, len, stream);
 }
 
 /**
@@ -117,25 +98,7 @@ void seqRoot(math_t* in,
              cudaStream_t stream,
              bool set_neg_zero = false)
 {
-  auto d_src  = in;
-  auto d_dest = out;
-
-  raft::linalg::unaryOp(
-    d_dest,
-    d_src,
-    len,
-    [=] __device__(math_t a) {
-      if (set_neg_zero) {
-        if (a < math_t(0)) {
-          return math_t(0);
-        } else {
-          return sqrt(a * scalar);
-        }
-      } else {
-        return sqrt(a * scalar);
-      }
-    },
-    stream);
+  detail::seqRoot(in, out, scalar, len, stream, set_neg_zero);
 }
 
 /**
@@ -152,7 +115,7 @@ template <typename math_t, typename IdxType = int>
 void seqRoot(
   math_t* inout, math_t scalar, IdxType len, cudaStream_t stream, bool set_neg_zero = false)
 {
-  seqRoot(inout, inout, scalar, len, stream, set_neg_zero);
+  detail::seqRoot(inout, scalar, len, stream, set_neg_zero);
 }
 
 /**
@@ -167,33 +130,38 @@ void seqRoot(
 template <typename math_t, typename IdxType = int>
 void seqRoot(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
 {
-  math_t scalar = 1.0;
-  seqRoot(in, out, scalar, len, stream);
+  detail::seqRoot(in, out, len, stream);
 }
 
+/**
+ * @brief Square root of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param inout: input matrix with in-place results
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
 template <typename math_t, typename IdxType = int>
 void seqRoot(math_t* inout, IdxType len, cudaStream_t stream)
 {
-  math_t scalar = 1.0;
-  seqRoot(inout, inout, scalar, len, stream);
+  detail::seqRoot(inout, len, stream);
 }
 
+/**
+ * @brief sets the small values to zero based on a defined threshold
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out: output matrix. The result is stored in the out matrix
+ * @param in: input matrix
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ * @param thres threshold to set values to zero
+ */
 template <typename math_t, typename IdxType = int>
 void setSmallValuesZero(
   math_t* out, const math_t* in, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
 {
-  raft::linalg::unaryOp(
-    out,
-    in,
-    len,
-    [=] __device__(math_t a) {
-      if (a <= thres && -a <= thres) {
-        return math_t(0);
-      } else {
-        return a;
-      }
-    },
-    stream);
+  detail::setSmallValuesZero(out, in, len, stream, thres);
 }
 
 /**
@@ -208,7 +176,7 @@ void setSmallValuesZero(
 template <typename math_t, typename IdxType = int>
 void setSmallValuesZero(math_t* inout, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
 {
-  setSmallValuesZero(inout, inout, len, stream, thres);
+  detail::setSmallValuesZero(inout, len, stream, thres);
 }
 
 /**
@@ -233,27 +201,20 @@ void reciprocal(math_t* in,
                 bool setzero = false,
                 math_t thres = 1e-15)
 {
-  auto d_src  = in;
-  auto d_dest = out;
-
-  raft::linalg::unaryOp(
-    d_dest,
-    d_src,
-    len,
-    [=] __device__(math_t a) { return setzero && (abs(a) <= thres) ? math_t{0} : scalar / a; },
-    stream);
+  detail::reciprocal(in, out, scalar, len, stream, setzero, thres);
 }
 
 /**
  * @brief Reciprocal of every element in the input matrix
  * @tparam math_t data-type upon which the math operation will be performed
  * @tparam IdxType Integer type used to for addressing
- * @param inout: input matrix and also the result is stored
+ * @param inout: input matrix with in-place results
  * @param scalar: every element is multiplied with scalar
  * @param len: number elements of input matrix
  * @param stream cuda stream
- * @param setzero: (default false) when true and |value|<thres, avoid dividing by (almost) zero
- * @param thres: Threshold to avoid dividing by zero (|value| < thres -> result = 0)
+ * @param setzero round down to zero if the input is less the threshold
+ * @param thres the threshold used to forcibly set inputs to zero
+ * @{
  */
 template <typename math_t, typename IdxType = int>
 void reciprocal(math_t* inout,
@@ -263,7 +224,7 @@ void reciprocal(math_t* inout,
                 bool setzero = false,
                 math_t thres = 1e-15)
 {
-  reciprocal(inout, inout, scalar, len, stream, setzero, thres);
+  detail::reciprocal(inout, scalar, len, stream, setzero, thres);
 }
 
 /**
@@ -277,8 +238,7 @@ void reciprocal(math_t* inout,
 template <typename math_t, typename IdxType = int>
 void reciprocal(math_t* inout, IdxType len, cudaStream_t stream)
 {
-  math_t scalar = 1.0;
-  reciprocal(inout, scalar, len, stream);
+  detail::reciprocal(inout, len, stream);
 }
 
 /**
@@ -293,15 +253,22 @@ void reciprocal(math_t* inout, IdxType len, cudaStream_t stream)
 template <typename math_t, typename IdxType = int>
 void reciprocal(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
 {
-  math_t scalar = 1.0;
-  reciprocal(in, out, scalar, len, stream);
+  detail::reciprocal(in, out, len, stream);
 }
 
+/**
+ * @brief set values to scalar in matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @param out output matrix. The result is stored in the out matrix
+ * @param in input matrix
+ * @param scalar svalar value
+ * @param len number elements of input matrix
+ * @param stream cuda stream
+ */
 template <typename math_t>
 void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_t stream = 0)
 {
-  raft::linalg::unaryOp(
-    out, in, len, [scalar] __device__(math_t in) { return scalar; }, stream);
+  detail::setValue(out, in, scalar, len, stream);
 }
 
 /**
@@ -318,15 +285,7 @@ template <typename math_t, typename IdxType = int>
 void ratio(
   const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream)
 {
-  auto d_src  = src;
-  auto d_dest = dest;
-
-  rmm::device_scalar<math_t> d_sum(stream);
-  auto* d_sum_ptr = d_sum.data();
-  auto no_op      = [] __device__(math_t in) { return in; };
-  raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src);
-  raft::linalg::unaryOp(
-    d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); }, stream);
+  detail::ratio(handle, src, dest, len, stream);
 }
 
 /** @} */
@@ -359,6 +318,16 @@ void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream)
   detail::signFlip(inout, n_rows, n_cols, stream);
 }
 
+/**
+ * @brief multiply each row or column of matrix with vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
 template <typename Type, typename IdxType = int, int TPB = 256>
 void matrixVectorBinaryMult(Type* data,
                             const Type* vec,
@@ -368,18 +337,20 @@ void matrixVectorBinaryMult(Type* data,
                             bool bcastAlongRows,
                             cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp(
-    data,
-    data,
-    vec,
-    n_col,
-    n_row,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a * b; },
-    stream);
+  detail::matrixVectorBinaryMult<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
 }
 
+/**
+ * @brief multiply each row or column of matrix with vector, skipping zeros in vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
 template <typename Type, typename IdxType = int, int TPB = 256>
 void matrixVectorBinaryMultSkipZero(Type* data,
                                     const Type* vec,
@@ -389,23 +360,20 @@ void matrixVectorBinaryMultSkipZero(Type* data,
                                     bool bcastAlongRows,
                                     cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp(
-    data,
-    data,
-    vec,
-    n_col,
-    n_row,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) {
-      if (b == Type(0))
-        return a;
-      else
-        return a * b;
-    },
-    stream);
+  detail::matrixVectorBinaryMultSkipZero<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
 }
 
+/**
+ * @brief divide each row or column of matrix with vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
 template <typename Type, typename IdxType = int, int TPB = 256>
 void matrixVectorBinaryDiv(Type* data,
                            const Type* vec,
@@ -415,18 +383,22 @@ void matrixVectorBinaryDiv(Type* data,
                            bool bcastAlongRows,
                            cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp(
-    data,
-    data,
-    vec,
-    n_col,
-    n_row,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a / b; },
-    stream);
+  detail::matrixVectorBinaryDiv<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
 }
 
+/**
+ * @brief divide each row or column of matrix with vector, skipping zeros in vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ * @param return_zero result is zero if true and vector value is below threshold, original value if
+ * false
+ */
 template <typename Type, typename IdxType = int, int TPB = 256>
 void matrixVectorBinaryDivSkipZero(Type* data,
                                    const Type* vec,
@@ -437,41 +409,20 @@ void matrixVectorBinaryDivSkipZero(Type* data,
                                    cudaStream_t stream,
                                    bool return_zero = false)
 {
-  if (return_zero) {
-    raft::linalg::matrixVectorOp(
-      data,
-      data,
-      vec,
-      n_col,
-      n_row,
-      rowMajor,
-      bcastAlongRows,
-      [] __device__(Type a, Type b) {
-        if (raft::myAbs(b) < Type(1e-10))
-          return Type(0);
-        else
-          return a / b;
-      },
-      stream);
-  } else {
-    raft::linalg::matrixVectorOp(
-      data,
-      data,
-      vec,
-      n_col,
-      n_row,
-      rowMajor,
-      bcastAlongRows,
-      [] __device__(Type a, Type b) {
-        if (raft::myAbs(b) < Type(1e-10))
-          return a;
-        else
-          return a / b;
-      },
-      stream);
-  }
+  detail::matrixVectorBinaryDivSkipZero<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream, return_zero);
 }
 
+/**
+ * @brief add each row or column of matrix with vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
 template <typename Type, typename IdxType = int, int TPB = 256>
 void matrixVectorBinaryAdd(Type* data,
                            const Type* vec,
@@ -481,18 +432,20 @@ void matrixVectorBinaryAdd(Type* data,
                            bool bcastAlongRows,
                            cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp(
-    data,
-    data,
-    vec,
-    n_col,
-    n_row,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a + b; },
-    stream);
+  detail::matrixVectorBinaryAdd<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
 }
 
+/**
+ * @brief subtract each row or column of matrix with vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
 template <typename Type, typename IdxType = int, int TPB = 256>
 void matrixVectorBinarySub(Type* data,
                            const Type* vec,
@@ -502,16 +455,8 @@ void matrixVectorBinarySub(Type* data,
                            bool bcastAlongRows,
                            cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp(
-    data,
-    data,
-    vec,
-    n_col,
-    n_row,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a - b; },
-    stream);
+  detail::matrixVectorBinarySub<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
 }
 
 };  // end namespace matrix
diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp
index a89c28ab80..d3d98cb872 100644
--- a/cpp/include/raft/matrix/matrix.hpp
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -18,14 +18,6 @@
 
 #include "detail/matrix.cuh"
 
-#include <algorithm>
-#include <cstddef>
-#include <cuda_runtime.h>
-#include <cusolverDn.h>
-#include <raft/cudart_utils.h>
-#include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.h>
-
 namespace raft {
 namespace matrix {
 
@@ -87,18 +79,7 @@ template <typename m_t, typename idx_t = int>
 void truncZeroOrigin(
   m_t* in, idx_t in_n_rows, m_t* out, idx_t out_n_rows, idx_t out_n_cols, cudaStream_t stream)
 {
-  auto m         = out_n_rows;
-  auto k         = in_n_rows;
-  idx_t size     = out_n_rows * out_n_cols;
-  auto d_q       = in;
-  auto d_q_trunc = out;
-  auto counting  = thrust::make_counting_iterator<idx_t>(0);
-
-  thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(idx_t idx) {
-    idx_t row                = idx % m;
-    idx_t col                = idx / m;
-    d_q_trunc[col * m + row] = d_q[col * k + row];
-  });
+  detail::truncZeroOrigin(in, in_n_rows, out, out_n_rows, out_n_cols, stream);
 }
 
 /**
@@ -112,23 +93,7 @@ void truncZeroOrigin(
 template <typename m_t, typename idx_t = int>
 void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 {
-  auto n            = n_cols;
-  auto m            = n_rows;
-  idx_t size        = n_rows * n_cols;
-  auto d_q          = inout;
-  auto d_q_reversed = inout;
-  auto counting     = thrust::make_counting_iterator<idx_t>(0);
-
-  thrust::for_each(
-    rmm::exec_policy(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) {
-      idx_t dest_row             = idx % m;
-      idx_t dest_col             = idx / m;
-      idx_t src_row              = dest_row;
-      idx_t src_col              = (n - dest_col) - 1;
-      m_t temp                   = (m_t)d_q_reversed[idx];
-      d_q_reversed[idx]          = d_q[src_col * m + src_row];
-      d_q[src_col * m + src_row] = temp;
-    });
+  detail::colReverse(inout, n_rows, n_cols, stream);
 }
 
 /**
@@ -142,24 +107,7 @@ void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 template <typename m_t, typename idx_t = int>
 void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 {
-  auto m            = n_rows;
-  idx_t size        = n_rows * n_cols;
-  auto d_q          = inout;
-  auto d_q_reversed = inout;
-  auto counting     = thrust::make_counting_iterator<idx_t>(0);
-
-  thrust::for_each(
-    rmm::exec_policy(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) {
-      idx_t dest_row = idx % m;
-      idx_t dest_col = idx / m;
-      idx_t src_row  = (m - dest_row) - 1;
-      ;
-      idx_t src_col = dest_col;
-
-      m_t temp                   = (m_t)d_q_reversed[idx];
-      d_q_reversed[idx]          = d_q[src_col * m + src_row];
-      d_q[src_col * m + src_row] = temp;
-    });
+  detail::rowReverse(inout, n_rows, n_cols, stream);
 }
 
 /**
@@ -179,14 +127,7 @@ void print(const m_t* in,
            char v_separator    = '\n',
            cudaStream_t stream = rmm::cuda_stream_default)
 {
-  std::vector<m_t> h_matrix = std::vector<m_t>(n_cols * n_rows);
-  raft::update_host(h_matrix.data(), in, n_cols * n_rows, stream);
-
-  for (idx_t i = 0; i < n_rows; i++) {
-    for (idx_t j = 0; j < n_cols; j++) {
-      printf("%1.4f%c", h_matrix[j * n_rows + i], j < n_cols - 1 ? h_separator : v_separator);
-    }
-  }
+  detail::print(in, n_rows, n_cols, h_separator, v_separator, stream);
 }
 
 /**
@@ -198,12 +139,7 @@ void print(const m_t* in,
 template <typename m_t, typename idx_t = int>
 void printHost(const m_t* in, idx_t n_rows, idx_t n_cols)
 {
-  for (idx_t i = 0; i < n_rows; i++) {
-    for (idx_t j = 0; j < n_cols; j++) {
-      printf("%1.4f ", in[j * n_rows + i]);
-    }
-    printf("\n");
-  }
+  detail::printHost(in, n_rows, n_cols);
 }
 
 /**
@@ -284,10 +220,7 @@ void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
 template <typename m_t, typename idx_t = int>
 m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream)
 {
-  cublasHandle_t cublasH = handle.get_cublas_handle();
-  m_t normval            = 0;
-  RAFT_CUBLAS_TRY(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream));
-  return normval;
+  return detail::getL2Norm(handle, in, size, stream);
 }
 
 };  // end namespace matrix
diff --git a/cpp/include/raft/stats/detail/mean_center.cuh b/cpp/include/raft/stats/detail/mean_center.cuh
new file mode 100644
index 0000000000..1a4fc20c51
--- /dev/null
+++ b/cpp/include/raft/stats/detail/mean_center.cuh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/vectorized.cuh>
+
+namespace raft {
+namespace stats {
+namespace detail {
+
+/**
+ * @brief Center the input matrix wrt its mean
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads per block of the cuda kernel launched
+ * @param out the output mean-centered matrix
+ * @param data input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param rowMajor whether input is row or col major
+ * @param bcastAlongRows whether to broadcast vector along rows or columns
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void meanCenter(Type* out,
+                const Type* data,
+                const Type* mu,
+                IdxType D,
+                IdxType N,
+                bool rowMajor,
+                bool bcastAlongRows,
+                cudaStream_t stream)
+{
+  raft::linalg::matrixVectorOp(
+    out,
+    data,
+    mu,
+    D,
+    N,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a - b; },
+    stream);
+}
+
+/**
+ * @brief Add the input matrix wrt its mean
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads per block of the cuda kernel launched
+ * @param out the output mean-added matrix
+ * @param data input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param rowMajor whether input is row or col major
+ * @param bcastAlongRows whether to broadcast vector along rows or columns
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void meanAdd(Type* out,
+             const Type* data,
+             const Type* mu,
+             IdxType D,
+             IdxType N,
+             bool rowMajor,
+             bool bcastAlongRows,
+             cudaStream_t stream)
+{
+  raft::linalg::matrixVectorOp(
+    out,
+    data,
+    mu,
+    D,
+    N,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a + b; },
+    stream);
+}
+
+};  // end namespace detail
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/mean_center.hpp b/cpp/include/raft/stats/mean_center.hpp
index c0ba24312b..406a0b5047 100644
--- a/cpp/include/raft/stats/mean_center.hpp
+++ b/cpp/include/raft/stats/mean_center.hpp
@@ -16,9 +16,7 @@
 
 #pragma once
 
-#include <raft/cuda_utils.cuh>
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/vectorized.cuh>
+#include "detail/mean_center.cuh"
 
 namespace raft {
 namespace stats {
@@ -47,16 +45,7 @@ void meanCenter(Type* out,
                 bool bcastAlongRows,
                 cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp(
-    out,
-    data,
-    mu,
-    D,
-    N,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a - b; },
-    stream);
+  detail::meanCenter<Type, IdxType, TPB>(out, data, mu, D, N, rowMajor, bcastAlongRows, stream);
 }
 
 /**
@@ -83,16 +72,7 @@ void meanAdd(Type* out,
              bool bcastAlongRows,
              cudaStream_t stream)
 {
-  raft::linalg::matrixVectorOp(
-    out,
-    data,
-    mu,
-    D,
-    N,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a + b; },
-    stream);
+  detail::meanAdd<Type, IdxType, TPB>(out, data, mu, D, N, rowMajor, bcastAlongRows, stream);
 }
 
 };  // end namespace stats

From 15fd1d3faf151480a8fd2d259a12b4487d00700d Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Wed, 12 Jan 2022 15:26:50 +0100
Subject: [PATCH 070/171] Faster matrix-vector-ops (#401)

Introduce `matrixLinewiseOp` for applying row- or column-wise operations on matrices with (templated) fixed number of vectors. This is a rewriting of `matrixVectorOp`.

The new primitive is on average 2x faster for various numbers of columns/rows. In general case: it improves performance by reusing the vector values across multiple matrix rows/columns (trying to load vector value once or at least cache it). In edge case: it tries to use vectorized load/store operations on the input/output matrices even when the pointers are not properly aligned, or the vectors' length is not multiple of the alignment.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/401
---
 cpp/include/raft/linalg/matrix_vector_op.cuh  | 189 +-----
 .../raft/matrix/detail/linewise_op.cuh        | 546 ++++++++++++++++++
 cpp/include/raft/matrix/matrix.hpp            |  48 +-
 cpp/include/raft/pow2_utils.cuh               |  44 +-
 cpp/test/CMakeLists.txt                       |   1 +
 cpp/test/matrix/linewise_op.cu                | 277 +++++++++
 6 files changed, 904 insertions(+), 201 deletions(-)
 create mode 100644 cpp/include/raft/matrix/detail/linewise_op.cuh
 create mode 100644 cpp/test/matrix/linewise_op.cu

diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index bd80cf5d02..750eca0742 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -16,83 +16,11 @@
 
 #pragma once
 
-#include <raft/cuda_utils.cuh>
-#include <raft/pow2_utils.cuh>
-#include <raft/vectorized.cuh>
+#include <raft/matrix/matrix.hpp>
 
 namespace raft {
 namespace linalg {
 
-namespace {
-template <size_t VecBytes>
-struct AlignedAccess {
-  template <typename T>
-  static inline bool test(const T* matrix, size_t strideBytes)
-  {
-    return Pow2<VecBytes>::isAligned(matrix) && Pow2<VecBytes>::isAligned(strideBytes) &&
-           Pow2<sizeof(T)>::isAligned(VecBytes);
-  }
-};
-};  // namespace
-
-template <typename Type, int veclen_, typename Lambda, typename IdxType>
-__global__ void matrixVectorOpKernel(Type* out,
-                                     const Type* matrix,
-                                     const Type* vector,
-                                     IdxType D,
-                                     IdxType N,
-                                     bool rowMajor,
-                                     bool bcastAlongRows,
-                                     Lambda op)
-{
-  typedef TxN_t<Type, veclen_> VecType;
-  IdxType len = N * D;
-  IdxType idx = threadIdx.x;
-  idx += (IdxType)blockIdx.x * (IdxType)blockDim.x;
-  idx *= VecType::Ratio;
-  if (idx >= len) return;
-  IdxType vIdx;
-  VecType mat, vec;
-  ///@todo: yikes! use fast-int-div here.
-  ///@todo: shared mem for vector could help with perf
-  if (rowMajor && bcastAlongRows) {
-    vIdx = idx % D;
-    vec.load(vector, vIdx);
-  } else if (!rowMajor && !bcastAlongRows) {
-    vIdx = idx % N;
-    vec.load(vector, vIdx);
-  } else if (rowMajor && !bcastAlongRows) {
-    vIdx = idx / D;
-    vec.fill(vector[vIdx]);
-  } else {
-    vIdx = idx / N;
-    vec.fill(vector[vIdx]);
-  }
-  mat.load(matrix, idx);
-#pragma unroll
-  for (int i = 0; i < VecType::Ratio; ++i)
-    mat.val.data[i] = op(mat.val.data[i], vec.val.data[i]);
-  mat.store(out, idx);
-}
-
-template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
-void matrixVectorOpImpl(Type* out,
-                        const Type* matrix,
-                        const Type* vec,
-                        IdxType D,
-                        IdxType N,
-                        bool rowMajor,
-                        bool bcastAlongRows,
-                        Lambda op,
-                        cudaStream_t stream)
-{
-  IdxType len   = N * D;
-  IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
-  matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-}
-
 /**
  * @brief Operations for all the columns or rows with a given vector.
  * Caution : Threads process multiple elements to speed up processing. These
@@ -127,91 +55,10 @@ void matrixVectorOp(Type* out,
                     Lambda op,
                     cudaStream_t stream)
 {
-  IdxType stride      = rowMajor ? D : N;
-  size_t stride_bytes = stride * sizeof(Type);
-
-  if (AlignedAccess<16>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<8>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<4>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<2>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<1>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else {
-    matrixVectorOpImpl<Type, 1, Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  }
-}
-
-///@todo: come up with a cleaner interface to support these cases in future!
-
-template <typename Type, int veclen_, typename Lambda, typename IdxType>
-__global__ void matrixVectorOpKernel(Type* out,
-                                     const Type* matrix,
-                                     const Type* vector1,
-                                     const Type* vector2,
-                                     IdxType D,
-                                     IdxType N,
-                                     bool rowMajor,
-                                     bool bcastAlongRows,
-                                     Lambda op)
-{
-  typedef TxN_t<Type, veclen_> VecType;
-  IdxType len = N * D;
-  IdxType idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio;
-  if (idx >= len) return;
-  IdxType vIdx;
-  VecType mat, vec1, vec2;
-  ///@todo: yikes! use fast-int-div here.
-  ///@todo: shared mem for vector could help with perf
-  if (rowMajor && bcastAlongRows) {
-    vIdx = idx % D;
-    vec1.load(vector1, vIdx);
-    vec2.load(vector2, vIdx);
-  } else if (!rowMajor && !bcastAlongRows) {
-    vIdx = idx % N;
-    vec1.load(vector1, vIdx);
-    vec2.load(vector2, vIdx);
-  } else if (rowMajor && !bcastAlongRows) {
-    vIdx = idx / D;
-    vec1.fill(vector1[vIdx]);
-    vec2.fill(vector2[vIdx]);
-  } else {
-    vIdx = idx / N;
-    vec1.fill(vector1[vIdx]);
-    vec2.fill(vector2[vIdx]);
-  }
-  mat.load(matrix, idx);
-#pragma unroll
-  for (int i = 0; i < VecType::Ratio; ++i)
-    mat.val.data[i] = op(mat.val.data[i], vec1.val.data[i], vec2.val.data[i]);
-  mat.store(out, idx);
-}
-
-template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
-void matrixVectorOpImpl(Type* out,
-                        const Type* matrix,
-                        const Type* vec1,
-                        const Type* vec2,
-                        IdxType D,
-                        IdxType N,
-                        bool rowMajor,
-                        bool bcastAlongRows,
-                        Lambda op,
-                        cudaStream_t stream)
-{
-  IdxType nblks = raft::ceildiv(N * D, (IdxType)TPB);
-  matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  IdxType stride = rowMajor ? D : N;
+  IdxType nLines = rowMajor ? N : D;
+  return matrix::linewiseOp(
+    out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec);
 }
 
 /**
@@ -250,28 +97,10 @@ void matrixVectorOp(Type* out,
                     Lambda op,
                     cudaStream_t stream)
 {
-  IdxType stride      = rowMajor ? D : N;
-  size_t stride_bytes = stride * sizeof(Type);
-
-  if (AlignedAccess<16>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<8>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<4>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<2>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<1>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else {
-    matrixVectorOpImpl<Type, 1, Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  }
+  IdxType stride = rowMajor ? D : N;
+  IdxType nLines = rowMajor ? N : D;
+  return matrix::linewiseOp(
+    out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec1, vec2);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh
new file mode 100644
index 0000000000..63fa872f9d
--- /dev/null
+++ b/cpp/include/raft/matrix/detail/linewise_op.cuh
@@ -0,0 +1,546 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/pow2_utils.cuh>
+#include <raft/vectorized.cuh>
+
+namespace raft {
+namespace matrix {
+namespace detail {
+
+template <typename Type, typename IdxType, std::size_t VecBytes, int BlockSize>
+struct Linewise {
+  static constexpr IdxType VecElems = VecBytes / sizeof(Type);
+
+  typedef raft::TxN_t<Type, VecElems> Vec;
+  typedef raft::Pow2<VecBytes> AlignBytes;
+  typedef raft::Pow2<VecElems> AlignElems;
+  typedef raft::Pow2<raft::WarpSize> AlignWarp;
+
+  /**
+   * Compute op(matrix_in, vec_1, vec_2, ...) where vectors are applied across the
+   * matrix rows (one vector element per matrix row).
+   *
+   * It's assumed that `in` and `out` are aligned to the cuda-vector-size,
+   * and their length is multiple of that.
+   *
+   * Block work arrangement: blocked;
+   *     one warp works on a contiguous chunk of a matrix. Since the matrix is represented
+   *     as a flat array, such an arangement minimizes the number of times when a single
+   *     thread needs to reload the vector value at an index corresponding to the current
+   *     matrix row. Ideally, a thread would load a value from a vector only once, but that
+   *     is not possible if the vector size (= number of matrix rows) is too small or not
+   *     aligned with the cuda-vector-size.
+   *
+   * Note about rowDiv/rowMod:
+   *     these two represent the row/column indices in the original input matrices, before
+   *     it was converted to (Vec::io_t*) type (which possibly involves shifting a pointer
+   *     a bit to align to the cuda-vector-size). Thus, they are used to track the index for
+   *     the argument vectors only (the vector pointers are not altered in any way).
+   *
+   *
+   * @tparam Vecs a pack of pointers to vectors (Type*)
+   * @param [out] out (aligned part of) the output matrix
+   * @param [in] in (aligned part of) the input matrix
+   * @param [in] in_end end of the (aligned part of the) input matrix
+   * @param [in] rowLen number of elements in a row (NOT the vector size)
+   * @param [in] rowDiv the index in the vectors (= row num in the original unaligned input matrix)
+   * @param [in] rowMod the index within a row in the original unaligned input matrix.
+   * @param [in] op the function to apply
+   * @param [in] vecs pointers to the argument vectors.
+   *
+   */
+  template <typename Lambda, typename... Vecs>
+  static __device__ __forceinline__ void vectorCols(typename Vec::io_t* out,
+                                                    const typename Vec::io_t* in,
+                                                    const typename Vec::io_t* in_end,
+                                                    const IdxType rowLen,
+                                                    IdxType rowDiv,
+                                                    IdxType rowMod,
+                                                    Lambda op,
+                                                    Vecs... vecs) noexcept
+  {
+    constexpr IdxType warpPad = (AlignWarp::Value - 1) * VecElems;
+    Type args[sizeof...(Vecs)];
+    Vec v, w;
+    bool update = true;
+    for (; in < in_end; in += AlignWarp::Value, out += AlignWarp::Value, rowMod += warpPad) {
+      v.val.internal = __ldcv(in);
+      while (rowMod >= rowLen) {
+        rowMod -= rowLen;
+        rowDiv++;
+        update = true;
+      }
+      if (update) {
+        int l = 0;
+        ((args[l] = vecs[rowDiv], l++), ...);
+        update = false;
+      }
+#pragma unroll VecElems
+      for (int k = 0; k < VecElems; k++, rowMod++) {
+        if (rowMod == rowLen) {
+          rowMod = 0;
+          rowDiv++;
+          int l = 0;
+          ((args[l] = vecs[rowDiv], l++), ...);
+        }
+        int l         = 0;
+        w.val.data[k] = op(v.val.data[k], (std::ignore = vecs, args[l++])...);
+      }
+      *out = w.val.internal;
+    }
+  }
+
+  /**
+   * Compute op(matrix_in, vec_1, vec_2, ...) where vectors are applied along
+   * matrix rows (vector and matrix indices are 1-1).
+   *
+   * It's assumed that `in` and `out` are aligned to the cuda-vector-size,
+   * and their length is multiple of that.
+   *
+   * Block work arrangement: striped;
+   *     the grid size is chosen in such a way, that one thread always processes
+   *     the same vector elements. That's why there is no need to read the
+   *     vector arguments multiple times.
+   *
+   * @tparam Args a pack of raft::TxN_t<Type, VecElems>
+   * @param [out] out (aligned part of) the output matrix
+   * @param [in] in (aligned part of) the input matrix
+   * @param [in] len total length of (the aligned part of) the input/output matrices
+   * @param [in] op the function to apply
+   * @param [in] args the cuda-vector-sized chunks on input vectors (raft::TxN_t<Type, VecElems>)
+   */
+  template <typename Lambda, typename... Args>
+  static __device__ __forceinline__ void vectorRows(typename Vec::io_t* out,
+                                                    const typename Vec::io_t* in,
+                                                    const IdxType len,
+                                                    Lambda op,
+                                                    Args... args) noexcept
+  {
+    Vec v;
+    const IdxType d = BlockSize * gridDim.x;
+    for (IdxType i = threadIdx.x + blockIdx.x * BlockSize; i < len; i += d) {
+      v.val.internal = __ldcv(in + i);
+#pragma unroll VecElems
+      for (int k = 0; k < VecElems; k++)
+        v.val.data[k] = op(v.val.data[k], args.val.data[k]...);
+      __stwt(out + i, v.val.internal);
+    }
+  }
+
+  /**
+   * The helper for `vectorRows`. Loads the `raft::TxN_t<Type, VecElems>` chunk
+   * of a vector. Most of the time this is not aligned, so we load it thread-striped
+   * within a block and then use the shared memory to get a contiguous chunk.
+   *
+   * @param [in] shm a shared memory region for rearranging the data among threads
+   * @param [in] p pointer to a vector
+   * @param [in] blockOffset the offset of the current block into a vector.
+   * @param [in] rowLen the length of a vector.
+   * @return a contiguous chunk of a vector, suitable for `vectorRows`.
+   */
+  static __device__ __forceinline__ Vec loadVec(Type* shm,
+                                                const Type* p,
+                                                const IdxType blockOffset,
+                                                const IdxType rowLen) noexcept
+  {
+    IdxType j = blockOffset + threadIdx.x;
+#pragma unroll VecElems
+    for (int k = threadIdx.x; k < VecElems * BlockSize; k += BlockSize, j += BlockSize) {
+      while (j >= rowLen)
+        j -= rowLen;
+      shm[k] = p[j];
+    }
+    __syncthreads();
+    {
+      Vec out;
+      out.val.internal = reinterpret_cast<typename Vec::io_t*>(shm)[threadIdx.x];
+      return out;
+    }
+  }
+};
+
+/**
+ * This kernel prepares the inputs for the `vectorCols` function where the most of the
+ * work happens; see `vectorCols` for details.
+ *
+ * The work arrangement is blocked; a single block works on a contiguous chunk of flattened
+ * matrix data and does not care about the gridDim.
+ *
+ * @param [out] out the output matrix
+ * @param [in] in the input matrix
+ * @param [in] arrOffset such an offset into the matrices that makes them aligned to the
+ * cuda-vector-size
+ * @param [in] rowLen number of elements in a row (NOT the vector size)
+ * @param [in] len the total length of the aligned part of the matrices
+ * @param [in] elemsPerThread how many elements are processed by a single thread in total
+ * @param [in] op the function to apply
+ * @param [in] vecs pointers to the argument vectors
+ */
+template <typename Type,
+          typename IdxType,
+          std::size_t VecBytes,
+          int BlockSize,
+          typename Lambda,
+          typename... Vecs>
+__global__ void __launch_bounds__(BlockSize)
+  matrixLinewiseVecColsMainKernel(Type* out,
+                                  const Type* in,
+                                  const IdxType arrOffset,
+                                  const IdxType rowLen,
+                                  const IdxType len,
+                                  const IdxType elemsPerThread,
+                                  Lambda op,
+                                  Vecs... vecs)
+{
+  typedef Linewise<Type, IdxType, VecBytes, BlockSize> L;
+
+  IdxType t = L::AlignWarp::mod(threadIdx.x);
+  t = arrOffset + elemsPerThread * (blockIdx.x * BlockSize + threadIdx.x - t) + t * L::VecElems;
+
+  return L::vectorCols(reinterpret_cast<typename L::Vec::io_t*>(out + t),
+                       reinterpret_cast<const typename L::Vec::io_t*>(in + t),
+                       reinterpret_cast<const typename L::Vec::io_t*>(
+                         in + min(t + elemsPerThread * L::AlignWarp::Value, len)),
+                       rowLen,
+                       t / rowLen,
+                       t % rowLen,
+                       op,
+                       vecs...);
+}
+
+/**
+ * This kernel is similar to `matrixLinewiseVecColsMainKernel`, but processes only the unaligned
+ * head and tail parts of the matrix.
+ * This kernel is always launched in just two blocks; the first block processes the head of the
+ * matrix, the second block processes the tail. It uses the same `vectorCols` function, but
+ * sets `VecElems = 1`
+ *
+ * @param [out] out the output matrix
+ * @param [in] in the input matrix
+ * @param [in] arrOffset the length of the unaligned head - such an offset into the matrices that
+ * makes them aligned to the `VecBytes`
+ * @param [in] arrTail the offset to the unaligned tail
+ * @param [in] rowLen number of elements in a row (NOT the vector size)
+ * @param [in] len the total length of the matrices (rowLen * nRows)
+ * @param [in] op the function to apply
+ * @param [in] vecs pointers to the argument vectors
+ */
+template <typename Type, typename IdxType, std::size_t MaxOffset, typename Lambda, typename... Vecs>
+__global__ void __launch_bounds__(MaxOffset, 2)
+  matrixLinewiseVecColsTailKernel(Type* out,
+                                  const Type* in,
+                                  const IdxType arrOffset,
+                                  const IdxType arrTail,
+                                  const IdxType rowLen,
+                                  const IdxType len,
+                                  Lambda op,
+                                  Vecs... vecs)
+{
+  // Note, L::VecElems == 1
+  typedef Linewise<Type, IdxType, sizeof(Type), MaxOffset> L;
+  IdxType threadOffset, elemsPerWarp;
+  if (blockIdx.x == 0) {
+    // first block: offset = 0, length = arrOffset
+    threadOffset = threadIdx.x;
+    elemsPerWarp = threadOffset < arrOffset;
+  } else {
+    // second block: offset = arrTail, length = len - arrTail
+    threadOffset = arrTail + threadIdx.x;
+    elemsPerWarp = threadOffset < len;
+  }
+  const IdxType rowDiv = threadOffset / rowLen;
+  const IdxType rowMod = threadOffset % rowLen;
+  return L::vectorCols(
+    reinterpret_cast<typename L::Vec::io_t*>(out + threadOffset),
+    reinterpret_cast<const typename L::Vec::io_t*>(in + threadOffset),
+    reinterpret_cast<const typename L::Vec::io_t*>(in + threadOffset + elemsPerWarp),
+    rowLen,
+    rowDiv,
+    rowMod,
+    op,
+    vecs...);
+}
+
+/**
+ * This kernel prepares the inputs for the `vectorRows` function where the most of the
+ * work happens; see `vectorRows` for details.
+ *
+ * The work arrangement is striped; the gridDim should be selected in such a way, that
+ * on each iteration a thread processes the same indices along rows:
+ *   `(gridDim.x * BlockSize * VecElems) % rowLen == 0`.
+ *
+ * @param [out] out the start of the *aligned* part of the output matrix
+ * @param [in] in the start of the *aligned* part of the input matrix
+ * @param [in] arrOffset such an offset into the matrices that makes them aligned to `VecBytes`
+ * @param [in] rowLen number of elements in a row (= the vector size)
+ * @param [in] len the total length of the aligned part of the matrices
+ * @param [in] op the function to apply
+ * @param [in] vecs pointers to the argument vectors
+ */
+template <typename Type,
+          typename IdxType,
+          std::size_t VecBytes,
+          int BlockSize,
+          typename Lambda,
+          typename... Vecs>
+__global__ void __launch_bounds__(BlockSize)
+  matrixLinewiseVecRowsMainKernel(Type* out,
+                                  const Type* in,
+                                  const IdxType arrOffset,
+                                  const IdxType rowLen,
+                                  const IdxType len,
+                                  Lambda op,
+                                  Vecs... vecs)
+{
+  typedef Linewise<Type, IdxType, VecBytes, BlockSize> L;
+  constexpr uint workSize = L::VecElems * BlockSize;
+  uint workOffset         = workSize;
+  __shared__ alignas(sizeof(Type) * L::VecElems)
+    Type shm[workSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)];
+  const IdxType blockOffset = (arrOffset + BlockSize * L::VecElems * blockIdx.x) % rowLen;
+  return L::vectorRows(
+    reinterpret_cast<typename L::Vec::io_t*>(out),
+    reinterpret_cast<const typename L::Vec::io_t*>(in),
+    L::AlignElems::div(len),
+    op,
+    (workOffset ^= workSize, L::loadVec(shm + workOffset, vecs, blockOffset, rowLen))...);
+}
+
+/**
+ * This kernel is similar to `matrixLinewiseVecRowsMainKernel`, but processes only the unaligned
+ * head and tail parts of the matrix.
+ * This kernel is always launched in just two blocks; the first block processes the head of the
+ * matrix, the second block processes the tail. It uses the same `vectorRows` function, but
+ * sets `VecElems = 1`
+ *
+ * @param [out] out the output matrix
+ * @param [in] in the input matrix
+ * @param [in] arrOffset the length of the unaligned head - such an offset into the matrices that
+ * makes them aligned to the `VecBytes`
+ * @param [in] arrTail the offset to the unaligned tail
+ * @param [in] rowLen number of elements in a row (= the vector size)
+ * @param [in] len the total length of the matrices (rowLen * nRows)
+ * @param [in] op the function to apply
+ * @param [in] vecs pointers to the argument vectors
+ */
+template <typename Type, typename IdxType, std::size_t MaxOffset, typename Lambda, typename... Vecs>
+__global__ void __launch_bounds__(MaxOffset, 2)
+  matrixLinewiseVecRowsTailKernel(Type* out,
+                                  const Type* in,
+                                  const IdxType arrOffset,
+                                  const IdxType arrTail,
+                                  const IdxType rowLen,
+                                  const IdxType len,
+                                  Lambda op,
+                                  Vecs... vecs)
+{
+  // Note, L::VecElems == 1
+  constexpr uint workSize = MaxOffset;
+  uint workOffset         = workSize;
+  __shared__ Type shm[workSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)];
+  typedef Linewise<Type, IdxType, sizeof(Type), MaxOffset> L;
+  if (blockIdx.x == 0) {
+    // first block: offset = 0, length = arrOffset
+    L::vectorRows(reinterpret_cast<typename L::Vec::io_t*>(out),
+                  reinterpret_cast<const typename L::Vec::io_t*>(in),
+                  arrOffset,
+                  op,
+                  (workOffset ^= workSize, L::loadVec(shm + workOffset, vecs, 0, rowLen))...);
+  } else {
+    // second block: offset = arrTail, length = len - arrTail
+    // NB: I substract MaxOffset (= blockDim.x) to get the correct indexing for block 1
+    L::vectorRows(
+      reinterpret_cast<typename L::Vec::io_t*>(out + arrTail - MaxOffset),
+      reinterpret_cast<const typename L::Vec::io_t*>(in + arrTail - MaxOffset),
+      len - arrTail + MaxOffset,
+      op,
+      (workOffset ^= workSize, L::loadVec(shm + workOffset, vecs, arrTail % rowLen, rowLen))...);
+  }
+}
+
+/** Fully occupy GPU this many times for better work balancing. */
+static inline constexpr uint OptimalSmOccupancy = 16;
+
+/**
+ * Calculate the grid size to be `OptimalSmOccupancy * FullyOccupiedGPU`, where `FullyOccupiedGPU`
+ * is the maximum number of blocks fitting in all available SMs.
+ *
+ * @tparam BlockSize blockDim of the kernel.
+ * @return OptimalSmOccupancy * FullyOccupiedGPU
+ */
+template <int BlockSize>
+inline uint getOptimalGridSize()
+{
+  int devId, smCount, maxBlockSize;
+  RAFT_CUDA_TRY(cudaGetDevice(&devId));
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smCount, cudaDevAttrMultiProcessorCount, devId));
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&maxBlockSize, cudaDevAttrMaxThreadsPerBlock, devId));
+  return OptimalSmOccupancy * static_cast<uint>(smCount * maxBlockSize / BlockSize);
+}
+
+template <typename Type,
+          typename IdxType,
+          std::size_t VecBytes,
+          int BlockSize,
+          typename Lambda,
+          typename... Vecs>
+void matrixLinewiseVecCols(Type* out,
+                           const Type* in,
+                           const IdxType rowLen,
+                           const IdxType nRows,
+                           Lambda op,
+                           cudaStream_t stream,
+                           Vecs... vecs)
+{
+  typedef raft::Pow2<VecBytes> AlignBytes;
+  constexpr std::size_t VecElems = VecBytes / sizeof(Type);
+  const IdxType totalLen         = rowLen * nRows;
+  const Type* alignedStart       = AlignBytes::roundUp(in);
+  const IdxType alignedOff       = IdxType(alignedStart - in);
+  const IdxType alignedEnd       = IdxType(AlignBytes::roundDown(in + totalLen) - in);
+  const IdxType alignedLen       = alignedEnd - alignedOff;
+  if (alignedLen > 0) {
+    constexpr dim3 bs(BlockSize, 1, 1);
+    // Minimum size of the grid to make the device well occupied
+    const uint occupy = getOptimalGridSize<BlockSize>();
+    // does not make sense to have more blocks than this
+    const uint maxBlocks = raft::ceildiv<uint>(uint(alignedLen), bs.x * VecElems);
+    const dim3 gs(min(maxBlocks, occupy), 1, 1);
+    // The work arrangement is blocked on the block and warp levels;
+    //   see more details at Linewise::vectorCols.
+    // The value below determines how many scalar elements are processed by on thread in total.
+    const IdxType elemsPerThread =
+      raft::ceildiv<IdxType>(alignedLen, gs.x * VecElems * BlockSize) * VecElems;
+    matrixLinewiseVecColsMainKernel<Type, IdxType, VecBytes, BlockSize, Lambda, Vecs...>
+      <<<gs, bs, 0, stream>>>(out, in, alignedOff, rowLen, alignedLen, elemsPerThread, op, vecs...);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+  if (alignedLen < totalLen) {
+    // should be not smaller than the warp size for better branching
+    constexpr std::size_t MaxOffset = std::max(std::size_t(raft::WarpSize), VecBytes);
+    matrixLinewiseVecColsTailKernel<Type, IdxType, MaxOffset, Lambda, Vecs...>
+      <<<dim3(2, 1, 1), dim3(MaxOffset, 1, 1), 0, stream>>>(
+        out, in, alignedOff, alignedEnd, rowLen, totalLen, op, vecs...);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+}
+
+template <typename Type,
+          typename IdxType,
+          std::size_t VecBytes,
+          int BlockSize,
+          typename Lambda,
+          typename... Vecs>
+void matrixLinewiseVecRows(Type* out,
+                           const Type* in,
+                           const IdxType rowLen,
+                           const IdxType nRows,
+                           Lambda op,
+                           cudaStream_t stream,
+                           Vecs... vecs)
+{
+  typedef raft::Pow2<VecBytes> AlignBytes;
+  constexpr std::size_t VecElems = VecBytes / sizeof(Type);
+  const IdxType totalLen         = rowLen * nRows;
+  const Type* alignedStart       = AlignBytes::roundUp(in);
+  const IdxType alignedOff       = IdxType(alignedStart - in);
+  const IdxType alignedEnd       = IdxType(AlignBytes::roundDown(in + totalLen) - in);
+  const IdxType alignedLen       = alignedEnd - alignedOff;
+  if (alignedLen > 0) {
+    constexpr dim3 bs(BlockSize, 1, 1);
+    // The work arrangement is striped;
+    //   see more details at Linewise::vectorRows.
+    // Below is the work amount performed by one block in one iteration.
+    constexpr uint block_work_size = bs.x * uint(VecElems);
+    /* Here I would define `grid_work_size = lcm(block_work_size, rowLen)` (Least Common Multiple)
+       This way, the grid spans a set of one or more rows each iteration, and, most importantly,
+       on every iteration each row processes the same set of indices within a row (= the same set
+       of vector indices).
+       This means, each block needs to load the values from the vector arguments only once.
+       Sadly, sometimes `grid_work_size > rowLen*nRows`, and sometimes grid_work_size > UINT_MAX.
+       That's why I don't declare it here explicitly.
+       Instead, I straightaway compute the
+         expected_grid_size = lcm(block_work_size, rowLen) / block_work_size
+     */
+    const uint expected_grid_size = rowLen / raft::gcd(block_work_size, uint(rowLen));
+    // Minimum size of the grid to make the device well occupied
+    const uint occupy = getOptimalGridSize<BlockSize>();
+    const dim3 gs(min(
+                    // does not make sense to have more blocks than this
+                    raft::ceildiv<uint>(uint(totalLen), block_work_size),
+                    // increase the grid size to be not less than `occupy` while
+                    // still being the multiple of `expected_grid_size`
+                    raft::ceildiv<uint>(occupy, expected_grid_size) * expected_grid_size),
+                  1,
+                  1);
+
+    matrixLinewiseVecRowsMainKernel<Type, IdxType, VecBytes, BlockSize, Lambda, Vecs...>
+      <<<gs, bs, 0, stream>>>(
+        out + alignedOff, alignedStart, alignedOff, rowLen, alignedLen, op, vecs...);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+  if (alignedLen < totalLen) {
+    // should be not smaller than the warp size for better branching
+    constexpr std::size_t MaxOffset = std::max(std::size_t(raft::WarpSize), VecBytes);
+    matrixLinewiseVecRowsTailKernel<Type, IdxType, MaxOffset, Lambda, Vecs...>
+      <<<dim3(2, 1, 1), dim3(MaxOffset, 1, 1), 0, stream>>>(
+        out, in, alignedOff, alignedEnd, rowLen, totalLen, op, vecs...);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+}
+
+/**
+ * Select one of the implementations:
+ *   a. vectors applied along/across lines
+ *   b. recursively try different VecBytes, such that alignments of `in` and `out`
+ *      are the same.
+ *
+ * @tparam VecBytes - size of the load/store ops in bytes.
+ * @tparam BlockSize - is fixed and should not affect the performance.
+ */
+template <std::size_t VecBytes = 16, int BlockSize = 256>
+struct MatrixLinewiseOp {
+  template <typename Type, typename IdxType, typename Lambda, typename... Vecs>
+  static void run(Type* out,
+                  const Type* in,
+                  const IdxType lineLen,
+                  const IdxType nLines,
+                  const bool alongLines,
+                  Lambda op,
+                  cudaStream_t stream,
+                  Vecs... vecs)
+  {
+    if constexpr (VecBytes > sizeof(Type)) {
+      if (!raft::Pow2<VecBytes>::areSameAlignOffsets(in, out))
+        return MatrixLinewiseOp<std::max((VecBytes >> 1), sizeof(Type)), BlockSize>::run(
+          out, in, lineLen, nLines, alongLines, op, stream, vecs...);
+    }
+    if (alongLines)
+      return matrixLinewiseVecRows<Type, IdxType, VecBytes, BlockSize, Lambda, Vecs...>(
+        out, in, lineLen, nLines, op, stream, vecs...);
+    else
+      return matrixLinewiseVecCols<Type, IdxType, VecBytes, BlockSize, Lambda, Vecs...>(
+        out, in, lineLen, nLines, op, stream, vecs...);
+  }
+};
+
+}  // end namespace detail
+}  // end namespace matrix
+}  // end namespace raft
diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp
index d3d98cb872..e3e2f88d14 100644
--- a/cpp/include/raft/matrix/matrix.hpp
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,11 @@
 
 #pragma once
 
+#include "detail/linewise_op.cuh"
 #include "detail/matrix.cuh"
 
+#include <raft/common/nvtx.hpp>
+
 namespace raft {
 namespace matrix {
 
@@ -223,5 +226,48 @@ m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t st
   return detail::getL2Norm(handle, in, size, stream);
 }
 
+/**
+ * Run a function over matrix lines (rows or columns) with a variable number
+ * row-vectors or column-vectors.
+ * The term `line` here signifies that the lines can be either columns or rows,
+ * depending on the matrix layout.
+ * What matters is if the vectors are applied along lines (indices of vectors correspond to
+ * indices within lines), or across lines (indices of vectors correspond to line numbers).
+ *
+ * @param [out] out result of the operation; can be same as `in`; should be aligned the same
+ *        as `in` to allow faster vectorized memory transfers.
+ * @param [in] in input matrix consisting of `nLines` lines, each `lineLen`-long.
+ * @param [in] lineLen length of matrix line in elements (`=nCols` in row-major or `=nRows` in
+ * col-major)
+ * @param [in] nLines number of matrix lines (`=nRows` in row-major or `=nCols` in col-major)
+ * @param [in] alongLines whether vectors are indices along or across lines.
+ * @param [in] op the operation applied on each line:
+ *    for i in [0..lineLen) and j in [0..nLines):
+ *      out[i, j] = op(in[i, j], vec1[i], vec2[i], ... veck[i])   if alongLines = true
+ *      out[i, j] = op(in[i, j], vec1[j], vec2[j], ... veck[j])   if alongLines = false
+ *    where matrix indexing is row-major ([i, j] = [i + lineLen * j]).
+ * @param [in] stream a cuda stream for the kernels
+ * @param [in] vecs zero or more vectors to be passed as arguments,
+ *    size of each vector is `alongLines ? lineLen : nLines`.
+ */
+template <typename m_t, typename idx_t = int, typename Lambda, typename... Vecs>
+void linewiseOp(m_t* out,
+                const m_t* in,
+                const idx_t lineLen,
+                const idx_t nLines,
+                const bool alongLines,
+                Lambda op,
+                cudaStream_t stream,
+                Vecs... vecs)
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope("linewiseOp-%c-%zu (%zu, %zu)",
+                                                            alongLines ? 'l' : 'x',
+                                                            sizeof...(Vecs),
+                                                            size_t(lineLen),
+                                                            size_t(nLines));
+  detail::MatrixLinewiseOp<16, 256>::run<m_t, idx_t, Lambda, Vecs...>(
+    out, in, lineLen, nLines, alongLines, op, stream, vecs...);
+}
+
 };  // end namespace matrix
 };  // end namespace raft
diff --git a/cpp/include/raft/pow2_utils.cuh b/cpp/include/raft/pow2_utils.cuh
index 56a3192f9f..93f81db1ac 100644
--- a/cpp/include/raft/pow2_utils.cuh
+++ b/cpp/include/raft/pow2_utils.cuh
@@ -35,7 +35,9 @@ struct Pow2 {
   static_assert(std::is_integral<Type>::value, "Value must be integral.");
   static_assert(Value && !(Value & Mask), "Value must be power of two.");
 
-#define Pow2_IsRepresentableAs(I) (std::is_integral<I>::value && Type(I(Value)) == Value)
+#define Pow2_FUNC_QUALIFIER         static constexpr __host__ __device__ __forceinline__
+#define Pow2_WHEN_INTEGRAL(I)       std::enable_if_t<Pow2_IS_REPRESENTABLE_AS(I), I>
+#define Pow2_IS_REPRESENTABLE_AS(I) (std::is_integral<I>::value && Type(I(Value)) == Value)
 
   /**
    * Integer division by Value truncated toward zero
@@ -44,7 +46,7 @@ struct Pow2 {
    *  Invariant: `x = Value * quot(x) + rem(x)`
    */
   template <typename I>
-  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> quot(I x) noexcept
+  Pow2_FUNC_QUALIFIER Pow2_WHEN_INTEGRAL(I) quot(I x) noexcept
   {
     if constexpr (std::is_signed<I>::value) return (x >> I(Log2)) + (x < 0 && (x & I(Mask)));
     if constexpr (std::is_unsigned<I>::value) return x >> I(Log2);
@@ -57,7 +59,7 @@ struct Pow2 {
    *  Invariant: `x = Value * quot(x) + rem(x)`.
    */
   template <typename I>
-  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> rem(I x) noexcept
+  Pow2_FUNC_QUALIFIER Pow2_WHEN_INTEGRAL(I) rem(I x) noexcept
   {
     if constexpr (std::is_signed<I>::value) return x < 0 ? -((-x) & I(Mask)) : (x & I(Mask));
     if constexpr (std::is_unsigned<I>::value) return x & I(Mask);
@@ -74,7 +76,7 @@ struct Pow2 {
    * compared to normal C++ operators `/` and `%`.
    */
   template <typename I>
-  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> div(I x) noexcept
+  Pow2_FUNC_QUALIFIER Pow2_WHEN_INTEGRAL(I) div(I x) noexcept
   {
     return x >> I(Log2);
   }
@@ -91,7 +93,7 @@ struct Pow2 {
    * compared to normal C++ operators `/` and `%`.
    */
   template <typename I>
-  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> mod(I x) noexcept
+  Pow2_FUNC_QUALIFIER Pow2_WHEN_INTEGRAL(I) mod(I x) noexcept
   {
     return x & I(Mask);
   }
@@ -105,25 +107,25 @@ struct Pow2 {
    * NB: for pointers, the alignment is checked in bytes, not in elements.
    */
   template <typename PtrT>
-  static constexpr HDI bool isAligned(PtrT p) noexcept
+  Pow2_FUNC_QUALIFIER bool isAligned(PtrT p) noexcept
   {
     Pow2_CHECK_TYPE(PtrT);
-    if constexpr (Pow2_IsRepresentableAs(PtrT)) return mod(p) == 0;
-    if constexpr (!Pow2_IsRepresentableAs(PtrT)) return mod(reinterpret_cast<Type>(p)) == 0;
+    if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrT)) return mod(p) == 0;
+    if constexpr (!Pow2_IS_REPRESENTABLE_AS(PtrT)) return mod(reinterpret_cast<Type>(p)) == 0;
   }
 
   /** Tell whether two pointers have the same address modulo Value. */
   template <typename PtrT, typename PtrS>
-  static constexpr HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept
+  Pow2_FUNC_QUALIFIER bool areSameAlignOffsets(PtrT a, PtrS b) noexcept
   {
     Pow2_CHECK_TYPE(PtrT);
     Pow2_CHECK_TYPE(PtrS);
     Type x, y;
-    if constexpr (Pow2_IsRepresentableAs(PtrT))
+    if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrT))
       x = Type(mod(a));
     else
       x = mod(reinterpret_cast<Type>(a));
-    if constexpr (Pow2_IsRepresentableAs(PtrS))
+    if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrS))
       y = Type(mod(b));
     else
       y = mod(reinterpret_cast<Type>(b));
@@ -132,29 +134,31 @@ struct Pow2 {
 
   /** Get this or next Value-aligned address (in bytes) or integral. */
   template <typename PtrT>
-  static constexpr HDI PtrT roundUp(PtrT p) noexcept
+  Pow2_FUNC_QUALIFIER PtrT roundUp(PtrT p) noexcept
   {
     Pow2_CHECK_TYPE(PtrT);
-    if constexpr (Pow2_IsRepresentableAs(PtrT)) return p + PtrT(Mask) - mod(p + PtrT(Mask));
-    if constexpr (!Pow2_IsRepresentableAs(PtrT)) {
+    if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrT)) return (p + PtrT(Mask)) & PtrT(~Mask);
+    if constexpr (!Pow2_IS_REPRESENTABLE_AS(PtrT)) {
       auto x = reinterpret_cast<Type>(p);
-      return reinterpret_cast<PtrT>(x + Mask - mod(x + Mask));
+      return reinterpret_cast<PtrT>((x + Mask) & (~Mask));
     }
   }
 
   /** Get this or previous Value-aligned address (in bytes) or integral. */
   template <typename PtrT>
-  static constexpr HDI PtrT roundDown(PtrT p) noexcept
+  Pow2_FUNC_QUALIFIER PtrT roundDown(PtrT p) noexcept
   {
     Pow2_CHECK_TYPE(PtrT);
-    if constexpr (Pow2_IsRepresentableAs(PtrT)) return p - mod(p);
-    if constexpr (!Pow2_IsRepresentableAs(PtrT)) {
+    if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrT)) return p & PtrT(~Mask);
+    if constexpr (!Pow2_IS_REPRESENTABLE_AS(PtrT)) {
       auto x = reinterpret_cast<Type>(p);
-      return reinterpret_cast<PtrT>(x - mod(x));
+      return reinterpret_cast<PtrT>(x & (~Mask));
     }
   }
 #undef Pow2_CHECK_TYPE
-#undef Pow2_IsRepresentableAs
+#undef Pow2_IS_REPRESENTABLE_AS
+#undef Pow2_FUNC_QUALIFIER
+#undef Pow2_WHEN_INTEGRAL
 };
 
 };  // namespace raft
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index b37c671525..5bf836e8d5 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -63,6 +63,7 @@ add_executable(test_raft
     test/linalg/unary_op.cu
     test/matrix/math.cu
     test/matrix/matrix.cu
+    test/matrix/linewise_op.cu
     test/mr/device/buffer.cpp
     test/mr/host/buffer.cpp
     test/mst.cu
diff --git a/cpp/test/matrix/linewise_op.cu b/cpp/test/matrix/linewise_op.cu
new file mode 100644
index 0000000000..1cd00b8adc
--- /dev/null
+++ b/cpp/test/matrix/linewise_op.cu
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../linalg/matrix_vector_op.cuh"
+#include "../test_utils.h"
+#include <cuda_profiler_api.h>
+#include <gtest/gtest.h>
+#include <raft/common/nvtx.hpp>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/matrix/matrix.hpp>
+#include <raft/random/rng.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace matrix {
+
+constexpr std::size_t PTR_PADDING = 128;
+
+struct LinewiseTestParams {
+  double tolerance;
+  std::size_t workSizeBytes;
+  uint64_t seed;
+  bool checkCorrectness;
+  int inAlignOffset;
+  int outAlignOffset;
+};
+
+template <typename T, typename I, typename ParamsReader>
+struct LinewiseTest : public ::testing::TestWithParam<typename ParamsReader::Params> {
+  const LinewiseTestParams params;
+  const raft::handle_t handle;
+  rmm::cuda_stream_view stream;
+
+  LinewiseTest()
+    : testing::TestWithParam<typename ParamsReader::Params>(),
+      params(
+        ParamsReader::read(::testing::TestWithParam<typename ParamsReader::Params>::GetParam())),
+      handle(),
+      stream(handle.get_stream())
+  {
+  }
+
+  void runLinewiseSum(
+    T* out, const T* in, const I lineLen, const I nLines, const bool alongLines, const T* vec)
+  {
+    auto f = [] __device__(T a, T b) -> T { return a + b; };
+    matrix::linewiseOp(out, in, lineLen, nLines, alongLines, f, stream, vec);
+  }
+
+  void runLinewiseSum(T* out,
+                      const T* in,
+                      const I lineLen,
+                      const I nLines,
+                      const bool alongLines,
+                      const T* vec1,
+                      const T* vec2)
+  {
+    auto f = [] __device__(T a, T b, T c) -> T { return a + b + c; };
+    matrix::linewiseOp(out, in, lineLen, nLines, alongLines, f, stream, vec1, vec2);
+  }
+
+  rmm::device_uvector<T> genData(size_t workSizeBytes)
+  {
+    raft::random::Rng r(params.seed);
+    const std::size_t workSizeElems = workSizeBytes / sizeof(T);
+    rmm::device_uvector<T> blob(workSizeElems, stream);
+    r.uniform(blob.data(), workSizeElems, T(-1.0), T(1.0), stream);
+    return blob;
+  }
+
+  /**
+   * Suggest multiple versions of matrix dimensions (n, m), such that
+   *
+   * (2 * n * m + numVectors * m + minUnused) * sizeof(T) <= workSize.
+   *
+   * This way I know I can create two matrices and numVectors vectors of size m,
+   * such that they fit into the allocated workSet.
+   */
+  std::vector<std::tuple<I, I>> suggestDimensions(I numVectors)
+  {
+    const std::size_t workSizeElems = params.workSizeBytes / sizeof(T);
+    std::vector<std::tuple<I, I>> out;
+    const double b = double(numVectors);
+    const double s = double(workSizeElems) - double(PTR_PADDING * 2 * (2 + b));
+    double squareN = 0.25 * (sqrt(8.0 * s + b * b) - b);
+
+    auto solveForN       = [s, b](I m) -> double { return (s - b * double(m)) / double(2 * m); };
+    auto solveForM       = [s, b](I n) -> double { return s / double(2 * n + b); };
+    auto addIfMakesSense = [&out](double x, double y) {
+      if (x <= 0 || y <= 0) return;
+      I n = I(floor(x));
+      I m = I(floor(y));
+      if (n > 0 && m > 0) out.push_back(std::make_tuple(n, m));
+    };
+    std::vector<double> sizes = {15, 16, 17, 256, 257, 263, 1024};
+    addIfMakesSense(squareN, squareN);
+    for (I k : sizes) {
+      addIfMakesSense(solveForN(k), k);
+      addIfMakesSense(k, solveForM(k));
+    }
+
+    return out;
+  }
+
+  std::tuple<T*, const T*, const T*, const T*> assignSafePtrs(rmm::device_uvector<T>& blob,
+                                                              I n,
+                                                              I m)
+  {
+    typedef raft::Pow2<PTR_PADDING> Align;
+    T* out = Align::roundUp(blob.data()) + params.outAlignOffset;
+    const T* in =
+      const_cast<const T*>(Align::roundUp(out + n * m + PTR_PADDING)) + params.inAlignOffset;
+    const T* vec1 = Align::roundUp(in + n * m + PTR_PADDING);
+    const T* vec2 = Align::roundUp(vec1 + m + PTR_PADDING);
+    ASSERT(blob.data() + blob.size() >= vec2 + PTR_PADDING,
+           "Failed to allocate pointers: the workset is not big enough.");
+    return std::make_tuple(out, in, vec1, vec2);
+  }
+
+  testing::AssertionResult run(std::vector<std::tuple<I, I>>&& dims, rmm::device_uvector<T>&& blob)
+  {
+    rmm::device_uvector<T> blob_val(params.checkCorrectness ? blob.size() / 2 : 0, stream);
+
+    stream.synchronize();
+    cudaProfilerStart();
+    testing::AssertionResult r = testing::AssertionSuccess();
+    for (auto [n, m] : dims) {
+      if (!r) break;
+      auto [out, in, vec1, vec2] = assignSafePtrs(blob, n, m);
+      common::nvtx::range dims_scope("Dims-%zu-%zu", std::size_t(n), std::size_t(m));
+      for (auto alongRows : ::testing::Bool()) {
+        common::nvtx::range dir_scope(alongRows ? "alongRows" : "acrossRows");
+        auto lineLen = alongRows ? m : n;
+        auto nLines  = alongRows ? n : m;
+        {
+          {
+            common::nvtx::range vecs_scope("one vec");
+            runLinewiseSum(out, in, lineLen, nLines, alongRows, vec1);
+          }
+          if (params.checkCorrectness) {
+            linalg::naiveMatVec(
+              blob_val.data(), in, vec1, lineLen, nLines, true, alongRows, T(1), stream);
+            r = devArrMatch(blob_val.data(), out, n * m, CompareApprox<T>(params.tolerance))
+                << " " << (alongRows ? "alongRows" : "acrossRows")
+                << " with one vec; lineLen: " << lineLen << "; nLines " << nLines;
+            if (!r) break;
+          }
+          {
+            common::nvtx::range vecs_scope("two vecs");
+            runLinewiseSum(out, in, lineLen, nLines, alongRows, vec1, vec2);
+          }
+          if (params.checkCorrectness) {
+            linalg::naiveMatVec(
+              blob_val.data(), in, vec1, vec2, lineLen, nLines, true, alongRows, T(1), stream);
+            r = devArrMatch(blob_val.data(), out, n * m, CompareApprox<T>(params.tolerance))
+                << " " << (alongRows ? "alongRows" : "acrossRows")
+                << " with two vecs;  lineLen: " << lineLen << "; nLines " << nLines;
+            if (!r) break;
+          }
+        }
+      }
+    }
+    cudaProfilerStop();
+
+    return r;
+  }
+
+  testing::AssertionResult run()
+  {
+    return run(suggestDimensions(2), genData(params.workSizeBytes));
+  }
+
+  testing::AssertionResult runEdgeCases()
+  {
+    std::vector<I> sizes = {1, 2, 3, 4, 7, 16};
+    std::vector<std::tuple<I, I>> dims;
+    for (auto m : sizes) {
+      for (auto n : sizes) {
+        dims.push_back(std::make_tuple(n, m));
+        dims.push_back(std::make_tuple(m, n));
+      }
+    }
+
+    return run(std::move(dims), genData(1024 * 1024));
+  }
+};
+
+#define TEST_IT(fun, TestClass, ElemType, IndexType)                                         \
+  typedef LinewiseTest<ElemType, IndexType, TestClass> TestClass##_##ElemType##_##IndexType; \
+  TEST_P(TestClass##_##ElemType##_##IndexType, fun) { ASSERT_TRUE(fun()); }                  \
+  INSTANTIATE_TEST_SUITE_P(LinewiseOp, TestClass##_##ElemType##_##IndexType, TestClass##Params)
+
+auto TinyParams = ::testing::Combine(::testing::Values(0, 1, 2, 4), ::testing::Values(0, 1, 2, 3));
+
+struct Tiny {
+  typedef std::tuple<int, int> Params;
+  static LinewiseTestParams read(Params ps)
+  {
+    return {/** .tolerance */ 0.00001,
+            /** .workSizeBytes */ 0 /* not used anyway */,
+            /** .seed */ 42ULL,
+            /** .checkCorrectness */ true,
+            /** .inAlignOffset */ std::get<0>(ps),
+            /** .outAlignOffset */ std::get<1>(ps)};
+  }
+};
+
+auto MegabyteParams = TinyParams;
+
+struct Megabyte {
+  typedef std::tuple<int, int> Params;
+  static LinewiseTestParams read(Params ps)
+  {
+    return {/** .tolerance */ 0.00001,
+            /** .workSizeBytes */ 1024 * 1024,
+            /** .seed */ 42ULL,
+            /** .checkCorrectness */ true,
+            /** .inAlignOffset */ std::get<0>(ps),
+            /** .outAlignOffset */ std::get<1>(ps)};
+  }
+};
+
+auto GigabyteParams = ::testing::Combine(::testing::Values(0, 1, 2), ::testing::Values(0, 1, 2));
+
+struct Gigabyte {
+  typedef std::tuple<int, int> Params;
+  static LinewiseTestParams read(Params ps)
+  {
+    return {/** .tolerance */ 0.00001,
+            /** .workSizeBytes */ 1024 * 1024 * 1024,
+            /** .seed */ 42ULL,
+            /** .checkCorrectness */ false,
+            /** .inAlignOffset */ std::get<0>(ps),
+            /** .outAlignOffset */ std::get<1>(ps)};
+  }
+};
+
+auto TenGigsParams = GigabyteParams;
+
+struct TenGigs {
+  typedef std::tuple<int, int> Params;
+  static LinewiseTestParams read(Params ps)
+  {
+    return {/** .tolerance */ 0.00001,
+            /** .workSizeBytes */ 10ULL * 1024ULL * 1024ULL * 1024ULL,
+            /** .seed */ 42ULL,
+            /** .checkCorrectness */ false,
+            /** .inAlignOffset */ std::get<0>(ps),
+            /** .outAlignOffset */ std::get<1>(ps)};
+  }
+};
+
+TEST_IT(runEdgeCases, Tiny, float, int);
+TEST_IT(runEdgeCases, Tiny, double, int);
+TEST_IT(run, Megabyte, float, int);
+TEST_IT(run, Megabyte, double, int);
+TEST_IT(run, Gigabyte, float, int);
+TEST_IT(run, Gigabyte, double, int);
+TEST_IT(run, TenGigs, float, uint64_t);
+TEST_IT(run, TenGigs, double, uint64_t);
+
+}  // namespace matrix
+}  // end namespace raft

From 605458ee84ceadff2469cba605dcd9a52eedd9ed Mon Sep 17 00:00:00 2001
From: Matt Joux <mjoux@nvidia.com>
Date: Wed, 12 Jan 2022 19:11:51 +0100
Subject: [PATCH 071/171] error macros: determining buffer size instead of
 fixed 2048 chars (#420)

Determining buffer size for error macros instead of using fixed size of 2048 bytes.
I ran all tests (which use these macros extensively) as well as tested without CUDA driver to see that output looks as expected.

Closes #419.

Authors:
  - Matt Joux (https://github.com/MatthiasKohl)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/raft/pull/420
---
 cpp/include/raft/error.hpp                    | 56 ++++++++++++------
 .../raft/linalg/cholesky_r1_update.cuh        |  4 +-
 cpp/test/cudart_utils.cpp                     | 58 ++++++++++++++++++-
 3 files changed, 96 insertions(+), 22 deletions(-)

diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp
index 773b83ab13..0eba4326e6 100644
--- a/cpp/include/raft/error.hpp
+++ b/cpp/include/raft/error.hpp
@@ -16,8 +16,10 @@
 
 #pragma once
 
+#include <cstdio>
 #include <execinfo.h>
 #include <iostream>
+#include <memory>
 #include <sstream>
 #include <stdexcept>
 #include <string>
@@ -91,16 +93,23 @@ struct logic_error : public raft::exception {
 
 // FIXME: Need to be replaced with RAFT_FAIL
 /** macro to throw a runtime error */
-#define THROW(fmt, ...)                                                                    \
-  do {                                                                                     \
-    std::string msg;                                                                       \
-    char errMsg[2048]; /* NOLINT */                                                        \
-    std::snprintf(                                                                         \
-      errMsg, sizeof(errMsg), "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \
-    msg += errMsg;                                                                         \
-    std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__);                             \
-    msg += errMsg;                                                                         \
-    throw raft::exception(msg);                                                            \
+#define THROW(fmt, ...)                                                                      \
+  do {                                                                                       \
+    int size1 =                                                                              \
+      std::snprintf(nullptr, 0, "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \
+    int size2 = std::snprintf(nullptr, 0, fmt, ##__VA_ARGS__);                               \
+    if (size1 < 0 || size2 < 0)                                                              \
+      throw raft::exception("Error in snprintf, cannot handle raft exception.");             \
+    auto size = size1 + size2 + 1; /* +1 for final '\0' */                                   \
+    auto buf  = std::make_unique<char[]>(size_t(size));                                      \
+    std::snprintf(buf.get(),                                                                 \
+                  size1 + 1 /* +1 for '\0' */,                                               \
+                  "exception occured! file=%s line=%d: ",                                    \
+                  __FILE__,                                                                  \
+                  __LINE__);                                                                 \
+    std::snprintf(buf.get() + size1, size2 + 1 /* +1 for '\0' */, fmt, ##__VA_ARGS__);       \
+    std::string msg(buf.get(), buf.get() + size - 1); /* -1 to remove final '\0' */          \
+    throw raft::exception(msg);                                                              \
   } while (0)
 
 // FIXME: Need to be replaced with RAFT_EXPECTS
@@ -110,15 +119,24 @@ struct logic_error : public raft::exception {
     if (!(check)) THROW(fmt, ##__VA_ARGS__); \
   } while (0)
 
-#define SET_ERROR_MSG(msg, location_prefix, fmt, ...)                                 \
-  do {                                                                                \
-    char err_msg[2048]; /* NOLINT */                                                  \
-    std::snprintf(err_msg, sizeof(err_msg), location_prefix);                         \
-    msg += err_msg;                                                                   \
-    std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, __LINE__); \
-    msg += err_msg;                                                                   \
-    std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__);                      \
-    msg += err_msg;                                                                   \
+/**
+ * Macro to append error message to first argument.
+ * This should only be called in contexts where it is OK to throw exceptions!
+ */
+#define SET_ERROR_MSG(msg, location_prefix, fmt, ...)                                           \
+  do {                                                                                          \
+    int size1 = std::snprintf(nullptr, 0, "%s", location_prefix);                               \
+    int size2 = std::snprintf(nullptr, 0, "file=%s line=%d: ", __FILE__, __LINE__);             \
+    int size3 = std::snprintf(nullptr, 0, fmt, ##__VA_ARGS__);                                  \
+    if (size1 < 0 || size2 < 0 || size3 < 0)                                                    \
+      throw raft::exception("Error in snprintf, cannot handle raft exception.");                \
+    auto size = size1 + size2 + size3 + 1; /* +1 for final '\0' */                              \
+    auto buf  = std::make_unique<char[]>(size_t(size));                                         \
+    std::snprintf(buf.get(), size1 + 1 /* +1 for '\0' */, "%s", location_prefix);               \
+    std::snprintf(                                                                              \
+      buf.get() + size1, size2 + 1 /* +1 for '\0' */, "file=%s line=%d: ", __FILE__, __LINE__); \
+    std::snprintf(buf.get() + size1 + size2, size3 + 1 /* +1 for '\0' */, fmt, ##__VA_ARGS__);  \
+    msg += std::string(buf.get(), buf.get() + size - 1); /* -1 to remove final '\0' */          \
   } while (0)
 
 /**
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh
index 40009414ed..1745b0dcc8 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh
@@ -160,8 +160,8 @@ void choleskyRank1Update(const raft::handle_t& handle,
   math_t* s    = reinterpret_cast<math_t*>(((char*)workspace) + offset);
   math_t* L_22 = L + (n - 1) * ld + n - 1;
 
-  math_t* A_new;
-  math_t* A_row;
+  math_t* A_new = nullptr;
+  math_t* A_row = nullptr;
   if (uplo == CUBLAS_FILL_MODE_UPPER) {
     // A_new is stored as the n-1 th column of L
     A_new = L + (n - 1) * ld;
diff --git a/cpp/test/cudart_utils.cpp b/cpp/test/cudart_utils.cpp
index ff7588ce49..9df8600527 100644
--- a/cpp/test/cudart_utils.cpp
+++ b/cpp/test/cudart_utils.cpp
@@ -14,18 +14,74 @@
  * limitations under the License.
  */
 
+#include <raft/cudart_utils.h>
+
 #include <gtest/gtest.h>
+
 #include <iostream>
-#include <raft/cudart_utils.h>
+#include <regex>
 
 namespace raft {
 
+#define TEST_ADD_FILENAME(s)    \
+  {                             \
+    s += std::string{__FILE__}; \
+  }
+
+std::string reg_escape(const std::string& s)
+{
+  static const std::regex SPECIAL_CHARS{R"([-[\]{}()*+?.,\^$|#\s])"};
+  return std::regex_replace(s, SPECIAL_CHARS, R"(\$&)");
+}
+
 TEST(Raft, Utils)
 {
   ASSERT_NO_THROW(ASSERT(1 == 1, "Should not assert!"));
   ASSERT_THROW(ASSERT(1 != 1, "Should assert!"), exception);
   ASSERT_THROW(THROW("Should throw!"), exception);
   ASSERT_NO_THROW(RAFT_CUDA_TRY(cudaFree(nullptr)));
+
+  // test for long error message strings
+  std::string test{"This is a test string repeated many times. "};
+  for (size_t i = 0; i < 6; ++i)
+    test += test;
+  EXPECT_TRUE(test.size() > 2048) << "size of test string is: " << test.size();
+  auto test_format    = test + "%d";
+  auto* test_format_c = test_format.c_str();
+
+  std::string file{};
+  TEST_ADD_FILENAME(file);
+  std::string reg_file = reg_escape(file);
+
+  // THROW has to convert the test string into an exception string
+  try {
+    ASSERT(1 != 1, test_format_c, 121);
+  } catch (const raft::exception& e) {
+    std::string msg_full{e.what()};
+    // only use first line
+    std::string msg = msg_full.substr(0, msg_full.find('\n'));
+    std::string re_exp{"^exception occured! file="};
+    re_exp += reg_file;
+    // test code must be at line >10 (copyright), assume line is never >9999
+    re_exp += " line=\\d{2,4}: ";
+    re_exp += reg_escape(test);
+    re_exp += "121$";
+    EXPECT_TRUE(std::regex_match(msg, std::regex(re_exp))) << "message:'" << msg << "'" << std::endl
+                                                           << "expected regex:'" << re_exp << "'";
+  }
+
+  // Now we test SET_ERROR_MSG instead of THROW
+  std::string msg{"prefix:"};
+  ASSERT_NO_THROW(SET_ERROR_MSG(msg, "location prefix:", test_format_c, 123));
+
+  std::string re_exp{"^prefix:location prefix:file="};
+  re_exp += reg_file;
+  // test code must be at line >10 (copyright), assume line is never >9999
+  re_exp += " line=\\d{2,4}: ";
+  re_exp += reg_escape(test);
+  re_exp += "123$";
+  EXPECT_TRUE(std::regex_match(msg, std::regex(re_exp))) << "message:'" << msg << "'" << std::endl
+                                                         << "expected regex:'" << re_exp << "'";
 }
 
 }  // namespace raft

From 6a8c7a3bebe85d8fef34951e6f09c93fa733b06f Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 13 Jan 2022 08:15:00 -0600
Subject: [PATCH 072/171] Unpin `dask` and `distributed` (#440)

Changes to be inline with: https://github.com/rapidsai/cudf/pull/10028

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/raft/pull/440
---
 ci/gpu/build.sh                          | 4 ++--
 ci/local/old-gpubuild.sh                 | 4 ++--
 conda/environments/raft_dev_cuda11.0.yml | 4 ++--
 conda/environments/raft_dev_cuda11.2.yml | 4 ++--
 conda/environments/raft_dev_cuda11.4.yml | 4 ++--
 conda/environments/raft_dev_cuda11.5.yml | 4 ++--
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 83cc6fdaef..77987e65be 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -63,8 +63,8 @@ gpuci_mamba_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid
 # Install the master version of dask, distributed, and dask-ml
 gpuci_logger "Install the master version of dask and distributed"
 set -x
-pip install "git+https://github.com/dask/distributed.git@2021.11.2" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git@2021.11.2" --upgrade --no-deps
+pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
+pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
 set +x
 
 
diff --git a/ci/local/old-gpubuild.sh b/ci/local/old-gpubuild.sh
index ce6db28291..5b9df78679 100644
--- a/ci/local/old-gpubuild.sh
+++ b/ci/local/old-gpubuild.sh
@@ -84,8 +84,8 @@ fi
 
 # Install the master version of dask, distributed, and dask-ml
 set -x
-pip install "git+https://github.com/dask/distributed.git@2021.11.2" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git@2021.11.2" --upgrade --no-deps
+pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
+pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
 set +x
 
 
diff --git a/conda/environments/raft_dev_cuda11.0.yml b/conda/environments/raft_dev_cuda11.0.yml
index 28f06752d1..93134c6367 100644
--- a/conda/environments/raft_dev_cuda11.0.yml
+++ b/conda/environments/raft_dev_cuda11.0.yml
@@ -21,8 +21,8 @@ dependencies:
 - pip
 - pip:
     - sphinx_markdown_tables
-    - git+https://github.com/dask/dask.git@2021.11.2
-    - git+https://github.com/dask/distributed.git@2021.11.2
+    - git+https://github.com/dask/dask.git@main
+    - git+https://github.com/dask/distributed.git@main
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/
diff --git a/conda/environments/raft_dev_cuda11.2.yml b/conda/environments/raft_dev_cuda11.2.yml
index fa3c601e51..ced3aa2ed1 100644
--- a/conda/environments/raft_dev_cuda11.2.yml
+++ b/conda/environments/raft_dev_cuda11.2.yml
@@ -21,8 +21,8 @@ dependencies:
 - pip
 - pip:
     - sphinx_markdown_tables
-    - git+https://github.com/dask/dask.git@2021.11.2
-    - git+https://github.com/dask/distributed.git@2021.11.2
+    - git+https://github.com/dask/dask.git@main
+    - git+https://github.com/dask/distributed.git@main
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/
diff --git a/conda/environments/raft_dev_cuda11.4.yml b/conda/environments/raft_dev_cuda11.4.yml
index 4e2c6f2154..54d2680295 100644
--- a/conda/environments/raft_dev_cuda11.4.yml
+++ b/conda/environments/raft_dev_cuda11.4.yml
@@ -21,8 +21,8 @@ dependencies:
 - pip
 - pip:
     - sphinx_markdown_tables
-    - git+https://github.com/dask/dask.git@2021.11.2
-    - git+https://github.com/dask/distributed.git@2021.11.2
+    - git+https://github.com/dask/dask.git@main
+    - git+https://github.com/dask/distributed.git@main
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/
diff --git a/conda/environments/raft_dev_cuda11.5.yml b/conda/environments/raft_dev_cuda11.5.yml
index 841431cc0f..152f3a8db5 100644
--- a/conda/environments/raft_dev_cuda11.5.yml
+++ b/conda/environments/raft_dev_cuda11.5.yml
@@ -21,8 +21,8 @@ dependencies:
 - pip
 - pip:
     - sphinx_markdown_tables
-    - git+https://github.com/dask/dask.git@2021.11.2
-    - git+https://github.com/dask/distributed.git@2021.11.2
+    - git+https://github.com/dask/dask.git@main
+    - git+https://github.com/dask/distributed.git@main
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/

From b74c65f776c80e530813dd1a61dbf7667081a496 Mon Sep 17 00:00:00 2001
From: Raymond Douglass <ray@raydouglass.com>
Date: Thu, 13 Jan 2022 10:39:52 -0500
Subject: [PATCH 073/171] DOC

---
 CHANGELOG.md                             | 4 ++++
 ci/gpu/build.sh                          | 2 +-
 ci/local/old-gpubuild.sh                 | 2 +-
 conda/environments/raft_dev_cuda11.0.yml | 2 +-
 conda/environments/raft_dev_cuda11.2.yml | 2 +-
 conda/environments/raft_dev_cuda11.4.yml | 2 +-
 conda/environments/raft_dev_cuda11.5.yml | 2 +-
 cpp/CMakeLists.txt                       | 4 ++--
 8 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3eb06e5368..d6a9eb0f6a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# raft 22.04.00 (Date TBD)
+
+Please see https://github.com/rapidsai/raft/releases/tag/v22.04.00a for the latest changes to this development branch.
+
 # raft 22.02.00 (Date TBD)
 
 Please see https://github.com/rapidsai/raft/releases/tag/v22.02.00a for the latest changes to this development branch.
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 77987e65be..b9e1b22769 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -30,7 +30,7 @@ export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 source "$WORKSPACE/ci/prtest.config"
 
 # ucx-py version
-export UCX_PY_VERSION='0.24.*'
+export UCX_PY_VERSION='0.25.*'
 
 ################################################################################
 # SETUP - Check environment
diff --git a/ci/local/old-gpubuild.sh b/ci/local/old-gpubuild.sh
index 5b9df78679..340ac85714 100644
--- a/ci/local/old-gpubuild.sh
+++ b/ci/local/old-gpubuild.sh
@@ -30,7 +30,7 @@ export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 source "$WORKSPACE/ci/prtest.config"
 
 # ucx-py version
-export UCX_PY_VERSION='0.24.*'
+export UCX_PY_VERSION='0.25.*'
 
 ################################################################################
 # SETUP - Check environment
diff --git a/conda/environments/raft_dev_cuda11.0.yml b/conda/environments/raft_dev_cuda11.0.yml
index 93134c6367..d8946329a6 100644
--- a/conda/environments/raft_dev_cuda11.0.yml
+++ b/conda/environments/raft_dev_cuda11.0.yml
@@ -13,7 +13,7 @@ dependencies:
 - rapids-doc-env=22.02.*
 - rmm=22.02.*
 - dask-cuda=22.02.*
-- ucx-py=0.23
+- ucx-py=0.25.*
 - ucx-proc=*=gpu
 - doxygen>=1.8.20
 - libfaiss>=1.7.0
diff --git a/conda/environments/raft_dev_cuda11.2.yml b/conda/environments/raft_dev_cuda11.2.yml
index ced3aa2ed1..2f6301c3de 100644
--- a/conda/environments/raft_dev_cuda11.2.yml
+++ b/conda/environments/raft_dev_cuda11.2.yml
@@ -13,7 +13,7 @@ dependencies:
 - rapids-doc-env=22.02.*
 - rmm=22.02.*
 - dask-cuda=22.02.*
-- ucx-py=0.23
+- ucx-py=0.25.*
 - ucx-proc=*=gpu
 - doxygen>=1.8.20
 - libfaiss>=1.7.0
diff --git a/conda/environments/raft_dev_cuda11.4.yml b/conda/environments/raft_dev_cuda11.4.yml
index 54d2680295..e427369962 100644
--- a/conda/environments/raft_dev_cuda11.4.yml
+++ b/conda/environments/raft_dev_cuda11.4.yml
@@ -13,7 +13,7 @@ dependencies:
 - rapids-doc-env=22.02.*
 - rmm=22.02.*
 - dask-cuda=22.02.*
-- ucx-py=0.23
+- ucx-py=0.25.*
 - ucx-proc=*=gpu
 - doxygen>=1.8.20
 - libfaiss>=1.7.0
diff --git a/conda/environments/raft_dev_cuda11.5.yml b/conda/environments/raft_dev_cuda11.5.yml
index 152f3a8db5..e84ba32a25 100644
--- a/conda/environments/raft_dev_cuda11.5.yml
+++ b/conda/environments/raft_dev_cuda11.5.yml
@@ -13,7 +13,7 @@ dependencies:
 - rapids-doc-env=22.02.*
 - rmm=22.02.*
 - dask-cuda=22.02.*
-- ucx-py=0.23
+- ucx-py=0.25.*
 - ucx-proc=*=gpu
 - doxygen>=1.8.20
 - libfaiss>=1.7.0
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index efebfff429..ea6d203258 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -15,7 +15,7 @@
 #=============================================================================
 
 cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR)
-file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.02/RAPIDS.cmake
+file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.04/RAPIDS.cmake
     ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
 include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
 include(rapids-cmake)
@@ -26,7 +26,7 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(RAFT)
 
-project(RAFT VERSION 22.02.00 LANGUAGES CXX CUDA)
+project(RAFT VERSION 22.04.00 LANGUAGES CXX CUDA)
 
 ##############################################################################
 # - build type ---------------------------------------------------------------

From bb2568a24447f3c978ef6bd06378e01b0903c400 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Tue, 18 Jan 2022 11:50:05 -0800
Subject: [PATCH 074/171] Update cuCollection git tag (#447)

Update NVIDIA cuCollection (https://github.com/NVIDIA/cuCollections) git tag to the most recent one (cuCollection has been updated to support CUDA stream).

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/447
---
 cpp/cmake/thirdparty/get_cuco.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index a0c0faf0a9..aaedb0ccc5 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@ function(find_and_configure_cuco VERSION)
       INSTALL_EXPORT_SET  raft-exports
       CPM_ARGS
         GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
-        GIT_TAG        f0eecb203590f1f4ac4a9f1700229f4434ac64dc
+        GIT_TAG        0ca860b824f5dc22cf8a41f09912e62e11f07d82
         OPTIONS        "BUILD_TESTS OFF"
                        "BUILD_BENCHMARKS OFF"
                        "BUILD_EXAMPLES OFF"

From 40bb4bcc21e65b308884ca1be5dc0efa30c465c6 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 18 Jan 2022 17:51:34 -0500
Subject: [PATCH 075/171] Remove libcudacxx patch needed for nvcc 11.4 (#446)

The `libcudacxx.patch` was required to fix issues with libcudacxx 1.6 and incorrect detection of the arm nvcc 11.4 compiler.

As rapids-cmake has moved to libcudacxx 1.7 this patch is not needed, and should be removed.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/446
---
 cpp/cmake/libcudacxx.patch                | 21 ---------------------
 cpp/cmake/thirdparty/get_libcudacxx.cmake |  5 +----
 2 files changed, 1 insertion(+), 25 deletions(-)
 delete mode 100644 cpp/cmake/libcudacxx.patch

diff --git a/cpp/cmake/libcudacxx.patch b/cpp/cmake/libcudacxx.patch
deleted file mode 100644
index 3cdc40ef08..0000000000
--- a/cpp/cmake/libcudacxx.patch
+++ /dev/null
@@ -1,21 +0,0 @@
-diff --git a/include/cuda/std/detail/__config b/include/cuda/std/detail/__config
-index d55a43688..654142d7e 100644
---- a/include/cuda/std/detail/__config
-+++ b/include/cuda/std/detail/__config
-@@ -23,7 +23,7 @@
-     #define _LIBCUDACXX_CUDACC_VER_MINOR __CUDACC_VER_MINOR__
-     #define _LIBCUDACXX_CUDACC_VER_BUILD __CUDACC_VER_BUILD__
-     #define _LIBCUDACXX_CUDACC_VER                                                  \
--        _LIBCUDACXX_CUDACC_VER_MAJOR * 10000 + _LIBCUDACXX_CUDACC_VER_MINOR * 100 + \
-+        _LIBCUDACXX_CUDACC_VER_MAJOR * 100000 + _LIBCUDACXX_CUDACC_VER_MINOR * 1000 + \
-         _LIBCUDACXX_CUDACC_VER_BUILD
- 
-     #define _LIBCUDACXX_HAS_NO_LONG_DOUBLE
-@@ -64,7 +64,7 @@
- #  endif
- #endif
- 
--#if defined(_LIBCUDACXX_COMPILER_MSVC) || (defined(_LIBCUDACXX_CUDACC_VER) && (_LIBCUDACXX_CUDACC_VER < 110500))
-+#if defined(_LIBCUDACXX_COMPILER_MSVC) || (defined(_LIBCUDACXX_CUDACC_VER) && (_LIBCUDACXX_CUDACC_VER < 1105000))
- #  define _LIBCUDACXX_HAS_NO_INT128
- #endif
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
index e18b912ba7..5343250dca 100644
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -16,10 +16,7 @@
 function(find_and_configure_libcudacxx)
   include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
 
-  rapids_cpm_libcudacxx(
-    BUILD_EXPORT_SET raft-exports INSTALL_EXPORT_SET raft-exports PATCH_COMMAND patch
-    --reject-file=- -p1 -N < ${RAFT_SOURCE_DIR}/cmake/libcudacxx.patch || true
-  )
+  rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-exports INSTALL_EXPORT_SET raft-exports)
 
 endfunction()
 

From 73585f4b35eda8fd2a1ffa9f3c058ec0674bc083 Mon Sep 17 00:00:00 2001
From: Victor Lafargue <viclafargue@nvidia.com>
Date: Wed, 19 Jan 2022 18:05:27 +0100
Subject: [PATCH 076/171] Use FAISS with RMM (#363)

Answers https://github.com/rapidsai/cuml/issues/2821

Authors:
  - Victor Lafargue (https://github.com/viclafargue)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/363
---
 cpp/include/raft/spatial/knn/ann.hpp          |   2 +-
 cpp/include/raft/spatial/knn/ann_common.h     |   4 +-
 .../knn/detail/ann_quantized_faiss.cuh        |   4 +-
 .../spatial/knn/detail/haversine_distance.cuh |   2 +-
 .../knn/detail/knn_brute_force_faiss.cuh      |   4 +-
 .../spatial/knn/detail/selection_faiss.cuh    |   2 +-
 cpp/include/raft/spatial/knn/faiss_mr.hpp     | 643 ++++++++++++++++++
 cpp/test/CMakeLists.txt                       |   1 +
 cpp/test/spatial/faiss_mr.cu                  |  94 +++
 9 files changed, 747 insertions(+), 9 deletions(-)
 create mode 100644 cpp/include/raft/spatial/knn/faiss_mr.hpp
 create mode 100644 cpp/test/spatial/faiss_mr.cu

diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp
index e8cc85256d..6ce9463e43 100644
--- a/cpp/include/raft/spatial/knn/ann.hpp
+++ b/cpp/include/raft/spatial/knn/ann.hpp
@@ -20,7 +20,7 @@
 #include "detail/ann_quantized_faiss.cuh"
 
 #include <faiss/gpu/GpuIndex.h>
-#include <faiss/gpu/StandardGpuResources.h>
+#include <raft/spatial/knn/faiss_mr.hpp>
 
 #include <raft/mr/device/buffer.hpp>
 
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index 573a23181d..79f75dc8ae 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -19,7 +19,7 @@
 #include <raft/linalg/distance_type.h>
 
 #include <faiss/gpu/GpuIndex.h>
-#include <faiss/gpu/StandardGpuResources.h>
+#include <raft/spatial/knn/faiss_mr.hpp>
 
 namespace raft {
 namespace spatial {
@@ -30,7 +30,7 @@ struct knnIndex {
   raft::distance::DistanceType metric;
   float metricArg;
 
-  faiss::gpu::StandardGpuResources* gpu_res;
+  raft::spatial::knn::RmmGpuResources* gpu_res;
   int device;
   ~knnIndex()
   {
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index ff08917585..1c2f21b72c 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -28,6 +28,7 @@
 
 #include <label/classlabels.cuh>
 #include <raft/distance/distance.hpp>
+#include <raft/spatial/knn/faiss_mr.hpp>
 
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuIndexFlat.h>
@@ -35,7 +36,6 @@
 #include <faiss/gpu/GpuIndexIVFPQ.h>
 #include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
 #include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/gpu/utils/Limits.cuh>
 #include <faiss/gpu/utils/Select.cuh>
 #include <faiss/gpu/utils/Tensor.cuh>
@@ -126,7 +126,7 @@ void approx_knn_build_index(raft::handle_t& handle,
   int device;
   RAFT_CUDA_TRY(cudaGetDevice(&device));
 
-  faiss::gpu::StandardGpuResources* gpu_res = new faiss::gpu::StandardGpuResources();
+  raft::spatial::knn::RmmGpuResources* gpu_res = new raft::spatial::knn::RmmGpuResources();
   gpu_res->noTempMemory();
   gpu_res->setDefaultStream(device, handle.get_stream());
   index->gpu_res   = gpu_res;
diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
index fdbe77c561..8d40860535 100644
--- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
+++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
@@ -21,13 +21,13 @@
 
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/gpu/utils/Limits.cuh>
 #include <faiss/gpu/utils/Select.cuh>
 #include <faiss/utils/Heap.h>
 
 #include <raft/handle.hpp>
 #include <raft/linalg/distance_type.h>
+#include <raft/spatial/knn/faiss_mr.hpp>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 6e0ea1f538..98a8885369 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -24,7 +24,6 @@
 
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/gpu/utils/Limits.cuh>
 #include <faiss/gpu/utils/Select.cuh>
 #include <faiss/utils/Heap.h>
@@ -33,6 +32,7 @@
 #include <iostream>
 #include <raft/handle.hpp>
 #include <raft/linalg/distance_type.h>
+#include <raft/spatial/knn/faiss_mr.hpp>
 #include <set>
 #include <thrust/iterator/transform_iterator.h>
 
@@ -324,7 +324,7 @@ void brute_force_knn_impl(
         default:
           faiss::MetricType m = build_faiss_metric(metric);
 
-          faiss::gpu::StandardGpuResources gpu_res;
+          raft::spatial::knn::RmmGpuResources gpu_res;
 
           gpu_res.noTempMemory();
           gpu_res.setDefaultStream(device, stream);
diff --git a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
index 07de6bcff9..03a4eabaac 100644
--- a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
@@ -17,11 +17,11 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
+#include <raft/spatial/knn/faiss_mr.hpp>
 
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/gpu/utils/Limits.cuh>
 #include <faiss/gpu/utils/Select.cuh>
 #include <faiss/utils/Heap.h>
diff --git a/cpp/include/raft/spatial/knn/faiss_mr.hpp b/cpp/include/raft/spatial/knn/faiss_mr.hpp
new file mode 100644
index 0000000000..11377e66e1
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/faiss_mr.hpp
@@ -0,0 +1,643 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+This code contains unnecessary code duplication. These could be deleted
+once the relevant changes would be made on the FAISS side. Indeed most of
+the logic in the below code is similar to FAISS's standard implementation
+and should thus be inherited instead of duplicated. This FAISS's issue
+once solved should allow the removal of the unnecessary duplicates
+in this file : https://github.com/facebookresearch/faiss/issues/2097
+*/
+
+#pragma once
+
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/StackDeviceMemory.h>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/impl/FaissAssert.h>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <map>
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/host/pinned_memory_resource.hpp>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+using namespace faiss::gpu;
+
+namespace {
+
+// How many streams per device we allocate by default (for multi-streaming)
+constexpr int kNumStreams = 2;
+
+// Use 256 MiB of pinned memory for async CPU <-> GPU copies by default
+constexpr size_t kDefaultPinnedMemoryAllocation = (size_t)256 * 1024 * 1024;
+
+// Default temporary memory allocation for <= 4 GiB memory GPUs
+constexpr size_t k4GiBTempMem = (size_t)512 * 1024 * 1024;
+
+// Default temporary memory allocation for <= 8 GiB memory GPUs
+constexpr size_t k8GiBTempMem = (size_t)1024 * 1024 * 1024;
+
+// Maximum temporary memory allocation for all GPUs
+constexpr size_t kMaxTempMem = (size_t)1536 * 1024 * 1024;
+
+std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map)
+{
+  // Produce a sorted list of all outstanding allocations by type
+  std::unordered_map<AllocType, std::pair<int, size_t>> stats;
+
+  for (auto& entry : map) {
+    auto& a = entry.second;
+
+    auto it = stats.find(a.type);
+    if (it != stats.end()) {
+      stats[a.type].first++;
+      stats[a.type].second += a.size;
+    } else {
+      stats[a.type] = std::make_pair(1, a.size);
+    }
+  }
+
+  std::stringstream ss;
+  for (auto& entry : stats) {
+    ss << "Alloc type " << allocTypeToString(entry.first) << ": " << entry.second.first
+       << " allocations, " << entry.second.second << " bytes\n";
+  }
+
+  return ss.str();
+}
+
+}  // namespace
+
+/// RMM implementation of the GpuResources object that provides for a
+/// temporary memory manager
+class RmmGpuResourcesImpl : public GpuResources {
+ public:
+  RmmGpuResourcesImpl()
+    : pinnedMemAlloc_(nullptr),
+      pinnedMemAllocSize_(0),
+      // let the adjustment function determine the memory size for us by passing
+      // in a huge value that will then be adjusted
+      tempMemSize_(getDefaultTempMemForGPU(-1, std::numeric_limits<size_t>::max())),
+      pinnedMemSize_(kDefaultPinnedMemoryAllocation),
+      allocLogging_(false),
+      cmr(new rmm::mr::cuda_memory_resource),
+      mmr(new rmm::mr::managed_memory_resource),
+      pmr(new rmm::mr::pinned_memory_resource){};
+
+  ~RmmGpuResourcesImpl()
+  {
+    // The temporary memory allocator has allocated memory through us, so clean
+    // that up before we finish fully de-initializing ourselves
+    tempMemory_.clear();
+
+    // Make sure all allocations have been freed
+    bool allocError = false;
+
+    for (auto& entry : allocs_) {
+      auto& map = entry.second;
+
+      if (!map.empty()) {
+        std::cerr << "RmmGpuResources destroyed with allocations outstanding:\n"
+                  << "Device " << entry.first << " outstanding allocations:\n";
+        std::cerr << allocsToString(map);
+        allocError = true;
+      }
+    }
+
+    FAISS_ASSERT_MSG(!allocError, "GPU memory allocations not properly cleaned up");
+
+    for (auto& entry : defaultStreams_) {
+      DeviceScope scope(entry.first);
+
+      // We created these streams, so are responsible for destroying them
+      CUDA_VERIFY(cudaStreamDestroy(entry.second));
+    }
+
+    for (auto& entry : alternateStreams_) {
+      DeviceScope scope(entry.first);
+
+      for (auto stream : entry.second) {
+        CUDA_VERIFY(cudaStreamDestroy(stream));
+      }
+    }
+
+    for (auto& entry : asyncCopyStreams_) {
+      DeviceScope scope(entry.first);
+
+      CUDA_VERIFY(cudaStreamDestroy(entry.second));
+    }
+
+    for (auto& entry : blasHandles_) {
+      DeviceScope scope(entry.first);
+
+      auto blasStatus = cublasDestroy(entry.second);
+      FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
+    }
+
+    if (pinnedMemAlloc_) { pmr->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_); }
+  };
+
+  /// Disable allocation of temporary memory; all temporary memory
+  /// requests will call cudaMalloc / cudaFree at the point of use
+  void noTempMemory() { setTempMemory(0); };
+
+  /// Specify that we wish to use a certain fixed size of memory on
+  /// all devices as temporary memory. This is the upper bound for the GPU
+  /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
+  /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
+  /// To avoid any temporary memory allocation, pass 0.
+  void setTempMemory(size_t size)
+  {
+    if (tempMemSize_ != size) {
+      // adjust based on general limits
+      tempMemSize_ = getDefaultTempMemForGPU(-1, size);
+
+      // We need to re-initialize memory resources for all current devices that
+      // have been initialized.
+      // This should be safe to do, even if we are currently running work, because
+      // the cudaFree call that this implies will force-synchronize all GPUs with
+      // the CPU
+      for (auto& p : tempMemory_) {
+        int device = p.first;
+        // Free the existing memory first
+        p.second.reset();
+
+        // Allocate new
+        p.second = std::unique_ptr<StackDeviceMemory>(
+          new StackDeviceMemory(this,
+                                p.first,
+                                // adjust for this specific device
+                                getDefaultTempMemForGPU(device, tempMemSize_)));
+      }
+    }
+  };
+
+  /// Set amount of pinned memory to allocate, for async GPU <-> CPU
+  /// transfers
+  void setPinnedMemory(size_t size)
+  {
+    // Should not call this after devices have been initialized
+    FAISS_ASSERT(defaultStreams_.size() == 0);
+    FAISS_ASSERT(!pinnedMemAlloc_);
+
+    pinnedMemSize_ = size;
+  };
+
+  /// Called to change the stream for work ordering. We do not own `stream`;
+  /// i.e., it will not be destroyed when the GpuResources object gets cleaned
+  /// up.
+  /// We are guaranteed that all Faiss GPU work is ordered with respect to
+  /// this stream upon exit from an index or other Faiss GPU call.
+  void setDefaultStream(int device, cudaStream_t stream)
+  {
+    if (isInitialized(device)) {
+      // A new series of calls may not be ordered with what was the previous
+      // stream, so if the stream being specified is different, then we need to
+      // ensure ordering between the two (new stream waits on old).
+      auto it                 = userDefaultStreams_.find(device);
+      cudaStream_t prevStream = nullptr;
+
+      if (it != userDefaultStreams_.end()) {
+        prevStream = it->second;
+      } else {
+        FAISS_ASSERT(defaultStreams_.count(device));
+        prevStream = defaultStreams_[device];
+      }
+
+      if (prevStream != stream) { streamWait({stream}, {prevStream}); }
+    }
+
+    userDefaultStreams_[device] = stream;
+  };
+
+  /// Revert the default stream to the original stream managed by this resources
+  /// object, in case someone called `setDefaultStream`.
+  void revertDefaultStream(int device)
+  {
+    if (isInitialized(device)) {
+      auto it = userDefaultStreams_.find(device);
+
+      if (it != userDefaultStreams_.end()) {
+        // There was a user stream set that we need to synchronize against
+        cudaStream_t prevStream = userDefaultStreams_[device];
+
+        FAISS_ASSERT(defaultStreams_.count(device));
+        cudaStream_t newStream = defaultStreams_[device];
+
+        streamWait({newStream}, {prevStream});
+      }
+    }
+
+    userDefaultStreams_.erase(device);
+  };
+
+  /// Returns the stream for the given device on which all Faiss GPU work is
+  /// ordered.
+  /// We are guaranteed that all Faiss GPU work is ordered with respect to
+  /// this stream upon exit from an index or other Faiss GPU call.
+  cudaStream_t getDefaultStream(int device)
+  {
+    initializeForDevice(device);
+
+    auto it = userDefaultStreams_.find(device);
+    if (it != userDefaultStreams_.end()) {
+      // There is a user override stream set
+      return it->second;
+    }
+
+    // Otherwise, our base default stream
+    return defaultStreams_[device];
+  };
+
+  /// Called to change the work ordering streams to the null stream
+  /// for all devices
+  void setDefaultNullStreamAllDevices()
+  {
+    for (int dev = 0; dev < getNumDevices(); ++dev) {
+      setDefaultStream(dev, nullptr);
+    }
+  };
+
+  /// If enabled, will print every GPU memory allocation and deallocation to
+  /// standard output
+  void setLogMemoryAllocations(bool enable) { allocLogging_ = enable; };
+
+ public:
+  /// Internal system calls
+
+  /// Initialize resources for this device
+  void initializeForDevice(int device)
+  {
+    if (isInitialized(device)) { return; }
+
+    // If this is the first device that we're initializing, create our
+    // pinned memory allocation
+    if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
+      pinnedMemAlloc_     = pmr->allocate(pinnedMemSize_);
+      pinnedMemAllocSize_ = pinnedMemSize_;
+    }
+
+    FAISS_ASSERT(device < getNumDevices());
+    DeviceScope scope(device);
+
+    // Make sure that device properties for all devices are cached
+    auto& prop = getDeviceProperties(device);
+
+    // Also check to make sure we meet our minimum compute capability (3.0)
+    FAISS_ASSERT_FMT(prop.major >= 3,
+                     "Device id %d with CC %d.%d not supported, "
+                     "need 3.0+ compute capability",
+                     device,
+                     prop.major,
+                     prop.minor);
+
+    // Create streams
+    cudaStream_t defaultStream = 0;
+    CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking));
+
+    defaultStreams_[device] = defaultStream;
+
+    cudaStream_t asyncCopyStream = 0;
+    CUDA_VERIFY(cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
+
+    asyncCopyStreams_[device] = asyncCopyStream;
+
+    std::vector<cudaStream_t> deviceStreams;
+    for (int j = 0; j < kNumStreams; ++j) {
+      cudaStream_t stream = 0;
+      CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+      deviceStreams.push_back(stream);
+    }
+
+    alternateStreams_[device] = std::move(deviceStreams);
+
+    // Create cuBLAS handle
+    cublasHandle_t blasHandle = 0;
+    auto blasStatus           = cublasCreate(&blasHandle);
+    FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
+    blasHandles_[device] = blasHandle;
+
+    // For CUDA 10 on V100, enabling tensor core usage would enable automatic
+    // rounding down of inputs to f16 (though accumulate in f32) which results in
+    // unacceptable loss of precision in general.
+    // For CUDA 11 / A100, only enable tensor core support if it doesn't result in
+    // a loss of precision.
+#if CUDA_VERSION >= 11000
+    cublasSetMathMode(blasHandle, CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+#endif
+
+    FAISS_ASSERT(allocs_.count(device) == 0);
+    allocs_[device] = std::unordered_map<void*, AllocRequest>();
+
+    FAISS_ASSERT(tempMemory_.count(device) == 0);
+    auto mem = std::unique_ptr<StackDeviceMemory>(
+      new StackDeviceMemory(this,
+                            device,
+                            // adjust for this specific device
+                            getDefaultTempMemForGPU(device, tempMemSize_)));
+
+    tempMemory_.emplace(device, std::move(mem));
+  };
+
+  cublasHandle_t getBlasHandle(int device)
+  {
+    initializeForDevice(device);
+    return blasHandles_[device];
+  };
+
+  std::vector<cudaStream_t> getAlternateStreams(int device)
+  {
+    initializeForDevice(device);
+    return alternateStreams_[device];
+  };
+
+  /// Allocate non-temporary GPU memory
+  void* allocMemory(const AllocRequest& req)
+  {
+    initializeForDevice(req.device);
+
+    // We don't allocate a placeholder for zero-sized allocations
+    if (req.size == 0) { return nullptr; }
+
+    // Make sure that the allocation is a multiple of 16 bytes for alignment
+    // purposes
+    auto adjReq = req;
+    adjReq.size = utils::roundUp(adjReq.size, (size_t)16);
+
+    void* p = nullptr;
+
+    if (allocLogging_) { std::cout << "RmmGpuResources: alloc " << adjReq.toString() << "\n"; }
+
+    if (adjReq.space == MemorySpace::Temporary) {
+      // If we don't have enough space in our temporary memory manager, we need
+      // to allocate this request separately
+      auto& tempMem = tempMemory_[adjReq.device];
+
+      if (adjReq.size > tempMem->getSizeAvailable()) {
+        // We need to allocate this ourselves
+        AllocRequest newReq = adjReq;
+        newReq.space        = MemorySpace::Device;
+        newReq.type         = AllocType::TemporaryMemoryOverflow;
+
+        return allocMemory(newReq);
+      }
+
+      // Otherwise, we can handle this locally
+      p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
+
+    } else if (adjReq.space == MemorySpace::Device) {
+      p = cmr->allocate(adjReq.size, adjReq.stream);
+    } else if (adjReq.space == MemorySpace::Unified) {
+      p = mmr->allocate(adjReq.size, adjReq.stream);
+    } else {
+      FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)adjReq.space);
+    }
+
+    allocs_[adjReq.device][p] = adjReq;
+
+    return p;
+  };
+
+  /// Returns a previous allocation
+  void deallocMemory(int device, void* p)
+  {
+    FAISS_ASSERT(isInitialized(device));
+
+    if (!p) { return; }
+
+    auto& a = allocs_[device];
+    auto it = a.find(p);
+    FAISS_ASSERT(it != a.end());
+
+    auto& req = it->second;
+
+    if (allocLogging_) { std::cout << "RmmGpuResources: dealloc " << req.toString() << "\n"; }
+
+    if (req.space == MemorySpace::Temporary) {
+      std::cout << "dealloc Temporary" << std::endl;
+      tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
+    } else if (req.space == MemorySpace::Device) {
+      std::cout << "dealloc Device" << std::endl;
+      cmr->deallocate(p, req.size, req.stream);
+    } else if (req.space == MemorySpace::Unified) {
+      std::cout << "dealloc Unified" << std::endl;
+      mmr->deallocate(p, req.size, req.stream);
+    } else {
+      FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space);
+    }
+
+    a.erase(it);
+  };
+
+  size_t getTempMemoryAvailable(int device) const
+  {
+    FAISS_ASSERT(isInitialized(device));
+
+    auto it = tempMemory_.find(device);
+    FAISS_ASSERT(it != tempMemory_.end());
+
+    return it->second->getSizeAvailable();
+  };
+
+  /// Export a description of memory used for Python
+  std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo() const
+  {
+    using AT = std::map<std::string, std::pair<int, size_t>>;
+
+    std::map<int, AT> out;
+
+    for (auto& entry : allocs_) {
+      AT outDevice;
+
+      for (auto& a : entry.second) {
+        auto& v = outDevice[allocTypeToString(a.second.type)];
+        v.first++;
+        v.second += a.second.size;
+      }
+
+      out[entry.first] = std::move(outDevice);
+    }
+
+    return out;
+  };
+
+  std::pair<void*, size_t> getPinnedMemory()
+  {
+    return std::make_pair(pinnedMemAlloc_, pinnedMemAllocSize_);
+  };
+
+  cudaStream_t getAsyncCopyStream(int device)
+  {
+    initializeForDevice(device);
+    return asyncCopyStreams_[device];
+  };
+
+ private:
+  /// Have GPU resources been initialized for this device yet?
+  bool isInitialized(int device) const
+  {
+    // Use default streams as a marker for whether or not a certain
+    // device has been initialized
+    return defaultStreams_.count(device) != 0;
+  };
+
+  /// Adjust the default temporary memory allocation based on the total GPU
+  /// memory size
+  static size_t getDefaultTempMemForGPU(int device, size_t requested)
+  {
+    auto totalMem = device != -1 ? getDeviceProperties(device).totalGlobalMem
+                                 : std::numeric_limits<size_t>::max();
+
+    if (totalMem <= (size_t)4 * 1024 * 1024 * 1024) {
+      // If the GPU has <= 4 GiB of memory, reserve 512 MiB
+
+      if (requested > k4GiBTempMem) { return k4GiBTempMem; }
+    } else if (totalMem <= (size_t)8 * 1024 * 1024 * 1024) {
+      // If the GPU has <= 8 GiB of memory, reserve 1 GiB
+
+      if (requested > k8GiBTempMem) { return k8GiBTempMem; }
+    } else {
+      // Never use more than 1.5 GiB
+      if (requested > kMaxTempMem) { return kMaxTempMem; }
+    }
+
+    // use whatever lower limit the user requested
+    return requested;
+  };
+
+ private:
+  /// Set of currently outstanding memory allocations per device
+  /// device -> (alloc request, allocated ptr)
+  std::unordered_map<int, std::unordered_map<void*, AllocRequest>> allocs_;
+
+  /// Temporary memory provider, per each device
+  std::unordered_map<int, std::unique_ptr<StackDeviceMemory>> tempMemory_;
+
+  /// Our default stream that work is ordered on, one per each device
+  std::unordered_map<int, cudaStream_t> defaultStreams_;
+
+  /// This contains particular streams as set by the user for
+  /// ordering, if any
+  std::unordered_map<int, cudaStream_t> userDefaultStreams_;
+
+  /// Other streams we can use, per each device
+  std::unordered_map<int, std::vector<cudaStream_t>> alternateStreams_;
+
+  /// Async copy stream to use for GPU <-> CPU pinned memory copies
+  std::unordered_map<int, cudaStream_t> asyncCopyStreams_;
+
+  /// cuBLAS handle for each device
+  std::unordered_map<int, cublasHandle_t> blasHandles_;
+
+  /// Pinned memory allocation for use with this GPU
+  void* pinnedMemAlloc_;
+  size_t pinnedMemAllocSize_;
+
+  /// Another option is to use a specified amount of memory on all
+  /// devices
+  size_t tempMemSize_;
+
+  /// Amount of pinned memory we should allocate
+  size_t pinnedMemSize_;
+
+  /// Whether or not we log every GPU memory allocation and deallocation
+  bool allocLogging_;
+
+  // cuda_memory_resource
+  std::unique_ptr<rmm::mr::device_memory_resource> cmr;
+
+  // managed_memory_resource
+  std::unique_ptr<rmm::mr::device_memory_resource> mmr;
+
+  // pinned_memory_resource
+  std::unique_ptr<rmm::mr::host_memory_resource> pmr;
+};
+
+/// Default implementation of GpuResources that allocates a cuBLAS
+/// stream and 2 streams for use, as well as temporary memory.
+/// Internally, the Faiss GPU code uses the instance managed by getResources,
+/// but this is the user-facing object that is internally reference counted.
+class RmmGpuResources : public GpuResourcesProvider {
+ public:
+  RmmGpuResources() : res_(new RmmGpuResourcesImpl){};
+
+  ~RmmGpuResources(){};
+
+  std::shared_ptr<GpuResources> getResources() { return res_; };
+
+  /// Disable allocation of temporary memory; all temporary memory
+  /// requests will call cudaMalloc / cudaFree at the point of use
+  void noTempMemory() { res_->noTempMemory(); };
+
+  /// Specify that we wish to use a certain fixed size of memory on
+  /// all devices as temporary memory. This is the upper bound for the GPU
+  /// memory that we will reserve. We will never go above 1.5 GiB on any GPU;
+  /// smaller GPUs (with <= 4 GiB or <= 8 GiB) will use less memory than that.
+  /// To avoid any temporary memory allocation, pass 0.
+  void setTempMemory(size_t size) { res_->setTempMemory(size); };
+
+  /// Set amount of pinned memory to allocate, for async GPU <-> CPU
+  /// transfers
+  void setPinnedMemory(size_t size) { res_->setPinnedMemory(size); };
+
+  /// Called to change the stream for work ordering. We do not own `stream`;
+  /// i.e., it will not be destroyed when the GpuResources object gets cleaned
+  /// up.
+  /// We are guaranteed that all Faiss GPU work is ordered with respect to
+  /// this stream upon exit from an index or other Faiss GPU call.
+  void setDefaultStream(int device, cudaStream_t stream)
+  {
+    res_->setDefaultStream(device, stream);
+  };
+
+  /// Revert the default stream to the original stream managed by this resources
+  /// object, in case someone called `setDefaultStream`.
+  void revertDefaultStream(int device) { res_->revertDefaultStream(device); };
+
+  /// Called to change the work ordering streams to the null stream
+  /// for all devices
+  void setDefaultNullStreamAllDevices() { res_->setDefaultNullStreamAllDevices(); };
+
+  /// Export a description of memory used for Python
+  std::map<int, std::map<std::string, std::pair<int, size_t>>> getMemoryInfo() const
+  {
+    return res_->getMemoryInfo();
+  };
+
+  /// Returns the current default stream
+  cudaStream_t getDefaultStream(int device) { return res_->getDefaultStream(device); };
+
+  /// Returns the current amount of temp memory available
+  size_t getTempMemoryAvailable(int device) const { return res_->getTempMemoryAvailable(device); };
+
+  /// Synchronize our default stream with the CPU
+  void syncDefaultStreamCurrentDevice() { res_->syncDefaultStreamCurrentDevice(); };
+
+  /// If enabled, will print every GPU memory allocation and deallocation to
+  /// standard output
+  void setLogMemoryAllocations(bool enable) { res_->setLogMemoryAllocations(enable); };
+
+ private:
+  std::shared_ptr<RmmGpuResourcesImpl> res_;
+};
+
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 5bf836e8d5..72dae50643 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -93,6 +93,7 @@ add_executable(test_raft
     test/spatial/fused_l2_knn.cu
     test/spatial/haversine.cu
     test/spatial/ball_cover.cu
+    test/spatial/faiss_mr.cu
     test/spatial/selection.cu
     test/spectral_matrix.cu
     test/stats/mean.cu
diff --git a/cpp/test/spatial/faiss_mr.cu b/cpp/test/spatial/faiss_mr.cu
new file mode 100644
index 0000000000..a626824621
--- /dev/null
+++ b/cpp/test/spatial/faiss_mr.cu
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+
+#include <faiss/gpu/GpuResources.h>
+#include <raft/linalg/distance_type.h>
+#include <raft/spatial/knn/knn.hpp>
+
+#include <rmm/device_buffer.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <iostream>
+#include <vector>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+using namespace faiss::gpu;
+
+struct AllocInputs {
+  size_t size;
+};
+
+template <typename T>
+class FAISS_MR_Test : public ::testing::TestWithParam<AllocInputs> {
+ public:
+  FAISS_MR_Test()
+    : params_(::testing::TestWithParam<AllocInputs>::GetParam()), stream(handle.get_stream())
+  {
+  }
+
+ protected:
+  size_t getFreeMemory(MemorySpace mem_space)
+  {
+    if (mem_space == MemorySpace::Device) {
+      rmm::mr::cuda_memory_resource cmr;
+      rmm::mr::device_memory_resource* dmr = &cmr;
+      return dmr->get_mem_info(stream).first;
+    } else if (mem_space == MemorySpace::Unified) {
+      rmm::mr::managed_memory_resource mmr;
+      rmm::mr::device_memory_resource* dmr = &mmr;
+      return dmr->get_mem_info(stream).first;
+    }
+    return 0;
+  }
+
+  void testAllocs(MemorySpace mem_space)
+  {
+    raft::spatial::knn::RmmGpuResources faiss_mr;
+    auto faiss_mr_impl = faiss_mr.getResources();
+    size_t free_before = getFreeMemory(mem_space);
+    AllocRequest req(AllocType::Other, 0, mem_space, stream, params_.size);
+    void* ptr               = faiss_mr_impl->allocMemory(req);
+    size_t free_after_alloc = getFreeMemory(mem_space);
+    faiss_mr_impl->deallocMemory(0, ptr);
+    ASSERT_TRUE(free_after_alloc <= free_before - params_.size);
+  }
+
+  raft::handle_t handle;
+  cudaStream_t stream;
+  AllocInputs params_;
+};
+
+const std::vector<AllocInputs> inputs = {{19687}};
+
+typedef FAISS_MR_Test<float> FAISS_MR_TestF;
+TEST_P(FAISS_MR_TestF, TestAllocs)
+{
+  testAllocs(MemorySpace::Device);
+  testAllocs(MemorySpace::Unified);
+}
+
+INSTANTIATE_TEST_CASE_P(FAISS_MR_Test, FAISS_MR_TestF, ::testing::ValuesIn(inputs));
+
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft

From c52420df475abaef217c7e98dfcc82d3ac73d40f Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Thu, 20 Jan 2022 07:24:57 -0500
Subject: [PATCH 077/171] Replace RMM CUDA Python bindings with those provided 
 by CUDA-Python (#451)

As a follow up to https://github.com/rapidsai/rmm/pull/930, fix RAFT to rely on CUDA Python directly rather than custom  CUDA bindings that RMM provided.

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Jordan Jacobelli (https://github.com/Ethyling)

URL: https://github.com/rapidsai/raft/pull/451
---
 conda/environments/raft_dev_cuda11.5.yml |  1 +
 python/raft/common/cuda.pxd              | 24 +++---------
 python/raft/common/cuda.pyx              | 47 ++++++++++++------------
 python/raft/common/handle.pxd            |  3 +-
 python/raft/common/handle.pyx            |  9 +++--
 5 files changed, 35 insertions(+), 49 deletions(-)

diff --git a/conda/environments/raft_dev_cuda11.5.yml b/conda/environments/raft_dev_cuda11.5.yml
index 152f3a8db5..c6d9f3fbf5 100644
--- a/conda/environments/raft_dev_cuda11.5.yml
+++ b/conda/environments/raft_dev_cuda11.5.yml
@@ -6,6 +6,7 @@ channels:
 - conda-forge
 dependencies:
 - cudatoolkit=11.5
+- cuda-python >=11.5,<12.0
 - clang=11.1.0
 - clang-tools=11.1.0
 - rapids-build-env=22.02.*
diff --git a/python/raft/common/cuda.pxd b/python/raft/common/cuda.pxd
index e407213f44..0459cb96af 100644
--- a/python/raft/common/cuda.pxd
+++ b/python/raft/common/cuda.pxd
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,23 +14,9 @@
 # limitations under the License.
 #
 
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
+from cuda.ccudart cimport cudaStream_t
 
+cdef class Stream:
+    cdef cudaStream_t s
 
-# Populate this with more typedef's (eg: events) as and when needed
-cdef extern from * nogil:
-    ctypedef void* _Stream "cudaStream_t"
-    ctypedef int   _Error  "cudaError_t"
-
-
-# Populate this with more runtime api method declarations as and when needed
-cdef extern from "cuda_runtime_api.h" nogil:
-    _Error cudaStreamCreate(_Stream* s)
-    _Error cudaStreamDestroy(_Stream s)
-    _Error cudaStreamSynchronize(_Stream s)
-    _Error cudaGetLastError()
-    const char* cudaGetErrorString(_Error e)
-    const char* cudaGetErrorName(_Error e)
+    cdef cudaStream_t getStream(self)
diff --git a/python/raft/common/cuda.pyx b/python/raft/common/cuda.pyx
index 0b97eeba67..c3c90936aa 100644
--- a/python/raft/common/cuda.pyx
+++ b/python/raft/common/cuda.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,10 +19,22 @@
 # cython: embedsignature = True
 # cython: language_level = 3
 
+from cuda.ccudart cimport(
+    cudaStream_t,
+    cudaError_t,
+    cudaSuccess,
+    cudaStreamCreate,
+    cudaStreamDestroy,
+    cudaStreamSynchronize,
+    cudaGetLastError,
+    cudaGetErrorString,
+    cudaGetErrorName
+)
+
 
 class CudaRuntimeError(RuntimeError):
     def __init__(self, extraMsg=None):
-        cdef _Error e = cudaGetLastError()
+        cdef cudaError_t e = cudaGetLastError()
         cdef bytes errMsg = cudaGetErrorString(e)
         cdef bytes errName = cudaGetErrorName(e)
         msg = "Error! %s reason='%s'" % (errName.decode(), errMsg.decode())
@@ -45,29 +57,17 @@ cdef class Stream:
         stream.sync()
         del stream  # optional!
     """
-
-    # NOTE:
-    # If we store _Stream directly, this always leads to the following error:
-    #   "Cannot convert Python object to '_Stream'"
-    # I was unable to find a good solution to this in reasonable time. Also,
-    # since cudaStream_t is a pointer anyways, storing it as an integer should
-    # be just fine (although, that certainly is ugly and hacky!).
-    cdef size_t s
-
     def __cinit__(self):
-        if self.s != 0:
-            return
-        cdef _Stream stream
-        cdef _Error e = cudaStreamCreate(&stream)
-        if e != 0:
+        cdef cudaStream_t stream
+        cdef cudaError_t e = cudaStreamCreate(&stream)
+        if e != cudaSuccess:
             raise CudaRuntimeError("Stream create")
-        self.s = <size_t>stream
+        self.s = stream
 
     def __dealloc__(self):
         self.sync()
-        cdef _Stream stream = <_Stream>self.s
-        cdef _Error e = cudaStreamDestroy(stream)
-        if e != 0:
+        cdef cudaError_t e = cudaStreamDestroy(self.s)
+        if e != cudaSuccess:
             raise CudaRuntimeError("Stream destroy")
 
     def sync(self):
@@ -76,10 +76,9 @@ cdef class Stream:
         could raise exception due to issues with previous asynchronous
         launches
         """
-        cdef _Stream stream = <_Stream>self.s
-        cdef _Error e = cudaStreamSynchronize(stream)
-        if e != 0:
+        cdef cudaError_t e = cudaStreamSynchronize(self.s)
+        if e != cudaSuccess:
             raise CudaRuntimeError("Stream sync")
 
-    def getStream(self):
+    cdef cudaStream_t getStream(self):
         return self.s
diff --git a/python/raft/common/handle.pxd b/python/raft/common/handle.pxd
index d2ae0a401d..8415b7e3d7 100644
--- a/python/raft/common/handle.pxd
+++ b/python/raft/common/handle.pxd
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,7 +21,6 @@
 
 
 from libcpp.memory cimport shared_ptr
-from .cuda cimport _Stream
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
 from rmm._lib.cuda_stream_pool cimport cuda_stream_pool
 from libcpp.memory cimport shared_ptr
diff --git a/python/raft/common/handle.pyx b/python/raft/common/handle.pyx
index 1accf9e679..661c5b5f23 100644
--- a/python/raft/common/handle.pyx
+++ b/python/raft/common/handle.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,9 +24,10 @@ from libcpp.memory cimport shared_ptr
 from rmm._lib.cuda_stream_view cimport cuda_stream_per_thread
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
 
-from .cuda cimport _Stream, _Error, cudaStreamSynchronize
+from .cuda cimport Stream
 from .cuda import CudaRuntimeError
 
+
 cdef class Handle:
     """
     Handle is a lightweight python wrapper around the corresponding C++ class
@@ -51,7 +52,7 @@ cdef class Handle:
         del handle  # optional!
     """
 
-    def __cinit__(self, stream=None, n_streams=0):
+    def __cinit__(self, stream: Stream = None, n_streams=0):
         self.n_streams = n_streams
         if n_streams > 0:
             self.stream_pool.reset(new cuda_stream_pool(n_streams))
@@ -64,7 +65,7 @@ cdef class Handle:
                                           self.stream_pool))
         else:
             # this constructor constructs a handle on user stream
-            c_stream = cuda_stream_view(<_Stream><size_t> stream.getStream())
+            c_stream = cuda_stream_view(stream.getStream())
             self.c_obj.reset(new handle_t(c_stream,
                                           self.stream_pool))
 

From 4dca1f0fee5a92db07d018081a7dd9a79a926b4c Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Sat, 22 Jan 2022 22:06:51 -0500
Subject: [PATCH 078/171] Updates to Python and C++ Docs (#442)

As we're beginning to hide the implementation details and establisht he public API, it's time to start putting a little focus on the presentation of our docs. Eventually, we want to make sure our different packages are easy to navigate and find. We will be updating this more as we continue to stabilize the public API.

The docs can be built by running `./build.sh docs` from the repository root.

Closes #2

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/raft/pull/442
---
 BUILD.md                                 | 158 ++++++-----------
 DEVELOPER_GUIDE.md                       |  24 +++
 README.md                                |  44 +----
 build.sh                                 |  26 +--
 ci/gpu/build.sh                          |   7 +-
 conda/environments/raft_dev_cuda11.0.yml |   1 +
 conda/environments/raft_dev_cuda11.2.yml |   1 +
 conda/environments/raft_dev_cuda11.4.yml |   1 +
 conda/environments/raft_dev_cuda11.5.yml |   1 +
 cpp/CMakeLists.txt                       |   4 +-
 cpp/{ => doxygen}/Doxyfile.in            |  42 +++--
 cpp/doxygen/main_page.md                 |  14 ++
 cpp/doxygen/rapids.css                   | 119 +++++++++++++
 docs/Makefile                            |  20 +++
 docs/README.md                           |  14 ++
 docs/make.bat                            |  36 ++++
 docs/source/_static/copybutton.css       |  42 +++++
 docs/source/_static/example_mod.js       |  61 +++++++
 docs/source/_static/infoboxes.css        |  87 +++++++++
 docs/source/_static/params.css           |   9 +
 docs/source/_static/references.css       |  23 +++
 docs/source/conf.py                      | 215 +++++++++++++++++++++++
 docs/source/cpp_api.rst                  |  14 ++
 docs/source/cpp_api/core.rst             |  13 ++
 docs/source/cpp_api/distributed.rst      |   0
 docs/source/cpp_api/linalg.rst           |   0
 docs/source/cpp_api/nn.rst               |  14 ++
 docs/source/cpp_api/sparse.rst           |  14 ++
 docs/source/cpp_api/spatial.rst          |  13 ++
 docs/source/cpp_api/stats.rst            |   0
 docs/source/cuda_cpp.rst                 |  11 ++
 docs/source/index.rst                    |  24 +++
 docs/source/python.rst                   |  11 ++
 docs/source/python_api.rst               |  17 ++
 docs/source/sphinxext/github_link.py     | 146 +++++++++++++++
 35 files changed, 1060 insertions(+), 166 deletions(-)
 create mode 100644 DEVELOPER_GUIDE.md
 rename cpp/{ => doxygen}/Doxyfile.in (98%)
 create mode 100644 cpp/doxygen/main_page.md
 create mode 100644 cpp/doxygen/rapids.css
 create mode 100644 docs/Makefile
 create mode 100644 docs/README.md
 create mode 100644 docs/make.bat
 create mode 100644 docs/source/_static/copybutton.css
 create mode 100644 docs/source/_static/example_mod.js
 create mode 100644 docs/source/_static/infoboxes.css
 create mode 100644 docs/source/_static/params.css
 create mode 100644 docs/source/_static/references.css
 create mode 100644 docs/source/conf.py
 create mode 100644 docs/source/cpp_api.rst
 create mode 100644 docs/source/cpp_api/core.rst
 create mode 100644 docs/source/cpp_api/distributed.rst
 create mode 100644 docs/source/cpp_api/linalg.rst
 create mode 100644 docs/source/cpp_api/nn.rst
 create mode 100644 docs/source/cpp_api/sparse.rst
 create mode 100644 docs/source/cpp_api/spatial.rst
 create mode 100644 docs/source/cpp_api/stats.rst
 create mode 100644 docs/source/cuda_cpp.rst
 create mode 100644 docs/source/index.rst
 create mode 100644 docs/source/python.rst
 create mode 100644 docs/source/python_api.rst
 create mode 100644 docs/source/sphinxext/github_link.py

diff --git a/BUILD.md b/BUILD.md
index 844a563a90..5a053650a5 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -1,78 +1,17 @@
 # RAFT Build and Development Guide
 
+- [Building and running tests](#building-and-running-tests)
 - [Usage of RAFT by downstream projects](#usage-of-raft-by-downstream-projects)
     - [C++ Integration](#c-integration)
     - [Python/Cython Integration](#pythoncython-integration)
-- [Building and running tests](#building-and-running-tests)
 - [CI Process](#ci-process)
 - [Developer Guide](#developer-guide)
     - [Local Development](#local-development)
     - [Submitting PRs](#submitting-prs)
 
+## Building and installing RAFT
 
-
-## Usage of RAFT by downstream projects
-
-### C++ Integration
-
-C++ RAFT is a header only library, so it can be easily configured using CMake by consuming libraries. Since this repo is intended to be included by downstream repos, the recommended way of accomplishing that is using CMake's git cloning functionality:
-
-
-```cmake
-if(DEFINED ENV{RAFT_PATH})
-  message(STATUS "RAFT_PATH environment variable detected.")
-  message(STATUS "RAFT_DIR set to $ENV{RAFT_PATH}")
-  set(RAFT_DIR ENV{RAFT_PATH})
-
-else(DEFINED ENV{RAFT_PATH})
-  message(STATUS "RAFT_PATH environment variable NOT detected, cloning RAFT")
-  set(RAFT_GIT_DIR ${CMAKE_CURRENT_BINARY_DIR}/raft CACHE STRING "Path to RAFT repo")
-
-  ExternalProject_Add(raft
-    GIT_REPOSITORY    git@github.com:rapidsai/raft.git
-    GIT_TAG           pinned_commit/git_tag/branch
-    PREFIX            ${RAFT_GIT_DIR}
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND     ""
-    INSTALL_COMMAND   "")
-
-  set(RAFT_INCLUDE_DIR ${RAFT_GIT_DIR}/src/raft/cpp/include CACHE STRING "RAFT include variable")
-endif(DEFINED ENV{RAFT_PATH})
-
-```
-
-This create the variable `$RAFT_INCLUDE_DIR` variable that can be used in `include_directories`, and then the related header files can be included when needed.
-
-### Python/Cython Integration
-
-RAFT's Python and Cython code have been designed to be included in projects that use RAFT, as opposed to be distributed by itself as a Python package. To use:
-
-- The file `setuputils.py` is included in RAFT's `python` folder. Copy the file to your repo, in a location where it can be imported by `setup.py`
-- In your setup.py, use the function `use_raft_package`, for example for cuML:
-
-
-```python
-# Optional location of C++ build folder that can be configured by the user
-libcuml_path = get_environment_option('CUML_BUILD_PATH')
-# Optional location of RAFT that can be confugred by the user
-raft_path = get_environment_option('RAFT_PATH')
-
-use_raft_package(raft_path, libcuml_path)
-```
-
-The usage of RAFT by the consuming repo's python code follows the rules:
-1. If the environment variable `RAFT_PATH` points to the RAFT repo, then that will be used.
-2. If there is a C++ build folder that has cloned RAFT already, setup.py will use that RAFT.
-3. If none of the above happened, then setup.py will clone RAFT and use it directly.
-
-- After `setup.py` calls the `use_raft_package` function, the RAFT python code will be included (via a symlink) in the consuming repo package, under a raft subfolder. So for example, `cuml` python package includes RAFT in `cuml.raft`.
-
-
-## Building and running tests
-
-Since RAFT is not meant to create any artifact on itself, but be included in other projects, the build infrastructure is focused only on testing.
-
-The base folder in the repository contains a `build.sh` script that builds both the C++ and Python code, which is the recommended way of building the tests.
+C++ RAFT is a header-only library but provides the option of building shared libraries with template instantiations for common types to speed up compile times for larger projects. The recommended way to build and install RAFT is to use the `build.sh` script in the root of the repository. This script can build both the C++ and Python code and provides options for building and installing the shared libraries.
 
 To run C++ tests:
 
@@ -87,64 +26,77 @@ cd python
 python -m pytest raft
 ```
 
-To build manually, you can also use `CMake` and setup.py directly. For C++:
+To build manually, you can also use `CMake` and setup.py directly.
+
+For C++, the `RAFT_COMPILE_LIBRARIES` option can be used to compile the shared libraries. Shared libraries are provided for the `nn` and `distance` packages currently. The `nn` package requires FAISS, which will be built from source if it is not already installed. FAISS can optionally be statically compiled into the `nn` shared library with the `RAFT_USE_FAISS_STATIC` option.
 
+To install RAFT into a specific location, use `CMAKE_INSTALL_PREFIX`. The snippet below will install it into the current conda environment.
 ```bash
 cd cpp
 mkdir build
 cd build
-cmake ..
+cmake -DRAFT_COMPILE_LIBRARIES=ON -DRAFT_USE_FAISS_STATIC=OFF  -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX ../
+make install
 ```
 
-There is no `install` target currently.
-
 For python:
 
 ```bash
 cd python
 python setup.py build_ext --inplace
+python setup.py install
 ```
 
+## Using RAFT in downstream projects
 
-## CI Process
-
-PRs submitted to RAFT will always run the RAFT tests (once GPUCI is enabled). Additionally, RAFT has convenience functionality to run tests of the following projects that use RAFT: cuML and cuGraph.
-
-To run these other tests, turn `ON` the variables in `ci/prtest.config` in your PR:
-
-```bash
-RUN_CUGRAPH_LIBCUGRAPH_TESTS=OFF
-RUN_CUGRAPH_PYTHON_TESTS=OFF
-
-RUN_CUML_LIBCUML_TESTS=OFF
-RUN_CUML_PRIMS_TESTS=OFF
-RUN_CUML_PYTHON_TESTS=OFF
-```
-
-This will make it so that CI in the PR will clone and build the respective repository, but the repository **will be built using the fork/branch of RAFT in the PR**. This allows to test changes in RAFT without the need of opening PRs in the other repositories.
-
-Before merging the PR, those variables need to be returned to `OFF`.
-
-
-## Developer Guide
-
-### Local Development
+### C++ Integration
 
-To help working with RAFT and consuming projects as seamless as possible, this section describes how a typical workflow looks like and gives some guidelines for developers working in projects that affect code in both RAFT and at least one downstream repository.
+Use RAFT in cmake projects with `find_package(raft)` for header-only operation and the `raft::raft` target will be available for configuring linking and `RAFT_INCLUDE_DIR` will be available for includes. Note that if any packages are used which require downstream dependencies, such as the `nn` package requiring FAISS, these dependencies will have be installed and configured in cmake independently.
 
-Using as an example developer working on cuML and RAFT, we recommend the following:
+Use `find_package(raft COMPONENTS nn, distance)` to enable the shared libraries and pass dependencies through separate targets for each component. In this example, `raft::distance` and `raft::nn` targets will be available for configuring linking paths. These targets will also pass through any transitive dependencies (such as FAISS in the case of the `nn` package).
 
-- Create two working folders: one containing the cloned cuML repository and the other the cloned RAFT one.
-- Create environment variable `RAFT_PATH` pointing to the location of the RAFT path.
-- Work on same named branches in both repos/folders.
+### Building RAFT C++ from source
 
-This will facilitate development, and the `RAFT_PATH` variable will make it so that the downstream repository, in this case cuML, builds using the locally cloned RAFT (as descrbed in the first step).
+RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library, so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` target for includes by default. The `COMPILE_LIBRARIES` option enables the building of the shared libraries 
 
-### Submitting PRs Guidelines
+```cmake
+function(find_and_configure_raft)
+
+  set(oneValueArgs VERSION FORK PINNED_TAG USE_RAFT_NN USE_FAISS_STATIC COMPILE_LIBRARIES)
+  cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+                            "${multiValueArgs}" ${ARGN} )
+
+  rapids_cpm_find(raft ${PKG_VERSION}
+          GLOBAL_TARGETS      raft::raft
+          BUILD_EXPORT_SET    proj-exports
+          INSTALL_EXPORT_SET  proj-exports
+          CPM_ARGS
+          GIT_REPOSITORY https://github.com/${PKG_FORK}/raft.git
+          GIT_TAG        ${PKG_PINNED_TAG}
+          SOURCE_SUBDIR  cpp
+          FIND_PACKAGE_ARGUMENTS "COMPONENTS ${RAFT_COMPONENTS}"
+          OPTIONS
+          "BUILD_TESTS OFF"
+          "RAFT_USE_FAISS_STATIC ${PKG_USE_FAISS_STATIC}"
+          "NVTX ${NVTX}"
+          "RAFT_COMPILE_LIBRARIES ${COMPILE_LIBRARIES}"
+          
+  )
+
+endfunction()
+
+# Change pinned tag here to test a commit in CI
+# To use a different RAFT locally, set the CMake variable
+# CPM_raft_SOURCE=/path/to/local/raft
+find_and_configure_raft(VERSION    22.02.00
+        FORK             rapidsai
+        PINNED_TAG       branch-22.02
+        USE_RAFT_NN       NO
+        USE_FAISS_STATIC  NO
+        COMPILE_LIBRARIES NO
+)
+```
 
-If you have changes to both RAFT and at least one downstream repo, then:
+### Python/Cython Integration
 
-- It is recommended to open a PR to both repositories (for visibility and CI tests).
-- Change the pinned branch/commit in the downstream repo PR to point to the fork and branch used for the RAFT PR to make CI run tests
-- If your changes might affect usage of RAFT by other downnstream repos, alert reviewers and open a github issue or PR in that downstream repo as approproate.
-- The PR to RAFT will be merged first, so that the downstream repo PR pinned branch/commit can be returned to the main RAFT branch and run CI with it.
+Once installed, RAFT's Python library can be imported and used directly.
\ No newline at end of file
diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md
new file mode 100644
index 0000000000..a045d13991
--- /dev/null
+++ b/DEVELOPER_GUIDE.md
@@ -0,0 +1,24 @@
+# Developer Guide
+
+## Local Development
+
+Devloping features and fixing bugs for the RAFT library itself is straightforward and only requires building and installing the relevant RAFT artifacts. 
+
+The process for working on a CUDA/C++ feature which spans RAFT and one or more consumers can vary slightly depending on whether the consuming project relies on a source build (as outlined in the [BUILD](BUILD.md#building-raft-c-from-source) docs). In such a case, the option `CPM_raft_SOURCE=/path/to/raft/source` can be passed to the cmake of the consuming project in order to build the local RAFT from source. The PR with relevant changes to the consuming project can also pin the RAFT version temporarily by explicitly changing the `FORK` and `PINNED_TAG` arguments to the RAFT branch containing their changes when invoking `find_and_configure_raft`.  The pin should be reverted after the changed is merged to the RAFT project and before it is merged to the dependent project(s) downstream. 
+
+If building a feature which spans projects and not using the source build in cmake, the RAFT changes (both C++ and Python) will need to be installed into the environment of the consuming project before they can be used. The ideal integration of RAFT into consuming projects will enable both the source build in the consuming project only for this case but also rely on a more stable packaging (such as conda packaging) otherwise. 
+
+## API stability
+
+Since RAFT is a core library with multiple consumers, it's important that the public APIs maintain stability across versions and any changes to them are done with caution, adding new functions and deprecating the old functions over a couple releases as necessary.
+
+The public APIs should be lightweight wrappers around calls to private APIs inside the `detail` namespace. 
+
+## Testing
+
+It's important for RAFT to maintain a high test coverage in order to minimize the potential for downstream projects to encounter unexpected build or runtime behavior as a result of changes. A well-defined public API can help maintain compile-time stability but means more focus should be placed on testing the functional requirements and verifying execution on the various edge cases within RAFT itself. Ideally, bug fixes and new features should be able to be made to RAFT independently of the consuming projects.
+
+
+## Documentation
+
+Public APIs always require documentation, since those will be exposed directly to users. In addition to summarizing the purpose of each class / function in the public API, the arguments (and relevant templates) should be documented along with brief usage examples.
\ No newline at end of file
diff --git a/README.md b/README.md
index 8091c345e1..c0eeab75e5 100755
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 # <div align="left"><img src="https://rapids.ai/assets/images/rapids_logo.png" width="90px"/>&nbsp;RAFT: RAPIDS Analytics Framework Toolkit</div>
 
-RAFT is a library containing building-blocks for rapid composition of RAPIDS Analytics. These building-blocks include shared representations, mathematical computational primitives, and utilities that accelerate building analytics and data science algorithms in the RAPIDS ecosystem. Both the C++ and Python components can be included in consuming libraries, providing building-blocks for both dense and sparse matrix formats in the following general categories:
+RAFT is a library containing building-blocks for rapid composition of RAPIDS Analytics. These building-blocks include shared representations, mathematical computational primitives, and utilities that accelerate building analytics and data science algorithms in the RAPIDS ecosystem. Both the C++ and Python components can be included in consuming libraries, providing operations for both dense and sparse matrix formats in the following general categories:
+
 #####
 | Category | Description / Examples |
 | --- | --- |
@@ -17,11 +18,13 @@ the maintenance burden by maximizing reuse across projects. RAFT relies on the [
 like other projects in the RAPIDS ecosystem, eases the burden of configuring different allocation strategies globally 
 across the libraries that use it. RMM also provides RAII wrappers around device arrays that handle the allocation and cleanup.
 
+## RAFT's primary goals are to be fast, simple, reusable, composable, and comprehensive.
+
 ## Getting started
 
-Refer to the [Build and Development Guide](BUILD.md) for details on RAFT's design, building, testing and development guidelines.
+Refer to the [Build](BUILD.md) instructions for details on building and including the RAFT library in downstream projects. The [Developer Guide](DEVELOPER_GUIDE.md) contains details on the developer guidelines, workflows, and principals. If you are interested in contributing to the RAFT project, please read our [Contributing guidelines](CONTRIBUTING.md).
 
-Most of the primitives in RAFT accept a `raft::handle_t` object for the management of resources which are expensive to create, such CUDA streams, stream pools, and handles to other CUDA libraries like `cublas` and `cusolver`. 
+Most of the primitives in RAFT accept a `raft::handle_t` object for the management of resources which are expensive to create, such CUDA streams, stream pools, and handles to other CUDA libraries like `cublas` and `cusolver`.
 
 
 ### C++ Example
@@ -58,37 +61,8 @@ raft::distance::pairwise_distance(handle, input.data(), input.data(),
 
 The folder structure mirrors other RAPIDS repos (cuDF, cuML, cuGraph...), with the following folders:
 
+- `ci`: Scripts for running CI in PRs
+- `conda`: conda recipes and development conda environments
 - `cpp`: Source code for all C++ code. The code is currently header-only, therefore it is in the `include` folder (with no `src`).
+- `docs`: Source code and scripts for building library documentation
 - `python`: Source code for all Python source code.
-- `ci`: Scripts for running CI in PRs
-
-[comment]: <> (TODO: This needs to be updated after the public API is established)
-[comment]: <> (The library layout contains the following structure:)
-
-[comment]: <> (```bash)
-
-[comment]: <> (cpp/include/raft)
-
-[comment]: <> (     |------------ comms      [communication abstraction layer])
-
-[comment]: <> (     |------------ distance   [dense pairwise distances])
-
-[comment]: <> (     |------------ linalg     [dense linear algebra])
-
-[comment]: <> (     |------------ matrix     [dense matrix format])
-
-[comment]: <> (     |------------ random     [random matrix generation])
-
-[comment]: <> (     |------------ sparse     [sparse matrix and graph algorithms])
-
-[comment]: <> (     |------------ spatial    [spatial algorithms])
-
-[comment]: <> (     |------------ spectral   [spectral clustering])
-
-[comment]: <> (     |------------ stats      [statistics primitives])
-
-[comment]: <> (     |------------ handle.hpp [raft handle])
-
-[comment]: <> (```)
-
-
diff --git a/build.sh b/build.sh
index a609670419..5b9a1c4ba0 100755
--- a/build.sh
+++ b/build.sh
@@ -18,14 +18,14 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean cppraft pyraft cppdocs -v -g --allgpuarch --nvtx --show_depr_warn -h --buildgtest --buildfaiss"
+VALIDARGS="clean cppraft pyraft docs -v -g --allgpuarch --nvtx --show_depr_warn -h --buildgtest --buildfaiss"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
    cppraft          - build the cuml C++ code only. Also builds the C-wrapper library
                       around the C++ code.
    pyraft             - build the cuml Python package
-   cppdocs            - build the C++ doxygen documentation
+   docs             - build the documentation
  and <flag> is:
    -v               - verbose build mode
    -g               - build for debug
@@ -38,6 +38,7 @@ HELP="$0 [<target> ...] [<flag> ...]
  default action (no args) is to build both cppraft and pyraft targets
 "
 CPP_RAFT_BUILD_DIR=${REPODIR}/cpp/build
+SPHINX_BUILD_DIR=${REPODIR}/docs
 PY_RAFT_BUILD_DIR=${REPODIR}/python/build
 PYTHON_DEPS_CLONE=${REPODIR}/python/external_repositories
 BUILD_DIRS="${CPP_RAFT_BUILD_DIR} ${PY_RAFT_BUILD_DIR} ${PYTHON_DEPS_CLONE}"
@@ -131,14 +132,10 @@ if (( ${CLEAN} == 1 )); then
     cd ${REPODIR}
 fi
 
-if hasArg cppdocs; then
-    cd ${CPP_RAFT_BUILD_DIR}
-    cmake --build ${CPP_RAFT_BUILD_DIR} --target docs_raft
-fi
 
 ################################################################################
 # Configure for building all C++ targets
-if (( ${NUMARGS} == 0 )) || hasArg cppraft; then
+if (( ${NUMARGS} == 0 )) || hasArg cppraft || hasArg docs; then
     if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
         RAFT_CMAKE_CUDA_ARCHITECTURES="NATIVE"
         echo "Building for the architecture of the GPU in the system..."
@@ -155,14 +152,15 @@ if (( ${NUMARGS} == 0 )) || hasArg cppraft; then
           -DBUILD_GTEST=${BUILD_GTEST} \
           -DBUILD_STATIC_FAISS=${BUILD_STATIC_FAISS}
 
-
-    # Run all c++ targets at once
-    cmake --build  ${CPP_RAFT_BUILD_DIR} -j${PARALLEL_LEVEL} ${MAKE_TARGETS} ${VERBOSE_FLAG}
+  if hasArg cppraft; then
+      # Run all c++ targets at once
+      cmake --build  ${CPP_RAFT_BUILD_DIR} -j${PARALLEL_LEVEL} ${MAKE_TARGETS} ${VERBOSE_FLAG}
+  fi
 fi
 
 
 # Build and (optionally) install the cuml Python package
-if (( ${NUMARGS} == 0 )) || hasArg pyraft; then
+if (( ${NUMARGS} == 0 )) || hasArg pyraft || hasArg docs; then
 
     cd ${REPODIR}/python
     if [[ ${INSTALL_TARGET} != "" ]]; then
@@ -171,3 +169,9 @@ if (( ${NUMARGS} == 0 )) || hasArg pyraft; then
         python setup.py build_ext -j${PARALLEL_LEVEL:-1} --inplace --library-dir=${LIBCUML_BUILD_DIR} ${SINGLEGPU}
     fi
 fi
+
+if hasArg docs; then
+    cmake --build ${CPP_RAFT_BUILD_DIR} --target docs_raft
+    cd ${SPHINX_BUILD_DIR}
+    make html
+fi
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 77987e65be..1c8d7797ad 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #########################################
 # cuML GPU build and test script for CI #
 #########################################
@@ -53,6 +53,7 @@ gpuci_mamba_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid
       "libcusolver>=11.2.1" \
       "cudf=${MINOR_VERSION}" \
       "rmm=${MINOR_VERSION}" \
+      "breathe" \
       "dask-cudf=${MINOR_VERSION}" \
       "dask-cuda=${MINOR_VERSION}" \
       "ucx-py=${UCX_PY_VERSION}" \
@@ -90,8 +91,8 @@ export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
 gpuci_logger "Build C++ and Python targets"
 "$WORKSPACE/build.sh" cppraft pyraft -v
 
-gpuci_logger "Building doxygen C++ docs"
-"$WORKSPACE/build.sh" cppdocs -v
+gpuci_logger "Building docs"
+"$WORKSPACE/build.sh" docs -v
 
 gpuci_logger "Resetting LD_LIBRARY_PATH"
 
diff --git a/conda/environments/raft_dev_cuda11.0.yml b/conda/environments/raft_dev_cuda11.0.yml
index 93134c6367..460ba7c1fb 100644
--- a/conda/environments/raft_dev_cuda11.0.yml
+++ b/conda/environments/raft_dev_cuda11.0.yml
@@ -21,6 +21,7 @@ dependencies:
 - pip
 - pip:
     - sphinx_markdown_tables
+    - breathe
     - git+https://github.com/dask/dask.git@main
     - git+https://github.com/dask/distributed.git@main
 
diff --git a/conda/environments/raft_dev_cuda11.2.yml b/conda/environments/raft_dev_cuda11.2.yml
index ced3aa2ed1..a1e9ad650f 100644
--- a/conda/environments/raft_dev_cuda11.2.yml
+++ b/conda/environments/raft_dev_cuda11.2.yml
@@ -21,6 +21,7 @@ dependencies:
 - pip
 - pip:
     - sphinx_markdown_tables
+    - breathe
     - git+https://github.com/dask/dask.git@main
     - git+https://github.com/dask/distributed.git@main
 
diff --git a/conda/environments/raft_dev_cuda11.4.yml b/conda/environments/raft_dev_cuda11.4.yml
index 54d2680295..c88f480e56 100644
--- a/conda/environments/raft_dev_cuda11.4.yml
+++ b/conda/environments/raft_dev_cuda11.4.yml
@@ -21,6 +21,7 @@ dependencies:
 - pip
 - pip:
     - sphinx_markdown_tables
+    - breathe
     - git+https://github.com/dask/dask.git@main
     - git+https://github.com/dask/distributed.git@main
 
diff --git a/conda/environments/raft_dev_cuda11.5.yml b/conda/environments/raft_dev_cuda11.5.yml
index c6d9f3fbf5..ad161b555f 100644
--- a/conda/environments/raft_dev_cuda11.5.yml
+++ b/conda/environments/raft_dev_cuda11.5.yml
@@ -22,6 +22,7 @@ dependencies:
 - pip
 - pip:
     - sphinx_markdown_tables
+    - breathe
     - git+https://github.com/dask/dask.git@main
     - git+https://github.com/dask/distributed.git@main
 
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index efebfff429..9387589f73 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -238,6 +238,6 @@ endif()
 # - doxygen targets ----------------------------------------------------------
 
 include(cmake/doxygen.cmake)
-add_doxygen_target(IN_DOXYFILE Doxyfile.in
+add_doxygen_target(IN_DOXYFILE doxygen/Doxyfile.in
   OUT_DOXYFILE ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
   CWD ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/cpp/Doxyfile.in b/cpp/doxygen/Doxyfile.in
similarity index 98%
rename from cpp/Doxyfile.in
rename to cpp/doxygen/Doxyfile.in
index 0918e12e4f..eb27b2d02c 100644
--- a/cpp/Doxyfile.in
+++ b/cpp/doxygen/Doxyfile.in
@@ -281,7 +281,8 @@ OPTIMIZE_OUTPUT_VHDL   = NO
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
 # the files are not read by doxygen.
 
-EXTENSION_MAPPING      =
+EXTENSION_MAPPING      = cu=C++ \
+                         cuh=C++
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
@@ -771,7 +772,8 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = @CMAKE_CURRENT_SOURCE_DIR@/include \
+INPUT                  = @CMAKE_CURRENT_SOURCE_DIR@/doxygen/main_page.md \
+                         @CMAKE_CURRENT_SOURCE_DIR@/include \
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -811,7 +813,9 @@ RECURSIVE              = YES
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                =
+EXCLUDE                = @CMAKE_CURRENT_SOURCE_DIR@/include/raft/sparse/linalg/symmetrize.hpp \ # Contains device code
+                         @CMAKE_CURRENT_SOURCE_DIR@/include/raft/sparse/csr.hpp \    # Contains device code
+                         @CMAKE_CURRENT_SOURCE_DIR@/include/raft/sparse/cusparse_wrappers.h
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -827,8 +831,9 @@ EXCLUDE_SYMLINKS       = NO
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories for example use the pattern */test/*
 
-EXCLUDE_PATTERNS       = **/detail/** \
-                         **/spectral/**
+EXCLUDE_PATTERNS       = */detail/* \
+                         */specializations/* \
+                         */spectral/*
 
 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
 # (namespaces, classes, functions, etc.) that should be excluded from the
@@ -839,7 +844,9 @@ EXCLUDE_PATTERNS       = **/detail/** \
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories use the pattern */test/*
 
-EXCLUDE_SYMBOLS        =
+EXCLUDE_SYMBOLS        = detail \
+                         csr_adj_graph \
+                         csr_adj_graph_batched
 
 # The EXAMPLE_PATH tag can be used to specify one or more files or directories
 # that contain example code fragments that are included (see the \include
@@ -921,7 +928,7 @@ FILTER_SOURCE_PATTERNS =
 # (index.html). This can be useful if you have a project on for instance GitHub
 # and want to reuse the introduction page also for the doxygen output.
 
-USE_MDFILE_AS_MAINPAGE =
+USE_MDFILE_AS_MAINPAGE = @CMAKE_CURRENT_SOURCE_DIR@/doxygen/main_page.md
 
 #---------------------------------------------------------------------------
 # Configuration options related to source browsing
@@ -1131,7 +1138,7 @@ HTML_STYLESHEET        =
 # list). For an example see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_EXTRA_STYLESHEET  =
+HTML_EXTRA_STYLESHEET  = @CMAKE_CURRENT_SOURCE_DIR@/doxygen/rapids.css
 
 # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
 # other source files which should be copied to the HTML output directory. Note
@@ -1152,7 +1159,7 @@ HTML_EXTRA_FILES       =
 # Minimum value: 0, maximum value: 359, default value: 220.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_COLORSTYLE_HUE    = 220
+HTML_COLORSTYLE_HUE    = 266
 
 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
 # in the HTML output. For a value of 0 the output will use grayscales only. A
@@ -1160,7 +1167,7 @@ HTML_COLORSTYLE_HUE    = 220
 # Minimum value: 0, maximum value: 255, default value: 100.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_COLORSTYLE_SAT    = 100
+HTML_COLORSTYLE_SAT    = 255
 
 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
 # luminance component of the colors in the HTML output. Values below 100
@@ -1171,7 +1178,7 @@ HTML_COLORSTYLE_SAT    = 100
 # Minimum value: 40, maximum value: 240, default value: 80.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_COLORSTYLE_GAMMA  = 80
+HTML_COLORSTYLE_GAMMA  = 52
 
 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
 # page will contain the date and time when the page was generated. Setting this
@@ -1182,6 +1189,17 @@ HTML_COLORSTYLE_GAMMA  = 80
 
 HTML_TIMESTAMP         = NO
 
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via JavaScript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have JavaScript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
 # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
 # documentation will contain sections that can be hidden and shown after the
 # page has loaded.
@@ -1885,7 +1903,7 @@ MAN_LINKS              = NO
 # captures the structure of the code including all documentation.
 # The default value is: NO.
 
-GENERATE_XML           = NO
+GENERATE_XML           = YES
 
 # The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
 # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
diff --git a/cpp/doxygen/main_page.md b/cpp/doxygen/main_page.md
new file mode 100644
index 0000000000..070a8f1f1d
--- /dev/null
+++ b/cpp/doxygen/main_page.md
@@ -0,0 +1,14 @@
+# libraft
+
+RAFT (RAPIDS Analytics Framework Toolkit) is a library containing building-blocks for rapid composition of RAPIDS Analytics. These building-blocks include shared representations, mathematical computational primitives, and utilities that accelerate building analytics and data science algorithms in the RAPIDS ecosystem. Both the C++ and Python components can be included in consuming libraries, providing building-blocks for both dense and sparse matrix formats in the following general categories:
+
+#####
+| Category | Description / Examples |
+| --- | --- |
+| **Data Formats** | tensor representations and conversions for both sparse and dense formats |
+| **Data Generation** | graph, spatial, and machine learning dataset generation |
+| **Dense Operations** | linear algebra, statistics |
+| **Spatial** | pairwise distances, nearest neighbors, neighborhood / proximity graph construction |
+| **Sparse/Graph Operations** | linear algebra, statistics, slicing, msf, spectral embedding/clustering, slhc, vertex degree |
+| **Solvers** | eigenvalue decomposition, least squares, lanczos |
+| **Tools** | multi-node multi-gpu communicator, utilities |
diff --git a/cpp/doxygen/rapids.css b/cpp/doxygen/rapids.css
new file mode 100644
index 0000000000..57b6de692a
--- /dev/null
+++ b/cpp/doxygen/rapids.css
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* sm-dox is the CSS class Doxygen uses for the main navigation menu bar */
+.sm-dox {
+    background-image:none;              /* override Doxygen background images */
+    background-color: #7306ff;        /* rapids.ai menu background purple */
+}
+
+/* Menu links */
+.sm-dox a,
+.sm-dox a:focus,
+.sm-dox a:hover,
+.sm-dox a:active,
+.sm-dox a.highlighted {
+    background-image:none;              /* override Doxygen background images */
+    color: white;                     /* rapids.ai menu white font */
+    font-family:"Open Sans",sans-serif; /* rapids.ai menu font family */
+    /* rapids.ai uppercase menus, no decoration or shadows, 1em size */
+    text-decoration: none;
+    text-transform: uppercase;
+    text-shadow: none;
+    font-weight: normal;
+    font-size: 1em;
+}
+
+.sm-dox a:hover {
+    background-image:none;                    /* override Doxygen background images */
+    color: #a785e7;                         /* rapids.ai menu text hover color */
+    -webkit-transition: all 0.3s ease-in-out; /* rapids.ai menu fade when hover */
+    -moz-transition: all 0.3s ease-in-out;
+    -o-transition: all 0.3s ease-in-out;
+    transition: all 0.3s ease-in-out;
+}
+
+/* These are the triangles to the right of menus that open up. Make them match the font/fade */
+.sm-dox a span.sub-arrow {
+    border-top-color: white;
+}
+
+.sm-dox a:hover span.sub-arrow {
+    border-top-color: #a785e7;              /* rapids.ai menu text hover color */
+    -webkit-transition: all 0.3s ease-in-out; /* rapids.ai menu fade when hover */
+    -moz-transition: all 0.3s ease-in-out;
+    -o-transition: all 0.3s ease-in-out;
+    transition: all 0.3s ease-in-out;
+}
+
+/* sm-dox ul is the drop-down menus that appear when you mouse over hierarchical menus.
+   Make these white but highlight hovered items with rapids purple background and white text. */
+.sm-dox ul a {
+    font-size: 1em;
+}
+
+.sm-dox ul a:hover {
+    background-image:none;
+    background-color: #7306ff;
+    font-size: 1em;
+    text-shadow: none;
+    -webkit-transition: all 0.3s ease-in-out;
+    -moz-transition: all 0.3s ease-in-out;
+    -o-transition: all 0.3s ease-in-out;
+    transition: all 0.3s ease-in-out;
+}
+
+/* Sub menu (underneath the main Doxygen bar) is a ul with class navpath that shows
+   the C++ class hierarchy.
+*/
+.navpath ul
+{
+    font-size: 13px;             /* Bigger than Doxygen default looks a bit better */
+    background-image:none;       /* Override Doxygen gradient background image */
+    background-color: #FAF6FF; /* A nearly white RAPIDS purple background */
+    border-top:none;             /* Override Doxygen top border for class hierarchy menu since
+                                  it doesn't match the menu above. */
+}
+
+/* Note we don't override the background on li tags here because we want to keep the ">"
+   background images Doxygen uses here as separators */
+.navpath li.navelem {
+    background-color: #FAF6FF;   /* A nearly white RAPIDS purple background */
+}
+
+/* Add some CSS to make class / function lists nicer in the presence of long templated names */
+
+.directory td.entry {
+    white-space: normal;                /* Allow text wrapping for long class names */
+    min-width:   512px;                 /* But don't wrap them too much. */
+
+    /* This indent and padding causes any long class names that are wrapped to be indented on
+       wrapped lines */
+    text-indent: -65px;
+    padding-left: 55px;
+}
+
+/* Prevent arrows from being negatively indented */
+.arrow {
+    text-indent: 0px;
+    padding-left: 10px;
+}
+
+/* Prevent icons from being negatively indented */
+.icona {
+    text-indent: 0px;
+    padding-left: 0px;
+}
\ No newline at end of file
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000000..b242888a67
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    = -Dbreath_projects.RAFT=../cpp/build/xml
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = RAFT
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help -v "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000000..ced8e63938
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,14 @@
+# Building Documentation
+## Building locally:
+
+#### [Build and install RAFT](../BUILD.md)
+
+#### Generate the docs
+```shell script
+bash build.sh docs
+```
+
+#### Once the process finishes, documentation can be found in build/html
+```shell script
+xdg-open build/html/index.html`
+```
\ No newline at end of file
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000000..0bc0dee103
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,36 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+set SPHINXPROJ=cuML
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+
+:end
+popd
diff --git a/docs/source/_static/copybutton.css b/docs/source/_static/copybutton.css
new file mode 100644
index 0000000000..5eef6e366d
--- /dev/null
+++ b/docs/source/_static/copybutton.css
@@ -0,0 +1,42 @@
+/* This contains code with copyright by the scikit-learn project, subject to
+the license in /thirdparty/LICENSES/LICENSE.scikit_learn */
+
+/* copybutton */
+/* Adds "Show/Hide Output" button to Examples */
+
+.copybutton {
+  cursor: pointer;
+  position: absolute;
+  top: 0px;
+  right: 0px;
+  border: 1px solid rgb(221, 221, 221);
+  color: rgb(221, 221, 221);
+  font-family: monospace;
+  padding-left: 0.2rem;
+  padding-right: 0.2rem;
+}
+
+div.highlight:hover span.copybutton::after {
+  background: #3F556B;
+  border-radius: 0.25rem;
+  color: white;
+  content: attr(title);
+  padding: 0.25rem;
+  position: absolute;
+  z-index: 98;
+  width: 100px;
+  font-size: 0.7rem;
+  top: 0;
+  right: 0;
+}
+
+/* copy buttonn */
+div.highlight:hover span.copybutton {
+  background-color: #3F556B;
+  color: white;
+}
+
+div.highlight:hover span.copybutton:hover {
+  background-color: #20252B;
+}
+
diff --git a/docs/source/_static/example_mod.js b/docs/source/_static/example_mod.js
new file mode 100644
index 0000000000..77dc618a82
--- /dev/null
+++ b/docs/source/_static/example_mod.js
@@ -0,0 +1,61 @@
+// This contains code with copyright by the scikit-learn project, subject to
+// the license in /thirdparty/LICENSES/LICENSE.scikit_learn
+
+$(document).ready(function () {
+   /* Add a [>>>] button on the top-right corner of code samples to hide
+    * the >>> and ... prompts and the output and thus make the code
+    * copyable. */
+   var div = $('.highlight-python .highlight,' +
+      '.highlight-python3 .highlight,' +
+      '.highlight-pycon .highlight,' +
+      '.highlight-default .highlight')
+   var pre = div.find('pre');
+
+   // get the styles from the current theme
+   pre.parent().parent().css('position', 'relative');
+   var hide_text = 'Hide prompts and outputs';
+   var show_text = 'Show prompts and outputs';
+
+   // create and add the button to all the code blocks that contain >>>
+   div.each(function (index) {
+      var jthis = $(this);
+      if (jthis.find('.gp').length > 0) {
+         var button = $('<span class="copybutton">&gt;&gt;&gt;</span>');
+         button.attr('title', hide_text);
+         button.data('hidden', 'false');
+         jthis.prepend(button);
+      }
+      // tracebacks (.gt) contain bare text elements that need to be
+      // wrapped in a span to work with .nextUntil() (see later)
+      jthis.find('pre:has(.gt)').contents().filter(function () {
+         return ((this.nodeType == 3) && (this.data.trim().length > 0));
+      }).wrap('<span>');
+   });
+
+   // define the behavior of the button when it's clicked
+   $('.copybutton').click(function (e) {
+      e.preventDefault();
+      var button = $(this);
+      if (button.data('hidden') === 'false') {
+         // hide the code output
+         button.parent().find('.go, .gp, .gt').hide();
+         button.next('pre')
+            .find('.gt')
+            .nextUntil('.gp, .go')
+            .css('visibility', 'hidden');
+         button.css('text-decoration', 'line-through');
+         button.attr('title', show_text);
+         button.data('hidden', 'true');
+      } else {
+         // show the code output
+         button.parent().find('.go, .gp, .gt').show();
+         button.next('pre')
+            .find('.gt')
+            .nextUntil('.gp, .go')
+            .css('visibility', 'visible');
+         button.css('text-decoration', 'none');
+         button.attr('title', hide_text);
+         button.data('hidden', 'false');
+      }
+   });
+});
\ No newline at end of file
diff --git a/docs/source/_static/infoboxes.css b/docs/source/_static/infoboxes.css
new file mode 100644
index 0000000000..4cc597bd28
--- /dev/null
+++ b/docs/source/_static/infoboxes.css
@@ -0,0 +1,87 @@
+/* This contains code with copyright by the scikit-learn project, subject to
+the license in /thirdparty/LICENSES/LICENSE.scikit_learn */
+
+/* info boxes */
+
+div.topic {
+  padding: 0.5rem;
+  background-color: #eee;
+  margin-bottom: 1rem;
+  border-radius: 0.25rem;
+  border: 1px solid #CCC;
+}
+
+div.topic p {
+  margin-bottom: 0.25rem;
+}
+
+div.topic dd {
+  margin-bottom: 0.25rem;
+}
+
+p.topic-title {
+  font-weight: bold;
+  margin-bottom: 0.5rem;
+}
+
+div.topic > ul.simple {
+  margin-bottom: 0.25rem;
+}
+
+p.admonition-title {
+  margin-right: 0.5rem;
+  font-weight: bold;
+  display: inline;
+}
+
+p.admonition-title:after {
+  content: ":";
+}
+
+div.admonition p.admonition-title + p, div.deprecated p {
+  display: inline;
+}
+
+div.admonition, div.deprecated {
+  padding: 0.5rem;
+  border-radius: 0.5rem;
+  border: 1px solid #ddd;
+  margin-bottom: 1rem;
+}
+
+div.admonition {
+  background-color: #eee;
+}
+
+div.admonition p, div.admonition dl, div.admonition dd {
+  margin-bottom: 0
+}
+
+div.deprecated {
+  color: #b94a48;
+  background-color: #F3E5E5;
+  border: 1px solid #eed3d7;
+}
+
+div.seealso {
+  background-color: #FFFBE8;
+  border: 1px solid #fbeed5;
+  color: #AF8A4B;
+}
+
+div.versionchanged {
+  margin-top: 0.5rem;
+  padding: 0.5rem;
+  background-color: #FFFBE8;
+  border: 1px solid #fbeed5;
+  border-radius: 0.5rem;
+}
+
+div.versionchanged p {
+  margin-bottom: 0;
+}
+
+dt.label {
+  float: left;
+  padding-right: 0.5rem;
+}
\ No newline at end of file
diff --git a/docs/source/_static/params.css b/docs/source/_static/params.css
new file mode 100644
index 0000000000..dc5cb96406
--- /dev/null
+++ b/docs/source/_static/params.css
@@ -0,0 +1,9 @@
+/* Mirrors the change in:
+ *   https://github.com/sphinx-doc/sphinx/pull/5976
+ * which is not showing up in our theme.
+ */
+.classifier:before {
+    font-style: normal;
+    margin: 0.5em;
+    content: ":";
+}
diff --git a/docs/source/_static/references.css b/docs/source/_static/references.css
new file mode 100644
index 0000000000..225cf13ba9
--- /dev/null
+++ b/docs/source/_static/references.css
@@ -0,0 +1,23 @@
+
+/* Fix references to not look like parameters */
+dl.citation > dt.label {
+  display: unset !important;
+  float: left !important;
+  border: unset !important;
+  background: unset !important;
+  padding: unset !important;
+  margin: unset !important;
+  font-size: unset !important;
+  line-height: unset !important;
+  padding-right: 0.5rem !important;
+}
+
+/* Add opening bracket */
+dl.citation > dt.label > span::before {
+  content: "[";
+}
+
+/* Add closing bracket */
+dl.citation > dt.label > span::after {
+  content: "]";
+}
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 0000000000..22979b102b
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+
+# If extensions (or modules to document with autodoc) are in another
+# directory, add these directories to sys.path here. If the directory
+# is relative to the documentation root, use os.path.abspath to make it
+# absolute, like shown here.
+sys.path.insert(0, os.path.abspath('sphinxext'))
+sys.path.insert(0, os.path.abspath('../../python'))
+
+from github_link import make_linkcode_resolve # noqa
+
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'numpydoc',
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.doctest',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.linkcode',
+    "IPython.sphinxext.ipython_console_highlighting",
+    "IPython.sphinxext.ipython_directive",
+    "breathe",
+    "recommonmark",
+    "sphinx_markdown_tables",
+]
+
+breathe_default_project = "RAFT"
+breathe_projects = {
+    "RAFT": "../../cpp/build/xml/",
+}
+ipython_mplbackend = "str"
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# generate autosummary even if no references
+# autosummary_generate = True
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = {".rst": "restructuredtext", ".md": "markdown"}
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'raft'
+copyright = '2022, nvidia'
+author = 'nvidia'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '22.02'
+# The full version, including alpha/beta/rc tags.
+release = '22.02.00'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+
+html_theme = 'sphinx_rtd_theme'
+
+# on_rtd is whether we are on readthedocs.org
+on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
+
+if not on_rtd:
+    # only import and set the theme if we're building docs locally
+    # otherwise, readthedocs.org uses their theme by default,
+    # so no need to specify it
+    import sphinx_rtd_theme
+    html_theme = 'sphinx_rtd_theme'
+    html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+html_js_files = ["example_mod.js"]
+
+# -- Options for HTMLHelp output ------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'raftdoc'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'raft.tex', 'RAFT Documentation', 'nvidia', 'manual'),
+]
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [(master_doc, 'raft', 'RAFT Documentation', [author], 1)]
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'raft', 'RAFT Documentation', author, 'raft',
+     'One line description of project.', 'Miscellaneous'),
+]
+
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = {
+    "python": ('https://docs.python.org/', None),
+    "scipy": ('https://docs.scipy.org/doc/scipy/reference', None)
+}
+
+# Config numpydoc
+numpydoc_show_inherited_class_members = False
+numpydoc_class_members_toctree = False
+
+
+def setup(app):
+    app.add_css_file('copybutton.css')
+    app.add_css_file('infoboxes.css')
+    app.add_css_file('params.css')
+    app.add_css_file('references.css')
+
+
+# The following is used by sphinx.ext.linkcode to provide links to github
+linkcode_resolve = make_linkcode_resolve(
+    'raft', 'https://github.com/rapidsai/raft'
+    'raft/blob/{revision}/python/'
+    '{package}/{path}#L{lineno}')
+
+# Set the default role for interpreted code (anything surrounded in `single
+# backticks`) to be a python object. See
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#confval-default_role
+default_role = "py:obj"
diff --git a/docs/source/cpp_api.rst b/docs/source/cpp_api.rst
new file mode 100644
index 0000000000..6d951587d9
--- /dev/null
+++ b/docs/source/cpp_api.rst
@@ -0,0 +1,14 @@
+~~~~~~~~~~~~~~~~~~~~~~
+RAFT C++ API Reference
+~~~~~~~~~~~~~~~~~~~~~~
+
+
+.. _api:
+
+.. toctree::
+   :maxdepth: 4
+
+   cpp_api/core.rst
+   cpp_api/spatial.rst
+   cpp_api/nn.rst
+   cpp_api/sparse.rst
\ No newline at end of file
diff --git a/docs/source/cpp_api/core.rst b/docs/source/cpp_api/core.rst
new file mode 100644
index 0000000000..13a4dca267
--- /dev/null
+++ b/docs/source/cpp_api/core.rst
@@ -0,0 +1,13 @@
+Core
+====
+
+This page provides C++ class references for the publicly-exposed elements of the core package.
+
+
+
+handle_t
+########
+
+.. doxygenclass:: raft::handle_t
+    :project: RAFT
+    :members:
diff --git a/docs/source/cpp_api/distributed.rst b/docs/source/cpp_api/distributed.rst
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/docs/source/cpp_api/linalg.rst b/docs/source/cpp_api/linalg.rst
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/docs/source/cpp_api/nn.rst b/docs/source/cpp_api/nn.rst
new file mode 100644
index 0000000000..79d8dd1ad3
--- /dev/null
+++ b/docs/source/cpp_api/nn.rst
@@ -0,0 +1,14 @@
+Nearest Neighbors
+=================
+
+This page provides C++ class references for the publicly-exposed elements of the nearest neighbors package.
+
+
+
+nearest neighbors
+#################
+
+.. doxygennamespace:: raft::spatial::knn
+    :project: RAFT
+    :members:
+
diff --git a/docs/source/cpp_api/sparse.rst b/docs/source/cpp_api/sparse.rst
new file mode 100644
index 0000000000..91e553426b
--- /dev/null
+++ b/docs/source/cpp_api/sparse.rst
@@ -0,0 +1,14 @@
+Sparse
+======
+
+This page provides C++ class references for the publicly-exposed elements of the sparse package.
+
+
+
+raft::sparse
+############
+
+.. doxygennamespace:: raft::sparse
+    :project: RAFT
+    :members:
+
diff --git a/docs/source/cpp_api/spatial.rst b/docs/source/cpp_api/spatial.rst
new file mode 100644
index 0000000000..410267e528
--- /dev/null
+++ b/docs/source/cpp_api/spatial.rst
@@ -0,0 +1,13 @@
+Spatial
+=======
+
+This page provides C++ class references for the publicly-exposed elements of the spatial package.
+
+
+
+distance
+########
+
+.. doxygennamespace:: raft::distance
+    :project: RAFT
+
diff --git a/docs/source/cpp_api/stats.rst b/docs/source/cpp_api/stats.rst
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/docs/source/cuda_cpp.rst b/docs/source/cuda_cpp.rst
new file mode 100644
index 0000000000..3737875a27
--- /dev/null
+++ b/docs/source/cuda_cpp.rst
@@ -0,0 +1,11 @@
+CUDA/C++ API
+============
+
+RAFT is header-only but provides optional shared libraries to speed up compile times for larger projects.
+
+.. _api:
+
+.. toctree::
+   :maxdepth: 4
+
+    cpp_api.rst
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 0000000000..85798a9b47
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,24 @@
+Welcome to RAFT's documentation!
+=================================
+
+
+RAFT (RAPIDS Analytics Framework Toolkit) is a Python and CUDA/C++ library containing building-blocks, mathematical primitives, and utilities for accelerating the composition of RAPIDS analytics.
+
+
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   raft_intro.rst
+   cpp_api.rst
+   python_api.rst
+
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/docs/source/python.rst b/docs/source/python.rst
new file mode 100644
index 0000000000..3909403ff0
--- /dev/null
+++ b/docs/source/python.rst
@@ -0,0 +1,11 @@
+Python API
+==========
+
+
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   python_api.rst
+
diff --git a/docs/source/python_api.rst b/docs/source/python_api.rst
new file mode 100644
index 0000000000..fb8be78c7a
--- /dev/null
+++ b/docs/source/python_api.rst
@@ -0,0 +1,17 @@
+~~~~~~~~~~~~~~~~~~~
+RAFT API Reference
+~~~~~~~~~~~~~~~~~~~
+
+.. role:: py(code)
+   :language: python
+   :class: highlight
+
+
+Multi-Node Multi-GPU Infrastructure
+===================================
+
+Dask-based Communicator
+-----------------------
+
+.. autoclass:: raft.dask.common.Comms
+    :members:
diff --git a/docs/source/sphinxext/github_link.py b/docs/source/sphinxext/github_link.py
new file mode 100644
index 0000000000..a7a46fdd9d
--- /dev/null
+++ b/docs/source/sphinxext/github_link.py
@@ -0,0 +1,146 @@
+# This contains code with copyright by the scikit-learn project, subject to the
+# license in /thirdparty/LICENSES/LICENSE.scikit_learn
+
+import inspect
+import os
+import re
+import subprocess
+import sys
+from functools import partial
+from operator import attrgetter
+
+orig = inspect.isfunction
+
+
+# See https://opendreamkit.org/2017/06/09/CythonSphinx/
+def isfunction(obj):
+
+    orig_val = orig(obj)
+
+    new_val = hasattr(type(obj), "__code__")
+
+    if (orig_val != new_val):
+        return new_val
+
+    return orig_val
+
+
+inspect.isfunction = isfunction
+
+REVISION_CMD = 'git rev-parse --short HEAD'
+
+source_regex = re.compile(r"^File: (.*?) \(starting at line ([0-9]*?)\)$",
+                          re.MULTILINE)
+
+
+def _get_git_revision():
+    try:
+        revision = subprocess.check_output(REVISION_CMD.split()).strip()
+    except (subprocess.CalledProcessError, OSError):
+        print('Failed to execute git to get revision')
+        return None
+    return revision.decode('utf-8')
+
+
+def _linkcode_resolve(domain, info, package, url_fmt, revision):
+    """Determine a link to online source for a class/method/function
+
+    This is called by sphinx.ext.linkcode
+
+    An example with a long-untouched module that everyone has
+    >>> _linkcode_resolve('py', {'module': 'tty',
+    ...                          'fullname': 'setraw'},
+    ...                   package='tty',
+    ...                   url_fmt='http://hg.python.org/cpython/file/'
+    ...                           '{revision}/Lib/{package}/{path}#L{lineno}',
+    ...                   revision='xxxx')
+    'http://hg.python.org/cpython/file/xxxx/Lib/tty/tty.py#L18'
+    """
+
+    if revision is None:
+        return
+    if domain not in ('py', 'pyx'):
+        return
+    if not info.get('module') or not info.get('fullname'):
+        return
+
+    class_name = info['fullname'].split('.')[0]
+    module = __import__(info['module'], fromlist=[class_name])
+    obj = attrgetter(info['fullname'])(module)
+
+    # Unwrap the object to get the correct source
+    # file in case that is wrapped by a decorator
+    obj = inspect.unwrap(obj)
+
+    fn: str = None
+    lineno: str = None
+
+    try:
+        fn = inspect.getsourcefile(obj)
+    except Exception:
+        fn = None
+    if not fn:
+        try:
+            fn = inspect.getsourcefile(sys.modules[obj.__module__])
+        except Exception:
+            fn = None
+
+    if not fn:
+        # Possibly Cython code. Search docstring for source
+        m = source_regex.search(obj.__doc__)
+
+        if (m is not None):
+            source_file = m.group(1)
+            lineno = m.group(2)
+
+            # fn is expected to be the absolute path.
+            fn = os.path.relpath(source_file, start=package)
+            print("{}:{}".format(
+                os.path.abspath(os.path.join("..", "python", "cuml", fn)),
+                lineno))
+        else:
+            return
+    else:
+        # Test if we are absolute or not (pyx are relative)
+        if (not os.path.isabs(fn)):
+            # Should be relative to docs right now
+            fn = os.path.abspath(os.path.join("..", "python", fn))
+
+        # Convert to relative from module root
+        fn = os.path.relpath(fn,
+                             start=os.path.dirname(
+                                 __import__(package).__file__))
+
+    # Get the line number if we need it. (Can work without it)
+    if (lineno is None):
+        try:
+            lineno = inspect.getsourcelines(obj)[1]
+        except Exception:
+
+            # Can happen if its a cyfunction. See if it has `__code__`
+            if (hasattr(obj, "__code__")):
+                lineno = obj.__code__.co_firstlineno
+            else:
+                lineno = ''
+    return url_fmt.format(revision=revision,
+                          package=package,
+                          path=fn,
+                          lineno=lineno)
+
+
+def make_linkcode_resolve(package, url_fmt):
+    """Returns a linkcode_resolve function for the given URL format
+
+    revision is a git commit reference (hash or name)
+
+    package is the name of the root module of the package
+
+    url_fmt is along the lines of ('https://github.com/USER/PROJECT/'
+                                   'blob/{revision}/{package}/'
+                                   '{path}#L{lineno}')
+    """
+    revision = _get_git_revision()
+    return partial(_linkcode_resolve,
+                   revision=revision,
+                   package=package,
+                   url_fmt=url_fmt)

From 99b6dd354bb422c99c779fa378c6d424fa49a716 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Sat, 22 Jan 2022 22:37:34 -0500
Subject: [PATCH 079/171] Simplify raft component CMake logic, and allow
 compilation without FAISS (#428)

The previous logic didn't allow for developers to build raft without FAISS as `raft_nn` was always required to be built.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)
  - Paul Taylor (https://github.com/trxcllnt)
  - Corey J. Nolet (https://github.com/cjnolet)
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Dante Gama Dessavre (https://github.com/dantegd)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/raft/pull/428
---
 build.sh                             |   6 +-
 cpp/CMakeLists.txt                   | 240 ++++++++++++++++++++-------
 cpp/cmake/modules/config.cmake.in    | 120 ++++++++++++++
 cpp/cmake/modules/raft_export.cmake  | 238 ++++++++++++++++++++++++++
 cpp/cmake/thirdparty/get_cuco.cmake  |   3 +-
 cpp/cmake/thirdparty/get_faiss.cmake |  70 +++++---
 cpp/cmake/thirdparty/get_gtest.cmake |  14 +-
 cpp/cmake/versions.json              |   9 +
 cpp/test/CMakeLists.txt              |   9 +-
 cpp/test/distance/distance_base.cuh  |   2 +
 cpp/test/sparse/knn_graph.cu         |   2 +
 cpp/test/spatial/ball_cover.cu       |   4 +-
 cpp/test/spatial/knn.cu              |   2 +
 cpp/test/spatial/selection.cu        |   2 +
 14 files changed, 624 insertions(+), 97 deletions(-)
 create mode 100644 cpp/cmake/modules/config.cmake.in
 create mode 100644 cpp/cmake/modules/raft_export.cmake
 create mode 100644 cpp/cmake/versions.json

diff --git a/build.sh b/build.sh
index 5b9a1c4ba0..94d0b5e812 100755
--- a/build.sh
+++ b/build.sh
@@ -44,6 +44,7 @@ PYTHON_DEPS_CLONE=${REPODIR}/python/external_repositories
 BUILD_DIRS="${CPP_RAFT_BUILD_DIR} ${PY_RAFT_BUILD_DIR} ${PYTHON_DEPS_CLONE}"
 
 # Set defaults for vars modified by flags to this script
+CMAKE_LOG_LEVEL=""
 VERBOSE_FLAG=""
 BUILD_ALL_GPU_ARCH=0
 BUILD_GTEST=OFF
@@ -85,6 +86,7 @@ fi
 # Process flags
 if hasArg -v; then
     VERBOSE_FLAG=-v
+    CMAKE_LOG_LEVEL="--log-level=VERBOSE"
     set -x
 fi
 if hasArg -g; then
@@ -144,13 +146,13 @@ if (( ${NUMARGS} == 0 )) || hasArg cppraft || hasArg docs; then
         echo "Building for *ALL* supported GPU architectures..."
     fi
 
-    cmake -S ${REPODIR}/cpp -B ${CPP_RAFT_BUILD_DIR} \
+    cmake -S ${REPODIR}/cpp -B ${CPP_RAFT_BUILD_DIR} ${CMAKE_LOG_LEVEL} \
           -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
           -DCMAKE_CUDA_ARCHITECTURES=${RAFT_CMAKE_CUDA_ARCHITECTURES} \
           -DNVTX=${NVTX} \
           -DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \
           -DBUILD_GTEST=${BUILD_GTEST} \
-          -DBUILD_STATIC_FAISS=${BUILD_STATIC_FAISS}
+          -DRAFT_USE_FAISS_STATIC=${BUILD_STATIC_FAISS}
 
   if hasArg cppraft; then
       # Run all c++ targets at once
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9387589f73..f3a0f2d554 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -49,6 +49,11 @@ option(DISABLE_DEPRECATION_WARNINGS "Disable depreaction warnings " ON)
 option(DISABLE_OPENMP "Disable OpenMP" OFF)
 option(NVTX "Enable nvtx markers" OFF)
 
+option(RAFT_COMPILE_LIBRARIES "Enable building raft shared library instantiations" ON)
+option(RAFT_ENABLE_NN_DEPENDENCIES "Search for raft::nn dependencies like faiss" ${RAFT_COMPILE_LIBRARIES})
+include(CMakeDependentOption)
+cmake_dependent_option(RAFT_USE_FAISS_STATIC "Build and statically link the FAISS library for nearest neighbors search on GPU" ON RAFT_COMPILE_LIBRARIES OFF)
+
 message(VERBOSE "RAFT: Build RAFT unit-tests: ${BUILD_TESTS}")
 message(VERBOSE "RAFT: Enable detection of conda environment for dependencies: ${DETECT_CONDA_ENV}")
 message(VERBOSE "RAFT: Disable depreaction warnings " ${DISABLE_DEPRECATION_WARNINGS})
@@ -100,96 +105,143 @@ endif()
 # add third party dependencies using CPM
 rapids_cpm_init()
 
-# thrust and libcudacxx need to be before cuco!
+# thrust before rmm/cuco so we get the right version of thrust/cub
 include(cmake/thirdparty/get_thrust.cmake)
 include(cmake/thirdparty/get_rmm.cmake)
-include(cmake/thirdparty/get_libcudacxx.cmake)
 include(cmake/thirdparty/get_cuco.cmake)
+include(cmake/thirdparty/get_libcudacxx.cmake)
+include(cmake/thirdparty/get_faiss.cmake)
 
 if(BUILD_TESTS)
-  include(cmake/thirdparty/get_faiss.cmake)
   include(cmake/thirdparty/get_gtest.cmake)
   include(cmake/thirdparty/get_nccl.cmake)
   include(cmake/thirdparty/get_ucx.cmake)
 endif()
 
 ##############################################################################
-# - install targets-----------------------------------------------------------
-rapids_cmake_install_lib_dir( lib_dir )
-
-include(CPack)
-
-file(GLOB_RECURSE RAFT_DISTANCE_SOURCES "src/distance/specializations/*.cu")
-file(GLOB_RECURSE RAFT_NN_SOURCES "src/nn/specializations/*.cu" )
-
-add_library(raft_distance SHARED ${RAFT_DISTANCE_SOURCES})
-add_library(raft::raft_distance ALIAS raft_distance)
-
-add_library(raft_nn SHARED ${RAFT_NN_SOURCES})
-add_library(raft::raft_nn ALIAS raft_nn)
+# - raft ---------------------------------------------------------------------
 
 add_library(raft INTERFACE)
 add_library(raft::raft ALIAS raft)
-target_include_directories(raft INTERFACE "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
-                                          "$<INSTALL_INTERFACE:include>")
 
-target_include_directories(raft_distance PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
+target_include_directories(raft INTERFACE
+        "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
         "$<INSTALL_INTERFACE:include>")
 
-target_include_directories(raft_nn PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
-        "$<INSTALL_INTERFACE:include>")
+target_link_libraries(raft INTERFACE
+  raft::Thrust
+  CUDA::cublas
+  CUDA::curand
+  CUDA::cusolver
+  CUDA::cudart
+  CUDA::cusparse
+  $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
+  rmm::rmm
+  cuco::cuco)
+
+target_compile_definitions(raft INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
+target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
 
-set(RAFT_LINK_LIBRARIES
-        CUDA::cublas
-        CUDA::curand
-        CUDA::cusolver
-        CUDA::cudart
-        CUDA::cusparse
-        $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
-        rmm::rmm
-        cuco::cuco
-        )
+##############################################################################
+# - raft_distance ------------------------------------------------------------
+add_library(raft_distance INTERFACE)
 
-target_link_libraries(raft INTERFACE ${RAFT_LINK_LIBRARIES})
-target_link_libraries(raft_distance PUBLIC ${RAFT_LINK_LIBRARIES})
-target_link_libraries(raft_nn PUBLIC ${RAFT_LINK_LIBRARIES} FAISS::FAISS)
+if(TARGET raft_distance AND (NOT TARGET raft::distance))
+  add_library(raft::distance ALIAS raft_distance)
+endif()
 
-set(RAFT_COMPILE_DEFINITIONS
-        $<$<BOOL:${NVTX}>:NVTX_ENABLED>
-        )
+set_target_properties(raft_distance PROPERTIES EXPORT_NAME distance)
+
+if(RAFT_COMPILE_LIBRARIES)
+  add_library(raft_distance_lib SHARED
+    src/distance/specializations/detail
+    src/distance/specializations/detail/canberra.cu
+    src/distance/specializations/detail/chebyshev.cu
+    src/distance/specializations/detail/correlation.cu
+    src/distance/specializations/detail/cosine.cu
+    src/distance/specializations/detail/hamming_unexpanded.cu
+    src/distance/specializations/detail/hellinger_expanded.cu
+    src/distance/specializations/detail/jensen_shannon.cu
+    src/distance/specializations/detail/kl_divergence.cu
+    src/distance/specializations/detail/l1.cu
+    src/distance/specializations/detail/l2_expanded.cu
+    src/distance/specializations/detail/l2_sqrt_expanded.cu
+    src/distance/specializations/detail/l2_sqrt_unexpanded.cu
+    src/distance/specializations/detail/l2_unexpanded.cu
+    src/distance/specializations/detail/lp_unexpanded.cu
+  )
+  set_target_properties(raft_distance_lib PROPERTIES OUTPUT_NAME raft_distance)
+
+  target_link_libraries(raft_distance_lib PRIVATE raft::raft)
+  target_compile_options(raft_distance_lib
+          PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
+          "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
+          )
+  target_compile_definitions(raft_distance_lib
+          INTERFACE "RAFT_DISTANCE_COMPILED")
+
+  install(TARGETS raft_distance_lib
+          DESTINATION ${lib_dir}
+          EXPORT raft-distance-exports)
+endif()
 
-target_compile_definitions(raft INTERFACE ${RAFT_COMPILE_DEFINITIONS})
-target_compile_definitions(raft_distance PRIVATE ${RAFT_COMPILE_DEFINITIONS})
-target_compile_definitions(raft_nn PRIVATE ${RAFT_COMPILE_DEFINITIONS})
+target_link_libraries(raft_distance INTERFACE raft::raft
+    $<TARGET_NAME_IF_EXISTS:raft_distance_lib>
+    $<TARGET_NAME_IF_EXISTS:raft::raft_distance_lib>)
 
-target_compile_options(raft_distance
-        PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
-        "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
-        )
+##############################################################################
+# - raft_nn ------------------------------------------------------------------
+add_library(raft_nn INTERFACE)
 
+if(TARGET raft_nn AND (NOT TARGET raft::nn))
+  add_library(raft::nn ALIAS raft_nn)
+endif()
 
-target_compile_options(raft_nn
-        PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
-        "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
-        )
+set_target_properties(raft_nn PROPERTIES EXPORT_NAME nn)
+
+if(RAFT_COMPILE_LIBRARIES)
+  add_library(raft_nn_lib SHARED
+    src/nn/specializations/ball_cover.cu
+    src/nn/specializations/detail/ball_cover_lowdim.cu
+    src/nn/specializations/fused_l2_knn.cu
+    src/nn/specializations/knn.cu
+  )
+  set_target_properties(raft_nn_lib PROPERTIES OUTPUT_NAME raft_nn)
+
+  target_link_libraries(raft_nn_lib PRIVATE raft::raft faiss::faiss)
+  target_compile_options(raft_nn_lib
+          PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
+          "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
+          )
+  target_compile_definitions(raft_nn_lib
+          INTERFACE "RAFT_NN_COMPILED")
+
+  install(TARGETS raft_nn_lib
+          DESTINATION ${lib_dir}
+          EXPORT raft-nn-exports)
+endif()
 
-target_compile_features(raft_distance PUBLIC cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
-target_compile_features(raft_nn PUBLIC cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
-target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
+target_link_libraries(raft_nn INTERFACE raft::raft faiss::faiss
+    $<TARGET_NAME_IF_EXISTS:raft_nn_lib>
+    $<TARGET_NAME_IF_EXISTS:raft::raft_nn_lib>)
 
-install(TARGETS raft_distance
+##############################################################################
+# - install targets-----------------------------------------------------------
+rapids_cmake_install_lib_dir( lib_dir )
+include(GNUInstallDirs)
+include(CPack)
+
+install(TARGETS raft
         DESTINATION ${lib_dir}
         EXPORT raft-exports)
-
+install(TARGETS raft_distance
+        DESTINATION ${lib_dir}
+        EXPORT raft-distance-exports)
 install(TARGETS raft_nn
         DESTINATION ${lib_dir}
-        EXPORT raft-exports)
+        EXPORT raft-nn-exports)
 
-install(TARGETS raft
-        DESTINATION ${lib_dir}
-        EXPORT raft-exports)
 
-include(GNUInstallDirs)
 install(DIRECTORY include/raft/
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft
         )
@@ -207,26 +259,88 @@ Provide targets for the RAFT: RAPIDS Analytics Framework Toolkit.
 RAPIDS Analytics Framework Toolkit contains shared representations,
 mathematical computational primitives, and utilities that accelerate
 building analytics and data science algorithms in the RAPIDS ecosystem.
+
+Optional Components:
+  - nn
+  - distance
+
+Imported Targets:
+  - raft::raft
+  - raft::nn brought in by the `nn` optional component
+  - raft::distance brought in by the `distance` optional component
+
 ]=])
 
- rapids_export(INSTALL raft
+set(code_string
+[=[
+thrust_create_target(raft::Thrust FROM_OPTIONS)
+
+if(distance IN_LIST raft_FIND_COMPONENTS)
+  enable_language(CUDA)
+endif()
+
+if(nn IN_LIST raft_FIND_COMPONENTS)
+  enable_language(CUDA)
+
+  if(TARGET faiss AND (NOT TARGET faiss::faiss))
+      add_library(faiss::faiss ALIAS faiss)
+  elseif(TARGET faiss::faiss AND (NOT TARGET faiss))
+      add_library(faiss ALIAS faiss::faiss)
+  endif()
+endif()
+]=]
+)
+
+# Use `rapids_export` for 22.04 as it will have COMPONENT support
+include(cmake/modules/raft_export.cmake)
+raft_export(INSTALL raft
     EXPORT_SET raft-exports
-    GLOBAL_TARGETS raft raft_distance# since we can't hook into EXPORT SETS
+    COMPONENTS nn distance
+    GLOBAL_TARGETS raft nn distance
     NAMESPACE raft::
     DOCUMENTATION doc_string
+    FINAL_CODE_BLOCK code_string
     )
 
 ##############################################################################
 # - build export -------------------------------------------------------------
 
-rapids_export(BUILD raft
+raft_export(BUILD raft
     EXPORT_SET raft-exports
-    GLOBAL_TARGETS raft raft_distance raft_nn# since we can't hook into EXPORT SETS
-    LANGUAGES CUDA
+    COMPONENTS nn distance
+    GLOBAL_TARGETS raft raft_distance raft_nn
     DOCUMENTATION doc_string
     NAMESPACE raft::
+    FINAL_CODE_BLOCK code_string
     )
 
+##############################################################################
+# - export/install optional components  --------------------------------------
+
+include("${rapids-cmake-dir}/export/write_dependencies.cmake")
+
+set(raft_components distance nn)
+foreach(comp IN LISTS raft_components)
+  install(
+    EXPORT raft-${comp}-exports
+    FILE raft-${comp}-targets.cmake
+    NAMESPACE raft::
+    DESTINATION "${lib_dir}/cmake/raft"
+  )
+  export(
+    EXPORT raft-${comp}-exports
+    FILE ${RAFT_BINARY_DIR}/raft-${comp}-targets.cmake
+    NAMESPACE raft::
+  )
+  rapids_export_write_dependencies(
+    BUILD raft-${comp}-exports "${PROJECT_BINARY_DIR}/raft-${comp}-dependencies.cmake"
+  )
+  rapids_export_write_dependencies(
+    INSTALL raft-${comp}-exports "${PROJECT_BINARY_DIR}/rapids-cmake/raft/export/raft-${comp}-dependencies.cmake"
+  )
+
+endforeach()
+
 ##############################################################################
 # - build test executable ----------------------------------------------------
 
diff --git a/cpp/cmake/modules/config.cmake.in b/cpp/cmake/modules/config.cmake.in
new file mode 100644
index 0000000000..bfafe3555a
--- /dev/null
+++ b/cpp/cmake/modules/config.cmake.in
@@ -0,0 +1,120 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+#[=======================================================================[
+
+@RAPIDS_PROJECT_DOCUMENTATION@
+
+Result Variables
+^^^^^^^^^^^^^^^^
+
+This module will set the following variables::
+
+  @project_name_uppercase@_FOUND
+  @project_name_uppercase@_VERSION
+  @project_name_uppercase@_VERSION_MAJOR
+  @project_name_uppercase@_VERSION_MINOR
+
+#]=======================================================================]
+
+@PACKAGE_INIT@
+
+cmake_minimum_required(VERSION @CMAKE_MINIMUM_REQUIRED_VERSION@)
+
+set(rapids_global_languages @RAPIDS_LANGUAGES@)
+foreach(lang IN LISTS rapids_global_languages)
+  include("${CMAKE_CURRENT_LIST_DIR}/@project_name@-${lang}-language.cmake")
+endforeach()
+unset(rapids_global_languages)
+
+set(rapids_allowed_components @RAPIDS_COMPONENTS@)
+
+if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/@project_name@-dependencies.cmake")
+  include("${CMAKE_CURRENT_LIST_DIR}/@project_name@-dependencies.cmake")
+endif()
+foreach(comp IN LISTS rapids_allowed_components)
+  # find dependencies before creating targets that use them
+  # this way if a dependency can't be found we fail
+  if(${comp} IN_LIST @project_name@_FIND_COMPONENTS)
+    include("${CMAKE_CURRENT_LIST_DIR}/@project_name@-${comp}-dependencies.cmake" OPTIONAL)
+  endif()
+endforeach()
+
+include("${CMAKE_CURRENT_LIST_DIR}/@project_name@-targets.cmake" OPTIONAL)
+
+foreach(comp IN LISTS rapids_allowed_components)
+  if(${comp} IN_LIST @project_name@_FIND_COMPONENTS)
+    include("${CMAKE_CURRENT_LIST_DIR}/@project_name@-${comp}-targets.cmake" OPTIONAL)
+    set(@project_name@_${comp}_FOUND TRUE)
+  endif()
+endforeach()
+
+unset(rapids_allowed_components)
+
+if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/@project_name@-config-version.cmake")
+  include("${CMAKE_CURRENT_LIST_DIR}/@project_name@-config-version.cmake")
+endif()
+
+# Set our version variables
+set(@project_name_uppercase@_VERSION_MAJOR @rapids_orig_major_version@)
+set(@project_name_uppercase@_VERSION_MINOR @rapids_orig_minor_version@)
+set(@project_name_uppercase@_VERSION_PATCH @rapids_orig_patch_version@)
+set(@project_name_uppercase@_VERSION @rapids_orig_version@)
+
+
+set(rapids_global_targets @RAPIDS_GLOBAL_TARGETS@)
+set(rapids_namespaced_global_targets @RAPIDS_GLOBAL_TARGETS@)
+if(rapids_namespaced_global_targets)
+  list(TRANSFORM rapids_namespaced_global_targets PREPEND @RAPIDS_NAMESPACE@ )
+endif()
+
+foreach(target IN LISTS rapids_namespaced_global_targets)
+  if(TARGET ${target})
+    get_target_property(_is_imported ${target} IMPORTED)
+    get_target_property(_already_global ${target} IMPORTED_GLOBAL)
+    if(_is_imported AND NOT _already_global)
+      set_target_properties(${target} PROPERTIES IMPORTED_GLOBAL TRUE)
+    endif()
+  endif()
+endforeach()
+
+# For backwards compat
+if("rapids_config_@type@" STREQUAL "rapids_config_build")
+  foreach(target IN LISTS rapids_global_targets)
+    if(TARGET ${target})
+      get_target_property(_is_imported ${target} IMPORTED)
+      get_target_property(_already_global ${target} IMPORTED_GLOBAL)
+      if(_is_imported AND NOT _already_global)
+        set_target_properties(${target} PROPERTIES IMPORTED_GLOBAL TRUE)
+      endif()
+      if(NOT TARGET @RAPIDS_NAMESPACE@${target})
+        add_library(@RAPIDS_NAMESPACE@${target} ALIAS ${target})
+      endif()
+    endif()
+  endforeach()
+endif()
+
+unset(rapids_global_targets)
+unset(rapids_namespaced_global_targets)
+
+check_required_components(@project_name@)
+
+set(${CMAKE_FIND_PACKAGE_NAME}_CONFIG "${CMAKE_CURRENT_LIST_FILE}")
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(${CMAKE_FIND_PACKAGE_NAME} CONFIG_MODE HANDLE_COMPONENTS)
+
+@RAPIDS_PROJECT_FINAL_CODE_BLOCK@
diff --git a/cpp/cmake/modules/raft_export.cmake b/cpp/cmake/modules/raft_export.cmake
new file mode 100644
index 0000000000..4411433336
--- /dev/null
+++ b/cpp/cmake/modules/raft_export.cmake
@@ -0,0 +1,238 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+include_guard(GLOBAL)
+
+#[=======================================================================[.rst:
+raft_export
+---------------------
+
+Generate a projects -Config.cmake module and all related information
+
+.. code-block:: cmake
+
+  raft_export( (BUILD|INSTALL) <project_name>
+      EXPORT_SET <export_set>
+      [ COMPONENTS <component_name...> ]
+      [ GLOBAL_TARGETS <targets...> ]
+      [ VERSION <X.Y.Z> ]
+      [ NAMESPACE <name_space> ]
+      [ DOCUMENTATION <doc_variable> ]
+      [ FINAL_CODE_BLOCK <code_block_variable> ]
+      [ LANGUAGES <langs...> ]
+      )
+
+The :cmake:command:`raft_export` function allow projects to easily generate a fully
+correct build and install tree `Project-Config.cmake` module including any necessary
+calls to :cmake:command:`find_dependency`, or :cmake:command:`CPMFindPackage`.
+
+.. note::
+  :cmake:command:`raft_export` always installs to `lib` and doesn't use GNUInstallDirs
+
+  The files generated by :cmake:command:`raft_export` are completly standalone
+  and don't require the consuming package to use `rapids-cmake`
+
+``project_name``
+  Name of the project, to be used by consumers when using `find_package`
+
+``GLOBAL_TARGETS``
+  Explicitly list what targets should be made globally visibile to
+  the consuming project.
+
+``VERSION``
+  Explicitly list the version of the package being exported. By
+  default :cmake:command:`raft_export` uses the version specified by the
+  root level :cmake:command:`project` call. If no version has been specified
+  either way or `OFF` is provided as the `VERSION` value, no version compatibility
+  checks will be generated.
+
+  Depending on the version string different compatibility modes will be used.
+
+    +------------------+---------------------+
+    | Version String   | Compatiblity Type   |
+    +==================+=====================+
+    | None             | No checks perfomed  |
+    +------------------+---------------------+
+    | X                | SameMajorVersion    |
+    +------------------+---------------------+
+    | X.Y              | SameMinorVersion    |
+    +------------------+---------------------+
+    | X.Y.Z            | SameMinorVersion    |
+    +------------------+---------------------+
+
+.. note::
+    It can be useful to explicitly specify a version string when generating
+    export rules for a sub-component of alarger project, or an external
+    project that doesn't have export rules.
+
+``NAMESPACE``
+  Optional value to specify what namespace all targets from the
+  EXPORT_SET will be placed into. When provided must match the pattern
+  of `<name>::`.
+  If not provided all targets will be placed in the `<project_name>::`
+  namespace
+
+  Note: When exporting with `BUILD` type, only `GLOBAL_TARGETS` will
+  be placed in the namespace.
+
+``DOCUMENTATION``
+  Optional value of the variable that holds the documentation
+  for this config file.
+
+  Note: This requires the documentation variable instead of the contents
+  so we can handle having CMake code inside the documentation
+
+``FINAL_CODE_BLOCK``
+  Optional value of the variable that holds a string of code that will
+  be executed at the last step of this config file.
+
+  Note: This requires the code block variable instead of the contents
+  so that we can properly insert CMake code
+
+``LANGUAGES``
+  Non default languages, such as CUDA that are required by consumers
+  of your package. This makes sure all consumers properly setup these
+  languages correctly.
+
+  This is required as CMake's :cmake:command:`enable_language` only supports
+  enabling languages for the current directory scope, and doesn't support
+  being called from within functions. Marking languages here overcomes
+  these limitations and makes it possible for packages included via
+  `CPM` to enable languages.
+
+
+#]=======================================================================]
+# cmake-lint: disable=R0912,R0915,W0105
+function(raft_export type project_name)
+  include(CMakePackageConfigHelpers)
+
+  list(APPEND CMAKE_MESSAGE_CONTEXT "raft.export")
+  string(TOLOWER ${type} type)
+
+  set(options "")
+  set(one_value EXPORT_SET VERSION NAMESPACE DOCUMENTATION FINAL_CODE_BLOCK)
+  set(multi_value COMPONENTS GLOBAL_TARGETS LANGUAGES)
+  cmake_parse_arguments(RAPIDS "${options}" "${one_value}" "${multi_value}" ${ARGN})
+
+  set(rapids_version_set ON)
+  if(DEFINED RAPIDS_VERSION AND NOT RAPIDS_VERSION)
+    # We need to capture `VERSION OFF` so we need to make sure it has an off value, and not just
+    # undefined
+    set(rapids_version_set OFF)
+    unset(RAPIDS_VERSION) # unset this so we don't export a version value of `OFF`
+  elseif(NOT DEFINED RAPIDS_VERSION AND NOT DEFINED PROJECT_VERSION)
+    set(rapids_version_set OFF)
+  elseif(DEFINED PROJECT_VERSION AND NOT DEFINED RAPIDS_VERSION)
+    # Choose the project version when an explicit version isn't provided
+    set(RAPIDS_VERSION "${PROJECT_VERSION}")
+  endif()
+
+  if(rapids_version_set)
+    include("${rapids-cmake-dir}/export/detail/parse_version.cmake")
+    rapids_export_parse_version(${RAPIDS_VERSION} rapids_orig rapids_project_version)
+  endif()
+
+  set(RAPIDS_PROJECT_VERSION "${project_name}::")
+  if(DEFINED RAPIDS_NAMESPACE)
+    set(RAPIDS_PROJECT_VERSION ${RAPIDS_NAMESPACE})
+  endif()
+
+  set(RAPIDS_PROJECT_DOCUMENTATION "Generated ${project_name}-config module")
+  if(DEFINED RAPIDS_DOCUMENTATION)
+    if(NOT DEFINED ${RAPIDS_DOCUMENTATION})
+      message(FATAL_ERROR "DOCUMENTATION variable `${RAPIDS_DOCUMENTATION}` doesn't exist")
+    endif()
+    set(RAPIDS_PROJECT_DOCUMENTATION "${${RAPIDS_DOCUMENTATION}}")
+  endif()
+
+  if(DEFINED RAPIDS_FINAL_CODE_BLOCK)
+    if(NOT DEFINED ${RAPIDS_FINAL_CODE_BLOCK})
+      message(FATAL_ERROR "FINAL_CODE_BLOCK variable `${RAPIDS_FINAL_CODE_BLOCK}` doesn't exist")
+    endif()
+    set(RAPIDS_PROJECT_FINAL_CODE_BLOCK "${${RAPIDS_FINAL_CODE_BLOCK}}")
+  endif()
+
+  # Write configuration and version files
+  string(TOLOWER ${project_name} project_name)
+  string(TOUPPER ${project_name} project_name_uppercase)
+  if(type STREQUAL "install")
+    include("${rapids-cmake-dir}/cmake/install_lib_dir.cmake")
+    rapids_cmake_install_lib_dir(install_location)
+    set(install_location "${install_location}/cmake/${project_name}")
+
+    set(scratch_dir "${PROJECT_BINARY_DIR}/rapids-cmake/${project_name}/export")
+
+    configure_package_config_file("${CMAKE_CURRENT_FUNCTION_LIST_DIR}/config.cmake.in"
+                                  "${scratch_dir}/${project_name}-config.cmake"
+                                  INSTALL_DESTINATION "${install_location}")
+
+    if(rapids_version_set)
+      write_basic_package_version_file(
+        "${scratch_dir}/${project_name}-config-version.cmake" VERSION ${rapids_project_version}
+        COMPATIBILITY ${rapids_project_version_compat})
+    endif()
+
+    install(EXPORT ${RAPIDS_EXPORT_SET} FILE ${project_name}-targets.cmake
+            NAMESPACE ${RAPIDS_PROJECT_VERSION} DESTINATION "${install_location}")
+
+    if(TARGET rapids_export_install_${RAPIDS_EXPORT_SET})
+      include("${rapids-cmake-dir}/export/write_dependencies.cmake")
+      set(destination "${scratch_dir}/${project_name}-dependencies.cmake")
+      rapids_export_write_dependencies(INSTALL ${RAPIDS_EXPORT_SET} "${destination}")
+    endif()
+
+    if(DEFINED RAPIDS_LANGUAGES)
+      include("${rapids-cmake-dir}/export/write_language.cmake")
+      foreach(lang IN LISTS RAPIDS_LANGUAGES)
+        set(destination "${scratch_dir}/${project_name}-${lang}-language.cmake")
+        rapids_export_write_language(INSTALL ${lang} "${destination}")
+      endforeach()
+    endif()
+
+    # Install everything we have generated
+    install(DIRECTORY "${scratch_dir}/" DESTINATION "${install_location}")
+
+  else()
+    set(install_location "${PROJECT_BINARY_DIR}")
+    configure_package_config_file("${CMAKE_CURRENT_FUNCTION_LIST_DIR}/config.cmake.in"
+                                  "${install_location}/${project_name}-config.cmake"
+                                  INSTALL_DESTINATION "${install_location}")
+
+    if(rapids_version_set)
+      write_basic_package_version_file(
+        "${install_location}/${project_name}-config-version.cmake" VERSION ${rapids_project_version}
+        COMPATIBILITY ${rapids_project_version_compat})
+    endif()
+
+    export(EXPORT ${RAPIDS_EXPORT_SET} NAMESPACE ${RAPIDS_PROJECT_VERSION}
+           FILE "${install_location}/${project_name}-targets.cmake")
+
+    if(TARGET rapids_export_build_${RAPIDS_EXPORT_SET})
+      include("${rapids-cmake-dir}/export/write_dependencies.cmake")
+      rapids_export_write_dependencies(BUILD ${RAPIDS_EXPORT_SET}
+                                       "${install_location}/${project_name}-dependencies.cmake")
+    endif()
+
+    if(DEFINED RAPIDS_LANGUAGES)
+      include("${rapids-cmake-dir}/export/write_language.cmake")
+      foreach(lang IN LISTS RAPIDS_LANGUAGES)
+        rapids_export_write_language(BUILD ${lang}
+                                     "${install_location}/${project_name}-${lang}-language.cmake")
+      endforeach()
+    endif()
+
+  endif()
+
+endfunction()
diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index aaedb0ccc5..381addb03c 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -30,4 +30,5 @@ function(find_and_configure_cuco VERSION)
 
 endfunction()
 
-find_and_configure_cuco(0.0.1)
+# cuCollections doesn't have a version yet
+find_and_configure_cuco(0.0)
diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake
index a65401579c..1079db3294 100644
--- a/cpp/cmake/thirdparty/get_faiss.cmake
+++ b/cpp/cmake/thirdparty/get_faiss.cmake
@@ -15,39 +15,59 @@
 #=============================================================================
 
 function(find_and_configure_faiss)
-    set(oneValueArgs VERSION PINNED_TAG)
+    set(oneValueArgs VERSION PINNED_TAG BUILD_STATIC_LIBS)
     cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
                           "${multiValueArgs}" ${ARGN} )
 
-    rapids_find_generate_module(FAISS
-        HEADER_NAMES  faiss/IndexFlat.h
-        LIBRARY_NAMES faiss
-    )
-
-    rapids_cpm_find(FAISS ${PKG_VERSION}
-        GLOBAL_TARGETS  faiss
-        CPM_ARGS
-          GIT_REPOSITORY  https://github.com/facebookresearch/faiss.git
-          GIT_TAG         ${PKG_PINNED_TAG}
-          OPTIONS
-            "FAISS_ENABLE_PYTHON OFF"
-            "BUILD_SHARED_LIBS OFF"
-            "CUDAToolkit_ROOT ${CUDAToolkit_LIBRARY_DIR}"
-            "FAISS_ENABLE_GPU ON"
-            "BUILD_TESTING OFF"
-            "CMAKE_MESSAGE_LOG_LEVEL VERBOSE"
-    )
-
-    if(FAISS_ADDED)
-      set(FAISS_GPU_HEADERS ${FAISS_SOURCE_DIR} PARENT_SCOPE)
-    endif()
+    if(RAFT_ENABLE_NN_DEPENDENCIES OR RAFT_COMPILE_LIBRARIES)
+      rapids_find_generate_module(faiss
+          HEADER_NAMES  faiss/IndexFlat.h
+          LIBRARY_NAMES faiss
+      )
+
+      set(BUILD_SHARED_LIBS OFF)
+      if (NOT PKG_BUILD_STATIC_LIBS)
+          set(BUILD_SHARED_LIBS ON)
+      endif()
+
+      rapids_cpm_find(faiss ${PKG_VERSION}
+          GLOBAL_TARGETS     faiss::faiss
+          INSTALL_EXPORT_SET raft-nn-exports
+          CPM_ARGS
+            GIT_REPOSITORY   https://github.com/facebookresearch/faiss.git
+            GIT_TAG          ${PKG_PINNED_TAG}
+            EXCLUDE_FROM_ALL TRUE
+            OPTIONS
+              "FAISS_ENABLE_PYTHON OFF"
+              "CUDAToolkit_ROOT ${CUDAToolkit_LIBRARY_DIR}"
+              "FAISS_ENABLE_GPU ON"
+              "BUILD_TESTING OFF"
+              "CMAKE_MESSAGE_LOG_LEVEL VERBOSE"
+      )
 
-    if(TARGET faiss AND NOT TARGET FAISS::FAISS)
-        add_library(FAISS::FAISS ALIAS faiss)
+      if(TARGET faiss AND NOT TARGET faiss::faiss)
+          add_library(faiss::faiss ALIAS faiss)
+      endif()
+
+      if(faiss_ADDED)
+        rapids_export(BUILD faiss
+            EXPORT_SET faiss-targets
+            GLOBAL_TARGETS faiss
+            NAMESPACE faiss::)
+      endif()
     endif()
 
+    # We generate the faiss-config files when we built faiss locally, so always do `find_dependency`
+    rapids_export_package(BUILD OpenMP raft-nn-exports) # faiss uses openMP but doesn't export a need for it
+    rapids_export_package(BUILD faiss raft-nn-exports)
+    rapids_export_package(INSTALL faiss raft-nn-exports)
+
+    # Tell cmake where it can find the generated faiss-config.cmake we wrote.
+    include("${rapids-cmake-dir}/export/find_package_root.cmake")
+    rapids_export_find_package_root(BUILD faiss [=[${CMAKE_CURRENT_LIST_DIR}]=] raft-nn-exports)
 endfunction()
 
 find_and_configure_faiss(VERSION    1.7.0
                          PINNED_TAG  bde7c0027191f29c9dadafe4f6e68ca0ee31fb30
+                         BUILD_STATIC_LIBS ${RAFT_USE_FAISS_STATIC}
                         )
diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index 7c234283d5..72fb0e18c6 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -17,7 +17,19 @@
 function(find_and_configure_gtest )
 
     include(${rapids-cmake-dir}/cpm/gtest.cmake)
-    rapids_cpm_gtest()
+    rapids_cpm_gtest(BUILD_EXPORT_SET raft-exports
+                     EXCLUDE_FROM_ALL TRUE)
+
+    if(GTest_ADDED)
+        rapids_export(BUILD GTest
+          VERSION ${GTest_VERSION}
+          EXPORT_SET GTestTargets
+          GLOBAL_TARGETS gtest gmock gtest_main gmock_main
+          NAMESPACE GTest::)
+
+        include("${rapids-cmake-dir}/export/find_package_root.cmake")
+        rapids_export_find_package_root(BUILD GTest [=[${CMAKE_CURRENT_LIST_DIR}]=] raft-exports)
+    endif()
 
 endfunction()
 
diff --git a/cpp/cmake/versions.json b/cpp/cmake/versions.json
new file mode 100644
index 0000000000..cca2dd8859
--- /dev/null
+++ b/cpp/cmake/versions.json
@@ -0,0 +1,9 @@
+{
+  "packages" : {
+      "Thrust" : {
+        "version" : "1.15.0",
+        "git_url" : "https://github.com/NVIDIA/thrust.git",
+        "git_tag" : "${version}"
+    }
+  }
+}
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 72dae50643..07f04ad2ab 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -121,15 +121,16 @@ target_compile_options(test_raft
 
 target_include_directories(test_raft
     PUBLIC  "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/test>"
-            "${FAISS_GPU_HEADERS}"
 )
 
 
 target_link_libraries(test_raft
 PRIVATE
-  raft # transitively links all CUDA libs, etc
-  raft_distance
-  raft_nn
+  raft::raft
+  raft::distance
+  raft::nn
+  NCCL::NCCL
+  faiss::faiss
   GTest::gtest
   GTest::gtest_main
   Threads::Threads
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index 475202137b..8f0de29eed 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -20,7 +20,9 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance.hpp>
+#if defined RAFT_DISTANCE_COMPILED
 #include <raft/distance/specializations.hpp>
+#endif
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu
index df9bb4e3e4..88a3f24df6 100644
--- a/cpp/test/sparse/knn_graph.cu
+++ b/cpp/test/sparse/knn_graph.cu
@@ -23,7 +23,9 @@
 
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/selection/knn_graph.hpp>
+#if defined RAFT_NN_COMPILED
 #include <raft/spatial/knn/specializations.hpp>
+#endif
 
 #include <iostream>
 
diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu
index 7b44c477aa..257950e4d7 100644
--- a/cpp/test/spatial/ball_cover.cu
+++ b/cpp/test/spatial/ball_cover.cu
@@ -20,9 +20,11 @@
 #include <raft/linalg/distance_type.h>
 #include <raft/spatial/knn/ball_cover.hpp>
 #include <raft/spatial/knn/detail/knn_brute_force_faiss.cuh>
+#if defined RAFT_NN_COMPILED
 #include <raft/spatial/knn/specializations.hpp>
-#include <rmm/device_uvector.hpp>
+#endif
 
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <thrust/transform.h>
 
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index 5681f66e25..8af1505bcd 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -19,7 +19,9 @@
 #include <raft/linalg/distance_type.h>
 
 #include <raft/spatial/knn/knn.hpp>
+#if defined RAFT_NN_COMPILED
 #include <raft/spatial/knn/specializations.hpp>
+#endif
 
 #include <rmm/device_buffer.hpp>
 
diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu
index 4409f893a8..8ccf3b6b73 100644
--- a/cpp/test/spatial/selection.cu
+++ b/cpp/test/spatial/selection.cu
@@ -21,7 +21,9 @@
 
 #include <raft/sparse/detail/utils.h>
 #include <raft/spatial/knn/knn.hpp>
+#if defined RAFT_NN_COMPILED
 #include <raft/spatial/knn/specializations.hpp>
+#endif
 
 namespace raft {
 namespace spatial {

From a13168f16b2e781173d046134fff07f52f431ac3 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 24 Jan 2022 09:12:02 -0600
Subject: [PATCH 080/171] pin dask (#455)

Co-authored-by: Corey J. Nolet <cjnolet@users.noreply.github.com>
---
 ci/gpu/build.sh                          | 4 ++--
 ci/local/old-gpubuild.sh                 | 4 ++--
 conda/environments/raft_dev_cuda11.0.yml | 4 ++--
 conda/environments/raft_dev_cuda11.2.yml | 6 +++---
 conda/environments/raft_dev_cuda11.4.yml | 4 ++--
 conda/environments/raft_dev_cuda11.5.yml | 4 ++--
 6 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 1c8d7797ad..145624500c 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -64,8 +64,8 @@ gpuci_mamba_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid
 # Install the master version of dask, distributed, and dask-ml
 gpuci_logger "Install the master version of dask and distributed"
 set -x
-pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
+pip install "git+https://github.com/dask/distributed.git@2022.01.0" --upgrade --no-deps
+pip install "git+https://github.com/dask/dask.git@2022.01.0" --upgrade --no-deps
 set +x
 
 
diff --git a/ci/local/old-gpubuild.sh b/ci/local/old-gpubuild.sh
index 5b9df78679..4f9ddbfbec 100644
--- a/ci/local/old-gpubuild.sh
+++ b/ci/local/old-gpubuild.sh
@@ -84,8 +84,8 @@ fi
 
 # Install the master version of dask, distributed, and dask-ml
 set -x
-pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
+pip install "git+https://github.com/dask/distributed.git@2022.01.0" --upgrade --no-deps
+pip install "git+https://github.com/dask/dask.git@2022.01.0" --upgrade --no-deps
 set +x
 
 
diff --git a/conda/environments/raft_dev_cuda11.0.yml b/conda/environments/raft_dev_cuda11.0.yml
index 460ba7c1fb..75f725b38a 100644
--- a/conda/environments/raft_dev_cuda11.0.yml
+++ b/conda/environments/raft_dev_cuda11.0.yml
@@ -22,8 +22,8 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@main
-    - git+https://github.com/dask/distributed.git@main
+    - git+https://github.com/dask/dask.git@2022.01.0
+    - git+https://github.com/dask/distributed.git@2022.01.0
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/
diff --git a/conda/environments/raft_dev_cuda11.2.yml b/conda/environments/raft_dev_cuda11.2.yml
index a1e9ad650f..0901bb9289 100644
--- a/conda/environments/raft_dev_cuda11.2.yml
+++ b/conda/environments/raft_dev_cuda11.2.yml
@@ -22,9 +22,9 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@main
-    - git+https://github.com/dask/distributed.git@main
-
+    - git+https://github.com/dask/dask.git@2022.01.0
+    - git+https://github.com/dask/distributed.git@2022.01.0
+    
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/
 
diff --git a/conda/environments/raft_dev_cuda11.4.yml b/conda/environments/raft_dev_cuda11.4.yml
index c88f480e56..0cc968025c 100644
--- a/conda/environments/raft_dev_cuda11.4.yml
+++ b/conda/environments/raft_dev_cuda11.4.yml
@@ -22,8 +22,8 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@main
-    - git+https://github.com/dask/distributed.git@main
+    - git+https://github.com/dask/dask.git@2022.01.0
+    - git+https://github.com/dask/distributed.git@2022.01.0
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/
diff --git a/conda/environments/raft_dev_cuda11.5.yml b/conda/environments/raft_dev_cuda11.5.yml
index ad161b555f..da5334c124 100644
--- a/conda/environments/raft_dev_cuda11.5.yml
+++ b/conda/environments/raft_dev_cuda11.5.yml
@@ -23,8 +23,8 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@main
-    - git+https://github.com/dask/distributed.git@main
+    - git+https://github.com/dask/dask.git@2022.01.0
+    - git+https://github.com/dask/distributed.git@2022.01.0
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/

From 98a6cccfa6d6123d4fa8b1f97cbe7039ad62e47a Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Mon, 24 Jan 2022 14:26:24 -0500
Subject: [PATCH 081/171] README and build fixes before release (#459)

Fixing a couple minor doc issues before release.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Brad Rees (https://github.com/BradReesWork)
  - Mark Sadang (https://github.com/msadang)

URL: https://github.com/rapidsai/raft/pull/459
---
 BUILD.md        | 45 +++++++++++++++++++++++----------------------
 README.md       | 17 ++++++++++-------
 build.sh        | 32 +++++++++++++++++++++-----------
 ci/gpu/build.sh |  2 +-
 4 files changed, 55 insertions(+), 41 deletions(-)

diff --git a/BUILD.md b/BUILD.md
index 5a053650a5..f2186b9740 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -1,15 +1,17 @@
 # RAFT Build and Development Guide
 
-- [Building and running tests](#building-and-running-tests)
-- [Usage of RAFT by downstream projects](#usage-of-raft-by-downstream-projects)
-    - [C++ Integration](#c-integration)
-    - [Python/Cython Integration](#pythoncython-integration)
-- [CI Process](#ci-process)
-- [Developer Guide](#developer-guide)
-    - [Local Development](#local-development)
-    - [Submitting PRs](#submitting-prs)
+- [Building and installing RAFT](#build_install)
+- [Using RAFT in downstream projects](#use_raft)
+    - [C++ Integration](#cxx_integration)
+    - [Building RAFT C++ from source](#build_cxx_source)
+    - [Python/Cython Integration](#py_integration)
 
-## Building and installing RAFT
+## <a id="build_install"></a>Building and installing RAFT
+
+### CUDA/GPU Requirements
+- CUDA 11.0+
+- NVIDIA driver 450.80.02+
+- Pascal architecture of better (Compute capability >= 6.0)
 
 C++ RAFT is a header-only library but provides the option of building shared libraries with template instantiations for common types to speed up compile times for larger projects. The recommended way to build and install RAFT is to use the `build.sh` script in the root of the repository. This script can build both the C++ and Python code and provides options for building and installing the shared libraries.
 
@@ -28,7 +30,7 @@ python -m pytest raft
 
 To build manually, you can also use `CMake` and setup.py directly.
 
-For C++, the `RAFT_COMPILE_LIBRARIES` option can be used to compile the shared libraries. Shared libraries are provided for the `nn` and `distance` packages currently. The `nn` package requires FAISS, which will be built from source if it is not already installed. FAISS can optionally be statically compiled into the `nn` shared library with the `RAFT_USE_FAISS_STATIC` option.
+For C++, the `RAFT_COMPILE_LIBRARIES` option can be used to compile the shared libraries. Shared libraries are provided for the `nn` and `distance` packages currently. The `nn` package requires FAISS, which will be built from source if it is not already installed. [FAISS](https://github.com/facebookresearch/faiss) can optionally be statically compiled into the `nn` shared library with the `RAFT_USE_FAISS_STATIC` option.
 
 To install RAFT into a specific location, use `CMAKE_INSTALL_PREFIX`. The snippet below will install it into the current conda environment.
 ```bash
@@ -47,22 +49,22 @@ python setup.py build_ext --inplace
 python setup.py install
 ```
 
-## Using RAFT in downstream projects
+## <a id="use_raft"></a>Using RAFT in downstream projects
 
-### C++ Integration
+### <a id="cxx_integration"></a>C++ Integration
 
 Use RAFT in cmake projects with `find_package(raft)` for header-only operation and the `raft::raft` target will be available for configuring linking and `RAFT_INCLUDE_DIR` will be available for includes. Note that if any packages are used which require downstream dependencies, such as the `nn` package requiring FAISS, these dependencies will have be installed and configured in cmake independently.
 
 Use `find_package(raft COMPONENTS nn, distance)` to enable the shared libraries and pass dependencies through separate targets for each component. In this example, `raft::distance` and `raft::nn` targets will be available for configuring linking paths. These targets will also pass through any transitive dependencies (such as FAISS in the case of the `nn` package).
 
-### Building RAFT C++ from source
+### <a id="build_cxx_source"></a>Building RAFT C++ from source
 
-RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library, so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` target for includes by default. The `COMPILE_LIBRARIES` option enables the building of the shared libraries 
+RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library, so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` link target and `RAFT_INCLUDE_DIR` for includes. The `COMPILE_LIBRARIES` option enables the building of the shared libraries 
 
 ```cmake
 function(find_and_configure_raft)
 
-  set(oneValueArgs VERSION FORK PINNED_TAG USE_RAFT_NN USE_FAISS_STATIC COMPILE_LIBRARIES)
+  set(oneValueArgs VERSION FORK PINNED_TAG USE_FAISS_STATIC COMPILE_LIBRARIES ENABLE_NN_DEPENDENCIES)
   cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
                             "${multiValueArgs}" ${ARGN} )
 
@@ -77,10 +79,9 @@ function(find_and_configure_raft)
           FIND_PACKAGE_ARGUMENTS "COMPONENTS ${RAFT_COMPONENTS}"
           OPTIONS
           "BUILD_TESTS OFF"
+          "RAFT_ENABLE_NN_DEPENDENCIES ${PKG_ENABLE_NN_DEPENDENCIES}"
           "RAFT_USE_FAISS_STATIC ${PKG_USE_FAISS_STATIC}"
-          "NVTX ${NVTX}"
-          "RAFT_COMPILE_LIBRARIES ${COMPILE_LIBRARIES}"
-          
+          "RAFT_COMPILE_LIBRARIES ${PKG_COMPILE_LIBRARIES}"
   )
 
 endfunction()
@@ -91,12 +92,12 @@ endfunction()
 find_and_configure_raft(VERSION    22.02.00
         FORK             rapidsai
         PINNED_TAG       branch-22.02
-        USE_RAFT_NN       NO
-        USE_FAISS_STATIC  NO
-        COMPILE_LIBRARIES NO
+        COMPILE_LIBRARIES      NO
+        ENABLE_NN_DEPENDENCIES NO
+        USE_FAISS_STATIC       NO
 )
 ```
 
-### Python/Cython Integration
+### <a id="py_integration"></a>Python/Cython Integration
 
 Once installed, RAFT's Python library can be imported and used directly.
\ No newline at end of file
diff --git a/README.md b/README.md
index c0eeab75e5..5f0b9a109f 100755
--- a/README.md
+++ b/README.md
@@ -18,17 +18,18 @@ the maintenance burden by maximizing reuse across projects. RAFT relies on the [
 like other projects in the RAPIDS ecosystem, eases the burden of configuring different allocation strategies globally 
 across the libraries that use it. RMM also provides RAII wrappers around device arrays that handle the allocation and cleanup.
 
-## RAFT's primary goals are to be fast, simple, reusable, composable, and comprehensive.
+RAFT's primary goals are to be fast, simple, reusable, composable, and comprehensive.
 
-## Getting started
-
-Refer to the [Build](BUILD.md) instructions for details on building and including the RAFT library in downstream projects. The [Developer Guide](DEVELOPER_GUIDE.md) contains details on the developer guidelines, workflows, and principals. If you are interested in contributing to the RAFT project, please read our [Contributing guidelines](CONTRIBUTING.md).
+## Build/Install RAFT
 
-Most of the primitives in RAFT accept a `raft::handle_t` object for the management of resources which are expensive to create, such CUDA streams, stream pools, and handles to other CUDA libraries like `cublas` and `cusolver`.
+Refer to the [Build](BUILD.md) instructions for details on building and including the RAFT library in downstream projects.
 
+## Getting started
 
 ### C++ Example
 
+Most of the primitives in RAFT accept a `raft::handle_t` object for the management of resources which are expensive to create, such CUDA streams, stream pools, and handles to other CUDA libraries like `cublas` and `cusolver`.
+
 The example below demonstrates creating a RAFT handle and using it with RMM's `device_uvector` to allocate memory on device and compute
 pairwise Euclidean distances:
 ```c++
@@ -55,8 +56,6 @@ raft::distance::pairwise_distance(handle, input.data(), input.data(),
 ```
 
 
-
-
 ## Folder Structure and Contents
 
 The folder structure mirrors other RAPIDS repos (cuDF, cuML, cuGraph...), with the following folders:
@@ -66,3 +65,7 @@ The folder structure mirrors other RAPIDS repos (cuDF, cuML, cuGraph...), with t
 - `cpp`: Source code for all C++ code. The code is currently header-only, therefore it is in the `include` folder (with no `src`).
 - `docs`: Source code and scripts for building library documentation
 - `python`: Source code for all Python source code.
+
+## Contributing
+
+If you are interested in contributing to the RAFT project, please read our [Contributing guidelines](CONTRIBUTING.md). Refer to the [Developer Guide](DEVELOPER_GUIDE.md) for details on the developer guidelines, workflows, and principals. 
\ No newline at end of file
diff --git a/build.sh b/build.sh
index 94d0b5e812..1346270e45 100755
--- a/build.sh
+++ b/build.sh
@@ -18,24 +18,26 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean cppraft pyraft docs -v -g --allgpuarch --nvtx --show_depr_warn -h --buildgtest --buildfaiss"
+VALIDARGS="clean libraft pyraft docs -v -g --compilelibs --allgpuarch --nvtx --show_depr_warn -h --buildgtest --buildfaiss"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
-   cppraft          - build the cuml C++ code only. Also builds the C-wrapper library
+   libraft          - build the raft C++ code only. Also builds the C-wrapper library
                       around the C++ code.
    pyraft             - build the cuml Python package
    docs             - build the documentation
+
  and <flag> is:
    -v               - verbose build mode
    -g               - build for debug
+   --compilelibs    - compile shared libraries
    --allgpuarch     - build for all supported GPU architectures
    --buildfaiss     - build faiss statically into raft
    --nvtx           - Enable nvtx for profiling support
    --show_depr_warn - show cmake deprecation warnings
    -h               - print this text
 
- default action (no args) is to build both cppraft and pyraft targets
+ default action (no args) is to build both libraft and pyraft targets
 "
 CPP_RAFT_BUILD_DIR=${REPODIR}/cpp/build
 SPHINX_BUILD_DIR=${REPODIR}/docs
@@ -47,8 +49,10 @@ BUILD_DIRS="${CPP_RAFT_BUILD_DIR} ${PY_RAFT_BUILD_DIR} ${PYTHON_DEPS_CLONE}"
 CMAKE_LOG_LEVEL=""
 VERBOSE_FLAG=""
 BUILD_ALL_GPU_ARCH=0
-BUILD_GTEST=OFF
+BUILD_TESTS=ON
 BUILD_STATIC_FAISS=OFF
+COMPILE_LIBRARIES=OFF
+ENABLE_NN_DEPENDENCIES=ON
 SINGLEGPU=""
 NVTX=OFF
 CLEAN=0
@@ -93,6 +97,10 @@ if hasArg -g; then
     BUILD_TYPE=Debug
 fi
 
+if hasArg --compilelibs; then
+    COMPILE_LIBRARIES=ON
+fi
+
 if hasArg --allgpuarch; then
     BUILD_ALL_GPU_ARCH=1
 fi
@@ -137,7 +145,7 @@ fi
 
 ################################################################################
 # Configure for building all C++ targets
-if (( ${NUMARGS} == 0 )) || hasArg cppraft || hasArg docs; then
+if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs; then
     if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
         RAFT_CMAKE_CUDA_ARCHITECTURES="NATIVE"
         echo "Building for the architecture of the GPU in the system..."
@@ -149,18 +157,20 @@ if (( ${NUMARGS} == 0 )) || hasArg cppraft || hasArg docs; then
     cmake -S ${REPODIR}/cpp -B ${CPP_RAFT_BUILD_DIR} ${CMAKE_LOG_LEVEL} \
           -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
           -DCMAKE_CUDA_ARCHITECTURES=${RAFT_CMAKE_CUDA_ARCHITECTURES} \
+          -DRAFT_COMPILE_LIBRARIES=${COMPILE_LIBRARIES} \
+          -DRAFT_ENABLE_NN_DEPENDENCIES=${ENABLE_NN_DEPENDENCIES} \
           -DNVTX=${NVTX} \
           -DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \
-          -DBUILD_GTEST=${BUILD_GTEST} \
-          -DRAFT_USE_FAISS_STATIC=${BUILD_STATIC_FAISS}
+          -DBUILD_TESTS=${BUILD_TESTS} \
+          -DRAFT_USE_FAISS_STATIC=${BUILD_STATIC_FAISS} \
+          ..
 
-  if hasArg cppraft; then
+  if (( ${NUMARGS} == 0 )) || hasArg libraft; then
       # Run all c++ targets at once
-      cmake --build  ${CPP_RAFT_BUILD_DIR} -j${PARALLEL_LEVEL} ${MAKE_TARGETS} ${VERBOSE_FLAG}
+      cmake --build  ${CPP_RAFT_BUILD_DIR} -j${PARALLEL_LEVEL} ${VERBOSE_FLAG}
   fi
 fi
 
-
 # Build and (optionally) install the cuml Python package
 if (( ${NUMARGS} == 0 )) || hasArg pyraft || hasArg docs; then
 
@@ -168,7 +178,7 @@ if (( ${NUMARGS} == 0 )) || hasArg pyraft || hasArg docs; then
     if [[ ${INSTALL_TARGET} != "" ]]; then
         python setup.py build_ext -j${PARALLEL_LEVEL:-1} --inplace ${SINGLEGPU}
     else
-        python setup.py build_ext -j${PARALLEL_LEVEL:-1} --inplace --library-dir=${LIBCUML_BUILD_DIR} ${SINGLEGPU}
+        python setup.py build_ext -j${PARALLEL_LEVEL:-1} --inplace --library-dir=${LIBRAFT_BUILD_DIR} ${SINGLEGPU}
     fi
 fi
 
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 145624500c..45d1e03881 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -89,7 +89,7 @@ export LD_LIBRARY_PATH_CACHED=$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
 
 gpuci_logger "Build C++ and Python targets"
-"$WORKSPACE/build.sh" cppraft pyraft -v
+"$WORKSPACE/build.sh" libraft pyraft -v
 
 gpuci_logger "Building docs"
 "$WORKSPACE/build.sh" docs -v

From c39ecd8d616e24a1122faff66e525f644a7b59ab Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Tue, 25 Jan 2022 13:40:24 -0500
Subject: [PATCH 082/171] Splitting fused l2 knn specializations (#461)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/461
---
 README.md                                     |  2 +-
 build.sh                                      | 34 ++++----
 cpp/CMakeLists.txt                            |  5 +-
 cpp/src/nn/specializations/fused_l2_knn.cu    | 80 -------------------
 .../fused_l2_knn_int_float_false.cu           | 41 ++++++++++
 .../fused_l2_knn_int_float_true.cu            | 40 ++++++++++
 .../fused_l2_knn_long_float_false.cu          | 40 ++++++++++
 .../fused_l2_knn_long_float_true.cu           | 40 ++++++++++
 8 files changed, 185 insertions(+), 97 deletions(-)
 delete mode 100644 cpp/src/nn/specializations/fused_l2_knn.cu
 create mode 100644 cpp/src/nn/specializations/fused_l2_knn_int_float_false.cu
 create mode 100644 cpp/src/nn/specializations/fused_l2_knn_int_float_true.cu
 create mode 100644 cpp/src/nn/specializations/fused_l2_knn_long_float_false.cu
 create mode 100644 cpp/src/nn/specializations/fused_l2_knn_long_float_true.cu

diff --git a/README.md b/README.md
index 5f0b9a109f..bf43450a1b 100755
--- a/README.md
+++ b/README.md
@@ -62,7 +62,7 @@ The folder structure mirrors other RAPIDS repos (cuDF, cuML, cuGraph...), with t
 
 - `ci`: Scripts for running CI in PRs
 - `conda`: conda recipes and development conda environments
-- `cpp`: Source code for all C++ code. The code is currently header-only, therefore it is in the `include` folder (with no `src`).
+- `cpp`: Source code for all C++ code. Headers are in the `include` folder and compiled template specializations for the shared libraries are in `src`.
 - `docs`: Source code and scripts for building library documentation
 - `python`: Source code for all Python source code.
 
diff --git a/build.sh b/build.sh
index 1346270e45..c13bd70342 100755
--- a/build.sh
+++ b/build.sh
@@ -18,7 +18,7 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libraft pyraft docs -v -g --compilelibs --allgpuarch --nvtx --show_depr_warn -h --buildgtest --buildfaiss"
+VALIDARGS="clean libraft pyraft docs -v -g --compilelibs --allgpuarch --nvtx --show_depr_warn -h --nogtest --buildfaiss"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
@@ -33,6 +33,7 @@ HELP="$0 [<target> ...] [<flag> ...]
    --compilelibs    - compile shared libraries
    --allgpuarch     - build for all supported GPU architectures
    --buildfaiss     - build faiss statically into raft
+   --nogtest        - do not build google tests for libraft
    --nvtx           - Enable nvtx for profiling support
    --show_depr_warn - show cmake deprecation warnings
    -h               - print this text
@@ -49,14 +50,15 @@ BUILD_DIRS="${CPP_RAFT_BUILD_DIR} ${PY_RAFT_BUILD_DIR} ${PYTHON_DEPS_CLONE}"
 CMAKE_LOG_LEVEL=""
 VERBOSE_FLAG=""
 BUILD_ALL_GPU_ARCH=0
-BUILD_TESTS=ON
+BUILD_TESTS=YES
 BUILD_STATIC_FAISS=OFF
-COMPILE_LIBRARIES=OFF
-ENABLE_NN_DEPENDENCIES=ON
+COMPILE_LIBRARIES=${BUILD_TESTS}
+ENABLE_NN_DEPENDENCIES=${BUILD_TESTS}
 SINGLEGPU=""
 NVTX=OFF
 CLEAN=0
-BUILD_DISABLE_DEPRECATION_WARNING=ON
+DISABLE_DEPRECATION_WARNINGS=ON
+CMAKE_TARGET=""
 
 # Set defaults for vars that may not have been defined externally
 #  FIXME: if INSTALL_PREFIX is not set, check PREFIX, then check
@@ -97,18 +99,20 @@ if hasArg -g; then
     BUILD_TYPE=Debug
 fi
 
-if hasArg --compilelibs; then
-    COMPILE_LIBRARIES=ON
-fi
-
 if hasArg --allgpuarch; then
     BUILD_ALL_GPU_ARCH=1
 fi
-if hasArg --buildgtest; then
-    BUILD_GTEST=ON
+if hasArg --nogtest; then
+    BUILD_TESTS=OFF
+    COMPILE_LIBRARIES=OFF
+    ENABLE_NN_DEPENDENCIES=OFF
+fi
+if hasArg --compilelibs; then
+    COMPILE_LIBRARIES=ON
+    ENABLE_NN_DEPENDENCIES=ON
 fi
 if hasArg --buildfaiss; then
-      BUILD_STATIC_FAISS=ON
+    BUILD_STATIC_FAISS=ON
 fi
 if hasArg --singlegpu; then
     SINGLEGPU="--singlegpu"
@@ -117,7 +121,7 @@ if hasArg --nvtx; then
     NVTX=ON
 fi
 if hasArg --show_depr_warn; then
-    BUILD_DISABLE_DEPRECATION_WARNING=OFF
+    DISABLE_DEPRECATION_WARNINGS=OFF
 fi
 if hasArg clean; then
     CLEAN=1
@@ -160,14 +164,14 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs; then
           -DRAFT_COMPILE_LIBRARIES=${COMPILE_LIBRARIES} \
           -DRAFT_ENABLE_NN_DEPENDENCIES=${ENABLE_NN_DEPENDENCIES} \
           -DNVTX=${NVTX} \
-          -DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \
+          -DDISABLE_DEPRECATION_WARNINGS=${DISABLE_DEPRECATION_WARNINGS} \
           -DBUILD_TESTS=${BUILD_TESTS} \
           -DRAFT_USE_FAISS_STATIC=${BUILD_STATIC_FAISS} \
           ..
 
   if (( ${NUMARGS} == 0 )) || hasArg libraft; then
       # Run all c++ targets at once
-      cmake --build  ${CPP_RAFT_BUILD_DIR} -j${PARALLEL_LEVEL} ${VERBOSE_FLAG}
+      cmake --build  ${CPP_RAFT_BUILD_DIR} -j${PARALLEL_LEVEL} ${VERBOSE_FLAG} ${CMAKE_TARGET}
   fi
 fi
 
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f3a0f2d554..6aafa39d97 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -203,7 +203,10 @@ if(RAFT_COMPILE_LIBRARIES)
   add_library(raft_nn_lib SHARED
     src/nn/specializations/ball_cover.cu
     src/nn/specializations/detail/ball_cover_lowdim.cu
-    src/nn/specializations/fused_l2_knn.cu
+    src/nn/specializations/fused_l2_knn_long_float_true.cu
+    src/nn/specializations/fused_l2_knn_long_float_false.cu
+    src/nn/specializations/fused_l2_knn_int_float_true.cu
+    src/nn/specializations/fused_l2_knn_int_float_false.cu
     src/nn/specializations/knn.cu
   )
   set_target_properties(raft_nn_lib PROPERTIES OUTPUT_NAME raft_nn)
diff --git a/cpp/src/nn/specializations/fused_l2_knn.cu b/cpp/src/nn/specializations/fused_l2_knn.cu
deleted file mode 100644
index 26aa7069e9..0000000000
--- a/cpp/src/nn/specializations/fused_l2_knn.cu
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-
-template void fusedL2Knn<long, float, true>(size_t D,
-                                            long* out_inds,
-                                            float* out_dists,
-                                            const float* index,
-                                            const float* query,
-                                            size_t n_index_rows,
-                                            size_t n_query_rows,
-                                            int k,
-                                            bool rowMajorIndex,
-                                            bool rowMajorQuery,
-                                            cudaStream_t stream,
-                                            raft::distance::DistanceType metric);
-
-template void fusedL2Knn<long, float, false>(size_t D,
-                                             long* out_inds,
-                                             float* out_dists,
-                                             const float* index,
-                                             const float* query,
-                                             size_t n_index_rows,
-                                             size_t n_query_rows,
-                                             int k,
-                                             bool rowMajorIndex,
-                                             bool rowMajorQuery,
-                                             cudaStream_t stream,
-                                             raft::distance::DistanceType metric);
-
-template void fusedL2Knn<int, float, true>(size_t D,
-                                           int* out_inds,
-                                           float* out_dists,
-                                           const float* index,
-                                           const float* query,
-                                           size_t n_index_rows,
-                                           size_t n_query_rows,
-                                           int k,
-                                           bool rowMajorIndex,
-                                           bool rowMajorQuery,
-                                           cudaStream_t stream,
-                                           raft::distance::DistanceType metric);
-
-template void fusedL2Knn<int, float, false>(size_t D,
-                                            int* out_inds,
-                                            float* out_dists,
-                                            const float* index,
-                                            const float* query,
-                                            size_t n_index_rows,
-                                            size_t n_query_rows,
-                                            int k,
-                                            bool rowMajorIndex,
-                                            bool rowMajorQuery,
-                                            cudaStream_t stream,
-                                            raft::distance::DistanceType metric);
-
-};  // namespace detail
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
diff --git a/cpp/src/nn/specializations/fused_l2_knn_int_float_false.cu b/cpp/src/nn/specializations/fused_l2_knn_int_float_false.cu
new file mode 100644
index 0000000000..7d1747cfc3
--- /dev/null
+++ b/cpp/src/nn/specializations/fused_l2_knn_int_float_false.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+
+template void fusedL2Knn<int, float, false>(size_t D,
+                                            int* out_inds,
+                                            float* out_dists,
+                                            const float* index,
+                                            const float* query,
+                                            size_t n_index_rows,
+                                            size_t n_query_rows,
+                                            int k,
+                                            bool rowMajorIndex,
+                                            bool rowMajorQuery,
+                                            cudaStream_t stream,
+                                            raft::distance::DistanceType metric);
+
+};  // namespace detail
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
diff --git a/cpp/src/nn/specializations/fused_l2_knn_int_float_true.cu b/cpp/src/nn/specializations/fused_l2_knn_int_float_true.cu
new file mode 100644
index 0000000000..d6748a0e4a
--- /dev/null
+++ b/cpp/src/nn/specializations/fused_l2_knn_int_float_true.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+template void fusedL2Knn<int, float, true>(size_t D,
+                                           int* out_inds,
+                                           float* out_dists,
+                                           const float* index,
+                                           const float* query,
+                                           size_t n_index_rows,
+                                           size_t n_query_rows,
+                                           int k,
+                                           bool rowMajorIndex,
+                                           bool rowMajorQuery,
+                                           cudaStream_t stream,
+                                           raft::distance::DistanceType metric);
+
+};  // namespace detail
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
diff --git a/cpp/src/nn/specializations/fused_l2_knn_long_float_false.cu b/cpp/src/nn/specializations/fused_l2_knn_long_float_false.cu
new file mode 100644
index 0000000000..b96bb8987e
--- /dev/null
+++ b/cpp/src/nn/specializations/fused_l2_knn_long_float_false.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+
+template void fusedL2Knn<long, float, false>(size_t D,
+                                             long* out_inds,
+                                             float* out_dists,
+                                             const float* index,
+                                             const float* query,
+                                             size_t n_index_rows,
+                                             size_t n_query_rows,
+                                             int k,
+                                             bool rowMajorIndex,
+                                             bool rowMajorQuery,
+                                             cudaStream_t stream,
+                                             raft::distance::DistanceType metric);
+};  // namespace detail
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
diff --git a/cpp/src/nn/specializations/fused_l2_knn_long_float_true.cu b/cpp/src/nn/specializations/fused_l2_knn_long_float_true.cu
new file mode 100644
index 0000000000..379c24bf36
--- /dev/null
+++ b/cpp/src/nn/specializations/fused_l2_knn_long_float_true.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+
+template void fusedL2Knn<long, float, true>(size_t D,
+                                            long* out_inds,
+                                            float* out_dists,
+                                            const float* index,
+                                            const float* query,
+                                            size_t n_index_rows,
+                                            size_t n_query_rows,
+                                            int k,
+                                            bool rowMajorIndex,
+                                            bool rowMajorQuery,
+                                            cudaStream_t stream,
+                                            raft::distance::DistanceType metric);
+};  // namespace detail
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft

From 66b2746a24f58b3638a4b0b73d767105569ca585 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Tue, 25 Jan 2022 14:40:22 -0500
Subject: [PATCH 083/171] Removing extra logging from faiss mr (#463)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/463
---
 cpp/include/raft/spatial/knn/faiss_mr.hpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/faiss_mr.hpp b/cpp/include/raft/spatial/knn/faiss_mr.hpp
index 11377e66e1..3cae417996 100644
--- a/cpp/include/raft/spatial/knn/faiss_mr.hpp
+++ b/cpp/include/raft/spatial/knn/faiss_mr.hpp
@@ -431,13 +431,10 @@ class RmmGpuResourcesImpl : public GpuResources {
     if (allocLogging_) { std::cout << "RmmGpuResources: dealloc " << req.toString() << "\n"; }
 
     if (req.space == MemorySpace::Temporary) {
-      std::cout << "dealloc Temporary" << std::endl;
       tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
     } else if (req.space == MemorySpace::Device) {
-      std::cout << "dealloc Device" << std::endl;
       cmr->deallocate(p, req.size, req.stream);
     } else if (req.space == MemorySpace::Unified) {
-      std::cout << "dealloc Unified" << std::endl;
       mmr->deallocate(p, req.size, req.stream);
     } else {
       FAISS_ASSERT_FMT(false, "unknown MemorySpace %d", (int)req.space);

From 4a13ea762e7b03f51fc69b29827fe72062e061c9 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Wed, 26 Jan 2022 16:34:12 -0500
Subject: [PATCH 084/171] Adding conda packaging for libraft and pyraft (#439)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Paul Taylor (https://github.com/trxcllnt)
  - Robert Maynard (https://github.com/robertmaynard)
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Jordan Jacobelli (https://github.com/Ethyling)
  - Dante Gama Dessavre (https://github.com/dantegd)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/raft/pull/439
---
 README.md                                |  54 +++---
 build.sh                                 |  29 +++-
 ci/checks/black_lists.sh                 |   4 +-
 ci/checks/changelog.sh                   |  39 -----
 ci/checks/copyright.py                   |   2 +-
 ci/gpu/build.sh                          |  16 +-
 ci/local/README.md                       |   6 +-
 ci/local/build.sh                        |   2 +-
 ci/local/old-gpubuild.sh                 | 208 -----------------------
 ci/prtest.config                         |   6 -
 ci/release/update-version.sh             |   2 +-
 conda/recipes/libraft_distance/build.sh  |   3 +
 conda/recipes/libraft_distance/meta.yaml |  57 +++++++
 conda/recipes/libraft_headers/build.sh   |   3 +
 conda/recipes/libraft_headers/meta.yaml  |  56 ++++++
 conda/recipes/libraft_nn/build.sh        |   3 +
 conda/recipes/libraft_nn/meta.yaml       |  56 ++++++
 conda/recipes/pyraft/build.sh            |   4 +
 conda/recipes/pyraft/meta.yaml           |  60 +++++++
 cpp/CMakeLists.txt                       |   3 +-
 cpp/cmake/thirdparty/get_thrust.cmake    |  14 +-
 21 files changed, 323 insertions(+), 304 deletions(-)
 delete mode 100755 ci/checks/changelog.sh
 delete mode 100644 ci/local/old-gpubuild.sh
 delete mode 100644 ci/prtest.config
 create mode 100644 conda/recipes/libraft_distance/build.sh
 create mode 100644 conda/recipes/libraft_distance/meta.yaml
 create mode 100644 conda/recipes/libraft_headers/build.sh
 create mode 100644 conda/recipes/libraft_headers/meta.yaml
 create mode 100644 conda/recipes/libraft_nn/build.sh
 create mode 100644 conda/recipes/libraft_nn/meta.yaml
 create mode 100644 conda/recipes/pyraft/build.sh
 create mode 100644 conda/recipes/pyraft/meta.yaml

diff --git a/README.md b/README.md
index bf43450a1b..88612f13f3 100755
--- a/README.md
+++ b/README.md
@@ -1,31 +1,36 @@
 # <div align="left"><img src="https://rapids.ai/assets/images/rapids_logo.png" width="90px"/>&nbsp;RAFT: RAPIDS Analytics Framework Toolkit</div>
 
-RAFT is a library containing building-blocks for rapid composition of RAPIDS Analytics. These building-blocks include shared representations, mathematical computational primitives, and utilities that accelerate building analytics and data science algorithms in the RAPIDS ecosystem. Both the C++ and Python components can be included in consuming libraries, providing operations for both dense and sparse matrix formats in the following general categories:
+RAFT is a [Scipy-like](https://scipy.org/) library for scientific computing, containing CUDA-accelerated building-blocks for rapidly composing analytics in the [RAPIDS](https://rapids.ai) ecosystem. These building-blocks include infrastructure as well as mathematical computational primitives, which accelerate the development of algorithms for data science applications. 
+
+By taking a primitives-based approach to algorithm development, RAFT
+1. accelerates algorithm construction time
+2. reduces the maintenance burden by maximizing reuse across projects, and
+3. centralizes the core computations, allowing future optimizations to benefit all algorithms that use them.
+
+RAFT provides a header-only C++ API (with optional shared libraries to accelerate build time) that cover the following general categories:
 
 #####
 | Category | Description / Examples |
 | --- | --- |
-| **Data Formats** | tensor representations and conversions for both sparse and dense formats |
-| **Data Generation** | graph, spatial, and machine learning dataset generation |
-| **Dense Operations** | linear algebra, statistics |
-| **Spatial** | pairwise distances, nearest neighbors, neighborhood / proximity graph construction |
-| **Sparse/Graph Operations** | linear algebra, statistics, slicing, msf, spectral embedding/clustering, slhc, vertex degree |
-| **Solvers** | eigenvalue decomposition, least squares, lanczos |
-| **Tools** | multi-node multi-gpu communicator, utilities |
-
-By taking a primitives-based approach to algorithm development, RAFT accelerates algorithm construction time and reduces
-the maintenance burden by maximizing reuse across projects. RAFT relies on the [RAPIDS memory manager (RMM)](https://github.com/rapidsai/rmm) which, 
-like other projects in the RAPIDS ecosystem, eases the burden of configuring different allocation strategies globally 
-across the libraries that use it. RMM also provides RAII wrappers around device arrays that handle the allocation and cleanup.
-
-RAFT's primary goals are to be fast, simple, reusable, composable, and comprehensive.
-
-## Build/Install RAFT
-
-Refer to the [Build](BUILD.md) instructions for details on building and including the RAFT library in downstream projects.
+| **Data Formats** | sparse & dense, conversions, and data generations |
+| **Data Generation** | sparse, spatial, machine learning datasets |
+| **Dense Linear Algebra** | matrix arithmetic, norms, factorization |
+| **Spatial** | pairwise distances, nearest neighbors, neighborhood graph construction |
+| **Sparse Operations** | linear algebra, slicing, symmetrization, norms, spectral embedding, msf |
+| **Basic Clustering** | spectral clustering, hierarchical clustering, k-means |
+| **Optimizers** | eigenvalue decomposition, least squares, and lanczos |
+| **Statistics** | sampling, moments, metrics |
+| **Distributed Tools** | multi-node multi-gpu infrastructure |
+
+RAFT also provides a Python API that enables the building of multi-node multi-GPU algorithms in the [Dask](https://dask.org/) ecosystem. We are continuing to improve the coverage of the Python API to expose the building-blocks from the categories above.
 
 ## Getting started
 
+### Rapids Memory Manager (RMM)
+RAFT relies heavily on [RMM](https://github.com/rapidsai/rmm) which, 
+like other projects in the RAPIDS ecosystem, eases the burden of configuring different allocation strategies globally 
+across the libraries that use it. RMM also provides [RAII](https://en.wikipedia.org/wiki/Resource_acquisition_is_initialization)) wrappers around device arrays that handle the allocation and cleanup.
+
 ### C++ Example
 
 Most of the primitives in RAFT accept a `raft::handle_t` object for the management of resources which are expensive to create, such CUDA streams, stream pools, and handles to other CUDA libraries like `cublas` and `cusolver`.
@@ -55,17 +60,22 @@ raft::distance::pairwise_distance(handle, input.data(), input.data(),
                                   workspace.data(), metric);
 ```
 
+## Build/Install RAFT
+
+Refer to the [Build](BUILD.md) instructions for details on building and including the RAFT library in downstream projects.
 
 ## Folder Structure and Contents
 
 The folder structure mirrors other RAPIDS repos (cuDF, cuML, cuGraph...), with the following folders:
 
 - `ci`: Scripts for running CI in PRs
-- `conda`: conda recipes and development conda environments
-- `cpp`: Source code for all C++ code. Headers are in the `include` folder and compiled template specializations for the shared libraries are in `src`.
+- `conda`: Conda recipes and development conda environments
+- `cpp`: Source code for all C++ code. 
+  - `include`: The C++ API is fully-contained here 
+  - `src`: Compiled template specializations for the shared libraries
 - `docs`: Source code and scripts for building library documentation
 - `python`: Source code for all Python source code.
 
 ## Contributing
 
-If you are interested in contributing to the RAFT project, please read our [Contributing guidelines](CONTRIBUTING.md). Refer to the [Developer Guide](DEVELOPER_GUIDE.md) for details on the developer guidelines, workflows, and principals. 
\ No newline at end of file
+If you are interested in contributing to the RAFT project, please read our [Contributing guidelines](CONTRIBUTING.md). Refer to the [Developer Guide](DEVELOPER_GUIDE.md) for details on the developer guidelines, workflows, and principals. 
diff --git a/build.sh b/build.sh
index c13bd70342..a636d100b4 100755
--- a/build.sh
+++ b/build.sh
@@ -18,7 +18,7 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libraft pyraft docs -v -g --compilelibs --allgpuarch --nvtx --show_depr_warn -h --nogtest --buildfaiss"
+VALIDARGS="clean libraft pyraft docs -v -g --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --nogtest --buildfaiss"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
@@ -30,7 +30,9 @@ HELP="$0 [<target> ...] [<flag> ...]
  and <flag> is:
    -v               - verbose build mode
    -g               - build for debug
-   --compilelibs    - compile shared libraries
+   --compile-libs    - compile shared libraries for all components
+   --compile-nn     - compile shared library for nn component
+   --compile-dist   - compile shared library for distance component
    --allgpuarch     - build for all supported GPU architectures
    --buildfaiss     - build faiss statically into raft
    --nogtest        - do not build google tests for libraft
@@ -107,10 +109,17 @@ if hasArg --nogtest; then
     COMPILE_LIBRARIES=OFF
     ENABLE_NN_DEPENDENCIES=OFF
 fi
-if hasArg --compilelibs; then
-    COMPILE_LIBRARIES=ON
+
+if hasArg --compile-nn || hasArg --compile-libs; then
     ENABLE_NN_DEPENDENCIES=ON
+    COMPILE_LIBRARIES=ON
+    CMAKE_TARGET="raft_nn_lib;${CMAKE_TARGET}"
+fi
+if hasArg --compile-dist || hasArg --compile-libs; then
+    COMPILE_LIBRARIES=ON
+    CMAKE_TARGET="raft_distance_lib;${CMAKE_TARGET}"
 fi
+
 if hasArg --buildfaiss; then
     BUILD_STATIC_FAISS=ON
 fi
@@ -171,8 +180,18 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs; then
 
   if (( ${NUMARGS} == 0 )) || hasArg libraft; then
       # Run all c++ targets at once
-      cmake --build  ${CPP_RAFT_BUILD_DIR} -j${PARALLEL_LEVEL} ${VERBOSE_FLAG} ${CMAKE_TARGET}
+      if hasArg --compile-nn || hasArg --compile-dist || hasArg --compile-libs; then
+        if ! hasArg --nogtest; then
+          CMAKE_TARGET="test_raft;${CMAKE_TARGET}"
+        fi
+
+        echo "-- Compiling targets: ${CMAKE_TARGET}"
+        cmake --build  ${CPP_RAFT_BUILD_DIR} -j${PARALLEL_LEVEL} ${VERBOSE_FLAG} --target ${CMAKE_TARGET}
+      else
+        cmake --build  ${CPP_RAFT_BUILD_DIR} -j${PARALLEL_LEVEL} ${VERBOSE_FLAG}
+      fi
   fi
+
 fi
 
 # Build and (optionally) install the cuml Python package
diff --git a/ci/checks/black_lists.sh b/ci/checks/black_lists.sh
index d96240650d..849b354d08 100755
--- a/ci/checks/black_lists.sh
+++ b/ci/checks/black_lists.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 ##########################################
-# cuML black listed function call Tester #
+# RAFT black listed function call Tester #
 ##########################################
 
 # PR_TARGET_BRANCH is set by the CI enviroment
diff --git a/ci/checks/changelog.sh b/ci/checks/changelog.sh
deleted file mode 100755
index 5560b34f73..0000000000
--- a/ci/checks/changelog.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2020, NVIDIA CORPORATION.
-#########################
-# cuML CHANGELOG Tester #
-#########################
-
-# Checkout main for comparison
-git checkout --force --quiet main
-
-# Switch back to tip of PR branch
-git checkout --force --quiet current-pr-branch
-
-# Ignore errors during searching
-set +e
-
-# Get list of modified files between matster and PR branch
-CHANGELOG=`git diff --name-only main...current-pr-branch | grep CHANGELOG.md`
-# Check if CHANGELOG has PR ID
-PRNUM=`cat CHANGELOG.md | grep "$PR_ID"`
-RETVAL=0
-
-# Return status of check result
-if [ "$CHANGELOG" != "" -a "$PRNUM" != "" ] ; then
-  echo -e "\n\n>>>> PASSED: CHANGELOG.md has been updated with current PR information.\n\nPlease ensure the update meets the following criteria.\n"
-else
-  echo -e "\n\n>>>> FAILED: CHANGELOG.md has not been updated!\n\nPlease add a line describing this PR to CHANGELOG.md in the repository root directory. The line should meet the following criteria.\n"
-  RETVAL=1
-fi
-
-cat << EOF
-  It should be placed under the section for the appropriate release.
-  It should be placed under "New Features", "Improvements", or "Bug Fixes" as appropriate.
-  It should be formatted as '- PR #<PR number> <Concise human-readable description of the PR's new feature, improvement, or bug fix>'
-    Example format for #491 '- PR #491 Add CI test script to check for updates to CHANGELOG.md in PRs'
-
-
-EOF
-
-exit $RETVAL
diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py
index 738eeb32dd..79a0d70005 100644
--- a/ci/checks/copyright.py
+++ b/ci/checks/copyright.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index dea19dfbc3..e79c21f0c5 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #########################################
-# cuML GPU build and test script for CI #
+# RAFT GPU build and test script for CI #
 #########################################
 
 set -e
@@ -26,9 +26,6 @@ cd "$WORKSPACE"
 export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 
-# Read options for cloning/running downstream repo tests
-source "$WORKSPACE/ci/prtest.config"
-
 # ucx-py version
 export UCX_PY_VERSION='0.25.*'
 
@@ -89,7 +86,11 @@ export LD_LIBRARY_PATH_CACHED=$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
 
 gpuci_logger "Build C++ and Python targets"
-"$WORKSPACE/build.sh" libraft pyraft -v
+if hasArg --skip-tests; then
+  "$WORKSPACE/build.sh" libraft pyraft libraft -v --compile-libs --nogtest
+else
+  "$WORKSPACE/build.sh" libraft pyraft libraft -v --compile-libs
+fi
 
 gpuci_logger "Building docs"
 "$WORKSPACE/build.sh" docs -v
@@ -99,7 +100,6 @@ gpuci_logger "Resetting LD_LIBRARY_PATH"
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH_CACHED
 export LD_LIBRARY_PATH_CACHED=""
 
-
 ################################################################################
 # TEST - Run GoogleTest and py.tests for RAFT
 ################################################################################
@@ -116,7 +116,7 @@ gpuci_logger "GoogleTest for raft"
 cd "$WORKSPACE/cpp/build"
 GTEST_OUTPUT="xml:$WORKSPACE/test-results/raft_cpp/" ./test_raft
 
-gpuci_logger "Python pytest for cuml"
+gpuci_logger "Python pytest for raft"
 cd "$WORKSPACE/python"
 
-python -m pytest --cache-clear --junitxml="$WORKSPACE/junit-cuml.xml" -v -s
+python -m pytest --cache-clear --junitxml="$WORKSPACE/junit-raft.xml" -v -s
diff --git a/ci/local/README.md b/ci/local/README.md
index 4e1dea789a..3b47ef3b53 100644
--- a/ci/local/README.md
+++ b/ci/local/README.md
@@ -23,13 +23,13 @@ where:
 ```
 
 Example Usage:
-`bash build.sh -r ~/rapids/cuml -i gpuci/rapidsai-base:cuda9.2-ubuntu16.04-gcc5-py3.6`
+`bash build.sh -r ~/rapids/raft -i gpuci/rapidsai-base:cuda9.2-ubuntu16.04-gcc5-py3.6`
 
 For a full list of available gpuCI docker images, visit our [DockerHub](https://hub.docker.com/r/gpuci/rapidsai-base/tags) page.
 
 Style Check:
 ```bash
-$ bash ci/local/build.sh -r ~/rapids/cuml -s
+$ bash ci/local/build.sh -r ~/rapids/raft -s
 $ . /opt/conda/etc/profile.d/conda.sh
 $ conda activate rapids    #Activate gpuCI conda environment
 $ cd rapids
@@ -43,7 +43,7 @@ There are some caveats to be aware of when using this script, especially if you
 
 ### Docker Image Build Repository
 
-The docker image will generate build artifacts in a folder on your machine located in the `root` directory of the repository you passed to the script. For the above example, the directory is named `~/rapids/cuml/build_rapidsai-base_cuda9.2-ubuntu16.04-gcc5-py3.6/`. Feel free to remove this directory after the script is finished.
+The docker image will generate build artifacts in a folder on your machine located in the `root` directory of the repository you passed to the script. For the above example, the directory is named `~/rapids/raft/build_rapidsai-base_cuda9.2-ubuntu16.04-gcc5-py3.6/`. Feel free to remove this directory after the script is finished.
 
 *Note*: The script *will not* override your local build repository. Your local environment stays in tact.
 
diff --git a/ci/local/build.sh b/ci/local/build.sh
index 9effad4244..cdafd967c7 100644
--- a/ci/local/build.sh
+++ b/ci/local/build.sh
@@ -6,7 +6,7 @@ MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 DOCKER_IMAGE="gpuci/rapidsai:${MINOR_VERSION}-cuda10.1-devel-ubuntu16.04-py3.7"
 REPO_PATH=${PWD}
 RAPIDS_DIR_IN_CONTAINER="/rapids"
-CPP_BUILD_DIR="cuML/build"
+CPP_BUILD_DIR="raft/build"
 PYTHON_BUILD_DIR="python/build"
 CONTAINER_SHELL_ONLY=0
 
diff --git a/ci/local/old-gpubuild.sh b/ci/local/old-gpubuild.sh
deleted file mode 100644
index c6210672a4..0000000000
--- a/ci/local/old-gpubuild.sh
+++ /dev/null
@@ -1,208 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2020, NVIDIA CORPORATION.
-#########################################
-# cuML GPU build and test script for CI #
-#########################################
-
-set -e
-NUMARGS=$#
-ARGS=$*
-
-# Arg parsing function
-function hasArg {
-    (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ")
-}
-
-# Set path and build parallel level
-export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
-export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
-export CUDA_REL=${CUDA_VERSION%.*}
-
-# Set home to the job's workspace
-export HOME="$WORKSPACE"
-
-# Parse git describei
-cd "$WORKSPACE"
-export GIT_DESCRIBE_TAG=`git describe --tags`
-export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
-
-# Read options for cloning/running downstream repo tests
-source "$WORKSPACE/ci/prtest.config"
-
-# ucx-py version
-export UCX_PY_VERSION='0.25.*'
-
-################################################################################
-# SETUP - Check environment
-################################################################################
-
-gpuci_logger "Check environment"
-env
-
-gpuci_logger "Check GPU usage"
-nvidia-smi
-
-# temporary usage of gpuci_conda_retry install with packages listed here, looking into
-# using the repos yaml files for this
-gpuci_logger "Activate conda env"
-. /opt/conda/etc/profile.d/conda.sh
-conda activate rapids
-gpuci_logger "Installing packages needed for RAFT"
-gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \
-      "cupy>=7,<8.0.0a0" \
-      "cudatoolkit=${CUDA_REL}" \
-      "cudf=${MINOR_VERSION}" \
-      "rmm=${MINOR_VERSION}" \
-      "cmake==3.14.3" \
-      "nccl>=2.5" \
-      "dask>=2.12.0" \
-      "distributed>=2.12.0" \
-      "dask-cudf=${MINOR_VERSION}" \
-      "dask-cuda=${MINOR_VERSION}" \
-      "ucx-py=${UCX_PY_VERSION}"
-
-if [ "$RUN_CUML_LIBCUML_TESTS" = "ON" ] || [ "$RUN_CUML_PRIMS_TESTS" = "ON" ] || [ "$RUN_CUML_PYTHON_TESTS" = "ON" ]; then
-  gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \
-      "nvstrings=${MINOR_VERSION}" \
-      "libcumlprims=${MINOR_VERSION}" \
-      "lapack" \
-      "umap-learn" \
-      "nccl>=2.5" \
-      "statsmodels" \
-      "xgboost====1.0.2dev.rapidsai0.13" \
-      "lightgbm"
-fi
-
-if [ "$RUN_CUGRAPH_LIBCUGRAPH_TESTS" = "ON" ] || [ "$RUN_CUGRAPH_PYTHON_TESTS" = "ON" ]; then
-  gpuci_conda_retry install -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge \
-      "networkx>=2.3" \
-      "python-louvain" \
-      "libcypher-parser" \
-      "ipython=7.3*" \
-      "jupyterlab"
-fi
-
-# Install the master version of dask, distributed, and dask-ml
-set -x
-pip install "git+https://github.com/dask/distributed.git@2022.01.0" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git@2022.01.0" --upgrade --no-deps
-set +x
-
-
-gpuci_logger "Check versions"
-python --version
-$CC --version
-$CXX --version
-conda info
-conda config --show-sources
-conda list --show-channel-urls
-
-
-################################################################################
-# BUILD - Build RAFT tests
-################################################################################
-
-gpuci_logger "Adding ${CONDA_PREFIX}/lib to LD_LIBRARY_PATH"
-
-export LD_LIBRARY_PATH_CACHED=$LD_LIBRARY_PATH
-export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
-
-gpuci_logger "Build libcuml, cuml, prims and bench targets"
-"$WORKSPACE/build.sh" cppraft pyraft -v
-
-gpuci_logger "Resetting LD_LIBRARY_PATH"
-
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH_CACHED
-export LD_LIBRARY_PATH_CACHED=""
-
-gpuci_logger "Build treelite for GPU testing"
-
-cd "$WORKSPACE"
-
-
-################################################################################
-# TEST - Run GoogleTest and py.tests for RAFT
-################################################################################
-
-if hasArg --skip-tests; then
-    gpuci_logger "Skipping Tests"
-    exit 0
-fi
-
-gpuci_logger "Check GPU usage"
-nvidia-smi
-
-gpuci_logger "GoogleTest for raft"
-cd "$WORKSPACE/cpp/build"
-GTEST_OUTPUT="xml:$WORKSPACE/test-results/raft_cpp/" ./test/ml
-
-gpuci_logger "Python pytest for cuml"
-cd "$WORKSPACE/python"
-
-pytest --cache-clear --junitxml="$WORKSPACE/junit-cuml.xml" -v -s
-
-
-################################################################################
-# cuML CI
-################################################################################
-
-if [ "$RUN_CUML_LIBCUML_TESTS" = "ON" ] || [ "$RUN_CUML_PRIMS_TESTS" = "ON" ] || [ "$RUN_CUML_PYTHON_TESTS" = "ON" ] || [ "$RUN_CUGRAPH_LIBCUGRAPH_TESTS" = "ON" ] || [ "$RUN_CUGRAPH_PYTHON_TESTS" = "ON" ]; then
-  cd "$WORKSPACE"
-  mkdir "$WORKSPACE/test_downstream_repos"
-  cd "$WORKSPACE/test_downstream_repos"
-  export RAFT_PATH="$WORKSPACE"
-fi
-
-if [ "$RUN_CUML_LIBCUML_TESTS" = "ON" ] || [ "$RUN_CUML_PRIMS_TESTS" = "ON" ] || [ "$RUN_CUML_PYTHON_TESTS" = "ON" ]; then
-  cd "$WORKSPACE/test_downstream_repos"
-
-  ## Change fork and branch to be tested here:
-  git clone https://github.com/rapidsai/cuml.git -b branch-0.14
-
-
-  ## Build cuML and run tests, uncomment the tests you want to run
-  "$WORKSPACE/test_downstream_repos/cuml/build.sh"
-
-  if [ "$RUN_CUML_LIBCUML_TESTS" = "ON" ]; then
-    gpuci_logger "GoogleTest for libcuml"
-    cd "$WORKSPACE/cpp/build"
-    GTEST_OUTPUT="xml:$WORKSPACE/test-results/libcuml_cpp/" ./test/ml
-  fi
-
-  if [ "$RUN_CUML_PYTHON_TESTS" = "ON" ]; then
-    gpuci_logger "Python pytest for cuml"
-    cd "$WORKSPACE/python"
-    pytest --cache-clear --junitxml="$WORKSPACE/junit-cuml.xml" -v -s -m "not memleak"
-  fi
-
-  if [ "$RUN_CUML_PRIMS_TESTS" = "ON" ]; then
-    gpuci_logger "Run ml-prims test"
-    cd "$WORKSPACE/cpp/build"
-    GTEST_OUTPUT="xml:$WORKSPACE/test-results/prims/ ./test/prims
-  fi
-fi
-
-
-################################################################################
-# cuGraph CI
-################################################################################
-
-if [ "$RUN_CUGRAPH_LIBCUGRAPH_TESTS" = "ON" ] || [ "$RUN_CUGRAPH_PYTHON_TESTS" = "ON" ]; then
-  cd "$WORKSPACE/test_downstream_repos"
-
-  ## Change fork and branch to be tested here:
-  git clone https://github.com/rapidsai/cugraph.git -b branch-0.14
-
-  "$WORKSPACE/test_downstream_repos/cugraph/build.sh" clean libcugraph cugraph
-
-  if [ "$RUN_CUGRAPH_LIBCUGRAPH_TESTS" = "ON" ]; then
-    gpuci_logger "GoogleTest for libcugraph"
-    cd "$WORKSPACE/cpp/build"
-    "$WORKSPACE/ci/test.sh" ${TEST_MODE_FLAG} | tee testoutput.txt
-  fi
-
-  if [ "$RUN_CUGRAPH_PYTHON_TESTS" = "ON" ]; then
-    gpuci_logger "Python pytest for cugraph"
-    cd "$WORKSPACE/python"
-  fi
-fi
diff --git a/ci/prtest.config b/ci/prtest.config
deleted file mode 100644
index 08bdcaa3ab..0000000000
--- a/ci/prtest.config
+++ /dev/null
@@ -1,6 +0,0 @@
-RUN_CUGRAPH_LIBCUGRAPH_TESTS=OFF
-RUN_CUGRAPH_PYTHON_TESTS=OFF
-
-RUN_CUML_LIBCUML_TESTS=OFF
-RUN_CUML_PRIMS_TESTS=OFF
-RUN_CUML_PYTHON_TESTS=OFF
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 58eb3bbd67..a832f67aaf 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # Copyright (c) 2020, NVIDIA CORPORATION.
 ########################
-# cuML Version Updater #
+# RAFT Version Updater #
 ########################
 
 ## Usage
diff --git a/conda/recipes/libraft_distance/build.sh b/conda/recipes/libraft_distance/build.sh
new file mode 100644
index 0000000000..72b43b7cf0
--- /dev/null
+++ b/conda/recipes/libraft_distance/build.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+./build.sh clean libraft -v --allgpuarch --compile-dist --nogtest
diff --git a/conda/recipes/libraft_distance/meta.yaml b/conda/recipes/libraft_distance/meta.yaml
new file mode 100644
index 0000000000..8c3a381a1f
--- /dev/null
+++ b/conda/recipes/libraft_distance/meta.yaml
@@ -0,0 +1,57 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+# Usage:
+#   conda build . -c conda-forge -c nvidia -c rapidsai
+{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
+{% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
+{% set cuda_version='.'.join(environ.get('CUDA', '9.2').split('.')[:2]) %}
+{% set cuda_major=cuda_version.split('.')[0] %}
+{% set ucx_py_version=environ.get('UCX_PY_VERSION') %}
+package:
+  name: libraft-distance
+  version: {{ version }}
+
+source:
+  git_url: ../../..
+
+build:
+  number: {{ GIT_DESCRIBE_NUMBER }}
+  string: cuda{{ cuda_major }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  script_env:
+    - CC
+    - CXX
+    - CUDAHOSTCXX
+    - PARALLEL_LEVEL
+    - VERSION_SUFFIX
+    - PROJECT_FLASH
+    - CCACHE_DIR
+    - CCACHE_NOHASHDIR
+    - CCACHE_COMPILERCHECK
+    - CMAKE_GENERATOR
+    - CMAKE_C_COMPILER_LAUNCHER
+    - CMAKE_CXX_COMPILER_LAUNCHER
+    - CMAKE_CUDA_COMPILER_LAUNCHER
+
+requirements:
+  build:
+    - cmake>=3.20.1
+  host:
+    - libraft-headers {{
+    - nccl>=2.9.9
+    - cudatoolkit {{ cuda_version }}.*
+    - ucx-py {{ ucx_py_version }}
+    - ucx-proc=*=gpu
+    - gtest=1.10.0
+    - gmock
+  run:
+    - nccl>=2.9.9
+    - ucx-py {{ ucx_py_version }}
+    - ucx-proc=*=gpu
+    - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
+    - libcusolver>=11.2.1
+
+about:
+  home: http://rapids.ai/
+  license: Apache-2.0
+  # license_file: LICENSE
+  summary: libraft-distance library
\ No newline at end of file
diff --git a/conda/recipes/libraft_headers/build.sh b/conda/recipes/libraft_headers/build.sh
new file mode 100644
index 0000000000..eec262bb1e
--- /dev/null
+++ b/conda/recipes/libraft_headers/build.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+./build.sh clean libraft -v --allgpuarch --nogtest
diff --git a/conda/recipes/libraft_headers/meta.yaml b/conda/recipes/libraft_headers/meta.yaml
new file mode 100644
index 0000000000..c7aa793ae5
--- /dev/null
+++ b/conda/recipes/libraft_headers/meta.yaml
@@ -0,0 +1,56 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+# Usage:
+  #   conda build . -c conda-forge -c nvidia -c rapidsai
+{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
+{% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
+{% set cuda_version='.'.join(environ.get('CUDA', '9.2').split('.')[:2]) %}
+{% set cuda_major=cuda_version.split('.')[0] %}
+{% set ucx_py_version=environ.get('UCX_PY_VERSION') %}
+package:
+  name: libraft-headers
+  version: {{ version }}
+
+source:
+  git_url: ../../..
+
+build:
+  number: {{ GIT_DESCRIBE_NUMBER }}
+  string: cuda{{ cuda_major }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  script_env:
+    - CC
+    - CXX
+    - CUDAHOSTCXX
+    - PARALLEL_LEVEL
+    - VERSION_SUFFIX
+    - PROJECT_FLASH
+    - CCACHE_DIR
+    - CCACHE_NOHASHDIR
+    - CCACHE_COMPILERCHECK
+    - CMAKE_GENERATOR
+    - CMAKE_C_COMPILER_LAUNCHER
+    - CMAKE_CXX_COMPILER_LAUNCHER
+    - CMAKE_CUDA_COMPILER_LAUNCHER
+
+requirements:
+  build:
+    - cmake>=3.20.1
+  host:
+    - nccl>=2.9.9
+    - cudatoolkit {{ cuda_version }}.*
+    - ucx-py {{ ucx_py_version }}
+    - ucx-proc=*=gpu
+    - gtest=1.10.0
+    - gmock
+  run:
+    - nccl>=2.9.9
+    - ucx-py {{ ucx_py_version }}
+    - ucx-proc=*=gpu
+    - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
+    - libcusolver>=11.2.1
+
+about:
+  home: http://rapids.ai/
+  license: Apache-2.0
+  # license_file: LICENSE
+  summary: libraft-headers library
\ No newline at end of file
diff --git a/conda/recipes/libraft_nn/build.sh b/conda/recipes/libraft_nn/build.sh
new file mode 100644
index 0000000000..30dfdbe04f
--- /dev/null
+++ b/conda/recipes/libraft_nn/build.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+./build.sh clean libraft -v --allgpuarch --compile-nn --nogtest
diff --git a/conda/recipes/libraft_nn/meta.yaml b/conda/recipes/libraft_nn/meta.yaml
new file mode 100644
index 0000000000..710e99902b
--- /dev/null
+++ b/conda/recipes/libraft_nn/meta.yaml
@@ -0,0 +1,56 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+# Usage:
+#   conda build . -c conda-forge -c nvidia -c rapidsai
+{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
+{% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
+{% set cuda_version='.'.join(environ.get('CUDA', '9.2').split('.')[:2]) %}
+{% set cuda_major=cuda_version.split('.')[0] %}
+{% set ucx_py_version=environ.get('UCX_PY_VERSION') %}
+package:
+  name: libraft-nn
+  version: {{ version }}
+
+source:
+  git_url: ../../..
+
+build:
+  number: {{ GIT_DESCRIBE_NUMBER }}
+  string: cuda{{ cuda_major }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  script_env:
+    - CC
+    - CXX
+    - CUDAHOSTCXX
+    - PARALLEL_LEVEL
+    - VERSION_SUFFIX
+    - PROJECT_FLASH
+    - CCACHE_DIR
+    - CCACHE_NOHASHDIR
+    - CCACHE_COMPILERCHECK
+    - CMAKE_GENERATOR
+    - CMAKE_C_COMPILER_LAUNCHER
+    - CMAKE_CXX_COMPILER_LAUNCHER
+    - CMAKE_CUDA_COMPILER_LAUNCHER
+
+requirements:
+  build:
+    - cmake>=3.20.1
+  host:
+    - libraft-headers {{ version }}
+    - cudatoolkit {{ cuda_version }}.*
+    - lapack
+    - faiss-proc=*=cuda
+    - libfaiss 1.7.0 *_cuda
+    - gtest=1.10.0
+    - gmock
+  run:
+    - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
+    - faiss-proc=*=cuda
+    - libfaiss 1.7.0 *_cuda
+    - libcusolver>=11.2.1
+
+about:
+  home: http://rapids.ai/
+  license: Apache-2.0
+  # license_file: LICENSE
+  summary: libraft-nn library
\ No newline at end of file
diff --git a/conda/recipes/pyraft/build.sh b/conda/recipes/pyraft/build.sh
new file mode 100644
index 0000000000..044a34f906
--- /dev/null
+++ b/conda/recipes/pyraft/build.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+
+# This assumes the script is executed from the root of the repo directory
+./build.sh pyraft
diff --git a/conda/recipes/pyraft/meta.yaml b/conda/recipes/pyraft/meta.yaml
new file mode 100644
index 0000000000..b1588d57c0
--- /dev/null
+++ b/conda/recipes/pyraft/meta.yaml
@@ -0,0 +1,60 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+# Usage:
+#   conda build . -c conda-forge -c numba -c rapidsai -c pytorch
+{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
+{% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
+{% set cuda_version='.'.join(environ.get('CUDA', 'unknown').split('.')[:2]) %}
+{% set cuda_major=cuda_version.split('.')[0] %}
+{% set py_version=environ.get('CONDA_PY', 36) %}
+{% set ucx_py_version=environ.get('UCX_PY_VERSION') %}
+
+package:
+  name: pyraft
+  version: {{ version }}
+
+source:
+  git_url: ../../..
+
+build:
+  number: {{ GIT_DESCRIBE_NUMBER }}
+  string: cuda{{ cuda_major }}_py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  script_env:
+    - CC
+    - CXX
+    - VERSION_SUFFIX
+
+requirements:
+  build:
+    - python x.x
+    - setuptools
+    - cython>=0.29,<0.30
+    - libraft-headers={{ version }}
+    - cudatoolkit {{ cuda_version }}.*
+    - ucx-py {{ ucx_py_version }}
+    - ucx-proc=*=gpu
+  run:
+    - python x.x
+    - dask-cuda {{ minor_version }}
+    - libraft-headers={{ version }}
+    - cupy>=7.8.0,<10.0.0a0
+    - nccl>=2.9.9
+    - ucx-py {{ ucx_py_version }}
+    - ucx-proc=*=gpu
+    - dask>=2021.11.1,<=2021.11.2
+    - distributed>=2021.11.1,<=2022.01.0
+    - cuda-python >=11.5,<12.0
+    - joblib >=0.11
+    - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
+
+tests:                                 # [linux64]
+  requirements:                        # [linux64]
+    - cudatoolkit {{ cuda_version }}.* # [linux64]
+  imports:                             # [linux64]
+    - raft                             # [linux64]
+
+about:
+  home: http://rapids.ai/
+  license: Apache-2.0
+  # license_file: LICENSE
+  summary: pyraft library
\ No newline at end of file
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2dd13aa634..b7ad6a92b3 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -187,7 +187,8 @@ endif()
 
 target_link_libraries(raft_distance INTERFACE raft::raft
     $<TARGET_NAME_IF_EXISTS:raft_distance_lib>
-    $<TARGET_NAME_IF_EXISTS:raft::raft_distance_lib>)
+    $<TARGET_NAME_IF_EXISTS:raft::raft_distance_lib>
+)
 
 ##############################################################################
 # - raft_nn ------------------------------------------------------------------
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index c28ff6e66d..ed6a2e48ec 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -14,14 +14,14 @@
 
 # Use CPM to find or clone thrust
 function(find_and_configure_thrust)
-  include(${rapids-cmake-dir}/cpm/thrust.cmake)
+    include(${rapids-cmake-dir}/cpm/thrust.cmake)
 
-  rapids_cpm_thrust(
-    NAMESPACE raft
-    BUILD_EXPORT_SET raft-exports
-    INSTALL_EXPORT_SET raft-exports
-  )
+    rapids_cpm_thrust(
+            NAMESPACE raft
+            BUILD_EXPORT_SET raft-exports
+            INSTALL_EXPORT_SET raft-exports
+    )
 
 endfunction()
 
-find_and_configure_thrust()
+find_and_configure_thrust()
\ No newline at end of file

From bf7de695868c96b10565b78f1a362c70cad76cab Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 28 Jan 2022 11:16:22 -0500
Subject: [PATCH 085/171] raft always calls `install` after
 `rapids_cmake_install_lib_dir`.

Fixes issue where conda packages had some libraries in lib64
---
 cpp/CMakeLists.txt | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6aafa39d97..f1c71e7a3f 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -180,9 +180,6 @@ if(RAFT_COMPILE_LIBRARIES)
   target_compile_definitions(raft_distance_lib
           INTERFACE "RAFT_DISTANCE_COMPILED")
 
-  install(TARGETS raft_distance_lib
-          DESTINATION ${lib_dir}
-          EXPORT raft-distance-exports)
 endif()
 
 target_link_libraries(raft_distance INTERFACE raft::raft
@@ -219,9 +216,6 @@ if(RAFT_COMPILE_LIBRARIES)
   target_compile_definitions(raft_nn_lib
           INTERFACE "RAFT_NN_COMPILED")
 
-  install(TARGETS raft_nn_lib
-          DESTINATION ${lib_dir}
-          EXPORT raft-nn-exports)
 endif()
 
 target_link_libraries(raft_nn INTERFACE raft::raft faiss::faiss
@@ -244,6 +238,18 @@ install(TARGETS raft_nn
         DESTINATION ${lib_dir}
         EXPORT raft-nn-exports)
 
+if(TARGET raft_distance_lib)
+  install(TARGETS raft_distance_lib
+          DESTINATION ${lib_dir}
+          EXPORT raft-distance-exports)
+endif()
+
+if(TARGET raft_nn_lib)
+  install(TARGETS raft_nn_lib
+          DESTINATION ${lib_dir}
+          EXPORT raft-nn-exports)
+endif()
+
 
 install(DIRECTORY include/raft/
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft

From ffe08fe238e1da2973f3c5ab25e3c3fb75f8c6a2 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Tue, 1 Feb 2022 11:44:47 -0500
Subject: [PATCH 086/171] Hiding implementation details for comms (#409)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Paul Taylor (https://github.com/trxcllnt)
  - Robert Maynard (https://github.com/robertmaynard)
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/409
---
 cpp/cmake/thirdparty/get_thrust.cmake         |   2 +-
 cpp/include/raft/comms/comms.hpp              |  42 +-
 cpp/include/raft/comms/comms_test.hpp         | 168 +++++
 cpp/include/raft/comms/detail/mpi_comms.hpp   | 441 +++++++++++++
 cpp/include/raft/comms/detail/std_comms.hpp   | 556 +++++++++++++++++
 cpp/include/raft/comms/{ => detail}/test.hpp  |   4 +-
 .../raft/comms/{ => detail}/ucp_helper.hpp    |   7 +-
 cpp/include/raft/comms/{ => detail}/util.hpp  |  16 +-
 cpp/include/raft/comms/mpi_comms.hpp          | 420 +------------
 cpp/include/raft/comms/std_comms.hpp          | 586 +++---------------
 python/raft/dask/common/comms_utils.pyx       |   7 +-
 python/raft/dask/common/nccl.pyx              |   2 +-
 12 files changed, 1294 insertions(+), 957 deletions(-)
 create mode 100644 cpp/include/raft/comms/comms_test.hpp
 create mode 100644 cpp/include/raft/comms/detail/mpi_comms.hpp
 create mode 100644 cpp/include/raft/comms/detail/std_comms.hpp
 rename cpp/include/raft/comms/{ => detail}/test.hpp (99%)
 rename cpp/include/raft/comms/{ => detail}/ucp_helper.hpp (98%)
 rename cpp/include/raft/comms/{ => detail}/util.hpp (93%)

diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index ed6a2e48ec..3813d0ea02 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index 0de84117e0..14c33c6cf2 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,52 +38,70 @@ enum class status_t {
 };
 
 template <typename value_t>
-constexpr datatype_t get_type();
+constexpr datatype_t
+
+get_type();
 
 template <>
-constexpr datatype_t get_type<char>()
+constexpr datatype_t
+
+get_type<char>()
 {
   return datatype_t::CHAR;
 }
 
 template <>
-constexpr datatype_t get_type<uint8_t>()
+constexpr datatype_t
+
+get_type<uint8_t>()
 {
   return datatype_t::UINT8;
 }
 
 template <>
-constexpr datatype_t get_type<int>()
+constexpr datatype_t
+
+get_type<int>()
 {
   return datatype_t::INT32;
 }
 
 template <>
-constexpr datatype_t get_type<uint32_t>()
+constexpr datatype_t
+
+get_type<uint32_t>()
 {
   return datatype_t::UINT32;
 }
 
 template <>
-constexpr datatype_t get_type<int64_t>()
+constexpr datatype_t
+
+get_type<int64_t>()
 {
   return datatype_t::INT64;
 }
 
 template <>
-constexpr datatype_t get_type<uint64_t>()
+constexpr datatype_t
+
+get_type<uint64_t>()
 {
   return datatype_t::UINT64;
 }
 
 template <>
-constexpr datatype_t get_type<float>()
+constexpr datatype_t
+
+get_type<float>()
 {
   return datatype_t::FLOAT32;
 }
 
 template <>
-constexpr datatype_t get_type<double>()
+constexpr datatype_t
+
+get_type<double>()
 {
   return datatype_t::FLOAT64;
 }
@@ -93,10 +111,12 @@ class comms_iface {
   virtual ~comms_iface() {}
 
   virtual int get_size() const = 0;
+
   virtual int get_rank() const = 0;
 
   virtual std::unique_ptr<comms_iface> comm_split(int color, int key) const = 0;
-  virtual void barrier() const                                              = 0;
+
+  virtual void barrier() const = 0;
 
   virtual status_t sync_stream(cudaStream_t stream) const = 0;
 
diff --git a/cpp/include/raft/comms/comms_test.hpp b/cpp/include/raft/comms/comms_test.hpp
new file mode 100644
index 0000000000..1acb72bc85
--- /dev/null
+++ b/cpp/include/raft/comms/comms_test.hpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/comms/comms.hpp>
+#include <raft/comms/detail/test.hpp>
+
+#include <raft/handle.hpp>
+
+namespace raft {
+namespace comms {
+
+/**
+ * @brief A simple sanity check that NCCL is able to perform a collective operation
+ *
+ * @param[in] handle the raft handle to use. This is expected to already have an
+ *        initialized comms instance.
+ *  @param[in] root the root rank id
+ */
+bool test_collective_allreduce(const handle_t& handle, int root)
+{
+  return detail::test_collective_allreduce(handle, root);
+}
+
+/**
+ * @brief A simple sanity check that NCCL is able to perform a collective operation
+ *
+ * @param[in] handle the raft handle to use. This is expected to already have an
+ *        initialized comms instance.
+ *  @param[in] root the root rank id
+ */
+bool test_collective_broadcast(const handle_t& handle, int root)
+{
+  return detail::test_collective_broadcast(handle, root);
+}
+
+/**
+ * @brief A simple sanity check that NCCL is able to perform a collective reduce
+ *
+ * @param[in] handle the raft handle to use. This is expected to already have an
+ *        initialized comms instance.
+ *  @param[in] root the root rank id
+ */
+bool test_collective_reduce(const handle_t& handle, int root)
+{
+  return detail::test_collective_reduce(handle, root);
+}
+
+/**
+ * @brief A simple sanity check that NCCL is able to perform a collective allgather
+ *
+ * @param[in] handle the raft handle to use. This is expected to already have an
+ *        initialized comms instance.
+ *  @param[in] root the root rank id
+ */
+bool test_collective_allgather(const handle_t& handle, int root)
+{
+  return detail::test_collective_allgather(handle, root);
+}
+
+/**
+ * @brief A simple sanity check that NCCL is able to perform a collective gather
+ *
+ * @param[in] handle the raft handle to use. This is expected to already have an
+ *        initialized comms instance.
+ *  @param[in] root the root rank id
+ */
+bool test_collective_gather(const handle_t& handle, int root)
+{
+  return detail::test_collective_gather(handle, root);
+}
+
+/**
+ * @brief A simple sanity check that NCCL is able to perform a collective gatherv
+ *
+ * @param[in] handle the raft handle to use. This is expected to already have an
+ *        initialized comms instance.
+ *  @param[in] root the root rank id
+ */
+bool test_collective_gatherv(const handle_t& handle, int root)
+{
+  return detail::test_collective_gatherv(handle, root);
+}
+
+/**
+ * @brief A simple sanity check that NCCL is able to perform a collective reducescatter
+ *
+ * @param[in] handle the raft handle to use. This is expected to already have an
+ *        initialized comms instance.
+ *  @param[in] root the root rank id
+ */
+bool test_collective_reducescatter(const handle_t& handle, int root)
+{
+  return detail::test_collective_reducescatter(handle, root);
+}
+
+/**
+ * A simple sanity check that UCX is able to send messages between all ranks
+ *
+ * @param[in] h the raft handle to use. This is expected to already have an
+ *        initialized comms instance.
+ * @param[in] numTrials number of iterations of all-to-all messaging to perform
+ */
+bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials)
+{
+  return detail::test_pointToPoint_simple_send_recv(h, numTrials);
+}
+
+/**
+ * A simple sanity check that device is able to send OR receive.
+ *
+ * @param h the raft handle to use. This is expected to already have an
+ *        initialized comms instance.
+ * @param numTrials number of iterations of send or receive messaging to perform
+ */
+bool test_pointToPoint_device_send_or_recv(const handle_t& h, int numTrials)
+{
+  return detail::test_pointToPoint_device_send_or_recv(h, numTrials);
+}
+
+/**
+ * A simple sanity check that device is able to send and receive at the same time.
+ *
+ * @param h the raft handle to use. This is expected to already have an
+ *        initialized comms instance.
+ * @param numTrials number of iterations of send or receive messaging to perform
+ */
+bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials)
+{
+  return detail::test_pointToPoint_device_sendrecv(h, numTrials);
+}
+
+/**
+ * A simple sanity check that device is able to perform multiple concurrent sends and receives.
+ *
+ * @param h the raft handle to use. This is expected to already have an
+ *        initialized comms instance.
+ * @param numTrials number of iterations of send or receive messaging to perform
+ */
+bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrials)
+{
+  return detail::test_pointToPoint_device_multicast_sendrecv(h, numTrials);
+}
+
+/**
+ * A simple test that the comms can be split into 2 separate subcommunicators
+ *
+ * @param h the raft handle to use. This is expected to already have an
+ *        initialized comms instance.
+ * @param n_colors number of different colors to test
+ */
+bool test_commsplit(const handle_t& h, int n_colors) { return detail::test_commsplit(h, n_colors); }
+}  // namespace comms
+};  // namespace raft
diff --git a/cpp/include/raft/comms/detail/mpi_comms.hpp b/cpp/include/raft/comms/detail/mpi_comms.hpp
new file mode 100644
index 0000000000..3bfd72baf9
--- /dev/null
+++ b/cpp/include/raft/comms/detail/mpi_comms.hpp
@@ -0,0 +1,441 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdio>
+#include <memory>
+
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+#include <mpi.h>
+#include <nccl.h>
+
+#include <raft/comms/comms.hpp>
+#include <raft/comms/detail/util.hpp>
+#include <raft/cudart_utils.h>
+#include <raft/error.hpp>
+#include <raft/handle.hpp>
+
+#define RAFT_MPI_TRY(call)                                                                    \
+  do {                                                                                        \
+    int status = call;                                                                        \
+    if (MPI_SUCCESS != status) {                                                              \
+      int mpi_error_string_lenght = 0;                                                        \
+      char mpi_error_string[MPI_MAX_ERROR_STRING];                                            \
+      MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght);                   \
+      RAFT_EXPECTS(                                                                           \
+        MPI_SUCCESS == status, "ERROR: MPI call='%s'. Reason:%s\n", #call, mpi_error_string); \
+    }                                                                                         \
+  } while (0)
+
+// FIXME: Remove after consumer rename
+#ifndef MPI_TRY
+#define MPI_TRY(call) RAFT_MPI_TRY(call)
+#endif
+
+#define RAFT_MPI_TRY_NO_THROW(call)                                         \
+  do {                                                                      \
+    int status = call;                                                      \
+    if (MPI_SUCCESS != status) {                                            \
+      int mpi_error_string_lenght = 0;                                      \
+      char mpi_error_string[MPI_MAX_ERROR_STRING];                          \
+      MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \
+      printf("MPI call='%s' at file=%s line=%d failed with %s ",            \
+             #call,                                                         \
+             __FILE__,                                                      \
+             __LINE__,                                                      \
+             mpi_error_string);                                             \
+    }                                                                       \
+  } while (0)
+
+// FIXME: Remove after consumer rename
+#ifndef MPI_TRY_NO_THROW
+#define MPI_TRY_NO_THROW(call) RAFT_MPI_TRY_NO_THROW(call)
+#endif
+namespace raft {
+namespace comms {
+namespace detail {
+
+constexpr MPI_Datatype get_mpi_datatype(const datatype_t datatype)
+{
+  switch (datatype) {
+    case datatype_t::CHAR: return MPI_CHAR;
+    case datatype_t::UINT8: return MPI_UNSIGNED_CHAR;
+    case datatype_t::INT32: return MPI_INT;
+    case datatype_t::UINT32: return MPI_UNSIGNED;
+    case datatype_t::INT64: return MPI_LONG_LONG;
+    case datatype_t::UINT64: return MPI_UNSIGNED_LONG_LONG;
+    case datatype_t::FLOAT32: return MPI_FLOAT;
+    case datatype_t::FLOAT64: return MPI_DOUBLE;
+    default:
+      // Execution should never reach here. This takes care of compiler warning.
+      return MPI_DOUBLE;
+  }
+}
+
+constexpr MPI_Op get_mpi_op(const op_t op)
+{
+  switch (op) {
+    case op_t::SUM: return MPI_SUM;
+    case op_t::PROD: return MPI_PROD;
+    case op_t::MIN: return MPI_MIN;
+    case op_t::MAX: return MPI_MAX;
+    default:
+      // Execution should never reach here. This takes care of compiler warning.
+      return MPI_MAX;
+  }
+}
+
+class mpi_comms : public comms_iface {
+ public:
+  mpi_comms(MPI_Comm comm, const bool owns_mpi_comm)
+    : owns_mpi_comm_(owns_mpi_comm), mpi_comm_(comm), size_(0), rank_(1), next_request_id_(0)
+  {
+    int mpi_is_initialized = 0;
+    RAFT_MPI_TRY(MPI_Initialized(&mpi_is_initialized));
+    RAFT_EXPECTS(mpi_is_initialized, "ERROR: MPI is not initialized!");
+    RAFT_MPI_TRY(MPI_Comm_size(mpi_comm_, &size_));
+    RAFT_MPI_TRY(MPI_Comm_rank(mpi_comm_, &rank_));
+    // get NCCL unique ID at rank 0 and broadcast it to all others
+    ncclUniqueId id;
+    if (0 == rank_) RAFT_NCCL_TRY(ncclGetUniqueId(&id));
+    RAFT_MPI_TRY(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, mpi_comm_));
+
+    // initializing NCCL
+    RAFT_NCCL_TRY(ncclCommInitRank(&nccl_comm_, size_, id, rank_));
+  }
+
+  virtual ~mpi_comms()
+  {
+    // finalizing NCCL
+    RAFT_NCCL_TRY_NO_THROW(ncclCommDestroy(nccl_comm_));
+    if (owns_mpi_comm_) { RAFT_MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_)); }
+  }
+
+  int get_size() const { return size_; }
+
+  int get_rank() const { return rank_; }
+
+  std::unique_ptr<comms_iface> comm_split(int color, int key) const
+  {
+    MPI_Comm new_comm;
+    RAFT_MPI_TRY(MPI_Comm_split(mpi_comm_, color, key, &new_comm));
+    return std::unique_ptr<comms_iface>(new mpi_comms(new_comm, true));
+  }
+
+  void barrier() const { RAFT_MPI_TRY(MPI_Barrier(mpi_comm_)); }
+
+  void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const
+  {
+    MPI_Request mpi_req;
+    request_t req_id;
+    if (free_requests_.empty()) {
+      req_id = next_request_id_++;
+    } else {
+      auto it = free_requests_.begin();
+      req_id  = *it;
+      free_requests_.erase(it);
+    }
+    RAFT_MPI_TRY(MPI_Isend(buf, size, MPI_BYTE, dest, tag, mpi_comm_, &mpi_req));
+    requests_in_flight_.insert(std::make_pair(req_id, mpi_req));
+    *request = req_id;
+  }
+
+  void irecv(void* buf, size_t size, int source, int tag, request_t* request) const
+  {
+    MPI_Request mpi_req;
+    request_t req_id;
+    if (free_requests_.empty()) {
+      req_id = next_request_id_++;
+    } else {
+      auto it = free_requests_.begin();
+      req_id  = *it;
+      free_requests_.erase(it);
+    }
+
+    RAFT_MPI_TRY(MPI_Irecv(buf, size, MPI_BYTE, source, tag, mpi_comm_, &mpi_req));
+    requests_in_flight_.insert(std::make_pair(req_id, mpi_req));
+    *request = req_id;
+  }
+
+  void waitall(int count, request_t array_of_requests[]) const
+  {
+    std::vector<MPI_Request> requests;
+    requests.reserve(count);
+    for (int i = 0; i < count; ++i) {
+      auto req_it = requests_in_flight_.find(array_of_requests[i]);
+      RAFT_EXPECTS(requests_in_flight_.end() != req_it,
+                   "ERROR: waitall on invalid request: %d",
+                   array_of_requests[i]);
+      requests.push_back(req_it->second);
+      free_requests_.insert(req_it->first);
+      requests_in_flight_.erase(req_it);
+    }
+    RAFT_MPI_TRY(MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE));
+  }
+
+  void allreduce(const void* sendbuff,
+                 void* recvbuff,
+                 size_t count,
+                 datatype_t datatype,
+                 op_t op,
+                 cudaStream_t stream) const
+  {
+    RAFT_NCCL_TRY(ncclAllReduce(
+      sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream));
+  }
+
+  void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const
+  {
+    RAFT_NCCL_TRY(
+      ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
+  }
+
+  void bcast(const void* sendbuff,
+             void* recvbuff,
+             size_t count,
+             datatype_t datatype,
+             int root,
+             cudaStream_t stream) const
+  {
+    RAFT_NCCL_TRY(ncclBroadcast(
+      sendbuff, recvbuff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
+  }
+
+  void reduce(const void* sendbuff,
+              void* recvbuff,
+              size_t count,
+              datatype_t datatype,
+              op_t op,
+              int root,
+              cudaStream_t stream) const
+  {
+    RAFT_NCCL_TRY(ncclReduce(sendbuff,
+                             recvbuff,
+                             count,
+                             get_nccl_datatype(datatype),
+                             get_nccl_op(op),
+                             root,
+                             nccl_comm_,
+                             stream));
+  }
+
+  void allgather(const void* sendbuff,
+                 void* recvbuff,
+                 size_t sendcount,
+                 datatype_t datatype,
+                 cudaStream_t stream) const
+  {
+    RAFT_NCCL_TRY(ncclAllGather(
+      sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream));
+  }
+
+  void allgatherv(const void* sendbuf,
+                  void* recvbuf,
+                  const size_t* recvcounts,
+                  const size_t* displs,
+                  datatype_t datatype,
+                  cudaStream_t stream) const
+  {
+    // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" -
+    // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4.
+    for (int root = 0; root < size_; ++root) {
+      RAFT_NCCL_TRY(
+        ncclBroadcast(sendbuf,
+                      static_cast<char*>(recvbuf) + displs[root] * get_datatype_size(datatype),
+                      recvcounts[root],
+                      get_nccl_datatype(datatype),
+                      root,
+                      nccl_comm_,
+                      stream));
+    }
+  }
+
+  void gather(const void* sendbuff,
+              void* recvbuff,
+              size_t sendcount,
+              datatype_t datatype,
+              int root,
+              cudaStream_t stream) const
+  {
+    size_t dtype_size = get_datatype_size(datatype);
+    RAFT_NCCL_TRY(ncclGroupStart());
+    if (get_rank() == root) {
+      for (int r = 0; r < get_size(); ++r) {
+        RAFT_NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + sendcount * r * dtype_size,
+                               sendcount,
+                               get_nccl_datatype(datatype),
+                               r,
+                               nccl_comm_,
+                               stream));
+      }
+    }
+    RAFT_NCCL_TRY(
+      ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
+    RAFT_NCCL_TRY(ncclGroupEnd());
+  }
+
+  void gatherv(const void* sendbuff,
+               void* recvbuff,
+               size_t sendcount,
+               const size_t* recvcounts,
+               const size_t* displs,
+               datatype_t datatype,
+               int root,
+               cudaStream_t stream) const
+  {
+    size_t dtype_size = get_datatype_size(datatype);
+    RAFT_NCCL_TRY(ncclGroupStart());
+    if (get_rank() == root) {
+      for (int r = 0; r < get_size(); ++r) {
+        RAFT_NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + displs[r] * dtype_size,
+                               recvcounts[r],
+                               get_nccl_datatype(datatype),
+                               r,
+                               nccl_comm_,
+                               stream));
+      }
+    }
+    RAFT_NCCL_TRY(
+      ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
+    RAFT_NCCL_TRY(ncclGroupEnd());
+  }
+
+  void reducescatter(const void* sendbuff,
+                     void* recvbuff,
+                     size_t recvcount,
+                     datatype_t datatype,
+                     op_t op,
+                     cudaStream_t stream) const
+  {
+    RAFT_NCCL_TRY(ncclReduceScatter(sendbuff,
+                                    recvbuff,
+                                    recvcount,
+                                    get_nccl_datatype(datatype),
+                                    get_nccl_op(op),
+                                    nccl_comm_,
+                                    stream));
+  }
+
+  status_t sync_stream(cudaStream_t stream) const
+  {
+    cudaError_t cudaErr;
+    ncclResult_t ncclErr, ncclAsyncErr;
+    while (1) {
+      cudaErr = cudaStreamQuery(stream);
+      if (cudaErr == cudaSuccess) return status_t::SUCCESS;
+
+      if (cudaErr != cudaErrorNotReady) {
+        // An error occurred querying the status of the stream
+        return status_t::ERROR;
+      }
+
+      ncclErr = ncclCommGetAsyncError(nccl_comm_, &ncclAsyncErr);
+      if (ncclErr != ncclSuccess) {
+        // An error occurred retrieving the asynchronous error
+        return status_t::ERROR;
+      }
+
+      if (ncclAsyncErr != ncclSuccess) {
+        // An asynchronous error happened. Stop the operation and destroy
+        // the communicator
+        ncclErr = ncclCommAbort(nccl_comm_);
+        if (ncclErr != ncclSuccess)
+          // Caller may abort with an exception or try to re-create a new communicator.
+          return status_t::ABORT;
+      }
+
+      // Let other threads (including NCCL threads) use the CPU.
+      pthread_yield();
+    }
+  };
+
+  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
+  void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const
+  {
+    RAFT_NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream));
+  }
+
+  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
+  void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const
+  {
+    RAFT_NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream));
+  }
+
+  void device_sendrecv(const void* sendbuf,
+                       size_t sendsize,
+                       int dest,
+                       void* recvbuf,
+                       size_t recvsize,
+                       int source,
+                       cudaStream_t stream) const
+  {
+    // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
+    RAFT_NCCL_TRY(ncclGroupStart());
+    RAFT_NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream));
+    RAFT_NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
+    RAFT_NCCL_TRY(ncclGroupEnd());
+  }
+
+  void device_multicast_sendrecv(const void* sendbuf,
+                                 std::vector<size_t> const& sendsizes,
+                                 std::vector<size_t> const& sendoffsets,
+                                 std::vector<int> const& dests,
+                                 void* recvbuf,
+                                 std::vector<size_t> const& recvsizes,
+                                 std::vector<size_t> const& recvoffsets,
+                                 std::vector<int> const& sources,
+                                 cudaStream_t stream) const
+  {
+    // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
+    RAFT_NCCL_TRY(ncclGroupStart());
+    for (size_t i = 0; i < sendsizes.size(); ++i) {
+      RAFT_NCCL_TRY(ncclSend(static_cast<const char*>(sendbuf) + sendoffsets[i],
+                             sendsizes[i],
+                             ncclUint8,
+                             dests[i],
+                             nccl_comm_,
+                             stream));
+    }
+    for (size_t i = 0; i < recvsizes.size(); ++i) {
+      RAFT_NCCL_TRY(ncclRecv(static_cast<char*>(recvbuf) + recvoffsets[i],
+                             recvsizes[i],
+                             ncclUint8,
+                             sources[i],
+                             nccl_comm_,
+                             stream));
+    }
+    RAFT_NCCL_TRY(ncclGroupEnd());
+  }
+
+ private:
+  bool owns_mpi_comm_;
+  MPI_Comm mpi_comm_;
+
+  ncclComm_t nccl_comm_;
+  int size_;
+  int rank_;
+  mutable request_t next_request_id_;
+  mutable std::unordered_map<request_t, MPI_Request> requests_in_flight_;
+  mutable std::unordered_set<request_t> free_requests_;
+};
+
+}  // end namespace detail
+};  // end namespace comms
+};  // end namespace raft
diff --git a/cpp/include/raft/comms/detail/std_comms.hpp b/cpp/include/raft/comms/detail/std_comms.hpp
new file mode 100644
index 0000000000..758a9d3781
--- /dev/null
+++ b/cpp/include/raft/comms/detail/std_comms.hpp
@@ -0,0 +1,556 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/comms/comms.hpp>
+#include <raft/comms/detail/ucp_helper.hpp>
+#include <raft/comms/detail/util.hpp>
+
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <raft/error.hpp>
+
+#include <raft/cudart_utils.h>
+
+#include <cuda_runtime.h>
+
+#include <ucp/api/ucp.h>
+#include <ucp/api/ucp_def.h>
+
+#include <nccl.h>
+
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+#include <algorithm>
+#include <chrono>
+#include <cstdio>
+#include <exception>
+#include <memory>
+#include <stdlib.h>
+#include <thread>
+#include <time.h>
+
+namespace raft {
+namespace comms {
+namespace detail {
+
+class std_comms : public comms_iface {
+ public:
+  std_comms() = delete;
+
+  /**
+   * @brief Constructor for collective + point-to-point operation.
+   * @param nccl_comm initialized nccl comm
+   * @param ucp_worker initialized ucp_worker instance
+   * @param eps shared pointer to array of ucp endpoints
+   * @param num_ranks number of ranks in the cluster
+   * @param rank rank of the current worker
+   * @param stream cuda stream for synchronizing and ordering collective operations
+   * @param subcomms_ucp use ucp for subcommunicators
+   */
+  std_comms(ncclComm_t nccl_comm,
+            ucp_worker_h ucp_worker,
+            std::shared_ptr<ucp_ep_h*> eps,
+            int num_ranks,
+            int rank,
+            cudaStream_t stream,
+            bool subcomms_ucp = true)
+    : nccl_comm_(nccl_comm),
+      stream_(stream),
+      status_(2, stream),
+      num_ranks_(num_ranks),
+      rank_(rank),
+      subcomms_ucp_(subcomms_ucp),
+      ucp_worker_(ucp_worker),
+      ucp_eps_(eps),
+      next_request_id_(0)
+  {
+    initialize();
+  };
+
+  /**
+   * @brief constructor for collective-only operation
+   * @param nccl_comm initilized nccl communicator
+   * @param num_ranks size of the cluster
+   * @param rank rank of the current worker
+   * @param stream stream for ordering collective operations
+   */
+  std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank, cudaStream_t stream)
+    : nccl_comm_(nccl_comm),
+      stream_(stream),
+      status_(2, stream),
+      num_ranks_(num_ranks),
+      rank_(rank),
+      subcomms_ucp_(false)
+  {
+    initialize();
+  };
+
+  void initialize()
+  {
+    sendbuff_ = status_.data();
+    recvbuff_ = status_.data() + 1;
+  }
+
+  int get_size() const { return num_ranks_; }
+
+  int get_rank() const { return rank_; }
+
+  std::unique_ptr<comms_iface> comm_split(int color, int key) const
+  {
+    rmm::device_uvector<int> d_colors(get_size(), stream_);
+    rmm::device_uvector<int> d_keys(get_size(), stream_);
+
+    update_device(d_colors.data() + get_rank(), &color, 1, stream_);
+    update_device(d_keys.data() + get_rank(), &key, 1, stream_);
+
+    allgather(d_colors.data() + get_rank(), d_colors.data(), 1, datatype_t::INT32, stream_);
+    allgather(d_keys.data() + get_rank(), d_keys.data(), 1, datatype_t::INT32, stream_);
+    this->sync_stream(stream_);
+
+    std::vector<int> h_colors(get_size());
+    std::vector<int> h_keys(get_size());
+
+    update_host(h_colors.data(), d_colors.data(), get_size(), stream_);
+    update_host(h_keys.data(), d_keys.data(), get_size(), stream_);
+
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
+
+    std::vector<int> subcomm_ranks{};
+    std::vector<ucp_ep_h> new_ucx_ptrs{};
+
+    for (int i = 0; i < get_size(); ++i) {
+      if (h_colors[i] == color) {
+        subcomm_ranks.push_back(i);
+        if (ucp_worker_ != nullptr && subcomms_ucp_) { new_ucx_ptrs.push_back((*ucp_eps_)[i]); }
+      }
+    }
+
+    ncclUniqueId id{};
+    if (get_rank() == subcomm_ranks[0]) {  // root of the new subcommunicator
+      RAFT_NCCL_TRY(ncclGetUniqueId(&id));
+      std::vector<request_t> requests(subcomm_ranks.size() - 1);
+      for (size_t i = 1; i < subcomm_ranks.size(); ++i) {
+        isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color, requests.data() + (i - 1));
+      }
+      waitall(requests.size(), requests.data());
+    } else {
+      request_t request{};
+      irecv(&id, sizeof(ncclUniqueId), subcomm_ranks[0], color, &request);
+      waitall(1, &request);
+    }
+    // FIXME: this seems unnecessary, do more testing and remove this
+    barrier();
+
+    ncclComm_t nccl_comm;
+    RAFT_NCCL_TRY(ncclCommInitRank(&nccl_comm, subcomm_ranks.size(), id, key));
+
+    if (ucp_worker_ != nullptr && subcomms_ucp_) {
+      auto eps_sp = std::make_shared<ucp_ep_h*>(new_ucx_ptrs.data());
+      return std::unique_ptr<comms_iface>(new std_comms(nccl_comm,
+                                                        (ucp_worker_h)ucp_worker_,
+                                                        eps_sp,
+                                                        subcomm_ranks.size(),
+                                                        key,
+                                                        stream_,
+                                                        subcomms_ucp_));
+    } else {
+      return std::unique_ptr<comms_iface>(
+        new std_comms(nccl_comm, subcomm_ranks.size(), key, stream_));
+    }
+  }
+
+  void barrier() const
+  {
+    RAFT_CUDA_TRY(cudaMemsetAsync(sendbuff_, 1, sizeof(int), stream_));
+    RAFT_CUDA_TRY(cudaMemsetAsync(recvbuff_, 1, sizeof(int), stream_));
+
+    allreduce(sendbuff_, recvbuff_, 1, datatype_t::INT32, op_t::SUM, stream_);
+
+    ASSERT(sync_stream(stream_) == status_t::SUCCESS,
+           "ERROR: syncStream failed. This can be caused by a failed rank_.");
+  }
+
+  void get_request_id(request_t* req) const
+  {
+    request_t req_id;
+
+    if (this->free_requests_.empty())
+      req_id = this->next_request_id_++;
+    else {
+      auto it = this->free_requests_.begin();
+      req_id  = *it;
+      this->free_requests_.erase(it);
+    }
+    *req = req_id;
+  }
+
+  void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const
+  {
+    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
+
+    get_request_id(request);
+    ucp_ep_h ep_ptr = (*ucp_eps_)[dest];
+
+    ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request));
+
+    this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag, default_tag_mask, get_rank());
+
+    requests_in_flight_.insert(std::make_pair(*request, ucp_req));
+  }
+
+  void irecv(void* buf, size_t size, int source, int tag, request_t* request) const
+  {
+    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
+
+    get_request_id(request);
+
+    ucp_ep_h ep_ptr = (*ucp_eps_)[source];
+
+    ucp_tag_t tag_mask = default_tag_mask;
+
+    ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request));
+    ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag, tag_mask, source);
+
+    requests_in_flight_.insert(std::make_pair(*request, ucp_req));
+  }
+
+  void waitall(int count, request_t array_of_requests[]) const
+  {
+    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
+
+    std::vector<ucp_request*> requests;
+    requests.reserve(count);
+
+    time_t start = time(NULL);
+
+    for (int i = 0; i < count; ++i) {
+      auto req_it = requests_in_flight_.find(array_of_requests[i]);
+      ASSERT(requests_in_flight_.end() != req_it,
+             "ERROR: waitall on invalid request: %d",
+             array_of_requests[i]);
+      requests.push_back(req_it->second);
+      free_requests_.insert(req_it->first);
+      requests_in_flight_.erase(req_it);
+    }
+
+    while (requests.size() > 0) {
+      time_t now = time(NULL);
+
+      // Timeout if we have not gotten progress or completed any requests
+      // in 10 or more seconds.
+      ASSERT(now - start < 10, "Timed out waiting for requests.");
+
+      for (std::vector<ucp_request*>::iterator it = requests.begin(); it != requests.end();) {
+        bool restart = false;  // resets the timeout when any progress was made
+
+        // Causes UCP to progress through the send/recv message queue
+        while (ucp_handler_.ucp_progress(ucp_worker_) != 0) {
+          restart = true;
+        }
+
+        auto req = *it;
+
+        // If the message needs release, we know it will be sent/received
+        // asynchronously, so we will need to track and verify its state
+        if (req->needs_release) {
+          ASSERT(UCS_PTR_IS_PTR(req->req), "UCX Request Error. Request is not valid UCX pointer");
+          ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n", UCS_PTR_STATUS(req->req));
+          ASSERT(req->req->completed == 1 || req->req->completed == 0,
+                 "request->completed not a valid value: %d\n",
+                 req->req->completed);
+        }
+
+        // If a message was sent synchronously (eg. completed before
+        // `isend`/`irecv` completed) or an asynchronous message
+        // is complete, we can go ahead and clean it up.
+        if (!req->needs_release || req->req->completed == 1) {
+          restart = true;
+
+          // perform cleanup
+          ucp_handler_.free_ucp_request(req);
+
+          // remove from pending requests
+          it = requests.erase(it);
+        } else {
+          ++it;
+        }
+        // if any progress was made, reset the timeout start time
+        if (restart) { start = time(NULL); }
+      }
+    }
+  }
+
+  void allreduce(const void* sendbuff,
+                 void* recvbuff,
+                 size_t count,
+                 datatype_t datatype,
+                 op_t op,
+                 cudaStream_t stream) const
+  {
+    RAFT_NCCL_TRY(ncclAllReduce(
+      sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream));
+  }
+
+  void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const
+  {
+    RAFT_NCCL_TRY(
+      ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
+  }
+
+  void bcast(const void* sendbuff,
+             void* recvbuff,
+             size_t count,
+             datatype_t datatype,
+             int root,
+             cudaStream_t stream) const
+  {
+    RAFT_NCCL_TRY(ncclBroadcast(
+      sendbuff, recvbuff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
+  }
+
+  void reduce(const void* sendbuff,
+              void* recvbuff,
+              size_t count,
+              datatype_t datatype,
+              op_t op,
+              int root,
+              cudaStream_t stream) const
+  {
+    RAFT_NCCL_TRY(ncclReduce(sendbuff,
+                             recvbuff,
+                             count,
+                             get_nccl_datatype(datatype),
+                             get_nccl_op(op),
+                             root,
+                             nccl_comm_,
+                             stream));
+  }
+
+  void allgather(const void* sendbuff,
+                 void* recvbuff,
+                 size_t sendcount,
+                 datatype_t datatype,
+                 cudaStream_t stream) const
+  {
+    RAFT_NCCL_TRY(ncclAllGather(
+      sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream));
+  }
+
+  void allgatherv(const void* sendbuf,
+                  void* recvbuf,
+                  const size_t* recvcounts,
+                  const size_t* displs,
+                  datatype_t datatype,
+                  cudaStream_t stream) const
+  {
+    // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" -
+    // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4.
+    for (int root = 0; root < num_ranks_; ++root) {
+      size_t dtype_size = get_datatype_size(datatype);
+      RAFT_NCCL_TRY(ncclBroadcast(sendbuf,
+                                  static_cast<char*>(recvbuf) + displs[root] * dtype_size,
+                                  recvcounts[root],
+                                  get_nccl_datatype(datatype),
+                                  root,
+                                  nccl_comm_,
+                                  stream));
+    }
+  }
+
+  void gather(const void* sendbuff,
+              void* recvbuff,
+              size_t sendcount,
+              datatype_t datatype,
+              int root,
+              cudaStream_t stream) const
+  {
+    size_t dtype_size = get_datatype_size(datatype);
+    RAFT_NCCL_TRY(ncclGroupStart());
+    if (get_rank() == root) {
+      for (int r = 0; r < get_size(); ++r) {
+        RAFT_NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + sendcount * r * dtype_size,
+                               sendcount,
+                               get_nccl_datatype(datatype),
+                               r,
+                               nccl_comm_,
+                               stream));
+      }
+    }
+    RAFT_NCCL_TRY(
+      ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
+    RAFT_NCCL_TRY(ncclGroupEnd());
+  }
+
+  void gatherv(const void* sendbuff,
+               void* recvbuff,
+               size_t sendcount,
+               const size_t* recvcounts,
+               const size_t* displs,
+               datatype_t datatype,
+               int root,
+               cudaStream_t stream) const
+  {
+    size_t dtype_size = get_datatype_size(datatype);
+    RAFT_NCCL_TRY(ncclGroupStart());
+    if (get_rank() == root) {
+      for (int r = 0; r < get_size(); ++r) {
+        RAFT_NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + displs[r] * dtype_size,
+                               recvcounts[r],
+                               get_nccl_datatype(datatype),
+                               r,
+                               nccl_comm_,
+                               stream));
+      }
+    }
+    RAFT_NCCL_TRY(
+      ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
+    RAFT_NCCL_TRY(ncclGroupEnd());
+  }
+
+  void reducescatter(const void* sendbuff,
+                     void* recvbuff,
+                     size_t recvcount,
+                     datatype_t datatype,
+                     op_t op,
+                     cudaStream_t stream) const
+  {
+    RAFT_NCCL_TRY(ncclReduceScatter(sendbuff,
+                                    recvbuff,
+                                    recvcount,
+                                    get_nccl_datatype(datatype),
+                                    get_nccl_op(op),
+                                    nccl_comm_,
+                                    stream));
+  }
+
+  status_t sync_stream(cudaStream_t stream) const
+  {
+    cudaError_t cudaErr;
+    ncclResult_t ncclErr, ncclAsyncErr;
+    while (1) {
+      cudaErr = cudaStreamQuery(stream);
+      if (cudaErr == cudaSuccess) return status_t::SUCCESS;
+
+      if (cudaErr != cudaErrorNotReady) {
+        // An error occurred querying the status of the stream_
+        return status_t::ERROR;
+      }
+
+      ncclErr = ncclCommGetAsyncError(nccl_comm_, &ncclAsyncErr);
+      if (ncclErr != ncclSuccess) {
+        // An error occurred retrieving the asynchronous error
+        return status_t::ERROR;
+      }
+
+      if (ncclAsyncErr != ncclSuccess) {
+        // An asynchronous error happened. Stop the operation and destroy
+        // the communicator
+        ncclErr = ncclCommAbort(nccl_comm_);
+        if (ncclErr != ncclSuccess)
+          // Caller may abort with an exception or try to re-create a new communicator.
+          return status_t::ABORT;
+      }
+
+      // Let other threads (including NCCL threads) use the CPU.
+      std::this_thread::yield();
+    }
+  }
+
+  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
+  void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const
+  {
+    RAFT_NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream));
+  }
+
+  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
+  void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const
+  {
+    RAFT_NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream));
+  }
+
+  void device_sendrecv(const void* sendbuf,
+                       size_t sendsize,
+                       int dest,
+                       void* recvbuf,
+                       size_t recvsize,
+                       int source,
+                       cudaStream_t stream) const
+  {
+    // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
+    RAFT_NCCL_TRY(ncclGroupStart());
+    RAFT_NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream));
+    RAFT_NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
+    RAFT_NCCL_TRY(ncclGroupEnd());
+  }
+
+  void device_multicast_sendrecv(const void* sendbuf,
+                                 std::vector<size_t> const& sendsizes,
+                                 std::vector<size_t> const& sendoffsets,
+                                 std::vector<int> const& dests,
+                                 void* recvbuf,
+                                 std::vector<size_t> const& recvsizes,
+                                 std::vector<size_t> const& recvoffsets,
+                                 std::vector<int> const& sources,
+                                 cudaStream_t stream) const
+  {
+    // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
+    RAFT_NCCL_TRY(ncclGroupStart());
+    for (size_t i = 0; i < sendsizes.size(); ++i) {
+      RAFT_NCCL_TRY(ncclSend(static_cast<const char*>(sendbuf) + sendoffsets[i],
+                             sendsizes[i],
+                             ncclUint8,
+                             dests[i],
+                             nccl_comm_,
+                             stream));
+    }
+    for (size_t i = 0; i < recvsizes.size(); ++i) {
+      RAFT_NCCL_TRY(ncclRecv(static_cast<char*>(recvbuf) + recvoffsets[i],
+                             recvsizes[i],
+                             ncclUint8,
+                             sources[i],
+                             nccl_comm_,
+                             stream));
+    }
+    RAFT_NCCL_TRY(ncclGroupEnd());
+  }
+
+ private:
+  ncclComm_t nccl_comm_;
+  cudaStream_t stream_;
+
+  int *sendbuff_, *recvbuff_;
+  rmm::device_uvector<int> status_;
+
+  int num_ranks_;
+  int rank_;
+
+  bool subcomms_ucp_;
+
+  comms_ucp_handler ucp_handler_;
+  ucp_worker_h ucp_worker_;
+  std::shared_ptr<ucp_ep_h*> ucp_eps_;
+  mutable request_t next_request_id_;
+  mutable std::unordered_map<request_t, struct ucp_request*> requests_in_flight_;
+  mutable std::unordered_set<request_t> free_requests_;
+};
+}  // namespace detail
+}  // end namespace comms
+}  // end namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/detail/test.hpp
similarity index 99%
rename from cpp/include/raft/comms/test.hpp
rename to cpp/include/raft/comms/detail/test.hpp
index 01ad6369f8..cd84d2becd 100644
--- a/cpp/include/raft/comms/test.hpp
+++ b/cpp/include/raft/comms/detail/test.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 
 namespace raft {
 namespace comms {
+namespace detail {
 
 /**
  * @brief A simple sanity check that NCCL is able to perform a collective operation
@@ -538,5 +539,6 @@ bool test_commsplit(const handle_t& h, int n_colors)
   return test_collective_allreduce(new_handle, 0);
 }
 
+}  // namespace detail
 }  // namespace comms
 };  // namespace raft
diff --git a/cpp/include/raft/comms/ucp_helper.hpp b/cpp/include/raft/comms/detail/ucp_helper.hpp
similarity index 98%
rename from cpp/include/raft/comms/ucp_helper.hpp
rename to cpp/include/raft/comms/detail/ucp_helper.hpp
index 89c7b25630..6ba66fb6f3 100644
--- a/cpp/include/raft/comms/ucp_helper.hpp
+++ b/cpp/include/raft/comms/detail/ucp_helper.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,13 +24,17 @@
 
 namespace raft {
 namespace comms {
+namespace detail {
 
 typedef void (*dlsym_print_info)(ucp_ep_h, FILE*);
+
 typedef void (*dlsym_rec_free)(void*);
+
 typedef int (*dlsym_worker_progress)(ucp_worker_h);
 
 typedef ucs_status_ptr_t (*dlsym_send)(
   ucp_ep_h, const void*, size_t, ucp_datatype_t, ucp_tag_t, ucp_send_callback_t);
+
 typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h,
                                        void*,
                                        size_t count,
@@ -250,5 +254,6 @@ class comms_ucp_handler {
            UCS_PTR_STATUS(recv_result));
   }
 };
+}  // end namespace detail
 }  // end namespace comms
 }  // end namespace raft
diff --git a/cpp/include/raft/comms/util.hpp b/cpp/include/raft/comms/detail/util.hpp
similarity index 93%
rename from cpp/include/raft/comms/util.hpp
rename to cpp/include/raft/comms/detail/util.hpp
index ef16773c75..1c0d152016 100644
--- a/cpp/include/raft/comms/util.hpp
+++ b/cpp/include/raft/comms/detail/util.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,8 +61,11 @@
 
 namespace raft {
 namespace comms {
+namespace detail {
 
-constexpr size_t get_datatype_size(const datatype_t datatype)
+constexpr size_t
+
+get_datatype_size(const datatype_t datatype)
 {
   switch (datatype) {
     case datatype_t::CHAR: return sizeof(char);
@@ -77,7 +80,9 @@ constexpr size_t get_datatype_size(const datatype_t datatype)
   }
 }
 
-constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype)
+constexpr ncclDataType_t
+
+get_nccl_datatype(const datatype_t datatype)
 {
   switch (datatype) {
     case datatype_t::CHAR: return ncclChar;
@@ -92,7 +97,9 @@ constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype)
   }
 }
 
-constexpr ncclRedOp_t get_nccl_op(const op_t op)
+constexpr ncclRedOp_t
+
+get_nccl_op(const op_t op)
 {
   switch (op) {
     case op_t::SUM: return ncclSum;
@@ -102,5 +109,6 @@ constexpr ncclRedOp_t get_nccl_op(const op_t op)
     default: throw "Unsupported datatype";
   }
 }
+};  // namespace detail
 };  // namespace comms
 };  // namespace raft
diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp
index 432f250b59..bb1e30afc8 100644
--- a/cpp/include/raft/comms/mpi_comms.hpp
+++ b/cpp/include/raft/comms/mpi_comms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,425 +16,13 @@
 
 #pragma once
 
-#include <cstdio>
-#include <memory>
-
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-
-#include <mpi.h>
-#include <nccl.h>
-
 #include <raft/comms/comms.hpp>
-#include <raft/comms/util.hpp>
-#include <raft/cudart_utils.h>
-#include <raft/error.hpp>
-#include <raft/handle.hpp>
-
-#define RAFT_MPI_TRY(call)                                                                    \
-  do {                                                                                        \
-    int status = call;                                                                        \
-    if (MPI_SUCCESS != status) {                                                              \
-      int mpi_error_string_lenght = 0;                                                        \
-      char mpi_error_string[MPI_MAX_ERROR_STRING];                                            \
-      MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght);                   \
-      RAFT_EXPECTS(                                                                           \
-        MPI_SUCCESS == status, "ERROR: MPI call='%s'. Reason:%s\n", #call, mpi_error_string); \
-    }                                                                                         \
-  } while (0)
-
-// FIXME: Remove after consumer rename
-#ifndef MPI_TRY
-#define MPI_TRY(call) RAFT_MPI_TRY(call)
-#endif
-
-#define RAFT_MPI_TRY_NO_THROW(call)                                         \
-  do {                                                                      \
-    int status = call;                                                      \
-    if (MPI_SUCCESS != status) {                                            \
-      int mpi_error_string_lenght = 0;                                      \
-      char mpi_error_string[MPI_MAX_ERROR_STRING];                          \
-      MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \
-      printf("MPI call='%s' at file=%s line=%d failed with %s ",            \
-             #call,                                                         \
-             __FILE__,                                                      \
-             __LINE__,                                                      \
-             mpi_error_string);                                             \
-    }                                                                       \
-  } while (0)
-
-// FIXME: Remove after consumer rename
-#ifndef MPI_TRY_NO_THROW
-#define MPI_TRY_NO_THROW(call) RAFT_MPI_TRY_NO_THROW(call)
-#endif
+#include <raft/comms/detail/mpi_comms.hpp>
 
 namespace raft {
 namespace comms {
 
-constexpr MPI_Datatype get_mpi_datatype(const datatype_t datatype)
-{
-  switch (datatype) {
-    case datatype_t::CHAR: return MPI_CHAR;
-    case datatype_t::UINT8: return MPI_UNSIGNED_CHAR;
-    case datatype_t::INT32: return MPI_INT;
-    case datatype_t::UINT32: return MPI_UNSIGNED;
-    case datatype_t::INT64: return MPI_LONG_LONG;
-    case datatype_t::UINT64: return MPI_UNSIGNED_LONG_LONG;
-    case datatype_t::FLOAT32: return MPI_FLOAT;
-    case datatype_t::FLOAT64: return MPI_DOUBLE;
-    default:
-      // Execution should never reach here. This takes care of compiler warning.
-      return MPI_DOUBLE;
-  }
-}
-
-constexpr MPI_Op get_mpi_op(const op_t op)
-{
-  switch (op) {
-    case op_t::SUM: return MPI_SUM;
-    case op_t::PROD: return MPI_PROD;
-    case op_t::MIN: return MPI_MIN;
-    case op_t::MAX: return MPI_MAX;
-    default:
-      // Execution should never reach here. This takes care of compiler warning.
-      return MPI_MAX;
-  }
-}
-
-class mpi_comms : public comms_iface {
- public:
-  mpi_comms(MPI_Comm comm, const bool owns_mpi_comm)
-    : owns_mpi_comm_(owns_mpi_comm), mpi_comm_(comm), size_(0), rank_(1), next_request_id_(0)
-  {
-    int mpi_is_initialized = 0;
-    RAFT_MPI_TRY(MPI_Initialized(&mpi_is_initialized));
-    RAFT_EXPECTS(mpi_is_initialized, "ERROR: MPI is not initialized!");
-    RAFT_MPI_TRY(MPI_Comm_size(mpi_comm_, &size_));
-    RAFT_MPI_TRY(MPI_Comm_rank(mpi_comm_, &rank_));
-    // get NCCL unique ID at rank 0 and broadcast it to all others
-    ncclUniqueId id;
-    if (0 == rank_) RAFT_NCCL_TRY(ncclGetUniqueId(&id));
-    RAFT_MPI_TRY(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, mpi_comm_));
-
-    // initializing NCCL
-    RAFT_NCCL_TRY(ncclCommInitRank(&nccl_comm_, size_, id, rank_));
-  }
-
-  virtual ~mpi_comms()
-  {
-    // finalizing NCCL
-    RAFT_NCCL_TRY_NO_THROW(ncclCommDestroy(nccl_comm_));
-    if (owns_mpi_comm_) { RAFT_MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_)); }
-  }
-
-  int get_size() const { return size_; }
-
-  int get_rank() const { return rank_; }
-
-  std::unique_ptr<comms_iface> comm_split(int color, int key) const
-  {
-    MPI_Comm new_comm;
-    RAFT_MPI_TRY(MPI_Comm_split(mpi_comm_, color, key, &new_comm));
-    return std::unique_ptr<comms_iface>(new mpi_comms(new_comm, true));
-  }
-
-  void barrier() const { RAFT_MPI_TRY(MPI_Barrier(mpi_comm_)); }
-
-  void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const
-  {
-    MPI_Request mpi_req;
-    request_t req_id;
-    if (free_requests_.empty()) {
-      req_id = next_request_id_++;
-    } else {
-      auto it = free_requests_.begin();
-      req_id  = *it;
-      free_requests_.erase(it);
-    }
-    RAFT_MPI_TRY(MPI_Isend(buf, size, MPI_BYTE, dest, tag, mpi_comm_, &mpi_req));
-    requests_in_flight_.insert(std::make_pair(req_id, mpi_req));
-    *request = req_id;
-  }
-
-  void irecv(void* buf, size_t size, int source, int tag, request_t* request) const
-  {
-    MPI_Request mpi_req;
-    request_t req_id;
-    if (free_requests_.empty()) {
-      req_id = next_request_id_++;
-    } else {
-      auto it = free_requests_.begin();
-      req_id  = *it;
-      free_requests_.erase(it);
-    }
-
-    RAFT_MPI_TRY(MPI_Irecv(buf, size, MPI_BYTE, source, tag, mpi_comm_, &mpi_req));
-    requests_in_flight_.insert(std::make_pair(req_id, mpi_req));
-    *request = req_id;
-  }
-
-  void waitall(int count, request_t array_of_requests[]) const
-  {
-    std::vector<MPI_Request> requests;
-    requests.reserve(count);
-    for (int i = 0; i < count; ++i) {
-      auto req_it = requests_in_flight_.find(array_of_requests[i]);
-      RAFT_EXPECTS(requests_in_flight_.end() != req_it,
-                   "ERROR: waitall on invalid request: %d",
-                   array_of_requests[i]);
-      requests.push_back(req_it->second);
-      free_requests_.insert(req_it->first);
-      requests_in_flight_.erase(req_it);
-    }
-    RAFT_MPI_TRY(MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE));
-  }
-
-  void allreduce(const void* sendbuff,
-                 void* recvbuff,
-                 size_t count,
-                 datatype_t datatype,
-                 op_t op,
-                 cudaStream_t stream) const
-  {
-    RAFT_NCCL_TRY(ncclAllReduce(
-      sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream));
-  }
-
-  void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const
-  {
-    RAFT_NCCL_TRY(
-      ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
-  }
-
-  void bcast(const void* sendbuff,
-             void* recvbuff,
-             size_t count,
-             datatype_t datatype,
-             int root,
-             cudaStream_t stream) const
-  {
-    RAFT_NCCL_TRY(ncclBroadcast(
-      sendbuff, recvbuff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
-  }
-
-  void reduce(const void* sendbuff,
-              void* recvbuff,
-              size_t count,
-              datatype_t datatype,
-              op_t op,
-              int root,
-              cudaStream_t stream) const
-  {
-    RAFT_NCCL_TRY(ncclReduce(sendbuff,
-                             recvbuff,
-                             count,
-                             get_nccl_datatype(datatype),
-                             get_nccl_op(op),
-                             root,
-                             nccl_comm_,
-                             stream));
-  }
-
-  void allgather(const void* sendbuff,
-                 void* recvbuff,
-                 size_t sendcount,
-                 datatype_t datatype,
-                 cudaStream_t stream) const
-  {
-    RAFT_NCCL_TRY(ncclAllGather(
-      sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream));
-  }
-
-  void allgatherv(const void* sendbuf,
-                  void* recvbuf,
-                  const size_t* recvcounts,
-                  const size_t* displs,
-                  datatype_t datatype,
-                  cudaStream_t stream) const
-  {
-    // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" -
-    // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4.
-    for (int root = 0; root < size_; ++root) {
-      RAFT_NCCL_TRY(
-        ncclBroadcast(sendbuf,
-                      static_cast<char*>(recvbuf) + displs[root] * get_datatype_size(datatype),
-                      recvcounts[root],
-                      get_nccl_datatype(datatype),
-                      root,
-                      nccl_comm_,
-                      stream));
-    }
-  }
-
-  void gather(const void* sendbuff,
-              void* recvbuff,
-              size_t sendcount,
-              datatype_t datatype,
-              int root,
-              cudaStream_t stream) const
-  {
-    size_t dtype_size = get_datatype_size(datatype);
-    RAFT_NCCL_TRY(ncclGroupStart());
-    if (get_rank() == root) {
-      for (int r = 0; r < get_size(); ++r) {
-        RAFT_NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + sendcount * r * dtype_size,
-                               sendcount,
-                               get_nccl_datatype(datatype),
-                               r,
-                               nccl_comm_,
-                               stream));
-      }
-    }
-    RAFT_NCCL_TRY(
-      ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
-    RAFT_NCCL_TRY(ncclGroupEnd());
-  }
-
-  void gatherv(const void* sendbuff,
-               void* recvbuff,
-               size_t sendcount,
-               const size_t* recvcounts,
-               const size_t* displs,
-               datatype_t datatype,
-               int root,
-               cudaStream_t stream) const
-  {
-    size_t dtype_size = get_datatype_size(datatype);
-    RAFT_NCCL_TRY(ncclGroupStart());
-    if (get_rank() == root) {
-      for (int r = 0; r < get_size(); ++r) {
-        RAFT_NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + displs[r] * dtype_size,
-                               recvcounts[r],
-                               get_nccl_datatype(datatype),
-                               r,
-                               nccl_comm_,
-                               stream));
-      }
-    }
-    RAFT_NCCL_TRY(
-      ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
-    RAFT_NCCL_TRY(ncclGroupEnd());
-  }
-
-  void reducescatter(const void* sendbuff,
-                     void* recvbuff,
-                     size_t recvcount,
-                     datatype_t datatype,
-                     op_t op,
-                     cudaStream_t stream) const
-  {
-    RAFT_NCCL_TRY(ncclReduceScatter(sendbuff,
-                                    recvbuff,
-                                    recvcount,
-                                    get_nccl_datatype(datatype),
-                                    get_nccl_op(op),
-                                    nccl_comm_,
-                                    stream));
-  }
-
-  status_t sync_stream(cudaStream_t stream) const
-  {
-    cudaError_t cudaErr;
-    ncclResult_t ncclErr, ncclAsyncErr;
-    while (1) {
-      cudaErr = cudaStreamQuery(stream);
-      if (cudaErr == cudaSuccess) return status_t::SUCCESS;
-
-      if (cudaErr != cudaErrorNotReady) {
-        // An error occurred querying the status of the stream
-        return status_t::ERROR;
-      }
-
-      ncclErr = ncclCommGetAsyncError(nccl_comm_, &ncclAsyncErr);
-      if (ncclErr != ncclSuccess) {
-        // An error occurred retrieving the asynchronous error
-        return status_t::ERROR;
-      }
-
-      if (ncclAsyncErr != ncclSuccess) {
-        // An asynchronous error happened. Stop the operation and destroy
-        // the communicator
-        ncclErr = ncclCommAbort(nccl_comm_);
-        if (ncclErr != ncclSuccess)
-          // Caller may abort with an exception or try to re-create a new communicator.
-          return status_t::ABORT;
-      }
-
-      // Let other threads (including NCCL threads) use the CPU.
-      pthread_yield();
-    }
-  };
-
-  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const
-  {
-    RAFT_NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream));
-  }
-
-  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const
-  {
-    RAFT_NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream));
-  }
-
-  void device_sendrecv(const void* sendbuf,
-                       size_t sendsize,
-                       int dest,
-                       void* recvbuf,
-                       size_t recvsize,
-                       int source,
-                       cudaStream_t stream) const
-  {
-    // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
-    RAFT_NCCL_TRY(ncclGroupStart());
-    RAFT_NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream));
-    RAFT_NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
-    RAFT_NCCL_TRY(ncclGroupEnd());
-  }
-
-  void device_multicast_sendrecv(const void* sendbuf,
-                                 std::vector<size_t> const& sendsizes,
-                                 std::vector<size_t> const& sendoffsets,
-                                 std::vector<int> const& dests,
-                                 void* recvbuf,
-                                 std::vector<size_t> const& recvsizes,
-                                 std::vector<size_t> const& recvoffsets,
-                                 std::vector<int> const& sources,
-                                 cudaStream_t stream) const
-  {
-    // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
-    RAFT_NCCL_TRY(ncclGroupStart());
-    for (size_t i = 0; i < sendsizes.size(); ++i) {
-      RAFT_NCCL_TRY(ncclSend(static_cast<const char*>(sendbuf) + sendoffsets[i],
-                             sendsizes[i],
-                             ncclUint8,
-                             dests[i],
-                             nccl_comm_,
-                             stream));
-    }
-    for (size_t i = 0; i < recvsizes.size(); ++i) {
-      RAFT_NCCL_TRY(ncclRecv(static_cast<char*>(recvbuf) + recvoffsets[i],
-                             recvsizes[i],
-                             ncclUint8,
-                             sources[i],
-                             nccl_comm_,
-                             stream));
-    }
-    RAFT_NCCL_TRY(ncclGroupEnd());
-  }
-
- private:
-  bool owns_mpi_comm_;
-  MPI_Comm mpi_comm_;
-
-  ncclComm_t nccl_comm_;
-  int size_;
-  int rank_;
-  mutable request_t next_request_id_;
-  mutable std::unordered_map<request_t, MPI_Request> requests_in_flight_;
-  mutable std::unordered_set<request_t> free_requests_;
-};
+using mpi_comms = detail::mpi_comms;
 
 inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm)
 {
@@ -443,5 +31,5 @@ inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm)
   handle->set_comms(communicator);
 };
 
-};  // end namespace comms
+};  // namespace comms
 };  // end namespace raft
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index 99f15643a1..f54535a88c 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,539 +16,93 @@
 
 #pragma once
 
-#include <raft/comms/comms.hpp>
-
-#include <raft/comms/ucp_helper.hpp>
 #include <raft/handle.hpp>
-#include <rmm/device_uvector.hpp>
-
-#include <raft/error.hpp>
-
-#include <raft/comms/util.hpp>
-#include <raft/cudart_utils.h>
 
-#include <cuda_runtime.h>
+#include <raft/comms/comms.hpp>
+#include <raft/comms/detail/std_comms.hpp>
 
-#include <raft/comms/ucp_helper.hpp>
-#include <ucp/api/ucp.h>
-#include <ucp/api/ucp_def.h>
+#include <raft/mr/device/buffer.hpp>
 
+#include <iostream>
 #include <nccl.h>
-
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-
-#include <algorithm>
-#include <chrono>
-#include <cstdio>
-#include <exception>
-#include <memory>
-#include <stdlib.h>
-#include <thread>
-#include <time.h>
+#include <ucp/api/ucp.h>
 
 namespace raft {
 namespace comms {
-class std_comms : public comms_iface {
- public:
-  std_comms() = delete;
-
-  /**
-   * @brief Constructor for collective + point-to-point operation.
-   * @param nccl_comm initialized nccl comm
-   * @param ucp_worker initialized ucp_worker instance
-   * @param eps shared pointer to array of ucp endpoints
-   * @param num_ranks number of ranks in the cluster
-   * @param rank rank of the current worker
-   * @param stream cuda stream for synchronizing and ordering collective operations
-   * @param subcomms_ucp use ucp for subcommunicators
-   */
-  std_comms(ncclComm_t nccl_comm,
-            ucp_worker_h ucp_worker,
-            std::shared_ptr<ucp_ep_h*> eps,
-            int num_ranks,
-            int rank,
-            cudaStream_t stream,
-            bool subcomms_ucp = true)
-    : nccl_comm_(nccl_comm),
-      stream_(stream),
-      status_(2, stream),
-      num_ranks_(num_ranks),
-      rank_(rank),
-      subcomms_ucp_(subcomms_ucp),
-      ucp_worker_(ucp_worker),
-      ucp_eps_(eps),
-      next_request_id_(0)
-  {
-    initialize();
-  };
-
-  /**
-   * @brief constructor for collective-only operation
-   * @param nccl_comm initilized nccl communicator
-   * @param num_ranks size of the cluster
-   * @param rank rank of the current worker
-   * @param stream stream for ordering collective operations
-   */
-  std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank, cudaStream_t stream)
-    : nccl_comm_(nccl_comm),
-      stream_(stream),
-      status_(2, stream),
-      num_ranks_(num_ranks),
-      rank_(rank),
-      subcomms_ucp_(false)
-  {
-    initialize();
-  };
-
-  void initialize()
-  {
-    sendbuff_ = status_.data();
-    recvbuff_ = status_.data() + 1;
-  }
 
-  int get_size() const { return num_ranks_; }
+using std_comms = detail::std_comms;
 
-  int get_rank() const { return rank_; }
-
-  std::unique_ptr<comms_iface> comm_split(int color, int key) const
-  {
-    rmm::device_uvector<int> d_colors(get_size(), stream_);
-    rmm::device_uvector<int> d_keys(get_size(), stream_);
-
-    update_device(d_colors.data() + get_rank(), &color, 1, stream_);
-    update_device(d_keys.data() + get_rank(), &key, 1, stream_);
-
-    allgather(d_colors.data() + get_rank(), d_colors.data(), 1, datatype_t::INT32, stream_);
-    allgather(d_keys.data() + get_rank(), d_keys.data(), 1, datatype_t::INT32, stream_);
-    this->sync_stream(stream_);
-
-    std::vector<int> h_colors(get_size());
-    std::vector<int> h_keys(get_size());
-
-    update_host(h_colors.data(), d_colors.data(), get_size(), stream_);
-    update_host(h_keys.data(), d_keys.data(), get_size(), stream_);
-
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
-
-    std::vector<int> subcomm_ranks{};
-    std::vector<ucp_ep_h> new_ucx_ptrs{};
-
-    for (int i = 0; i < get_size(); ++i) {
-      if (h_colors[i] == color) {
-        subcomm_ranks.push_back(i);
-        if (ucp_worker_ != nullptr && subcomms_ucp_) { new_ucx_ptrs.push_back((*ucp_eps_)[i]); }
-      }
-    }
+/**
+ * Function to construct comms_t and inject it on a handle_t. This
+ * is used for convenience in the Python layer.
+ *
+ * @param handle raft::handle_t for injecting the comms
+ * @param nccl_comm initialized NCCL communicator to use for collectives
+ * @param num_ranks number of ranks in communicator clique
+ * @param rank rank of local instance
+ */
+void build_comms_nccl_only(handle_t* handle, ncclComm_t nccl_comm, int num_ranks, int rank)
+{
+  cudaStream_t stream = handle->get_stream();
+
+  auto communicator = std::make_shared<comms_t>(
+    std::unique_ptr<comms_iface>(new raft::comms::std_comms(nccl_comm, num_ranks, rank, stream)));
+  handle->set_comms(communicator);
+}
+
+/**
+ * Function to construct comms_t and inject it on a handle_t. This
+ * is used for convenience in the Python layer.
+ *
+ * @param handle raft::handle_t for injecting the comms
+ * @param nccl_comm initialized NCCL communicator to use for collectives
+ * @param ucp_worker of local process
+ *        Note: This is purposefully left as void* so that the ucp_worker_h
+ *        doesn't need to be exposed through the cython layer
+ * @param eps array of ucp_ep_h instances.
+ *        Note: This is purposefully left as void* so that
+ *        the ucp_ep_h doesn't need to be exposed through the cython layer.
+ * @param num_ranks number of ranks in communicator clique
+ * @param rank rank of local instance
+ */
+void build_comms_nccl_ucx(
+  handle_t* handle, ncclComm_t nccl_comm, void* ucp_worker, void* eps, int num_ranks, int rank)
+{
+  auto eps_sp = std::make_shared<ucp_ep_h*>(new ucp_ep_h[num_ranks]);
 
-    ncclUniqueId id{};
-    if (get_rank() == subcomm_ranks[0]) {  // root of the new subcommunicator
-      RAFT_NCCL_TRY(ncclGetUniqueId(&id));
-      std::vector<request_t> requests(subcomm_ranks.size() - 1);
-      for (size_t i = 1; i < subcomm_ranks.size(); ++i) {
-        isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color, requests.data() + (i - 1));
-      }
-      waitall(requests.size(), requests.data());
-    } else {
-      request_t request{};
-      irecv(&id, sizeof(ncclUniqueId), subcomm_ranks[0], color, &request);
-      waitall(1, &request);
-    }
-    // FIXME: this seems unnecessary, do more testing and remove this
-    barrier();
+  auto size_t_ep_arr = reinterpret_cast<size_t*>(eps);
 
-    ncclComm_t nccl_comm;
-    RAFT_NCCL_TRY(ncclCommInitRank(&nccl_comm, subcomm_ranks.size(), id, key));
+  for (int i = 0; i < num_ranks; i++) {
+    size_t ptr    = size_t_ep_arr[i];
+    auto ucp_ep_v = reinterpret_cast<ucp_ep_h*>(*eps_sp);
 
-    if (ucp_worker_ != nullptr && subcomms_ucp_) {
-      auto eps_sp = std::make_shared<ucp_ep_h*>(new_ucx_ptrs.data());
-      return std::unique_ptr<comms_iface>(new std_comms(nccl_comm,
-                                                        (ucp_worker_h)ucp_worker_,
-                                                        eps_sp,
-                                                        subcomm_ranks.size(),
-                                                        key,
-                                                        stream_,
-                                                        subcomms_ucp_));
+    if (ptr != 0) {
+      auto eps_ptr = reinterpret_cast<ucp_ep_h>(size_t_ep_arr[i]);
+      ucp_ep_v[i]  = eps_ptr;
     } else {
-      return std::unique_ptr<comms_iface>(
-        new std_comms(nccl_comm, subcomm_ranks.size(), key, stream_));
-    }
-  }
-
-  void barrier() const
-  {
-    RAFT_CUDA_TRY(cudaMemsetAsync(sendbuff_, 1, sizeof(int), stream_));
-    RAFT_CUDA_TRY(cudaMemsetAsync(recvbuff_, 1, sizeof(int), stream_));
-
-    allreduce(sendbuff_, recvbuff_, 1, datatype_t::INT32, op_t::SUM, stream_);
-
-    ASSERT(sync_stream(stream_) == status_t::SUCCESS,
-           "ERROR: syncStream failed. This can be caused by a failed rank_.");
-  }
-
-  void get_request_id(request_t* req) const
-  {
-    request_t req_id;
-
-    if (this->free_requests_.empty())
-      req_id = this->next_request_id_++;
-    else {
-      auto it = this->free_requests_.begin();
-      req_id  = *it;
-      this->free_requests_.erase(it);
-    }
-    *req = req_id;
-  }
-
-  void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const
-  {
-    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
-
-    get_request_id(request);
-    ucp_ep_h ep_ptr = (*ucp_eps_)[dest];
-
-    ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request));
-
-    this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag, default_tag_mask, get_rank());
-
-    requests_in_flight_.insert(std::make_pair(*request, ucp_req));
-  }
-
-  void irecv(void* buf, size_t size, int source, int tag, request_t* request) const
-  {
-    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
-
-    get_request_id(request);
-
-    ucp_ep_h ep_ptr = (*ucp_eps_)[source];
-
-    ucp_tag_t tag_mask = default_tag_mask;
-
-    ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request));
-    ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag, tag_mask, source);
-
-    requests_in_flight_.insert(std::make_pair(*request, ucp_req));
-  }
-
-  void waitall(int count, request_t array_of_requests[]) const
-  {
-    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
-
-    std::vector<ucp_request*> requests;
-    requests.reserve(count);
-
-    time_t start = time(NULL);
-
-    for (int i = 0; i < count; ++i) {
-      auto req_it = requests_in_flight_.find(array_of_requests[i]);
-      ASSERT(requests_in_flight_.end() != req_it,
-             "ERROR: waitall on invalid request: %d",
-             array_of_requests[i]);
-      requests.push_back(req_it->second);
-      free_requests_.insert(req_it->first);
-      requests_in_flight_.erase(req_it);
-    }
-
-    while (requests.size() > 0) {
-      time_t now = time(NULL);
-
-      // Timeout if we have not gotten progress or completed any requests
-      // in 10 or more seconds.
-      ASSERT(now - start < 10, "Timed out waiting for requests.");
-
-      for (std::vector<ucp_request*>::iterator it = requests.begin(); it != requests.end();) {
-        bool restart = false;  // resets the timeout when any progress was made
-
-        // Causes UCP to progress through the send/recv message queue
-        while (ucp_handler_.ucp_progress(ucp_worker_) != 0) {
-          restart = true;
-        }
-
-        auto req = *it;
-
-        // If the message needs release, we know it will be sent/received
-        // asynchronously, so we will need to track and verify its state
-        if (req->needs_release) {
-          ASSERT(UCS_PTR_IS_PTR(req->req), "UCX Request Error. Request is not valid UCX pointer");
-          ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n", UCS_PTR_STATUS(req->req));
-          ASSERT(req->req->completed == 1 || req->req->completed == 0,
-                 "request->completed not a valid value: %d\n",
-                 req->req->completed);
-        }
-
-        // If a message was sent synchronously (eg. completed before
-        // `isend`/`irecv` completed) or an asynchronous message
-        // is complete, we can go ahead and clean it up.
-        if (!req->needs_release || req->req->completed == 1) {
-          restart = true;
-
-          // perform cleanup
-          ucp_handler_.free_ucp_request(req);
-
-          // remove from pending requests
-          it = requests.erase(it);
-        } else {
-          ++it;
-        }
-        // if any progress was made, reset the timeout start time
-        if (restart) { start = time(NULL); }
-      }
-    }
-  }
-
-  void allreduce(const void* sendbuff,
-                 void* recvbuff,
-                 size_t count,
-                 datatype_t datatype,
-                 op_t op,
-                 cudaStream_t stream) const
-  {
-    RAFT_NCCL_TRY(ncclAllReduce(
-      sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream));
-  }
-
-  void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const
-  {
-    RAFT_NCCL_TRY(
-      ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
-  }
-
-  void bcast(const void* sendbuff,
-             void* recvbuff,
-             size_t count,
-             datatype_t datatype,
-             int root,
-             cudaStream_t stream) const
-  {
-    RAFT_NCCL_TRY(ncclBroadcast(
-      sendbuff, recvbuff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
-  }
-
-  void reduce(const void* sendbuff,
-              void* recvbuff,
-              size_t count,
-              datatype_t datatype,
-              op_t op,
-              int root,
-              cudaStream_t stream) const
-  {
-    RAFT_NCCL_TRY(ncclReduce(sendbuff,
-                             recvbuff,
-                             count,
-                             get_nccl_datatype(datatype),
-                             get_nccl_op(op),
-                             root,
-                             nccl_comm_,
-                             stream));
-  }
-
-  void allgather(const void* sendbuff,
-                 void* recvbuff,
-                 size_t sendcount,
-                 datatype_t datatype,
-                 cudaStream_t stream) const
-  {
-    RAFT_NCCL_TRY(ncclAllGather(
-      sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream));
-  }
-
-  void allgatherv(const void* sendbuf,
-                  void* recvbuf,
-                  const size_t* recvcounts,
-                  const size_t* displs,
-                  datatype_t datatype,
-                  cudaStream_t stream) const
-  {
-    // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" -
-    // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4.
-    for (int root = 0; root < num_ranks_; ++root) {
-      size_t dtype_size = get_datatype_size(datatype);
-      RAFT_NCCL_TRY(ncclBroadcast(sendbuf,
-                                  static_cast<char*>(recvbuf) + displs[root] * dtype_size,
-                                  recvcounts[root],
-                                  get_nccl_datatype(datatype),
-                                  root,
-                                  nccl_comm_,
-                                  stream));
-    }
-  }
-
-  void gather(const void* sendbuff,
-              void* recvbuff,
-              size_t sendcount,
-              datatype_t datatype,
-              int root,
-              cudaStream_t stream) const
-  {
-    size_t dtype_size = get_datatype_size(datatype);
-    RAFT_NCCL_TRY(ncclGroupStart());
-    if (get_rank() == root) {
-      for (int r = 0; r < get_size(); ++r) {
-        RAFT_NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + sendcount * r * dtype_size,
-                               sendcount,
-                               get_nccl_datatype(datatype),
-                               r,
-                               nccl_comm_,
-                               stream));
-      }
-    }
-    RAFT_NCCL_TRY(
-      ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
-    RAFT_NCCL_TRY(ncclGroupEnd());
-  }
-
-  void gatherv(const void* sendbuff,
-               void* recvbuff,
-               size_t sendcount,
-               const size_t* recvcounts,
-               const size_t* displs,
-               datatype_t datatype,
-               int root,
-               cudaStream_t stream) const
-  {
-    size_t dtype_size = get_datatype_size(datatype);
-    RAFT_NCCL_TRY(ncclGroupStart());
-    if (get_rank() == root) {
-      for (int r = 0; r < get_size(); ++r) {
-        RAFT_NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + displs[r] * dtype_size,
-                               recvcounts[r],
-                               get_nccl_datatype(datatype),
-                               r,
-                               nccl_comm_,
-                               stream));
-      }
-    }
-    RAFT_NCCL_TRY(
-      ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
-    RAFT_NCCL_TRY(ncclGroupEnd());
-  }
-
-  void reducescatter(const void* sendbuff,
-                     void* recvbuff,
-                     size_t recvcount,
-                     datatype_t datatype,
-                     op_t op,
-                     cudaStream_t stream) const
-  {
-    RAFT_NCCL_TRY(ncclReduceScatter(sendbuff,
-                                    recvbuff,
-                                    recvcount,
-                                    get_nccl_datatype(datatype),
-                                    get_nccl_op(op),
-                                    nccl_comm_,
-                                    stream));
-  }
-
-  status_t sync_stream(cudaStream_t stream) const
-  {
-    cudaError_t cudaErr;
-    ncclResult_t ncclErr, ncclAsyncErr;
-    while (1) {
-      cudaErr = cudaStreamQuery(stream);
-      if (cudaErr == cudaSuccess) return status_t::SUCCESS;
-
-      if (cudaErr != cudaErrorNotReady) {
-        // An error occurred querying the status of the stream_
-        return status_t::ERROR;
-      }
-
-      ncclErr = ncclCommGetAsyncError(nccl_comm_, &ncclAsyncErr);
-      if (ncclErr != ncclSuccess) {
-        // An error occurred retrieving the asynchronous error
-        return status_t::ERROR;
-      }
-
-      if (ncclAsyncErr != ncclSuccess) {
-        // An asynchronous error happened. Stop the operation and destroy
-        // the communicator
-        ncclErr = ncclCommAbort(nccl_comm_);
-        if (ncclErr != ncclSuccess)
-          // Caller may abort with an exception or try to re-create a new communicator.
-          return status_t::ABORT;
-      }
-
-      // Let other threads (including NCCL threads) use the CPU.
-      std::this_thread::yield();
-    }
-  }
-
-  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const
-  {
-    RAFT_NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream));
-  }
-
-  // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const
-  {
-    RAFT_NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream));
-  }
-
-  void device_sendrecv(const void* sendbuf,
-                       size_t sendsize,
-                       int dest,
-                       void* recvbuf,
-                       size_t recvsize,
-                       int source,
-                       cudaStream_t stream) const
-  {
-    // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
-    RAFT_NCCL_TRY(ncclGroupStart());
-    RAFT_NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream));
-    RAFT_NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
-    RAFT_NCCL_TRY(ncclGroupEnd());
-  }
-
-  void device_multicast_sendrecv(const void* sendbuf,
-                                 std::vector<size_t> const& sendsizes,
-                                 std::vector<size_t> const& sendoffsets,
-                                 std::vector<int> const& dests,
-                                 void* recvbuf,
-                                 std::vector<size_t> const& recvsizes,
-                                 std::vector<size_t> const& recvoffsets,
-                                 std::vector<int> const& sources,
-                                 cudaStream_t stream) const
-  {
-    // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
-    RAFT_NCCL_TRY(ncclGroupStart());
-    for (size_t i = 0; i < sendsizes.size(); ++i) {
-      RAFT_NCCL_TRY(ncclSend(static_cast<const char*>(sendbuf) + sendoffsets[i],
-                             sendsizes[i],
-                             ncclUint8,
-                             dests[i],
-                             nccl_comm_,
-                             stream));
-    }
-    for (size_t i = 0; i < recvsizes.size(); ++i) {
-      RAFT_NCCL_TRY(ncclRecv(static_cast<char*>(recvbuf) + recvoffsets[i],
-                             recvsizes[i],
-                             ncclUint8,
-                             sources[i],
-                             nccl_comm_,
-                             stream));
+      ucp_ep_v[i] = nullptr;
     }
-    RAFT_NCCL_TRY(ncclGroupEnd());
   }
 
- private:
-  ncclComm_t nccl_comm_;
-  cudaStream_t stream_;
+  cudaStream_t stream = handle->get_stream();
 
-  int *sendbuff_, *recvbuff_;
-  rmm::device_uvector<int> status_;
+  auto communicator =
+    std::make_shared<comms_t>(std::unique_ptr<comms_iface>(new raft::comms::std_comms(
+      nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, num_ranks, rank, stream)));
+  handle->set_comms(communicator);
+}
 
-  int num_ranks_;
-  int rank_;
+inline void nccl_unique_id_from_char(ncclUniqueId* id, char* uniqueId, int size)
+{
+  memcpy(id->internal, uniqueId, size);
+}
 
-  bool subcomms_ucp_;
+inline void get_unique_id(char* uid, int size)
+{
+  ncclUniqueId id;
+  ncclGetUniqueId(&id);
 
-  comms_ucp_handler ucp_handler_;
-  ucp_worker_h ucp_worker_;
-  std::shared_ptr<ucp_ep_h*> ucp_eps_;
-  mutable request_t next_request_id_;
-  mutable std::unordered_map<request_t, struct ucp_request*> requests_in_flight_;
-  mutable std::unordered_set<request_t> free_requests_;
-};
-}  // end namespace comms
-}  // end namespace raft
+  memcpy(uid, id.internal, size);
+}
+};  // namespace comms
+};  // end namespace raft
\ No newline at end of file
diff --git a/python/raft/dask/common/comms_utils.pyx b/python/raft/dask/common/comms_utils.pyx
index 7370085805..990e882be5 100644
--- a/python/raft/dask/common/comms_utils.pyx
+++ b/python/raft/dask/common/comms_utils.pyx
@@ -37,11 +37,6 @@ cdef extern from "raft/handle.hpp" namespace "raft":
 
 cdef extern from "raft/comms/std_comms.hpp" namespace "raft::comms":
 
-    cdef cppclass std_comms:
-        pass
-
-cdef extern from "raft/comms/helper.hpp" namespace "raft::comms":
-
     void build_comms_nccl_ucx(handle_t *handle,
                               ncclComm_t comm,
                               void *ucp_worker,
@@ -54,7 +49,7 @@ cdef extern from "raft/comms/helper.hpp" namespace "raft::comms":
                                int size,
                                int rank) except +
 
-cdef extern from "raft/comms/test.hpp" namespace "raft::comms":
+cdef extern from "raft/comms/comms_test.hpp" namespace "raft::comms":
 
     bool test_collective_allreduce(const handle_t &h, int root) except +
     bool test_collective_broadcast(const handle_t &h, int root) except +
diff --git a/python/raft/dask/common/nccl.pyx b/python/raft/dask/common/nccl.pyx
index 7fc813b515..fd91f34eb5 100644
--- a/python/raft/dask/common/nccl.pyx
+++ b/python/raft/dask/common/nccl.pyx
@@ -25,7 +25,7 @@ from cython.operator cimport dereference as deref
 from libcpp cimport bool
 from libc.stdlib cimport malloc, free
 
-cdef extern from "raft/comms/helper.hpp" namespace "raft::comms":
+cdef extern from "raft/comms/std_comms.hpp" namespace "raft::comms":
     void get_unique_id(char *uid, int size) except +
     void nccl_unique_id_from_char(ncclUniqueId *id,
                                   char *uniqueId,

From bf6dd466f299e5fb6c056c748080cd4a6da8d1d1 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 1 Feb 2022 17:03:33 -0600
Subject: [PATCH 087/171] Unpin dask and distributed (#474)

Changed to be in-line with: https://github.com/rapidsai/cudf/pull/10182

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Dante Gama Dessavre (https://github.com/dantegd)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/474
---
 ci/gpu/build.sh                          | 4 ++--
 conda/environments/raft_dev_cuda11.0.yml | 4 ++--
 conda/environments/raft_dev_cuda11.2.yml | 4 ++--
 conda/environments/raft_dev_cuda11.4.yml | 4 ++--
 conda/environments/raft_dev_cuda11.5.yml | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index e79c21f0c5..8b6374c487 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -61,8 +61,8 @@ gpuci_mamba_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid
 # Install the master version of dask, distributed, and dask-ml
 gpuci_logger "Install the master version of dask and distributed"
 set -x
-pip install "git+https://github.com/dask/distributed.git@2022.01.0" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git@2022.01.0" --upgrade --no-deps
+pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
+pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
 set +x
 
 
diff --git a/conda/environments/raft_dev_cuda11.0.yml b/conda/environments/raft_dev_cuda11.0.yml
index f62d1e0e37..c345e07ba2 100644
--- a/conda/environments/raft_dev_cuda11.0.yml
+++ b/conda/environments/raft_dev_cuda11.0.yml
@@ -22,8 +22,8 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@2022.01.0
-    - git+https://github.com/dask/distributed.git@2022.01.0
+    - git+https://github.com/dask/dask.git@main
+    - git+https://github.com/dask/distributed.git@main
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/
diff --git a/conda/environments/raft_dev_cuda11.2.yml b/conda/environments/raft_dev_cuda11.2.yml
index 049b377d29..537f9e6c85 100644
--- a/conda/environments/raft_dev_cuda11.2.yml
+++ b/conda/environments/raft_dev_cuda11.2.yml
@@ -22,8 +22,8 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@2022.01.0
-    - git+https://github.com/dask/distributed.git@2022.01.0
+    - git+https://github.com/dask/dask.git@main
+    - git+https://github.com/dask/distributed.git@main
     
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/
diff --git a/conda/environments/raft_dev_cuda11.4.yml b/conda/environments/raft_dev_cuda11.4.yml
index 7aec86ab2b..8add42966e 100644
--- a/conda/environments/raft_dev_cuda11.4.yml
+++ b/conda/environments/raft_dev_cuda11.4.yml
@@ -22,8 +22,8 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@2022.01.0
-    - git+https://github.com/dask/distributed.git@2022.01.0
+    - git+https://github.com/dask/dask.git@main
+    - git+https://github.com/dask/distributed.git@main
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/
diff --git a/conda/environments/raft_dev_cuda11.5.yml b/conda/environments/raft_dev_cuda11.5.yml
index 063e98af3e..bf1b93694d 100644
--- a/conda/environments/raft_dev_cuda11.5.yml
+++ b/conda/environments/raft_dev_cuda11.5.yml
@@ -23,8 +23,8 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@2022.01.0
-    - git+https://github.com/dask/distributed.git@2022.01.0
+    - git+https://github.com/dask/dask.git@main
+    - git+https://github.com/dask/distributed.git@main
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/

From fc76bcc1169e48d2e2fe86e4f8f8b3b809e004ba Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Tue, 1 Feb 2022 15:47:42 -0800
Subject: [PATCH 088/171] Define PTDS via `-D` to fix cache misses in sccache
 (#476)

Define PTDS via `-DCUDA_API_PER_THREAD_DEFAULT_STREAM` because sccache doesn't recognize the `--default-stream per-thread` arg yet.

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/476
---
 cpp/cmake/modules/ConfigureCUDA.cmake | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index 02bd15c407..a9163a474f 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -23,7 +23,9 @@ if(CMAKE_COMPILER_IS_GNUCXX)
     list(APPEND RAFT_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations)
 endif()
 
-list(APPEND RAFT_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr --default-stream per-thread)
+list(APPEND RAFT_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
+list(APPEND RAFT_CXX_FLAGS "-DCUDA_API_PER_THREAD_DEFAULT_STREAM")
+list(APPEND RAFT_CUDA_FLAGS "-DCUDA_API_PER_THREAD_DEFAULT_STREAM")
 
 # set warnings as errors
 if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2.0)

From 08d58c897307684f1325c0904d679871f9128b5f Mon Sep 17 00:00:00 2001
From: Raymond Douglass <ray@raydouglass.com>
Date: Wed, 2 Feb 2022 10:50:14 -0500
Subject: [PATCH 089/171] update changelog

---
 CHANGELOG.md | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 55 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b983f0bafb..a85f64a098 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,59 @@
-# raft 22.02.00 (Date TBD)
+# raft 22.02.00 (2 Feb 2022)
 
-Please see https://github.com/rapidsai/raft/releases/tag/v22.02.00a for the latest changes to this development branch.
+## 🚨 Beaking Changes
+
+- Simplify aft component CMake logic, and allow compilation without FAISS ([#428](https://github.com/rapidsai/raft/pull/428)) [@obetmaynad](https://github.com/obetmaynad)
+- One cudaSteam_t instance pe aft::handle_t ([#291](https://github.com/rapidsai/raft/pull/291)) [@divyegala](https://github.com/divyegala)
+
+## 🐛 Bug Fixes
+
+- Removing exta logging fom faiss m ([#463](https://github.com/rapidsai/raft/pull/463)) [@cjnolet](https://github.com/cjnolet)
+- Pin `dask` &amp; `distibuted` vesions ([#455](https://github.com/rapidsai/raft/pull/455)) [@galipemsaga](https://github.com/galipemsaga)
+- Replace RMM CUDA Python bindings with those povided  by CUDA-Python ([#451](https://github.com/rapidsai/raft/pull/451)) [@shwina](https://github.com/shwina)
+- Fix comms memoy leak ([#436](https://github.com/rapidsai/raft/pull/436)) [@seunghwak](https://github.com/seunghwak)
+- Fix C++ doxygen documentation ([#426](https://github.com/rapidsai/raft/pull/426)) [@achikin](https://github.com/achikin)
+- Fix clang-fomat style eos ([#425](https://github.com/rapidsai/raft/pull/425)) [@achikin](https://github.com/achikin)
+- Fix using incoect maco RAFT_CHECK_CUDA in place of RAFT_CUDA_TRY ([#415](https://github.com/rapidsai/raft/pull/415)) [@achikin](https://github.com/achikin)
+- Fix CUDA_CHECK_NO_THROW compatibility define ([#414](https://github.com/rapidsai/raft/pull/414)) [@zbjonson](https://github.com/zbjonson)
+- Disabling fused l2 knn fom bfknn ([#407](https://github.com/rapidsai/raft/pull/407)) [@cjnolet](https://github.com/cjnolet)
+- Disabling expanded fused l2 knn to unblock cuml CI ([#404](https://github.com/rapidsai/raft/pull/404)) [@cjnolet](https://github.com/cjnolet)
+- Reveting default knn distance to L2Unexpanded fo now. ([#403](https://github.com/rapidsai/raft/pull/403)) [@cjnolet](https://github.com/cjnolet)
+
+## 📖 Documentation
+
+- README and build fixes befoe elease ([#459](https://github.com/rapidsai/raft/pull/459)) [@cjnolet](https://github.com/cjnolet)
+- Updates to Python and C++ Docs ([#442](https://github.com/rapidsai/raft/pull/442)) [@cjnolet](https://github.com/cjnolet)
+
+## 🚀 New Featues
+
+- eo macos: detemining buffe size instead of fixed 2048 chas ([#420](https://github.com/rapidsai/raft/pull/420)) [@MatthiasKohl](https://github.com/MatthiasKohl)
+- NVTX ange helpes ([#416](https://github.com/rapidsai/raft/pull/416)) [@achikin](https://github.com/achikin)
+
+## 🛠️ Impovements
+
+- Splitting fused l2 knn specializations ([#461](https://github.com/rapidsai/raft/pull/461)) [@cjnolet](https://github.com/cjnolet)
+- Update cuCollection git tag ([#447](https://github.com/rapidsai/raft/pull/447)) [@seunghwak](https://github.com/seunghwak)
+- Remove libcudacxx patch needed fo nvcc 11.4 ([#446](https://github.com/rapidsai/raft/pull/446)) [@obetmaynad](https://github.com/obetmaynad)
+- Unpin `dask` and `distibuted` ([#440](https://github.com/rapidsai/raft/pull/440)) [@galipemsaga](https://github.com/galipemsaga)
+- Public apis fo emainde of matix and stats ([#438](https://github.com/rapidsai/raft/pull/438)) [@divyegala](https://github.com/divyegala)
+- Fix bug in poduce-consume buffe exchange which occus in UMAP test on GV100 ([#429](https://github.com/rapidsai/raft/pull/429)) [@mdoijade](https://github.com/mdoijade)
+- Simplify aft component CMake logic, and allow compilation without FAISS ([#428](https://github.com/rapidsai/raft/pull/428)) [@obetmaynad](https://github.com/obetmaynad)
+- Update ucx-py vesion on elease using vc ([#422](https://github.com/rapidsai/raft/pull/422)) [@Ethyling](https://github.com/Ethyling)
+- Disabling fused l2 knn again. Not sue how this got added back. ([#421](https://github.com/rapidsai/raft/pull/421)) [@cjnolet](https://github.com/cjnolet)
+- Adding no thow maco vaiants ([#417](https://github.com/rapidsai/raft/pull/417)) [@cjnolet](https://github.com/cjnolet)
+- Remove `IncludeCategoies` fom `.clang-fomat` ([#412](https://github.com/rapidsai/raft/pull/412)) [@codeepot](https://github.com/codeepot)
+- fix nan issues in L2 expanded sqt KNN distances ([#411](https://github.com/rapidsai/raft/pull/411)) [@mdoijade](https://github.com/mdoijade)
+- Consistent enaming of CHECK_CUDA and *_TRY macos ([#410](https://github.com/rapidsai/raft/pull/410)) [@cjnolet](https://github.com/cjnolet)
+- Faste matix-vecto-ops ([#401](https://github.com/rapidsai/raft/pull/401)) [@achikin](https://github.com/achikin)
+- Adding dev conda envionment files. ([#397](https://github.com/rapidsai/raft/pull/397)) [@cjnolet](https://github.com/cjnolet)
+- Update to UCX-Py 0.24 ([#392](https://github.com/rapidsai/raft/pull/392)) [@pentschev](https://github.com/pentschev)
+- Banch 21.12 mege 22.02 ([#386](https://github.com/rapidsai/raft/pull/386)) [@cjnolet](https://github.com/cjnolet)
+- Hiding implementation details fo spase API ([#381](https://github.com/rapidsai/raft/pull/381)) [@cjnolet](https://github.com/cjnolet)
+- Adding distance specializations ([#376](https://github.com/rapidsai/raft/pull/376)) [@cjnolet](https://github.com/cjnolet)
+- Use FAISS with RMM ([#363](https://github.com/rapidsai/raft/pull/363)) [@viclafague](https://github.com/viclafague)
+- Add Fused L2 Expanded KNN kenel ([#339](https://github.com/rapidsai/raft/pull/339)) [@mdoijade](https://github.com/mdoijade)
+- Update `.clang-fomat` to be consistent with all othe RAPIDS epos ([#300](https://github.com/rapidsai/raft/pull/300)) [@codeepot](https://github.com/codeepot)
+- One cudaSteam_t instance pe aft::handle_t ([#291](https://github.com/rapidsai/raft/pull/291)) [@divyegala](https://github.com/divyegala)
 
 # raft 21.12.00 (9 Dec 2021)
 

From fc4c21091988b3ad15133ed4b62d998bb21864b8 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Wed, 2 Feb 2022 11:59:10 -0500
Subject: [PATCH 090/171] iUpdating codeowners to use new raft codeowners
 (#480)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/raft/pull/480
---
 .github/CODEOWNERS | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index a91c5b58d2..a0528e4011 100755
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,15 +1,15 @@
 #cpp code owners
-cpp/               @rapidsai/cuml-cpp-codeowners @rapidsai/cugraph-cpp-codeowners
+cpp/               @rapidsai/raft-cpp-codeowners
 
 #python code owners
-python/            @rapidsai/cuml-python-codeowners @rapidsai/cugraph-python-codeowners
+python/            @rapidsai/raft-python-codeowners
 
 #cmake code owners
-**/CMakeLists.txt  @rapidsai/cuml-cmake-codeowners @rapidsai/cugraph-cmake-codeowners
-**/cmake/          @rapidsai/cuml-cmake-codeowners @rapidsai/cugraph-cmake-codeowners
-python/setup.py    @rapidsai/cuml-cmake-codeowners @rapidsai/cugraph-cmake-codeowners
-build.sh           @rapidsai/cuml-cmake-codeowners @rapidsai/cugraph-cmake-codeowners
-**/build.sh        @rapidsai/cuml-cmake-codeowners @rapidsai/cugraph-cmake-codeowners
+**/CMakeLists.txt  @rapidsai/raft-cmake-codeowners
+**/cmake/          @rapidsai/raft-cmake-codeowners
+python/setup.py    @rapidsai/raft-cmake-codeowners
+build.sh           @rapidsai/raft-cmake-codeowners
+**/build.sh        @rapidsai/raft-cmake-codeowners
 
 #build/ops code owners
 .github/           @rapidsai/ops-codeowners

From 207a9deb7f189a5ccd9004acae31f7e42dd63d62 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Wed, 2 Feb 2022 18:24:26 -0500
Subject: [PATCH 091/171] Adding cpu ci for conda build (#482)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Mark Sadang (https://github.com/msadang)

URL: https://github.com/rapidsai/raft/pull/482
---
 ci/cpu/build.sh    | 121 +++++++++++++++++++++++++++++++++++++++++++++
 ci/cpu/prebuild.sh |  21 ++++++++
 ci/cpu/upload.sh   |  69 ++++++++++++++++++++++++++
 3 files changed, 211 insertions(+)
 create mode 100755 ci/cpu/build.sh
 create mode 100755 ci/cpu/prebuild.sh
 create mode 100755 ci/cpu/upload.sh

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
new file mode 100755
index 0000000000..e232cb1942
--- /dev/null
+++ b/ci/cpu/build.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#########################################
+#  RAFT CPU conda build script for CI   #
+#########################################
+set -e
+
+# Set path and build parallel level
+# openmpi dir is required on CentOS for finding MPI libs from cmake
+if [[ -e /etc/os-release ]] && (grep -qi centos /etc/os-release); then
+    export PATH=/opt/conda/bin:/usr/local/cuda/bin:/usr/lib64/openmpi/bin:$PATH
+else
+    export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
+fi
+export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
+
+# Set home to the job's workspace
+export HOME=$WORKSPACE
+
+# Switch to project root; also root of repo checkout
+cd $WORKSPACE
+
+# If nightly build, append current YYMMDD to version
+if [[ "$BUILD_MODE" = "branch" && "$SOURCE_BRANCH" = branch-* ]] ; then
+  export VERSION_SUFFIX=`date +%y%m%d`
+fi
+
+# Setup 'gpuci_conda_retry' for build retries (results in 2 total attempts)
+export GPUCI_CONDA_RETRY_MAX=1
+export GPUCI_CONDA_RETRY_SLEEP=30
+
+# Use Ninja to build
+export CMAKE_GENERATOR="Ninja"
+export CONDA_BLD_DIR="${WORKSPACE}/.conda-bld"
+
+# ucx-py version
+export UCX_PY_VERSION='0.25.*'
+
+################################################################################
+# SETUP - Check environment
+################################################################################
+
+gpuci_logger "Check environment variables"
+env
+
+gpuci_logger "Activate conda env"
+. /opt/conda/etc/profile.d/conda.sh
+conda activate rapids
+
+# Remove rapidsai-nightly channel if we are building main branch
+if [ "$SOURCE_BRANCH" = "main" ]; then
+  conda config --system --remove channels rapidsai-nightly
+fi
+
+gpuci_logger "Check versions"
+python --version
+$CC --version
+$CXX --version
+
+gpuci_logger "Check conda environment"
+conda info
+conda config --show-sources
+conda list --show-channel-urls
+
+# FIX Added to deal with Anancoda SSL verification issues during conda builds
+conda config --set ssl_verify False
+
+# FIXME: for now, force the building of all packages so they are built on a
+# machine with a single CUDA version, then have the gpu/build.sh script simply
+# install. This should eliminate a mismatch between different CUDA versions on
+# cpu vs. gpu builds that is problematic with CUDA 11.5 Enhanced Compat.
+if [ "$BUILD_LIBRAFT" == '1' ]; then
+  BUILD_PYRAFT=1
+  # If we are doing CUDA + Python builds, libraft package is located at ${CONDA_BLD_DIR}
+  CONDA_LOCAL_CHANNEL="${CONDA_BLD_DIR}"
+else
+  # If we are doing Python builds only, libraft package is placed here by Project Flash
+  CONDA_LOCAL_CHANNEL="ci/artifacts/raft/cpu/.conda-bld/"
+fi
+
+
+###############################################################################
+# BUILD - Conda package builds
+###############################################################################
+
+if [ "$BUILD_LIBRAFT" == '1' ]; then
+  gpuci_logger "Building conda packages for libraft-nn, libraft-distance, and libraft-headers"
+  if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
+    gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft-nn
+    gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft-distance
+    gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft-headers
+  else
+    gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft-nn
+    gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft-distance
+    gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft-headers
+    mkdir -p ${CONDA_BLD_DIR}/libraft
+    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft/work
+  fi
+else
+  gpuci_logger "SKIPPING build of conda packages for libraft-nn, libraft-distance and libraft-headers"
+fi
+
+if [ "$BUILD_raft" == "1" ]; then
+  gpuci_logger "Building conda packages for pyraft"
+  if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
+    gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pyraft --python=$PYTHON
+  else
+    gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pyraft -c ${CONDA_LOCAL_CHANNEL} --dirty --no-remove-work-dir --python=$PYTHON
+    mkdir -p ${CONDA_BLD_DIR}/pyraft
+    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/pyraft/work
+  fi
+else
+  gpuci_logger "SKIPPING build of conda packages for pyraft"
+fi
+
+################################################################################
+# UPLOAD - Conda packages
+################################################################################
+
+gpuci_logger "Upload conda packages"
+source ci/cpu/upload.sh
\ No newline at end of file
diff --git a/ci/cpu/prebuild.sh b/ci/cpu/prebuild.sh
new file mode 100755
index 0000000000..e37bcedb52
--- /dev/null
+++ b/ci/cpu/prebuild.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+# Copyright (c) 2022, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
+    #If project flash is not activate, always build both
+    export BUILD_PYRAFT=1
+    export BUILD_LIBRAFT=1
+fi
+
+export UPLOAD_LIBRAFT=1
diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
new file mode 100755
index 0000000000..83d1686c0c
--- /dev/null
+++ b/ci/cpu/upload.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Adopted from https://github.com/tmcdonell/travis-scripts/blob/dfaac280ac2082cd6bcaba3217428347899f2975/update-accelerate-buildbot.sh
+
+set -e
+
+# Setup 'gpuci_retry' for upload retries (results in 4 total attempts)
+export GPUCI_RETRY_MAX=3
+export GPUCI_RETRY_SLEEP=30
+
+# Set default label options if they are not defined elsewhere
+export LABEL_OPTION=${LABEL_OPTION:-"--label main"}
+
+# Skip uploads unless BUILD_MODE == "branch"
+if [ ${BUILD_MODE} != "branch" ]; then
+  echo "Skipping upload"
+  return 0
+fi
+
+# Skip uploads if there is no upload key
+if [ -z "$MY_UPLOAD_KEY" ]; then
+  echo "No upload key"
+  return 0
+fi
+
+################################################################################
+# SETUP - Get conda file output locations
+################################################################################
+
+gpuci_logger "Get conda file output locations"
+
+export LIBRAFT_NN_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_nn --output`
+export LIBRAFT_DISTANCE_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_distance --output`
+export LIBRAFT_HEADERS_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/libraft_headers --output`
+export PYRAFT_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/pyraft --python=$PYTHON --output`
+
+################################################################################
+# UPLOAD - Conda packages
+################################################################################
+
+gpuci_logger "Starting conda uploads"
+
+if [[ "$BUILD_LIBRAFT" == "1" && "$UPLOAD_LIBRAFT" == "1" ]]; then
+  # libraft-nn
+  test -e ${LIBRAFT_NN_FILE}
+  echo "Upload libraft-nn"
+  echo ${LIBRAFT_NN_FILE}
+  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBRAFT_NN_FILE} --no-progress
+
+  # libraft-distance
+  test -e ${LIBRAFT_DISTANCE_FILE}
+  echo "Upload libraft-distance"
+  echo ${LIBRAFT_DISTANCE_FILE}
+  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBRAFT_DISTANCE_FILE} --no-progress
+
+  # libraft-headers
+  test -e ${LIBRAFT_HEADERS_FILE}
+  echo "Upload libraft-nn"
+  echo ${LIBRAFT_HEADERS_FILE}
+  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBRAFT_HEADERS_FILE} --no-progress
+fi
+
+if [[ "$BUILD_PYRAFT" == "1" ]]; then
+  test -e ${PYRAFT_FILE}
+  echo "Upload pyraft"
+  echo ${PYRAFT_FILE}
+  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${PYRAFT_FILE} --no-progress
+fi

From 7568d4a478578e5c712b8b3074cb8f55fcb69f5f Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 3 Feb 2022 22:55:36 +0800
Subject: [PATCH 092/171] Span implementation. (#399)

The implementation is largely ported from xgboost with some simplification and cleanups.

The one in XGBoost was modeled after the core guideline support library instead of std, which
was still a draft back then.  The one in this PR uses a plain pointer as the iterator and doesn't
have bound check in release mode.  ISO span uses `size_type` for index operator, this PR defines
a template parameter to allow indexing using other integer types.

* Implement host_span
* Implement device_span

Authors:
  - Jiaming Yuan (https://github.com/trivialfis)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/raft/pull/399
---
 cpp/include/raft/detail/span.hpp |  90 +++++++
 cpp/include/raft/span.hpp        | 283 +++++++++++++++++++++
 cpp/test/CMakeLists.txt          |   2 +
 cpp/test/span.cpp                | 419 +++++++++++++++++++++++++++++++
 cpp/test/span.cu                 | 213 ++++++++++++++++
 cpp/test/test_span.hpp           | 239 ++++++++++++++++++
 6 files changed, 1246 insertions(+)
 create mode 100644 cpp/include/raft/detail/span.hpp
 create mode 100644 cpp/include/raft/span.hpp
 create mode 100644 cpp/test/span.cpp
 create mode 100644 cpp/test/span.cu
 create mode 100644 cpp/test/test_span.hpp

diff --git a/cpp/include/raft/detail/span.hpp b/cpp/include/raft/detail/span.hpp
new file mode 100644
index 0000000000..aa598caf32
--- /dev/null
+++ b/cpp/include/raft/detail/span.hpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <limits>                // numeric_limits
+#include <thrust/host_vector.h>  // __host__ __device__
+#include <type_traits>
+
+namespace raft {
+constexpr std::size_t dynamic_extent = std::numeric_limits<std::size_t>::max();
+
+template <class ElementType, bool is_device, std::size_t Extent>
+class span;
+
+namespace detail {
+/*!
+ * The extent E of the span returned by subspan is determined as follows:
+ *
+ *   - If Count is not dynamic_extent, Count;
+ *   - Otherwise, if Extent is not dynamic_extent, Extent - Offset;
+ *   - Otherwise, dynamic_extent.
+ */
+template <std::size_t Extent, std::size_t Offset, std::size_t Count>
+struct extent_value_t
+  : public std::integral_constant<
+      std::size_t,
+      Count != dynamic_extent ? Count : (Extent != dynamic_extent ? Extent - Offset : Extent)> {
+};
+
+/*!
+ * If N is dynamic_extent, the extent of the returned span E is also
+ * dynamic_extent; otherwise it is std::size_t(sizeof(T)) * N.
+ */
+template <typename T, std::size_t Extent>
+struct extent_as_bytes_value_t
+  : public std::integral_constant<std::size_t,
+                                  Extent == dynamic_extent ? Extent : sizeof(T) * Extent> {
+};
+
+template <std::size_t From, std::size_t To>
+struct is_allowed_extent_conversion_t
+  : public std::integral_constant<bool,
+                                  From == To || From == dynamic_extent || To == dynamic_extent> {
+};
+
+template <class From, class To>
+struct is_allowed_element_type_conversion_t
+  : public std::integral_constant<bool, std::is_convertible<From (*)[], To (*)[]>::value> {
+};
+
+template <class T>
+struct is_span_oracle_t : std::false_type {
+};
+
+template <class T, bool is_device, std::size_t Extent>
+struct is_span_oracle_t<span<T, is_device, Extent>> : std::true_type {
+};
+
+template <class T>
+struct is_span_t : public is_span_oracle_t<typename std::remove_cv<T>::type> {
+};
+
+template <class InputIt1, class InputIt2, class Compare>
+__host__ __device__ constexpr auto lexicographical_compare(InputIt1 first1,
+                                                           InputIt1 last1,
+                                                           InputIt2 first2,
+                                                           InputIt2 last2) -> bool
+{
+  Compare comp;
+  for (; first1 != last1 && first2 != last2; ++first1, ++first2) {
+    if (comp(*first1, *first2)) { return true; }
+    if (comp(*first2, *first1)) { return false; }
+  }
+  return first1 == last1 && first2 != last2;
+}
+}  // namespace detail
+}  // namespace raft
diff --git a/cpp/include/raft/span.hpp b/cpp/include/raft/span.hpp
new file mode 100644
index 0000000000..389a6a2177
--- /dev/null
+++ b/cpp/include/raft/span.hpp
@@ -0,0 +1,283 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cassert>
+#include <cinttypes>  // size_t
+#include <cstddef>    // std::byte
+#include <raft/detail/span.hpp>
+#include <thrust/functional.h>
+#include <thrust/host_vector.h>  // __host__ __device__
+#include <thrust/iterator/reverse_iterator.h>
+#include <type_traits>
+
+namespace raft {
+/**
+ * @brief The span class defined in ISO C++20.  Iterator is defined as plain pointer and
+ *        most of the methods have bound check on debug build.
+ *
+ * @code
+ *   rmm::device_uvector<float> uvec(10, rmm::cuda_stream_default);
+ *   auto view = device_span<float>{uvec.data(), uvec.size()};
+ * @endcode
+ */
+template <typename T, bool is_device, std::size_t Extent = dynamic_extent>
+class span {
+ public:
+  using element_type    = T;
+  using value_type      = typename std::remove_cv<T>::type;
+  using size_type       = std::size_t;
+  using difference_type = std::ptrdiff_t;
+  using pointer         = T*;
+  using const_pointer   = T const*;
+  using reference       = T&;
+  using const_reference = T const&;
+
+  using iterator               = pointer;
+  using const_iterator         = const_pointer;
+  using reverse_iterator       = thrust::reverse_iterator<iterator>;
+  using const_reverse_iterator = thrust::reverse_iterator<const_iterator>;
+
+  /**
+   * @brief Default constructor that constructs a span with size 0 and nullptr.
+   */
+  constexpr span() noexcept = default;
+
+  /**
+   * @brief Constructs a span that is a view over the range [first, first + count);
+   */
+  constexpr span(pointer ptr, size_type count) noexcept : size_(count), data_(ptr)
+  {
+    assert(!(Extent != dynamic_extent && count != Extent));
+    assert(ptr || count == 0);
+  }
+  /**
+   * @brief Constructs a span that is a view over the range [first, last)
+   */
+  constexpr span(pointer first, pointer last) noexcept : size_(last - first), data_(first)
+  {
+    assert(data_ || size_ == 0);
+  }
+  /**
+   * @brief Constructs a span that is a view over the array arr.
+   */
+  template <std::size_t N>
+  constexpr span(element_type (&arr)[N]) noexcept : size_(N), data_(&arr[0])
+  {
+  }
+
+  /**
+   * @brief Initialize a span class from another one who's underlying type is convertible
+   *        to element_type.
+   */
+  template <class U,
+            std::size_t OtherExtent,
+            class = typename std::enable_if<
+              detail::is_allowed_element_type_conversion_t<U, T>::value &&
+              detail::is_allowed_extent_conversion_t<OtherExtent, Extent>::value>>
+  constexpr span(const span<U, is_device, OtherExtent>& other) noexcept
+    : size_(other.size()), data_(other.data())
+  {
+  }
+
+  constexpr span(span const& other) noexcept = default;
+  constexpr span(span&& other) noexcept      = default;
+
+  constexpr auto operator=(span const& other) noexcept -> span& = default;
+  constexpr auto operator=(span&& other) noexcept -> span& = default;
+
+  constexpr auto begin() const noexcept -> iterator { return data(); }
+
+  constexpr auto end() const noexcept -> iterator { return data() + size(); }
+
+  constexpr auto cbegin() const noexcept -> const_iterator { return data(); }
+
+  constexpr auto cend() const noexcept -> const_iterator { return data() + size(); }
+
+  __host__ __device__ constexpr auto rbegin() const noexcept -> reverse_iterator
+  {
+    return reverse_iterator{end()};
+  }
+
+  __host__ __device__ constexpr auto rend() const noexcept -> reverse_iterator
+  {
+    return reverse_iterator{begin()};
+  }
+
+  __host__ __device__ constexpr auto crbegin() const noexcept -> const_reverse_iterator
+  {
+    return const_reverse_iterator{cend()};
+  }
+
+  __host__ __device__ constexpr auto crend() const noexcept -> const_reverse_iterator
+  {
+    return const_reverse_iterator{cbegin()};
+  }
+
+  // element access
+  constexpr auto front() const -> reference { return (*this)[0]; }
+
+  constexpr auto back() const -> reference { return (*this)[size() - 1]; }
+
+  template <typename Index>
+  constexpr auto operator[](Index _idx) const -> reference
+  {
+    assert(static_cast<size_type>(_idx) < size());
+    return data()[_idx];
+  }
+
+  constexpr auto data() const noexcept -> pointer { return data_; }
+
+  // Observers
+  [[nodiscard]] constexpr auto size() const noexcept -> size_type { return size_; }
+  [[nodiscard]] constexpr auto size_bytes() const noexcept -> size_type
+  {
+    return size() * sizeof(T);
+  }
+
+  constexpr auto empty() const noexcept { return size() == 0; }
+
+  // Subviews
+  template <std::size_t Count>
+  constexpr auto first() const -> span<element_type, is_device, Count>
+  {
+    assert(Count <= size());
+    return {data(), Count};
+  }
+
+  constexpr auto first(std::size_t _count) const -> span<element_type, is_device, dynamic_extent>
+  {
+    assert(_count <= size());
+    return {data(), _count};
+  }
+
+  template <std::size_t Count>
+  constexpr auto last() const -> span<element_type, is_device, Count>
+  {
+    assert(Count <= size());
+    return {data() + size() - Count, Count};
+  }
+
+  constexpr auto last(std::size_t _count) const -> span<element_type, is_device, dynamic_extent>
+  {
+    assert(_count <= size());
+    return subspan(size() - _count, _count);
+  }
+
+  /*!
+   * If Count is std::dynamic_extent, r.size() == this->size() - Offset;
+   * Otherwise r.size() == Count.
+   */
+  template <std::size_t Offset, std::size_t Count = dynamic_extent>
+  constexpr auto subspan() const
+    -> span<element_type, is_device, detail::extent_value_t<Extent, Offset, Count>::value>
+  {
+    assert((Count == dynamic_extent) ? (Offset <= size()) : (Offset + Count <= size()));
+    return {data() + Offset, Count == dynamic_extent ? size() - Offset : Count};
+  }
+
+  constexpr auto subspan(size_type _offset, size_type _count = dynamic_extent) const
+    -> span<element_type, is_device, dynamic_extent>
+  {
+    assert((_count == dynamic_extent) ? (_offset <= size()) : (_offset + _count <= size()));
+    return {data() + _offset, _count == dynamic_extent ? size() - _offset : _count};
+  }
+
+ private:
+  size_type size_{0};
+  pointer data_{nullptr};
+};
+
+/**
+ * @brief A span class for host pointer.
+ */
+template <typename T, size_t extent = dynamic_extent>
+using host_span = span<T, false, extent>;
+
+/**
+ * @brief A span class for device pointer.
+ */
+template <typename T, size_t extent = dynamic_extent>
+using device_span = span<T, true, extent>;
+
+template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
+constexpr auto operator==(span<T, is_device, X> l, span<U, is_device, Y> r) -> bool
+{
+  if (l.size() != r.size()) { return false; }
+  for (auto l_beg = l.cbegin(), r_beg = r.cbegin(); l_beg != l.cend(); ++l_beg, ++r_beg) {
+    if (*l_beg != *r_beg) { return false; }
+  }
+  return true;
+}
+
+template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
+constexpr auto operator!=(span<T, is_device, X> l, span<U, is_device, Y> r)
+{
+  return !(l == r);
+}
+
+template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
+constexpr auto operator<(span<T, is_device, X> l, span<U, is_device, Y> r)
+{
+  return detail::lexicographical_compare<
+    typename span<T, is_device, X>::iterator,
+    typename span<U, is_device, Y>::iterator,
+    thrust::less<typename span<T, is_device, X>::element_type>>(
+    l.begin(), l.end(), r.begin(), r.end());
+}
+
+template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
+constexpr auto operator<=(span<T, is_device, X> l, span<U, is_device, Y> r)
+{
+  return !(l > r);
+}
+
+template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
+constexpr auto operator>(span<T, is_device, X> l, span<U, is_device, Y> r)
+{
+  return detail::lexicographical_compare<
+    typename span<T, is_device, X>::iterator,
+    typename span<U, is_device, Y>::iterator,
+    thrust::greater<typename span<T, is_device, X>::element_type>>(
+    l.begin(), l.end(), r.begin(), r.end());
+}
+
+template <class T, std::size_t X, class U, std::size_t Y, bool is_device>
+constexpr auto operator>=(span<T, is_device, X> l, span<U, is_device, Y> r)
+{
+  return !(l < r);
+}
+
+/**
+ * @brief Converts a span into a view of its underlying bytes
+ */
+template <class T, bool is_device, std::size_t E>
+auto as_bytes(span<T, is_device, E> s) noexcept
+  -> span<const std::byte, is_device, detail::extent_as_bytes_value_t<T, E>::value>
+{
+  return {reinterpret_cast<const std::byte*>(s.data()), s.size_bytes()};
+}
+
+/**
+ * @brief Converts a span into a mutable view of its underlying bytes
+ */
+template <class T, bool is_device, std::size_t E>
+auto as_writable_bytes(span<T, is_device, E> s) noexcept
+  -> span<std::byte, is_device, detail::extent_as_bytes_value_t<T, E>::value>
+{
+  return {reinterpret_cast<std::byte*>(s.data()), s.size_bytes()};
+}
+}  // namespace raft
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 07f04ad2ab..c8b128a0f3 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -70,6 +70,8 @@ add_executable(test_raft
     test/random/rng.cu
     test/random/rng_int.cu
     test/random/sample_without_replacement.cu
+    test/span.cpp
+    test/span.cu
     test/sparse/add.cu
     test/sparse/convert_coo.cu
     test/sparse/convert_csr.cu
diff --git a/cpp/test/span.cpp b/cpp/test/span.cpp
new file mode 100644
index 0000000000..6163811b95
--- /dev/null
+++ b/cpp/test/span.cpp
@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "test_span.hpp"
+#include <gtest/gtest.h>
+#include <numeric>  // iota
+#include <raft/span.hpp>
+
+namespace raft {
+TEST(Span, DlfConstructors)
+{
+  // Dynamic extent
+  {
+    host_span<int> s;
+    ASSERT_EQ(s.size(), 0);
+    ASSERT_EQ(s.data(), nullptr);
+
+    host_span<int const> cs;
+    ASSERT_EQ(cs.size(), 0);
+    ASSERT_EQ(cs.data(), nullptr);
+  }
+
+  // Static extent
+  {
+    host_span<int, 0> s;
+    ASSERT_EQ(s.size(), 0);
+    ASSERT_EQ(s.data(), nullptr);
+
+    host_span<int const, 0> cs;
+    ASSERT_EQ(cs.size(), 0);
+    ASSERT_EQ(cs.data(), nullptr);
+  }
+
+  // Init list.
+  {
+    host_span<float> s{};
+    ASSERT_EQ(s.size(), 0);
+    ASSERT_EQ(s.data(), nullptr);
+
+    host_span<int const> cs{};
+    ASSERT_EQ(cs.size(), 0);
+    ASSERT_EQ(cs.data(), nullptr);
+  }
+}
+
+TEST(Span, FromNullPtr)
+{
+  // dynamic extent
+  {
+    host_span<float> s{nullptr, static_cast<host_span<float>::size_type>(0)};
+    ASSERT_EQ(s.size(), 0);
+    ASSERT_EQ(s.data(), nullptr);
+
+    host_span<float const> cs{nullptr, static_cast<host_span<float>::size_type>(0)};
+    ASSERT_EQ(cs.size(), 0);
+    ASSERT_EQ(cs.data(), nullptr);
+  }
+  // static extent
+  {
+    host_span<float, 0> s{nullptr, static_cast<host_span<float>::size_type>(0)};
+    ASSERT_EQ(s.size(), 0);
+    ASSERT_EQ(s.data(), nullptr);
+
+    host_span<float const, 0> cs{nullptr, static_cast<host_span<float>::size_type>(0)};
+    ASSERT_EQ(cs.size(), 0);
+    ASSERT_EQ(cs.data(), nullptr);
+  }
+}
+
+TEST(Span, FromPtrLen)
+{
+  float arr[16];
+  std::iota(arr, arr + 16, 0);
+
+  // static extent
+  {
+    host_span<float> s(arr, 16);
+    ASSERT_EQ(s.size(), 16);
+    ASSERT_EQ(s.data(), arr);
+
+    for (host_span<float>::size_type i = 0; i < 16; ++i) {
+      ASSERT_EQ(s[i], arr[i]);
+    }
+
+    host_span<float const> cs(arr, 16);
+    ASSERT_EQ(cs.size(), 16);
+    ASSERT_EQ(cs.data(), arr);
+
+    for (host_span<float const>::size_type i = 0; i < 16; ++i) {
+      ASSERT_EQ(cs[i], arr[i]);
+    }
+  }
+
+  // dynamic extent
+  {
+    host_span<float, 16> s(arr, 16);
+    ASSERT_EQ(s.size(), 16);
+    ASSERT_EQ(s.data(), arr);
+
+    for (size_t i = 0; i < 16; ++i) {
+      ASSERT_EQ(s[i], arr[i]);
+    }
+
+    host_span<float const, 16> cs(arr, 16);
+    ASSERT_EQ(cs.size(), 16);
+    ASSERT_EQ(cs.data(), arr);
+
+    for (host_span<float const>::size_type i = 0; i < 16; ++i) {
+      ASSERT_EQ(cs[i], arr[i]);
+    }
+  }
+}
+
+TEST(Span, FromFirstLast)
+{
+  float arr[16];
+  initialize_range(arr, arr + 16);
+
+  // dynamic extent
+  {
+    host_span<float> s(arr, arr + 16);
+    ASSERT_EQ(s.size(), 16);
+    ASSERT_EQ(s.data(), arr);
+    ASSERT_EQ(s.data() + s.size(), arr + 16);
+
+    for (size_t i = 0; i < 16; ++i) {
+      ASSERT_EQ(s[i], arr[i]);
+    }
+
+    host_span<float const> cs(arr, arr + 16);
+    ASSERT_EQ(cs.size(), 16);
+    ASSERT_EQ(cs.data(), arr);
+    ASSERT_EQ(cs.data() + cs.size(), arr + 16);
+
+    for (size_t i = 0; i < 16; ++i) {
+      ASSERT_EQ(cs[i], arr[i]);
+    }
+  }
+
+  // static extent
+  {
+    host_span<float, 16> s(arr, arr + 16);
+    ASSERT_EQ(s.size(), 16);
+    ASSERT_EQ(s.data(), arr);
+    ASSERT_EQ(s.data() + s.size(), arr + 16);
+
+    for (size_t i = 0; i < 16; ++i) {
+      ASSERT_EQ(s[i], arr[i]);
+    }
+
+    host_span<float const> cs(arr, arr + 16);
+    ASSERT_EQ(cs.size(), 16);
+    ASSERT_EQ(cs.data(), arr);
+    ASSERT_EQ(cs.data() + cs.size(), arr + 16);
+
+    for (size_t i = 0; i < 16; ++i) {
+      ASSERT_EQ(cs[i], arr[i]);
+    }
+  }
+}
+
+namespace {
+struct base_class_t {
+  virtual void operator()() {}
+};
+struct derived_class_t : public base_class_t {
+  void operator()() override {}
+};
+}  // anonymous namespace
+
+TEST(Span, FromOther)
+{
+  // convert constructor
+  {
+    host_span<derived_class_t> derived;
+    host_span<base_class_t> base{derived};
+    ASSERT_EQ(base.size(), derived.size());
+    ASSERT_EQ(base.data(), derived.data());
+  }
+
+  float arr[16];
+  initialize_range(arr, arr + 16);
+
+  // default copy constructor
+  {
+    host_span<float> s0(arr);
+    host_span<float> s1(s0);
+    ASSERT_EQ(s0.size(), s1.size());
+    ASSERT_EQ(s0.data(), s1.data());
+  }
+}
+
+TEST(Span, FromArray)
+{
+  float arr[16];
+  initialize_range(arr, arr + 16);
+
+  {
+    host_span<float> s(arr);
+    ASSERT_EQ(&arr[0], s.data());
+    ASSERT_EQ(s.size(), 16);
+    for (size_t i = 0; i < 16; ++i) {
+      ASSERT_EQ(arr[i], s[i]);
+    }
+  }
+
+  {
+    host_span<float, 16> s(arr);
+    ASSERT_EQ(&arr[0], s.data());
+    ASSERT_EQ(s.size(), 16);
+    for (size_t i = 0; i < 16; ++i) {
+      ASSERT_EQ(arr[i], s[i]);
+    }
+  }
+}
+
+TEST(Span, Assignment)
+{
+  int status = 1;
+  test_assignment_t<false>{&status}();
+  ASSERT_EQ(status, 1);
+}
+
+TEST(Span, BeginEnd)
+{
+  int status = 1;
+  test_beginend_t<false>{&status}();
+  ASSERT_EQ(status, 1);
+}
+
+TEST(Span, ElementAccess)
+{
+  float arr[16];
+  initialize_range(arr, arr + 16);
+
+  host_span<float> s(arr);
+  size_t j = 0;
+  for (auto i : s) {
+    ASSERT_EQ(i, arr[j]);
+    ++j;
+  }
+}
+
+TEST(Span, Obversers)
+{
+  int status = 1;
+  test_observers_t<false>{&status}();
+  ASSERT_EQ(status, 1);
+}
+
+TEST(Span, FrontBack)
+{
+  {
+    float arr[4]{0, 1, 2, 3};
+    host_span<float, 4> s(arr);
+    ASSERT_EQ(s.front(), 0);
+    ASSERT_EQ(s.back(), 3);
+  }
+  {
+    std::vector<double> arr{0, 1, 2, 3};
+    host_span<double> s(arr.data(), arr.size());
+    ASSERT_EQ(s.front(), 0);
+    ASSERT_EQ(s.back(), 3);
+  }
+}
+
+TEST(Span, FirstLast)
+{
+  // static extent
+  {
+    float arr[16];
+    initialize_range(arr, arr + 16);
+
+    host_span<float> s(arr);
+    host_span<float, 4> first = s.first<4>();
+
+    ASSERT_EQ(first.size(), 4);
+    ASSERT_EQ(first.data(), arr);
+
+    for (size_t i = 0; i < first.size(); ++i) {
+      ASSERT_EQ(first[i], arr[i]);
+    }
+  }
+
+  {
+    float arr[16];
+    initialize_range(arr, arr + 16);
+
+    host_span<float> s(arr);
+    host_span<float, 4> last = s.last<4>();
+
+    ASSERT_EQ(last.size(), 4);
+    ASSERT_EQ(last.data(), arr + 12);
+
+    for (size_t i = 0; i < last.size(); ++i) {
+      ASSERT_EQ(last[i], arr[i + 12]);
+    }
+  }
+
+  // dynamic extent
+  {
+    float* arr = new float[16];
+    initialize_range(arr, arr + 16);
+    host_span<float> s(arr, 16);
+    host_span<float> first = s.first(4);
+
+    ASSERT_EQ(first.size(), 4);
+    ASSERT_EQ(first.data(), s.data());
+
+    for (size_t i = 0; i < first.size(); ++i) {
+      ASSERT_EQ(first[i], s[i]);
+    }
+
+    delete[] arr;
+  }
+
+  {
+    float* arr = new float[16];
+    initialize_range(arr, arr + 16);
+    host_span<float> s(arr, 16);
+    host_span<float> last = s.last(4);
+
+    ASSERT_EQ(last.size(), 4);
+    ASSERT_EQ(last.data(), s.data() + 12);
+
+    for (size_t i = 0; i < last.size(); ++i) {
+      ASSERT_EQ(s[12 + i], last[i]);
+    }
+
+    delete[] arr;
+  }
+}
+
+TEST(Span, Subspan)
+{
+  int arr[16]{0};
+  host_span<int> s1(arr);
+  auto s2 = s1.subspan<4>();
+  ASSERT_EQ(s1.size() - 4, s2.size());
+
+  auto s3 = s1.subspan(2, 4);
+  ASSERT_EQ(s1.data() + 2, s3.data());
+  ASSERT_EQ(s3.size(), 4);
+
+  auto s4 = s1.subspan(2, dynamic_extent);
+  ASSERT_EQ(s1.data() + 2, s4.data());
+  ASSERT_EQ(s4.size(), s1.size() - 2);
+}
+
+TEST(Span, Compare)
+{
+  int status = 1;
+  test_compare_t<false>{&status}();
+  ASSERT_EQ(status, 1);
+}
+
+TEST(Span, AsBytes)
+{
+  int status = 1;
+  test_as_bytes_t<false>{&status}();
+  ASSERT_EQ(status, 1);
+}
+
+TEST(Span, AsWritableBytes)
+{
+  int status = 1;
+  test_as_writable_bytes_t<false>{&status}();
+  ASSERT_EQ(status, 1);
+}
+
+TEST(Span, Empty)
+{
+  {
+    host_span<float> s{nullptr, static_cast<host_span<float>::size_type>(0)};
+    auto res = s.subspan(0);
+    ASSERT_EQ(res.data(), nullptr);
+    ASSERT_EQ(res.size(), 0);
+
+    res = s.subspan(0, 0);
+    ASSERT_EQ(res.data(), nullptr);
+    ASSERT_EQ(res.size(), 0);
+  }
+
+  {
+    host_span<float, 0> s{nullptr, static_cast<host_span<float>::size_type>(0)};
+    auto res = s.subspan(0);
+    ASSERT_EQ(res.data(), nullptr);
+    ASSERT_EQ(res.size(), 0);
+
+    res = s.subspan(0, 0);
+    ASSERT_EQ(res.data(), nullptr);
+    ASSERT_EQ(res.size(), 0);
+  }
+  {
+    // Should emit compile error
+    // host_span<float, 0> h{nullptr, 0ul};
+    // device_span<float, 0> d{h};
+  }
+}
+
+TEST(Span, RBeginREnd)
+{
+  int32_t status = 1;
+  test_rbeginrend_t<false>{&status}();
+  ASSERT_EQ(status, 1);
+}
+}  // namespace raft
diff --git a/cpp/test/span.cu b/cpp/test/span.cu
new file mode 100644
index 0000000000..e121cea108
--- /dev/null
+++ b/cpp/test/span.cu
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "test_span.hpp"
+#include <gtest/gtest.h>
+#include <numeric>  // iota
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/span.hpp>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+namespace raft {
+struct TestStatus {
+ private:
+  int* status_;
+
+ public:
+  TestStatus()
+  {
+    CUDA_CHECK(cudaMalloc(&status_, sizeof(int)));
+    int h_status = 1;
+    CUDA_CHECK(cudaMemcpy(status_, &h_status, sizeof(int), cudaMemcpyHostToDevice));
+  }
+  ~TestStatus() noexcept(false) { CUDA_CHECK(cudaFree(status_)); }
+
+  int Get()
+  {
+    int h_status;
+    CUDA_CHECK(cudaMemcpy(&h_status, status_, sizeof(int), cudaMemcpyDeviceToHost));
+    return h_status;
+  }
+
+  int* Data() { return status_; }
+};
+
+__global__ void TestFromOtherKernel(device_span<float> span)
+{
+  // don't get optimized out
+  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (idx >= span.size()) { return; }
+}
+// Test converting different T
+__global__ void TestFromOtherKernelConst(device_span<float const, 16> span)
+{
+  // don't get optimized out
+  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (idx >= span.size()) { return; }
+}
+
+/*!
+ * \brief Here we just test whether the code compiles.
+ */
+TEST(GPUSpan, FromOther)
+{
+  thrust::host_vector<float> h_vec(16);
+  std::iota(h_vec.begin(), h_vec.end(), 0);
+
+  thrust::device_vector<float> d_vec(h_vec.size());
+  thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin());
+  // dynamic extent
+  {
+    device_span<float> span(d_vec.data().get(), d_vec.size());
+    TestFromOtherKernel<<<1, 16>>>(span);
+  }
+  {
+    device_span<float> span(d_vec.data().get(), d_vec.size());
+    TestFromOtherKernelConst<<<1, 16>>>(span);
+  }
+  // static extent
+  {
+    device_span<float, 16> span(d_vec.data().get(), d_vec.data().get() + 16);
+    TestFromOtherKernel<<<1, 16>>>(span);
+  }
+  {
+    device_span<float, 16> span(d_vec.data().get(), d_vec.data().get() + 16);
+    TestFromOtherKernelConst<<<1, 16>>>(span);
+  }
+}
+
+TEST(GPUSpan, Assignment)
+{
+  TestStatus status;
+  thrust::for_each_n(
+    thrust::make_counting_iterator(0ul), 16, test_assignment_t<true>{status.Data()});
+  ASSERT_EQ(status.Get(), 1);
+}
+
+TEST(GPUSpan, TestStatus)
+{
+  TestStatus status;
+  thrust::for_each_n(thrust::make_counting_iterator(0ul), 16, test_test_status_t{status.Data()});
+  ASSERT_EQ(status.Get(), -1);
+}
+
+template <typename T>
+struct TestEqual {
+ private:
+  T *lhs_, *rhs_;
+  int* status_;
+
+ public:
+  TestEqual(T* _lhs, T* _rhs, int* _status) : lhs_(_lhs), rhs_(_rhs), status_(_status) {}
+
+  HD void operator()(size_t _idx)
+  {
+    bool res = lhs_[_idx] == rhs_[_idx];
+    SPAN_ASSERT_TRUE(res, status_);
+  }
+};
+
+TEST(GPUSpan, WithTrust)
+{
+  // Not adviced to initialize span with host_vector, since h_vec.data() is
+  // a host function.
+  thrust::host_vector<float> h_vec(16);
+  std::iota(h_vec.begin(), h_vec.end(), 0);
+
+  thrust::device_vector<float> d_vec(h_vec.size());
+  thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin());
+
+  // Can't initialize span with device_vector, since d_vec.data() is not raw
+  // pointer
+  {
+    device_span<float> s(d_vec.data().get(), d_vec.size());
+
+    ASSERT_EQ(d_vec.size(), s.size());
+    ASSERT_EQ(d_vec.data().get(), s.data());
+  }
+
+  {
+    TestStatus status;
+    thrust::device_vector<float> d_vec1(d_vec.size());
+    thrust::copy(thrust::device, d_vec.begin(), d_vec.end(), d_vec1.begin());
+    device_span<float> s(d_vec1.data().get(), d_vec.size());
+
+    thrust::for_each_n(
+      thrust::make_counting_iterator(0ul),
+      16,
+      TestEqual<float>{thrust::raw_pointer_cast(d_vec1.data()), s.data(), status.Data()});
+    ASSERT_EQ(status.Get(), 1);
+  }
+}
+
+TEST(GPUSpan, BeginEnd)
+{
+  TestStatus status;
+  thrust::for_each_n(thrust::make_counting_iterator(0ul), 16, test_beginend_t<true>{status.Data()});
+  ASSERT_EQ(status.Get(), 1);
+}
+
+TEST(GPUSpan, RBeginREnd)
+{
+  TestStatus status;
+  thrust::for_each_n(
+    thrust::make_counting_iterator(0ul), 16, test_rbeginrend_t<true>{status.Data()});
+  ASSERT_EQ(status.Get(), 1);
+}
+
+__global__ void TestModifyKernel(device_span<float> span)
+{
+  size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (idx >= span.size()) { return; }
+  span[idx] = span.size() - idx;
+}
+
+TEST(GPUSpan, Modify)
+{
+  thrust::host_vector<float> h_vec(16);
+  std::iota(h_vec.begin(), h_vec.end(), 0);
+
+  thrust::device_vector<float> d_vec(h_vec.size());
+  thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin());
+
+  device_span<float> span(d_vec.data().get(), d_vec.size());
+
+  TestModifyKernel<<<1, 16>>>(span);
+
+  for (size_t i = 0; i < d_vec.size(); ++i) {
+    ASSERT_EQ(d_vec[i], d_vec.size() - i);
+  }
+}
+
+TEST(GPUSpan, Observers)
+{
+  TestStatus status;
+  thrust::for_each_n(
+    thrust::make_counting_iterator(0ul), 16, test_observers_t<true>{status.Data()});
+  ASSERT_EQ(status.Get(), 1);
+}
+
+TEST(GPUSpan, Compare)
+{
+  TestStatus status;
+  thrust::for_each_n(thrust::make_counting_iterator(0), 1, test_compare_t<false>{status.Data()});
+  ASSERT_EQ(status.Get(), 1);
+}
+}  // namespace raft
diff --git a/cpp/test/test_span.hpp b/cpp/test/test_span.hpp
new file mode 100644
index 0000000000..254c89f91c
--- /dev/null
+++ b/cpp/test/test_span.hpp
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <raft/span.hpp>
+
+namespace raft {
+
+template <typename Iter>
+__host__ __device__ void initialize_range(Iter _begin, Iter _end)
+{
+  float j = 0;
+  for (Iter i = _begin; i != _end; ++i, ++j) {
+    *i = j;
+  }
+}
+#define SPAN_ASSERT_TRUE(cond, status) \
+  if (!(cond)) { *(status) = -1; }
+
+#define SPAN_ASSERT_FALSE(cond, status) \
+  if ((cond)) { *(status) = -1; }
+
+struct test_test_status_t {
+  int* status;
+
+  explicit test_test_status_t(int* _status) : status(_status) {}
+
+  __host__ __device__ void operator()() { this->operator()(0); }
+  __host__ __device__ void operator()(int _idx) { SPAN_ASSERT_TRUE(false, status); }
+};
+
+template <bool is_device>
+struct test_assignment_t {
+  int* status;
+
+  explicit test_assignment_t(int* _status) : status(_status) {}
+
+  __host__ __device__ void operator()() { this->operator()(0); }
+  __host__ __device__ void operator()(int _idx)
+  {
+    span<float, is_device> s1;
+
+    float arr[] = {3, 4, 5};
+
+    span<const float, is_device> s2 = arr;
+    SPAN_ASSERT_TRUE(s2.size() == 3, status);
+    SPAN_ASSERT_TRUE(s2.data() == &arr[0], status);
+
+    s2 = s1;
+    SPAN_ASSERT_TRUE(s2.empty(), status);
+  }
+};
+
+template <bool is_device>
+struct test_beginend_t {
+  int* status;
+
+  explicit test_beginend_t(int* _status) : status(_status) {}
+
+  __host__ __device__ void operator()() { this->operator()(0); }
+  __host__ __device__ void operator()(int _idx)
+  {
+    float arr[16];
+    initialize_range(arr, arr + 16);
+
+    span<float, is_device> s(arr);
+    typename span<float, is_device>::iterator beg{s.begin()};
+    typename span<float, is_device>::iterator end{s.end()};
+
+    SPAN_ASSERT_TRUE(end == beg + 16, status);
+    SPAN_ASSERT_TRUE(*beg == arr[0], status);
+    SPAN_ASSERT_TRUE(*(end - 1) == arr[15], status);
+  }
+};
+
+template <bool is_device>
+struct test_rbeginrend_t {
+  int* status;
+
+  explicit test_rbeginrend_t(int* _status) : status(_status) {}
+
+  __host__ __device__ void operator()() { this->operator()(0); }
+  __host__ __device__ void operator()(int _idx)
+  {
+    float arr[16];
+    initialize_range(arr, arr + 16);
+
+    span<float, is_device> s(arr);
+    s.rbegin();
+    typename span<float, is_device>::reverse_iterator rbeg{s.rbegin()};
+    typename span<float, is_device>::reverse_iterator rend{s.rend()};
+
+    SPAN_ASSERT_TRUE(rbeg + 16 == rend, status);
+    SPAN_ASSERT_TRUE(*(rbeg) == arr[15], status);
+    SPAN_ASSERT_TRUE(*(rend - 1) == arr[0], status);
+
+    typename span<float, is_device>::const_reverse_iterator crbeg{s.crbegin()};
+    typename span<float, is_device>::const_reverse_iterator crend{s.crend()};
+
+    SPAN_ASSERT_TRUE(crbeg + 16 == crend, status);
+    SPAN_ASSERT_TRUE(*(crbeg) == arr[15], status);
+    SPAN_ASSERT_TRUE(*(crend - 1) == arr[0], status);
+  }
+};
+
+template <bool is_device>
+struct test_observers_t {
+  int* status;
+
+  explicit test_observers_t(int* _status) : status(_status) {}
+
+  __host__ __device__ void operator()() { this->operator()(0); }
+  __host__ __device__ void operator()(int _idx)
+  {
+    // empty
+    {
+      float* arr = nullptr;
+      span<float, is_device> s(arr, static_cast<typename span<float, is_device>::size_type>(0));
+      SPAN_ASSERT_TRUE(s.empty(), status);
+    }
+
+    // size, size_types
+    {
+      float* arr = new float[16];
+      span<float, is_device> s(arr, 16);
+      SPAN_ASSERT_TRUE(s.size() == 16, status);
+      SPAN_ASSERT_TRUE(s.size_bytes() == 16 * sizeof(float), status);
+      delete[] arr;
+    }
+  }
+};
+
+template <bool is_device>
+struct test_compare_t {
+  int* status;
+
+  explicit test_compare_t(int* _status) : status(_status) {}
+
+  __host__ __device__ void operator()() { this->operator()(0); }
+  __host__ __device__ void operator()(int _idx)
+  {
+    float lhs_arr[16], rhs_arr[16];
+    initialize_range(lhs_arr, lhs_arr + 16);
+    initialize_range(rhs_arr, rhs_arr + 16);
+
+    span<float, is_device> lhs(lhs_arr);
+    span<float, is_device> rhs(rhs_arr);
+
+    SPAN_ASSERT_TRUE(lhs == rhs, status);
+    SPAN_ASSERT_FALSE(lhs != rhs, status);
+
+    SPAN_ASSERT_TRUE(lhs <= rhs, status);
+    SPAN_ASSERT_TRUE(lhs >= rhs, status);
+
+    lhs[2] -= 1;
+
+    SPAN_ASSERT_FALSE(lhs == rhs, status);
+    SPAN_ASSERT_TRUE(lhs < rhs, status);
+    SPAN_ASSERT_FALSE(lhs > rhs, status);
+  }
+};
+
+template <bool is_device>
+struct test_as_bytes_t {
+  int* status;
+
+  explicit test_as_bytes_t(int* _status) : status(_status) {}
+
+  __host__ __device__ void operator()() { this->operator()(0); }
+  __host__ __device__ void operator()(int _idx)
+  {
+    float arr[16];
+    initialize_range(arr, arr + 16);
+
+    {
+      const span<const float, is_device> s{arr};
+      const span<const std::byte, is_device> bs = as_bytes(s);
+      SPAN_ASSERT_TRUE(bs.size() == s.size_bytes(), status);
+      SPAN_ASSERT_TRUE(static_cast<const void*>(bs.data()) == static_cast<const void*>(s.data()),
+                       status);
+    }
+
+    {
+      span<float, is_device> s;
+      const span<const std::byte, is_device> bs = as_bytes(s);
+      SPAN_ASSERT_TRUE(bs.size() == s.size(), status);
+      SPAN_ASSERT_TRUE(bs.size() == 0, status);
+      SPAN_ASSERT_TRUE(bs.size_bytes() == 0, status);
+      SPAN_ASSERT_TRUE(static_cast<const void*>(bs.data()) == static_cast<const void*>(s.data()),
+                       status);
+      SPAN_ASSERT_TRUE(bs.data() == nullptr, status);
+    }
+  }
+};
+
+template <bool is_device>
+struct test_as_writable_bytes_t {
+  int* status;
+
+  explicit test_as_writable_bytes_t(int* _status) : status(_status) {}
+
+  __host__ __device__ void operator()() { this->operator()(0); }
+  __host__ __device__ void operator()(int _idx)
+  {
+    float arr[16];
+    initialize_range(arr, arr + 16);
+
+    {
+      span<float, is_device> s;
+      span<std::byte, is_device> byte_s = as_writable_bytes(s);
+      SPAN_ASSERT_TRUE(byte_s.size() == s.size(), status);
+      SPAN_ASSERT_TRUE(byte_s.size_bytes() == s.size_bytes(), status);
+      SPAN_ASSERT_TRUE(byte_s.size() == 0, status);
+      SPAN_ASSERT_TRUE(byte_s.size_bytes() == 0, status);
+      SPAN_ASSERT_TRUE(byte_s.data() == nullptr, status);
+      SPAN_ASSERT_TRUE(static_cast<void*>(byte_s.data()) == static_cast<void*>(s.data()), status);
+    }
+
+    {
+      span<float, is_device> s{arr};
+      span<std::byte, is_device> bs{as_writable_bytes(s)};
+      SPAN_ASSERT_TRUE(s.size_bytes() == bs.size_bytes(), status);
+      SPAN_ASSERT_TRUE(static_cast<void*>(bs.data()) == static_cast<void*>(s.data()), status);
+    }
+  }
+};
+}  // namespace raft

From 7fb015858ed87c816eda759223df2c9e7043123e Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Thu, 3 Feb 2022 16:32:16 -0500
Subject: [PATCH 093/171] Replace `ccache` with `sccache` (#471)

This PR replaces `ccache` with `sccache`.

Authors:
  - AJ Schmidt (https://github.com/ajschmidt8)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/raft/pull/471
---
 ci/gpu/build.sh                          | 15 +++++++++++++--
 conda/recipes/libraft_distance/meta.yaml | 10 ++++++----
 conda/recipes/libraft_headers/meta.yaml  | 10 ++++++----
 conda/recipes/libraft_nn/meta.yaml       | 10 ++++++----
 4 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 8b6374c487..5df731ca9b 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -21,7 +21,7 @@ export CUDA_REL=${CUDA_VERSION%.*}
 # Set home to the job's workspace
 export HOME="$WORKSPACE"
 
-# Parse git describei
+# Parse git describe
 cd "$WORKSPACE"
 export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
@@ -29,6 +29,14 @@ export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 # ucx-py version
 export UCX_PY_VERSION='0.25.*'
 
+export CMAKE_CUDA_COMPILER_LAUNCHER="sccache"
+export CMAKE_CXX_COMPILER_LAUNCHER="sccache"
+export CMAKE_C_COMPILER_LAUNCHER="sccache"
+export SCCACHE_S3_KEY_PREFIX="libraft-$(uname -m)"
+export SCCACHE_BUCKET="rapids-sccache"
+export SCCACHE_REGION="us-west-2"
+export SCCACHE_IDLE_TIMEOUT="32768"
+
 ################################################################################
 # SETUP - Check environment
 ################################################################################
@@ -45,7 +53,7 @@ gpuci_logger "Activate conda env"
 . /opt/conda/etc/profile.d/conda.sh
 conda activate rapids
 gpuci_logger "Installing packages needed for RAFT"
-gpuci_mamba_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \
+gpuci_mamba_retry install -y -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \
       "cudatoolkit=${CUDA_REL}" \
       "libcusolver>=11.2.1" \
       "cudf=${MINOR_VERSION}" \
@@ -92,6 +100,9 @@ else
   "$WORKSPACE/build.sh" libraft pyraft libraft -v --compile-libs
 fi
 
+gpuci_logger "sccache stats"
+sccache --show-stats
+
 gpuci_logger "Building docs"
 "$WORKSPACE/build.sh" docs -v
 
diff --git a/conda/recipes/libraft_distance/meta.yaml b/conda/recipes/libraft_distance/meta.yaml
index 8c3a381a1f..f7aaf0d261 100644
--- a/conda/recipes/libraft_distance/meta.yaml
+++ b/conda/recipes/libraft_distance/meta.yaml
@@ -24,13 +24,15 @@ build:
     - PARALLEL_LEVEL
     - VERSION_SUFFIX
     - PROJECT_FLASH
-    - CCACHE_DIR
-    - CCACHE_NOHASHDIR
-    - CCACHE_COMPILERCHECK
     - CMAKE_GENERATOR
     - CMAKE_C_COMPILER_LAUNCHER
     - CMAKE_CXX_COMPILER_LAUNCHER
     - CMAKE_CUDA_COMPILER_LAUNCHER
+    - SCCACHE_S3_KEY_PREFIX=libraft-aarch64 # [aarch64]
+    - SCCACHE_S3_KEY_PREFIX=libraft-linux64 # [linux64]
+    - SCCACHE_BUCKET=rapids-sccache
+    - SCCACHE_REGION=us-west-2
+    - SCCACHE_IDLE_TIMEOUT=32768
 
 requirements:
   build:
@@ -54,4 +56,4 @@ about:
   home: http://rapids.ai/
   license: Apache-2.0
   # license_file: LICENSE
-  summary: libraft-distance library
\ No newline at end of file
+  summary: libraft-distance library
diff --git a/conda/recipes/libraft_headers/meta.yaml b/conda/recipes/libraft_headers/meta.yaml
index c7aa793ae5..a03146a7d5 100644
--- a/conda/recipes/libraft_headers/meta.yaml
+++ b/conda/recipes/libraft_headers/meta.yaml
@@ -24,13 +24,15 @@ build:
     - PARALLEL_LEVEL
     - VERSION_SUFFIX
     - PROJECT_FLASH
-    - CCACHE_DIR
-    - CCACHE_NOHASHDIR
-    - CCACHE_COMPILERCHECK
     - CMAKE_GENERATOR
     - CMAKE_C_COMPILER_LAUNCHER
     - CMAKE_CXX_COMPILER_LAUNCHER
     - CMAKE_CUDA_COMPILER_LAUNCHER
+    - SCCACHE_S3_KEY_PREFIX=libraft-aarch64 # [aarch64]
+    - SCCACHE_S3_KEY_PREFIX=libraft-linux64 # [linux64]
+    - SCCACHE_BUCKET=rapids-sccache
+    - SCCACHE_REGION=us-west-2
+    - SCCACHE_IDLE_TIMEOUT=32768
 
 requirements:
   build:
@@ -53,4 +55,4 @@ about:
   home: http://rapids.ai/
   license: Apache-2.0
   # license_file: LICENSE
-  summary: libraft-headers library
\ No newline at end of file
+  summary: libraft-headers library
diff --git a/conda/recipes/libraft_nn/meta.yaml b/conda/recipes/libraft_nn/meta.yaml
index 710e99902b..53f44f2e97 100644
--- a/conda/recipes/libraft_nn/meta.yaml
+++ b/conda/recipes/libraft_nn/meta.yaml
@@ -24,13 +24,15 @@ build:
     - PARALLEL_LEVEL
     - VERSION_SUFFIX
     - PROJECT_FLASH
-    - CCACHE_DIR
-    - CCACHE_NOHASHDIR
-    - CCACHE_COMPILERCHECK
     - CMAKE_GENERATOR
     - CMAKE_C_COMPILER_LAUNCHER
     - CMAKE_CXX_COMPILER_LAUNCHER
     - CMAKE_CUDA_COMPILER_LAUNCHER
+    - SCCACHE_S3_KEY_PREFIX=libraft-aarch64 # [aarch64]
+    - SCCACHE_S3_KEY_PREFIX=libraft-linux64 # [linux64]
+    - SCCACHE_BUCKET=rapids-sccache
+    - SCCACHE_REGION=us-west-2
+    - SCCACHE_IDLE_TIMEOUT=32768
 
 requirements:
   build:
@@ -53,4 +55,4 @@ about:
   home: http://rapids.ai/
   license: Apache-2.0
   # license_file: LICENSE
-  summary: libraft-nn library
\ No newline at end of file
+  summary: libraft-nn library

From 6e2fbacb0e9669023672c50e4212c8473e2e483d Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Fri, 4 Feb 2022 14:31:02 -0500
Subject: [PATCH 094/171] Adding fatbin to shared libs and fixing conda paths
 in cpu build (#485)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Robert Maynard (https://github.com/robertmaynard)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/485
---
 build.sh                                      | 50 ++++++++++-------
 ci/cpu/build.sh                               | 36 ++++++++-----
 ci/cpu/prebuild.sh                            |  2 +-
 ci/cpu/upload.sh                              | 10 ++--
 conda/recipes/libraft_distance/build.sh       |  2 +-
 conda/recipes/libraft_distance/meta.yaml      |  2 +-
 conda/recipes/libraft_headers/build.sh        |  2 +-
 conda/recipes/libraft_nn/build.sh             |  2 +-
 conda/recipes/pyraft/meta.yaml                |  7 ++-
 cpp/CMakeLists.txt                            | 54 +++++++++++++++----
 ...jensen_shannon_double_double_double_int.cu | 37 +++++++++++++
 ...> jensen_shannon_float_float_float_int.cu} | 14 -----
 ...jensen_shannon_float_float_float_uint32.cu | 39 ++++++++++++++
 .../kl_divergence_double_double_double_int.cu | 37 +++++++++++++
 .../kl_divergence_float_float_float_int.cu    | 37 +++++++++++++
 ...kl_divergence_float_float_float_uint32.cu} | 25 ---------
 .../detail/l1_double_double_double_int.cu     | 37 +++++++++++++
 .../detail/l1_float_float_float_int.cu        | 37 +++++++++++++
 .../{l1.cu => l1_float_float_float_uint32.cu} | 25 ---------
 .../l2_expanded_double_double_double_int.cu   | 36 +++++++++++++
 .../l2_expanded_float_float_float_int.cu      | 37 +++++++++++++
 ...> l2_expanded_float_float_float_uint32.cu} | 26 ---------
 ..._sqrt_expanded_double_double_double_int.cu | 37 +++++++++++++
 .../l2_sqrt_expanded_float_float_float_int.cu | 37 +++++++++++++
 ...sqrt_expanded_float_float_float_uint32.cu} | 25 ---------
 ...qrt_unexpanded_double_double_double_int.cu | 37 +++++++++++++
 ...2_sqrt_unexpanded_float_float_float_int.cu | 37 +++++++++++++
 ...rt_unexpanded_float_float_float_uint32.cu} | 25 ---------
 .../l2_unexpanded_double_double_double_int.cu | 37 +++++++++++++
 .../l2_unexpanded_float_float_float_int.cu    | 37 +++++++++++++
 ...l2_unexpanded_float_float_float_uint32.cu} | 25 ---------
 .../lp_unexpanded_double_double_double_int.cu | 37 +++++++++++++
 .../lp_unexpanded_float_float_float_int.cu    | 37 +++++++++++++
 ...lp_unexpanded_float_float_float_uint32.cu} | 25 ---------
 34 files changed, 705 insertions(+), 245 deletions(-)
 create mode 100644 cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
 rename cpp/src/distance/specializations/detail/{jensen_shannon.cu => jensen_shannon_float_float_float_int.cu} (80%)
 create mode 100644 cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu
 create mode 100644 cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
 create mode 100644 cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
 rename cpp/src/distance/specializations/detail/{kl_divergence.cu => kl_divergence_float_float_float_uint32.cu} (66%)
 create mode 100644 cpp/src/distance/specializations/detail/l1_double_double_double_int.cu
 create mode 100644 cpp/src/distance/specializations/detail/l1_float_float_float_int.cu
 rename cpp/src/distance/specializations/detail/{l1.cu => l1_float_float_float_uint32.cu} (66%)
 create mode 100644 cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
 create mode 100644 cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
 rename cpp/src/distance/specializations/detail/{l2_expanded.cu => l2_expanded_float_float_float_uint32.cu} (66%)
 create mode 100644 cpp/src/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu
 create mode 100644 cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu
 rename cpp/src/distance/specializations/detail/{l2_sqrt_expanded.cu => l2_sqrt_expanded_float_float_float_uint32.cu} (66%)
 create mode 100644 cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu
 create mode 100644 cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu
 rename cpp/src/distance/specializations/detail/{l2_sqrt_unexpanded.cu => l2_sqrt_unexpanded_float_float_float_uint32.cu} (65%)
 create mode 100644 cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
 create mode 100644 cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
 rename cpp/src/distance/specializations/detail/{l2_unexpanded.cu => l2_unexpanded_float_float_float_uint32.cu} (66%)
 create mode 100644 cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
 create mode 100644 cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
 rename cpp/src/distance/specializations/detail/{lp_unexpanded.cu => lp_unexpanded_float_float_float_uint32.cu} (66%)

diff --git a/build.sh b/build.sh
index a636d100b4..3c8524f6ce 100755
--- a/build.sh
+++ b/build.sh
@@ -18,7 +18,7 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libraft pyraft docs -v -g --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --nogtest --buildfaiss"
+VALIDARGS="clean libraft pyraft docs -v -g --noinstall --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --nogtest --buildfaiss"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
@@ -36,6 +36,7 @@ HELP="$0 [<target> ...] [<flag> ...]
    --allgpuarch     - build for all supported GPU architectures
    --buildfaiss     - build faiss statically into raft
    --nogtest        - do not build google tests for libraft
+   --noinstall     - do not install cmake targets
    --nvtx           - Enable nvtx for profiling support
    --show_depr_warn - show cmake deprecation warnings
    -h               - print this text
@@ -55,18 +56,21 @@ BUILD_ALL_GPU_ARCH=0
 BUILD_TESTS=YES
 BUILD_STATIC_FAISS=OFF
 COMPILE_LIBRARIES=${BUILD_TESTS}
+COMPILE_NN_LIBRARY=OFF
+COMPILE_DIST_LIBRARY=OFF
 ENABLE_NN_DEPENDENCIES=${BUILD_TESTS}
 SINGLEGPU=""
 NVTX=OFF
 CLEAN=0
 DISABLE_DEPRECATION_WARNINGS=ON
 CMAKE_TARGET=""
+INSTALL_TARGET="install"
 
 # Set defaults for vars that may not have been defined externally
 #  FIXME: if INSTALL_PREFIX is not set, check PREFIX, then check
 #         CONDA_PREFIX, but there is no fallback from there!
 INSTALL_PREFIX=${INSTALL_PREFIX:=${PREFIX:=${CONDA_PREFIX}}}
-PARALLEL_LEVEL=${PARALLEL_LEVEL:=""}
+PARALLEL_LEVEL=${PARALLEL_LEVEL:=`nproc`}
 BUILD_ABI=${BUILD_ABI:=ON}
 
 # Default to Ninja if generator is not specified
@@ -76,6 +80,10 @@ function hasArg {
     (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ")
 }
 
+if hasArg --noinstall; then
+    INSTALL_TARGET=""
+fi
+
 if hasArg -h || hasArg --help; then
     echo "${HELP}"
     exit 0
@@ -93,9 +101,8 @@ fi
 
 # Process flags
 if hasArg -v; then
-    VERBOSE_FLAG=-v
-    CMAKE_LOG_LEVEL="--log-level=VERBOSE"
-    set -x
+    VERBOSE_FLAG="-v"
+    CMAKE_LOG_LEVEL="VERBOSE"
 fi
 if hasArg -g; then
     BUILD_TYPE=Debug
@@ -110,13 +117,17 @@ if hasArg --nogtest; then
     ENABLE_NN_DEPENDENCIES=OFF
 fi
 
+if hasArg --compile-libs; then
+  COMPILE_LIBRARIES=ON
+fi
+
 if hasArg --compile-nn || hasArg --compile-libs; then
     ENABLE_NN_DEPENDENCIES=ON
-    COMPILE_LIBRARIES=ON
+    COMPILE_NN_LIBRARY=ON
     CMAKE_TARGET="raft_nn_lib;${CMAKE_TARGET}"
 fi
 if hasArg --compile-dist || hasArg --compile-libs; then
-    COMPILE_LIBRARIES=ON
+    COMPILE_DIST_LIBRARY=ON
     CMAKE_TARGET="raft_distance_lib;${CMAKE_TARGET}"
 fi
 
@@ -167,7 +178,9 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs; then
         echo "Building for *ALL* supported GPU architectures..."
     fi
 
-    cmake -S ${REPODIR}/cpp -B ${CPP_RAFT_BUILD_DIR} ${CMAKE_LOG_LEVEL} \
+    mkdir -p ${CPP_RAFT_BUILD_DIR}
+    cd ${CPP_RAFT_BUILD_DIR}
+    cmake -S ${REPODIR}/cpp -B ${CPP_RAFT_BUILD_DIR} \
           -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
           -DCMAKE_CUDA_ARCHITECTURES=${RAFT_CMAKE_CUDA_ARCHITECTURES} \
           -DRAFT_COMPILE_LIBRARIES=${COMPILE_LIBRARIES} \
@@ -175,23 +188,20 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs; then
           -DNVTX=${NVTX} \
           -DDISABLE_DEPRECATION_WARNINGS=${DISABLE_DEPRECATION_WARNINGS} \
           -DBUILD_TESTS=${BUILD_TESTS} \
-          -DRAFT_USE_FAISS_STATIC=${BUILD_STATIC_FAISS} \
-          ..
+          -DCMAKE_MESSAGE_LOG_LEVEL=${CMAKE_LOG_LEVEL} \
+          -DRAFT_COMPILE_NN_LIBRARY=${COMPILE_NN_LIBRARY} \
+          -DRAFT_COMPILE_DIST_LIBRARY=${COMPILE_DIST_LIBRARY} \
+          -DRAFT_USE_FAISS_STATIC=${BUILD_STATIC_FAISS}
 
   if (( ${NUMARGS} == 0 )) || hasArg libraft; then
       # Run all c++ targets at once
-      if hasArg --compile-nn || hasArg --compile-dist || hasArg --compile-libs; then
-        if ! hasArg --nogtest; then
-          CMAKE_TARGET="test_raft;${CMAKE_TARGET}"
-        fi
-
-        echo "-- Compiling targets: ${CMAKE_TARGET}"
-        cmake --build  ${CPP_RAFT_BUILD_DIR} -j${PARALLEL_LEVEL} ${VERBOSE_FLAG} --target ${CMAKE_TARGET}
-      else
-        cmake --build  ${CPP_RAFT_BUILD_DIR} -j${PARALLEL_LEVEL} ${VERBOSE_FLAG}
+      if ! hasArg --nogtest; then
+        CMAKE_TARGET="${CMAKE_TARGET};test_raft;"
       fi
-  fi
 
+      echo "-- Compiling targets: ${CMAKE_TARGET}, verbose=${VERBOSE_FLAG}"
+      cmake --build  "${CPP_RAFT_BUILD_DIR}" ${VERBOSE_FLAG} -j${PARALLEL_LEVEL} --target ${CMAKE_TARGET} ${INSTALL_TARGET}
+  fi
 fi
 
 # Build and (optionally) install the cuml Python package
diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index e232cb1942..cf202d8888 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -70,7 +70,7 @@ conda config --set ssl_verify False
 # install. This should eliminate a mismatch between different CUDA versions on
 # cpu vs. gpu builds that is problematic with CUDA 11.5 Enhanced Compat.
 if [ "$BUILD_LIBRAFT" == '1' ]; then
-  BUILD_PYRAFT=1
+  BUILD_RAFT=1
   # If we are doing CUDA + Python builds, libraft package is located at ${CONDA_BLD_DIR}
   CONDA_LOCAL_CHANNEL="${CONDA_BLD_DIR}"
 else
@@ -78,6 +78,7 @@ else
   CONDA_LOCAL_CHANNEL="ci/artifacts/raft/cpu/.conda-bld/"
 fi
 
+gpuci_mamba_retry install -c conda-forge boa
 
 ###############################################################################
 # BUILD - Conda package builds
@@ -86,26 +87,35 @@ fi
 if [ "$BUILD_LIBRAFT" == '1' ]; then
   gpuci_logger "Building conda packages for libraft-nn, libraft-distance, and libraft-headers"
   if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
-    gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft-nn
-    gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft-distance
-    gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft-headers
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_headers
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_nn
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_distance
   else
-    gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft-nn
-    gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft-distance
-    gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft-headers
-    mkdir -p ${CONDA_BLD_DIR}/libraft
-    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft/work
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_headers
+    gpuci_logger "`ls ${CONDA_BLD_DIR}/work`"
+    mkdir -p ${CONDA_BLD_DIR}/libraft_headers/work
+    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_headers/work
+
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_nn
+    gpuci_logger "`ls ${CONDA_BLD_DIR}/work`"
+    mkdir -p ${CONDA_BLD_DIR}/libraft_nn/work
+    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_nn/work
+
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} --dirty --no-remove-work-dir conda/recipes/libraft_distance
+    gpuci_logger "`ls ${CONDA_BLD_DIR}/work`"
+    mkdir -p ${CONDA_BLD_DIR}/libraft_distance/work
+    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/libraft_distance/work
   fi
 else
   gpuci_logger "SKIPPING build of conda packages for libraft-nn, libraft-distance and libraft-headers"
 fi
 
-if [ "$BUILD_raft" == "1" ]; then
+if [ "$BUILD_RAFT" == "1" ]; then
   gpuci_logger "Building conda packages for pyraft"
   if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
-    gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pyraft --python=$PYTHON
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pyraft --python=$PYTHON
   else
-    gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pyraft -c ${CONDA_LOCAL_CHANNEL} --dirty --no-remove-work-dir --python=$PYTHON
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pyraft -c ${CONDA_LOCAL_CHANNEL} --dirty --no-remove-work-dir --python=$PYTHON
     mkdir -p ${CONDA_BLD_DIR}/pyraft
     mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/pyraft/work
   fi
@@ -118,4 +128,4 @@ fi
 ################################################################################
 
 gpuci_logger "Upload conda packages"
-source ci/cpu/upload.sh
\ No newline at end of file
+source ci/cpu/upload.sh
diff --git a/ci/cpu/prebuild.sh b/ci/cpu/prebuild.sh
index e37bcedb52..0364394423 100755
--- a/ci/cpu/prebuild.sh
+++ b/ci/cpu/prebuild.sh
@@ -14,7 +14,7 @@
 
 if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     #If project flash is not activate, always build both
-    export BUILD_PYRAFT=1
+    export BUILD_RAFT=1
     export BUILD_LIBRAFT=1
 fi
 
diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index 83d1686c0c..9aa61bd1e2 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -30,10 +30,10 @@ fi
 
 gpuci_logger "Get conda file output locations"
 
-export LIBRAFT_NN_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_nn --output`
-export LIBRAFT_DISTANCE_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_distance --output`
-export LIBRAFT_HEADERS_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/libraft_headers --output`
-export PYRAFT_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/pyraft --python=$PYTHON --output`
+export LIBRAFT_NN_FILE=`conda mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_nn --output`
+export LIBRAFT_DISTANCE_FILE=`conda mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_distance --output`
+export LIBRAFT_HEADERS_FILE=`conda mambabuild --croot ${CONDA_BLD_DIR} conda/recipes/libraft_headers --output`
+export PYRAFT_FILE=`conda mambabuild --croot ${CONDA_BLD_DIR} conda/recipes/pyraft --python=$PYTHON --output`
 
 ################################################################################
 # UPLOAD - Conda packages
@@ -61,7 +61,7 @@ if [[ "$BUILD_LIBRAFT" == "1" && "$UPLOAD_LIBRAFT" == "1" ]]; then
   gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBRAFT_HEADERS_FILE} --no-progress
 fi
 
-if [[ "$BUILD_PYRAFT" == "1" ]]; then
+if [[ "$BUILD_RAFT" == "1" ]]; then
   test -e ${PYRAFT_FILE}
   echo "Upload pyraft"
   echo ${PYRAFT_FILE}
diff --git a/conda/recipes/libraft_distance/build.sh b/conda/recipes/libraft_distance/build.sh
index 72b43b7cf0..7523263f01 100644
--- a/conda/recipes/libraft_distance/build.sh
+++ b/conda/recipes/libraft_distance/build.sh
@@ -1,3 +1,3 @@
 #!/usr/bin/env bash
 
-./build.sh clean libraft -v --allgpuarch --compile-dist --nogtest
+./build.sh libraft -v --allgpuarch --compile-dist --nogtest
diff --git a/conda/recipes/libraft_distance/meta.yaml b/conda/recipes/libraft_distance/meta.yaml
index f7aaf0d261..a4f2dbac94 100644
--- a/conda/recipes/libraft_distance/meta.yaml
+++ b/conda/recipes/libraft_distance/meta.yaml
@@ -38,7 +38,7 @@ requirements:
   build:
     - cmake>=3.20.1
   host:
-    - libraft-headers {{
+    - libraft-headers {{ version }}
     - nccl>=2.9.9
     - cudatoolkit {{ cuda_version }}.*
     - ucx-py {{ ucx_py_version }}
diff --git a/conda/recipes/libraft_headers/build.sh b/conda/recipes/libraft_headers/build.sh
index eec262bb1e..ca6d9b4960 100644
--- a/conda/recipes/libraft_headers/build.sh
+++ b/conda/recipes/libraft_headers/build.sh
@@ -1,3 +1,3 @@
 #!/usr/bin/env bash
 
-./build.sh clean libraft -v --allgpuarch --nogtest
+./build.sh libraft -v --allgpuarch --nogtest
diff --git a/conda/recipes/libraft_nn/build.sh b/conda/recipes/libraft_nn/build.sh
index 30dfdbe04f..5c60cd2fa1 100644
--- a/conda/recipes/libraft_nn/build.sh
+++ b/conda/recipes/libraft_nn/build.sh
@@ -1,3 +1,3 @@
 #!/usr/bin/env bash
 
-./build.sh clean libraft -v --allgpuarch --compile-nn --nogtest
+./build.sh libraft -v --allgpuarch --compile-nn --nogtest
diff --git a/conda/recipes/pyraft/meta.yaml b/conda/recipes/pyraft/meta.yaml
index b1588d57c0..4182137f85 100644
--- a/conda/recipes/pyraft/meta.yaml
+++ b/conda/recipes/pyraft/meta.yaml
@@ -29,16 +29,19 @@ requirements:
     - python x.x
     - setuptools
     - cython>=0.29,<0.30
-    - libraft-headers={{ version }}
+    - rmm {{ minor_version }}
+    - libraft-headers {{ version }}
     - cudatoolkit {{ cuda_version }}.*
+    - cuda-python >=11.5,<12.0
     - ucx-py {{ ucx_py_version }}
     - ucx-proc=*=gpu
   run:
     - python x.x
     - dask-cuda {{ minor_version }}
-    - libraft-headers={{ version }}
+    - libraft-headers {{ version }}
     - cupy>=7.8.0,<10.0.0a0
     - nccl>=2.9.9
+    - rmm {{ minor_version }}
     - ucx-py {{ ucx_py_version }}
     - ucx-proc=*=gpu
     - dask>=2021.11.1,<=2021.11.2
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 8cd97ec93f..8acd9c0099 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -50,6 +50,8 @@ option(DISABLE_OPENMP "Disable OpenMP" OFF)
 option(NVTX "Enable nvtx markers" OFF)
 
 option(RAFT_COMPILE_LIBRARIES "Enable building raft shared library instantiations" ON)
+option(RAFT_COMPILE_NN_LIBRARY "Enable building raft nearest neighbors shared library instantiations" OFF)
+option(RAFT_COMPILE_DIST_LIBRARY "Enable building raft distant shared library instantiations" OFF)
 option(RAFT_ENABLE_NN_DEPENDENCIES "Search for raft::nn dependencies like faiss" ${RAFT_COMPILE_LIBRARIES})
 include(CMakeDependentOption)
 cmake_dependent_option(RAFT_USE_FAISS_STATIC "Build and statically link the FAISS library for nearest neighbors search on GPU" ON RAFT_COMPILE_LIBRARIES OFF)
@@ -142,6 +144,16 @@ target_link_libraries(raft INTERFACE
 target_compile_definitions(raft INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
 target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
 
+if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY OR RAFT_COMPILE_NN_LIBRARY)
+  file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
+[=[
+SECTIONS
+{
+.nvFatBinSegment : { *(.nvFatBinSegment) }
+.nv_fatbin : { *(.nv_fatbin) }
+}
+]=])
+endif()
 ##############################################################################
 # - raft_distance ------------------------------------------------------------
 add_library(raft_distance INTERFACE)
@@ -152,7 +164,7 @@ endif()
 
 set_target_properties(raft_distance PROPERTIES EXPORT_NAME distance)
 
-if(RAFT_COMPILE_LIBRARIES)
+if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
   add_library(raft_distance_lib SHARED
     src/distance/specializations/detail
     src/distance/specializations/detail/canberra.cu
@@ -161,14 +173,30 @@ if(RAFT_COMPILE_LIBRARIES)
     src/distance/specializations/detail/cosine.cu
     src/distance/specializations/detail/hamming_unexpanded.cu
     src/distance/specializations/detail/hellinger_expanded.cu
-    src/distance/specializations/detail/jensen_shannon.cu
-    src/distance/specializations/detail/kl_divergence.cu
-    src/distance/specializations/detail/l1.cu
-    src/distance/specializations/detail/l2_expanded.cu
-    src/distance/specializations/detail/l2_sqrt_expanded.cu
-    src/distance/specializations/detail/l2_sqrt_unexpanded.cu
-    src/distance/specializations/detail/l2_unexpanded.cu
-    src/distance/specializations/detail/lp_unexpanded.cu
+    src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
+    src/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu
+    src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
+    src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
+    src/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu
+    src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
+    src/distance/specializations/detail/l1_float_float_float_int.cu
+    src/distance/specializations/detail/l1_float_float_float_uint32.cu
+    src/distance/specializations/detail/l1_double_double_double_int.cu
+    src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
+    src/distance/specializations/detail/l2_expanded_float_float_float_uint32.cu
+    src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
+    src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu
+    src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_uint32.cu
+    src/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu
+    src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu
+    src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_uint32.cu
+    src/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu
+    src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
+    src/distance/specializations/detail/l2_unexpanded_float_float_float_uint32.cu
+    src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
+    src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
+    src/distance/specializations/detail/lp_unexpanded_float_float_float_uint32.cu
+    src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
   )
   set_target_properties(raft_distance_lib PROPERTIES OUTPUT_NAME raft_distance)
 
@@ -180,6 +208,9 @@ if(RAFT_COMPILE_LIBRARIES)
   target_compile_definitions(raft_distance_lib
           INTERFACE "RAFT_DISTANCE_COMPILED")
 
+  # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
+  target_link_options(raft_distance_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
+
 endif()
 
 target_link_libraries(raft_distance INTERFACE raft::raft
@@ -197,7 +228,7 @@ endif()
 
 set_target_properties(raft_nn PROPERTIES EXPORT_NAME nn)
 
-if(RAFT_COMPILE_LIBRARIES)
+if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_NN_LIBRARY)
   add_library(raft_nn_lib SHARED
     src/nn/specializations/ball_cover.cu
     src/nn/specializations/detail/ball_cover_lowdim.cu
@@ -214,6 +245,9 @@ if(RAFT_COMPILE_LIBRARIES)
           PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
           )
+  # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
+  target_link_options(raft_nn_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
+
   target_compile_definitions(raft_nn_lib
           INTERFACE "RAFT_NN_COMPILED")
 
diff --git a/cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu b/cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
new file mode 100644
index 0000000000..615af0554c
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::JensenShannon, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/jensen_shannon.cu b/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
similarity index 80%
rename from cpp/src/distance/specializations/detail/jensen_shannon.cu
rename to cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
index 37f1f81fb1..c737e1645e 100644
--- a/cpp/src/distance/specializations/detail/jensen_shannon.cu
+++ b/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
@@ -45,20 +45,6 @@ template void distance<raft::distance::DistanceType::JensenShannon, double, doub
   bool isRowMajor,
   double metric_arg);
 
-template void
-distance<raft::distance::DistanceType::JensenShannon, float, float, float, std::uint32_t>(
-  const float* x,
-  const float* y,
-  float* dist,
-  std::uint32_t m,
-  std::uint32_t n,
-  std::uint32_t k,
-  void* workspace,
-  std::size_t worksize,
-  cudaStream_t stream,
-  bool isRowMajor,
-  float metric_arg);
-
 }  // namespace detail
 }  // namespace distance
 }  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu b/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu
new file mode 100644
index 0000000000..91e5f4b3d1
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+
+template void
+distance<raft::distance::DistanceType::JensenShannon, float, float, float, std::uint32_t>(
+  const float* x,
+  const float* y,
+  float* dist,
+  std::uint32_t m,
+  std::uint32_t n,
+  std::uint32_t k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu b/cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
new file mode 100644
index 0000000000..3add90ce4e
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::KLDivergence, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu b/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
new file mode 100644
index 0000000000..ae81c29aff
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::KLDivergence, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/kl_divergence.cu b/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu
similarity index 66%
rename from cpp/src/distance/specializations/detail/kl_divergence.cu
rename to cpp/src/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu
index f6412cdefd..d6638004ed 100644
--- a/cpp/src/distance/specializations/detail/kl_divergence.cu
+++ b/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu
@@ -19,31 +19,6 @@
 namespace raft {
 namespace distance {
 namespace detail {
-template void distance<raft::distance::DistanceType::KLDivergence, float, float, float, int>(
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  cudaStream_t stream,
-  bool isRowMajor,
-  float metric_arg);
-
-template void distance<raft::distance::DistanceType::KLDivergence, double, double, double, int>(
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  cudaStream_t stream,
-  bool isRowMajor,
-  double metric_arg);
 
 template void
 distance<raft::distance::DistanceType::KLDivergence, float, float, float, std::uint32_t>(
diff --git a/cpp/src/distance/specializations/detail/l1_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l1_double_double_double_int.cu
new file mode 100644
index 0000000000..49cef9a76f
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l1_double_double_double_int.cu
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::L1, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/l1_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l1_float_float_float_int.cu
new file mode 100644
index 0000000000..afec666d57
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l1_float_float_float_int.cu
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::L1, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/l1.cu b/cpp/src/distance/specializations/detail/l1_float_float_float_uint32.cu
similarity index 66%
rename from cpp/src/distance/specializations/detail/l1.cu
rename to cpp/src/distance/specializations/detail/l1_float_float_float_uint32.cu
index 5df9c9ece6..b12f10a3c3 100644
--- a/cpp/src/distance/specializations/detail/l1.cu
+++ b/cpp/src/distance/specializations/detail/l1_float_float_float_uint32.cu
@@ -19,31 +19,6 @@
 namespace raft {
 namespace distance {
 namespace detail {
-template void distance<raft::distance::DistanceType::L1, float, float, float, int>(
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  cudaStream_t stream,
-  bool isRowMajor,
-  float metric_arg);
-
-template void distance<raft::distance::DistanceType::L1, double, double, double, int>(
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  cudaStream_t stream,
-  bool isRowMajor,
-  double metric_arg);
 
 template void distance<raft::distance::DistanceType::L1, float, float, float, std::uint32_t>(
   const float* x,
diff --git a/cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
new file mode 100644
index 0000000000..690fdb304a
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::L2Expanded, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
new file mode 100644
index 0000000000..743e885bde
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::L2Expanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/l2_expanded.cu b/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_uint32.cu
similarity index 66%
rename from cpp/src/distance/specializations/detail/l2_expanded.cu
rename to cpp/src/distance/specializations/detail/l2_expanded_float_float_float_uint32.cu
index 1b122ca331..3e84786db5 100644
--- a/cpp/src/distance/specializations/detail/l2_expanded.cu
+++ b/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_uint32.cu
@@ -19,32 +19,6 @@
 namespace raft {
 namespace distance {
 namespace detail {
-template void distance<raft::distance::DistanceType::L2Expanded, float, float, float, int>(
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  cudaStream_t stream,
-  bool isRowMajor,
-  float metric_arg);
-
-template void distance<raft::distance::DistanceType::L2Expanded, double, double, double, int>(
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  cudaStream_t stream,
-  bool isRowMajor,
-  double metric_arg);
-
 template void
 distance<raft::distance::DistanceType::L2Expanded, float, float, float, std::uint32_t>(
   const float* x,
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu
new file mode 100644
index 0000000000..a57d664c7b
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::L2SqrtExpanded, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu
new file mode 100644
index 0000000000..836d3b28e4
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::L2SqrtExpanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_expanded.cu b/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_uint32.cu
similarity index 66%
rename from cpp/src/distance/specializations/detail/l2_sqrt_expanded.cu
rename to cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_uint32.cu
index f87d08b94b..ff57678a5d 100644
--- a/cpp/src/distance/specializations/detail/l2_sqrt_expanded.cu
+++ b/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_uint32.cu
@@ -19,31 +19,6 @@
 namespace raft {
 namespace distance {
 namespace detail {
-template void distance<raft::distance::DistanceType::L2SqrtExpanded, float, float, float, int>(
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  cudaStream_t stream,
-  bool isRowMajor,
-  float metric_arg);
-
-template void distance<raft::distance::DistanceType::L2SqrtExpanded, double, double, double, int>(
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  cudaStream_t stream,
-  bool isRowMajor,
-  double metric_arg);
 
 template void
 distance<raft::distance::DistanceType::L2SqrtExpanded, float, float, float, std::uint32_t>(
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu
new file mode 100644
index 0000000000..b12c70df58
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+
+template void distance<raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu
new file mode 100644
index 0000000000..24d6e6916c
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded.cu b/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_uint32.cu
similarity index 65%
rename from cpp/src/distance/specializations/detail/l2_sqrt_unexpanded.cu
rename to cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_uint32.cu
index 7067cc9015..f61c40541c 100644
--- a/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded.cu
+++ b/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_uint32.cu
@@ -19,31 +19,6 @@
 namespace raft {
 namespace distance {
 namespace detail {
-template void distance<raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, int>(
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  cudaStream_t stream,
-  bool isRowMajor,
-  float metric_arg);
-
-template void distance<raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, int>(
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  cudaStream_t stream,
-  bool isRowMajor,
-  double metric_arg);
 
 template void
 distance<raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, std::uint32_t>(
diff --git a/cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
new file mode 100644
index 0000000000..8c02098c96
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::L2Unexpanded, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
new file mode 100644
index 0000000000..350cb27874
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::L2Unexpanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/l2_unexpanded.cu b/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_uint32.cu
similarity index 66%
rename from cpp/src/distance/specializations/detail/l2_unexpanded.cu
rename to cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_uint32.cu
index bdd57b13b2..607113a18d 100644
--- a/cpp/src/distance/specializations/detail/l2_unexpanded.cu
+++ b/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_uint32.cu
@@ -19,31 +19,6 @@
 namespace raft {
 namespace distance {
 namespace detail {
-template void distance<raft::distance::DistanceType::L2Unexpanded, float, float, float, int>(
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  cudaStream_t stream,
-  bool isRowMajor,
-  float metric_arg);
-
-template void distance<raft::distance::DistanceType::L2Unexpanded, double, double, double, int>(
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  cudaStream_t stream,
-  bool isRowMajor,
-  double metric_arg);
 
 template void
 distance<raft::distance::DistanceType::L2Unexpanded, float, float, float, std::uint32_t>(
diff --git a/cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
new file mode 100644
index 0000000000..bd306df055
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+
+template void distance<raft::distance::DistanceType::LpUnexpanded, double, double, double, int>(
+  const double* x,
+  const double* y,
+  double* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  double metric_arg);
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
new file mode 100644
index 0000000000..64a0656e27
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/distance.cuh>
+
+namespace raft {
+namespace distance {
+namespace detail {
+template void distance<raft::distance::DistanceType::LpUnexpanded, float, float, float, int>(
+  const float* x,
+  const float* y,
+  float* dist,
+  int m,
+  int n,
+  int k,
+  void* workspace,
+  std::size_t worksize,
+  cudaStream_t stream,
+  bool isRowMajor,
+  float metric_arg);
+
+}  // namespace detail
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/lp_unexpanded.cu b/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_uint32.cu
similarity index 66%
rename from cpp/src/distance/specializations/detail/lp_unexpanded.cu
rename to cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_uint32.cu
index a3c15e498d..fcf6f2c65b 100644
--- a/cpp/src/distance/specializations/detail/lp_unexpanded.cu
+++ b/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_uint32.cu
@@ -19,31 +19,6 @@
 namespace raft {
 namespace distance {
 namespace detail {
-template void distance<raft::distance::DistanceType::LpUnexpanded, float, float, float, int>(
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  cudaStream_t stream,
-  bool isRowMajor,
-  float metric_arg);
-
-template void distance<raft::distance::DistanceType::LpUnexpanded, double, double, double, int>(
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  cudaStream_t stream,
-  bool isRowMajor,
-  double metric_arg);
 
 template void
 distance<raft::distance::DistanceType::LpUnexpanded, float, float, float, std::uint32_t>(

From a7832bc880ea8642e44c406e07fa44a0fddbfccd Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Fri, 4 Feb 2022 23:07:25 +0100
Subject: [PATCH 095/171] single-pass raft::stats::meanvar (#472)

Add a helper to compute column-wise mean and variance for a matrix in a single pass.

Implementation and tests are to a large extent copy-pasted from `raft::stats::mean` and `raft::stats::stddev`.
It is to be used in cuml's `ML::GLM::preProcessData`, where it finishes ~1.8 faster compared to running mean+var separately (on column-major data).

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/472
---
 cpp/include/raft/stats/detail/meanvar.cuh | 191 ++++++++++++++++++++++
 cpp/include/raft/stats/meanvar.hpp        |  56 +++++++
 cpp/test/CMakeLists.txt                   |   1 +
 cpp/test/stats/meanvar.cu                 | 152 +++++++++++++++++
 cpp/test/test_utils.h                     |  11 +-
 5 files changed, 410 insertions(+), 1 deletion(-)
 create mode 100644 cpp/include/raft/stats/detail/meanvar.cuh
 create mode 100644 cpp/include/raft/stats/meanvar.hpp
 create mode 100644 cpp/test/stats/meanvar.cu

diff --git a/cpp/include/raft/stats/detail/meanvar.cuh b/cpp/include/raft/stats/detail/meanvar.cuh
new file mode 100644
index 0000000000..ed411ef74d
--- /dev/null
+++ b/cpp/include/raft/stats/detail/meanvar.cuh
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/reduce.cuh>
+
+namespace raft::stats::detail {
+
+template <typename T>
+class mean_var {
+ private:
+  T w;
+  T m;
+  T s;
+
+ public:
+  /** Monoidal neutral. */
+  HDI mean_var() : w(0.0), m(0.0), s(0.0) {}
+  /** Lift a single value. */
+  HDI explicit mean_var(T x) : w(1.0), m(x), s(0.0) {}
+
+  /**
+   * Monoidal binary op: combine means and vars of two sets.
+   * (associative and commutative)
+   */
+  friend HDI auto operator+(mean_var<T> a, mean_var<T> const& b) -> mean_var<T>
+  {
+    a += b;
+    return a;
+  }
+
+  /**
+   * Combine means and vars of two sets.
+   *
+   * Similar to:
+   * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+   */
+  HDI auto operator+=(mean_var<T> const& b) & -> mean_var<T>&
+  {
+    mean_var<T>& a(*this);
+    T cw = a.w + b.w;
+    if (cw == 0) return a;
+    T aw_frac = a.w / cw;
+    T bw_frac = b.w / cw;
+    a.w       = cw;
+    T d       = a.m - b.m;
+    a.s += b.s + cw * (d * aw_frac) * (d * bw_frac);
+    a.m = a.m * aw_frac + b.m * bw_frac;
+    return a;
+  }
+
+  /** Get the computed mean. */
+  HDI auto mean() const -> T { return m; }
+
+  /**
+   * @brief Get the computed variance.
+   *
+   * @param [in] sample whether to produce sample variance (divide by `N - 1` instead of `N`).
+   * @return variance
+   */
+  HDI auto var(bool sample) const -> T { return s / max(T(1.0), sample ? w - T(1.0) : w); }
+
+  HDI void load(volatile mean_var<T>* address)
+  {
+    this->m = address->m;
+    this->s = address->s;
+    this->w = address->w;
+  }
+
+  HDI void store(volatile mean_var<T>* address)
+  {
+    address->m = this->m;
+    address->s = this->s;
+    address->w = this->w;
+  }
+};
+
+/*
+NB: current implementation here is not optimal, especially the rowmajor version;
+    leaving this for further work (perhaps, as a more generic "linewiseReduce").
+
+Possible improvements:
+
+  1. (romajor) Process input by the warps, not by blocks (thus reduce the iteration workset),
+               then aggregate output partially within blocks.
+  2. (both) Use vectorized loads to utilize memory better
+  3. (rowmajor) Scale the grid size better to utilize more the GPU (like in linewise_op).
+
+ */
+template <typename T, typename I, int BlockSize>
+__global__ void meanvar_kernel_rowmajor(
+  const T* data, volatile mean_var<T>* mvs, int* locks, I len, I D)
+{
+  const I thread_idx = threadIdx.x + BlockSize * blockIdx.x;
+  mean_var<T> thread_data;
+  {
+    const I grid_size = BlockSize * gridDim.x;
+    for (I i = thread_idx; i < len; i += grid_size) {
+      thread_data += mean_var<T>(data[i]);
+    }
+  }
+
+  {
+    const I col = thread_idx % D;
+    int* lock   = locks + col;
+    while (atomicCAS(lock, 0, 1) == 1) {
+      __threadfence();
+    }
+    __threadfence();
+    mean_var<T> global_data;
+    global_data.load(mvs + col);
+    global_data += thread_data;
+    global_data.store(mvs + col);
+    __threadfence();
+    __stwt(lock, 0);
+  }
+}
+
+template <typename T, typename I, int BlockSize>
+__global__ void meanvar_kernel_colmajor(T* mean, T* var, const T* data, I D, I N, bool sample)
+{
+  using BlockReduce = cub::BlockReduce<mean_var<T>, BlockSize>;
+  __shared__ typename BlockReduce::TempStorage shm;
+
+  const T* block_data = data + N * blockIdx.x;
+  mean_var<T> thread_data;
+  for (I i = threadIdx.x; i < N; i += BlockSize) {
+    thread_data += mean_var<T>(block_data[i]);
+  }
+  mean_var<T> acc = BlockReduce(shm).Sum(thread_data);
+  if (threadIdx.x == 0) {
+    mean[blockIdx.x] = acc.mean();
+    var[blockIdx.x]  = acc.var(sample);
+  }
+}
+
+template <typename T, typename I>
+__global__ void meanvar_kernel_fill(T* mean, T* var, const mean_var<T>* aggr, I D, bool sample)
+{
+  I i = threadIdx.x + blockDim.x * blockIdx.x;
+  if (i >= D) return;
+  auto x  = aggr[i];
+  mean[i] = x.mean();
+  var[i]  = x.var(sample);
+}
+
+template <typename T, typename I = int, int BlockSize = 256>
+void meanvar(
+  T* mean, T* var, const T* data, I D, I N, bool sample, bool rowMajor, cudaStream_t stream)
+{
+  if (rowMajor) {
+    const uint64_t len = uint64_t(D) * uint64_t(N);
+    ASSERT(len <= uint64_t(std::numeric_limits<I>::max()), "N * D does not fit the indexing type");
+    // lcm(row width, block size):
+    //   this way, each thread processes the same column on each iteration.
+    const uint64_t expected_grid_size =
+      (uint64_t(N) / raft::gcd<uint64_t>(uint64_t(N), uint64_t(BlockSize))) * uint64_t(BlockSize);
+    const uint gs =
+      uint(min(expected_grid_size, raft::ceildiv<uint64_t>(len, uint64_t(BlockSize))));
+
+    rmm::device_buffer buf((sizeof(mean_var<T>) + sizeof(int)) * D, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(buf.data(), 0, buf.size(), stream));
+    mean_var<T>* mvs = static_cast<mean_var<T>*>(buf.data());
+    int* locks       = static_cast<int*>(static_cast<void*>(mvs + D));
+    meanvar_kernel_rowmajor<T, I, BlockSize>
+      <<<gs, BlockSize, 0, stream>>>(data, mvs, locks, len, D);
+    meanvar_kernel_fill<T, I>
+      <<<raft::ceildiv<I>(D, BlockSize), BlockSize, 0, stream>>>(mean, var, mvs, D, sample);
+  } else {
+    meanvar_kernel_colmajor<T, I, BlockSize>
+      <<<D, BlockSize, 0, stream>>>(mean, var, data, D, N, sample);
+  }
+  RAFT_CHECK_CUDA(stream);
+}
+
+};  // namespace raft::stats::detail
diff --git a/cpp/include/raft/stats/meanvar.hpp b/cpp/include/raft/stats/meanvar.hpp
new file mode 100644
index 0000000000..3a41ee8a00
--- /dev/null
+++ b/cpp/include/raft/stats/meanvar.hpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/meanvar.cuh"
+
+namespace raft::stats {
+
+/**
+ * @brief Compute mean and variance for each column of a given matrix.
+ *
+ * The operation is performed in a single sweep. Consider using it when you need to compute
+ * both mean and variance, or when you need to compute variance but don't have the mean.
+ * It's almost twice faster than running `mean` and `vars` sequentially, because all three
+ * kernels are memory-bound.
+ *
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used for addressing
+ * @param [out] mean the output mean vector of size D
+ * @param [out] var the output variance vector of size D
+ * @param [in] data the input matrix of size [N, D]
+ * @param [in] D number of columns of data
+ * @param [in] N number of rows of data
+ * @param [in] sample whether to evaluate sample variance or not. In other words, whether to
+ * normalize the variance using N-1 or N, for true or false respectively.
+ * @param [in] rowMajor whether the input data is row- or col-major, for true or false respectively.
+ * @param [in] stream
+ */
+template <typename Type, typename IdxType = int>
+void meanvar(Type* mean,
+             Type* var,
+             const Type* data,
+             IdxType D,
+             IdxType N,
+             bool sample,
+             bool rowMajor,
+             cudaStream_t stream)
+{
+  detail::meanvar(mean, var, data, D, N, sample, rowMajor, stream);
+}
+
+};  // namespace raft::stats
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index c8b128a0f3..f131b02039 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -99,6 +99,7 @@ add_executable(test_raft
     test/spatial/selection.cu
     test/spectral_matrix.cu
     test/stats/mean.cu
+    test/stats/meanvar.cu
     test/stats/mean_center.cu
     test/stats/stddev.cu
     test/stats/sum.cu
diff --git a/cpp/test/stats/meanvar.cu b/cpp/test/stats/meanvar.cu
new file mode 100644
index 0000000000..bebdd7a19d
--- /dev/null
+++ b/cpp/test/stats/meanvar.cu
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/matrix/math.hpp>
+#include <raft/random/rng.hpp>
+#include <raft/stats/meanvar.hpp>
+
+namespace raft {
+namespace stats {
+
+template <typename T>
+struct MeanVarInputs {
+  T mean, stddev;
+  int rows, cols;
+  bool sample, rowMajor;
+  unsigned long long int seed;
+  static const int N_SIGMAS = 6;
+
+  T mean_tol() const { return T(N_SIGMAS) * stddev / sqrt(T(rows)); }
+
+  T var_tol() const { return T(N_SIGMAS) * stddev * stddev * sqrt(T(2.0) / T(max(1, rows - 1))); }
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const MeanVarInputs<T>& ps)
+{
+  return os << "rows: " << ps.rows << "; cols: " << ps.cols << "; "
+            << (ps.rowMajor ? "row-major" : "col-major") << " (tolerance: mean = " << ps.mean_tol()
+            << ", var = " << ps.var_tol() << ")";
+}
+
+template <typename T>
+class MeanVarTest : public ::testing::TestWithParam<MeanVarInputs<T>> {
+ public:
+  MeanVarTest()
+    : params(::testing::TestWithParam<MeanVarInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      data(params.rows * params.cols, stream),
+      mean_act(params.cols, stream),
+      vars_act(params.cols, stream)
+  {
+  }
+
+ protected:
+  void SetUp() override
+  {
+    random::Rng(params.seed)
+      .normal(data.data(), params.cols * params.rows, params.mean, params.stddev, stream);
+    meanvar(mean_act.data(),
+            vars_act.data(),
+            data.data(),
+            params.cols,
+            params.rows,
+            params.sample,
+            params.rowMajor,
+            stream);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  }
+
+ protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
+  MeanVarInputs<T> params;
+  rmm::device_uvector<T> data, mean_act, vars_act;
+};
+
+const std::vector<MeanVarInputs<float>> inputsf = {{1.f, 2.f, 1024, 32, true, false, 1234ULL},
+                                                   {1.f, 2.f, 1024, 64, true, false, 1234ULL},
+                                                   {1.f, 2.f, 1024, 128, true, false, 1234ULL},
+                                                   {1.f, 2.f, 1024, 256, true, false, 1234ULL},
+                                                   {-1.f, 2.f, 1024, 32, false, false, 1234ULL},
+                                                   {-1.f, 2.f, 1024, 64, false, false, 1234ULL},
+                                                   {-1.f, 2.f, 1024, 128, false, false, 1234ULL},
+                                                   {-1.f, 2.f, 1024, 256, false, false, 1234ULL},
+                                                   {-1.f, 2.f, 1024, 256, false, false, 1234ULL},
+                                                   {-1.f, 2.f, 1024, 257, false, false, 1234ULL},
+                                                   {1.f, 2.f, 1024, 32, true, true, 1234ULL},
+                                                   {1.f, 2.f, 1024, 64, true, true, 1234ULL},
+                                                   {1.f, 2.f, 1024, 128, true, true, 1234ULL},
+                                                   {1.f, 2.f, 1024, 256, true, true, 1234ULL},
+                                                   {-1.f, 2.f, 1024, 32, false, true, 1234ULL},
+                                                   {-1.f, 2.f, 1024, 64, false, true, 1234ULL},
+                                                   {-1.f, 2.f, 1024, 128, false, true, 1234ULL},
+                                                   {-1.f, 2.f, 1024, 256, false, true, 1234ULL},
+                                                   {-1.f, 2.f, 1024, 257, false, true, 1234ULL}};
+
+const std::vector<MeanVarInputs<double>> inputsd = {{1.0, 2.0, 1024, 32, true, false, 1234ULL},
+                                                    {1.0, 2.0, 1024, 64, true, false, 1234ULL},
+                                                    {1.0, 2.0, 1024, 128, true, false, 1234ULL},
+                                                    {1.0, 2.0, 1024, 256, true, false, 1234ULL},
+                                                    {-1.0, 2.0, 1024, 32, false, false, 1234ULL},
+                                                    {-1.0, 2.0, 1024, 64, false, false, 1234ULL},
+                                                    {-1.0, 2.0, 1024, 128, false, false, 1234ULL},
+                                                    {-1.0, 2.0, 1024, 256, false, false, 1234ULL},
+                                                    {1.0, 2.0, 1024, 32, true, true, 1234ULL},
+                                                    {1.0, 2.0, 1024, 64, true, true, 1234ULL},
+                                                    {1.0, 2.0, 1024, 128, true, true, 1234ULL},
+                                                    {1.0, 2.0, 1024, 256, true, true, 1234ULL},
+                                                    {-1.0, 2.0, 1024, 32, false, true, 1234ULL},
+                                                    {-1.0, 2.0, 1024, 64, false, true, 1234ULL},
+                                                    {-1.0, 2.0, 1024, 128, false, true, 1234ULL},
+                                                    {-1.0, 2.0, 1024, 256, false, true, 1234ULL}};
+
+typedef MeanVarTest<float> MeanVarTestF;
+TEST_P(MeanVarTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    params.mean, mean_act.data(), params.cols, CompareApprox<float>(params.mean_tol()), stream));
+
+  ASSERT_TRUE(devArrMatch(params.stddev * params.stddev,
+                          vars_act.data(),
+                          params.cols,
+                          CompareApproxNoScaling<float>(params.var_tol()),
+                          stream));
+}
+
+typedef MeanVarTest<double> MeanVarTestD;
+TEST_P(MeanVarTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    params.mean, mean_act.data(), params.cols, CompareApprox<double>(params.mean_tol()), stream));
+
+  ASSERT_TRUE(devArrMatch(params.stddev * params.stddev,
+                          vars_act.data(),
+                          params.cols,
+                          CompareApproxNoScaling<double>(params.var_tol()),
+                          stream));
+}
+
+INSTANTIATE_TEST_SUITE_P(MeanVarTests, MeanVarTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_SUITE_P(MeanVarTests, MeanVarTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h
index df7c2cbe36..5349ac23d9 100644
--- a/cpp/test/test_utils.h
+++ b/cpp/test/test_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -66,6 +66,15 @@ struct CompareApproxAbs {
   T eps;
 };
 
+template <typename T>
+struct CompareApproxNoScaling {
+  CompareApproxNoScaling(T eps_) : eps(eps_) {}
+  bool operator()(const T& a, const T& b) const { return (abs(a - b) <= eps); }
+
+ private:
+  T eps;
+};
+
 template <typename T>
 __host__ __device__ T abs(const T& a)
 {

From b7640ae1b45dd153a2bc981a1b14364dd55f3ba0 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Fri, 4 Feb 2022 23:08:09 +0100
Subject: [PATCH 096/171] CUBLAS wrappers with switchable host/device pointer
 mode (#453)

Add a few overloads for raft-CUBLAS `gemv`, `gemm`, `axpy` functions to support switching between host and device pointer mode. This allows passing some of the parameters (constants `alpha`, `beta`) as device pointers, which sometimes improves performance.

By default, CUBLAS context is created in the host pointer mode. To keep this presumption, the device pointer mode is enabled only for the time of a particular CUBLAS call.

This feature is required for https://github.com/rapidsai/cuml/pull/4446.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/453
---
 cpp/include/raft/linalg/axpy.h            | 55 +++++++++++++++++++++
 cpp/include/raft/linalg/cublas_wrappers.h | 31 +++++++++++-
 cpp/include/raft/linalg/gemm.cuh          | 60 ++++++++++++++++++++++-
 cpp/include/raft/linalg/gemv.h            | 59 ++++++++++++++++++++--
 4 files changed, 198 insertions(+), 7 deletions(-)
 create mode 100644 cpp/include/raft/linalg/axpy.h

diff --git a/cpp/include/raft/linalg/axpy.h b/cpp/include/raft/linalg/axpy.h
new file mode 100644
index 0000000000..27b14aea08
--- /dev/null
+++ b/cpp/include/raft/linalg/axpy.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+#include <raft/linalg/cublas_wrappers.h>
+
+namespace raft::linalg {
+
+/**
+ * @brief the wrapper of cublas axpy function
+ *  It computes the following equation: y = alpha * x + y
+ *
+ * @tparam T the element type
+ * @tparam DevicePointerMode whether pointers alpha, beta point to device memory
+ * @param [in] handle raft handle
+ * @param [in] n number of elements in x and y
+ * @param [in] alpha host or device scalar
+ * @param [in] x vector of length n
+ * @param [in] incx stride between consecutive elements of x
+ * @param [inout] y vector of length n
+ * @param [in] incy stride between consecutive elements of y
+ * @param [in] stream
+ */
+template <typename T, bool DevicePointerMode = false>
+void axpy(const raft::handle_t& handle,
+          const int n,
+          const T* alpha,
+          const T* x,
+          const int incx,
+          T* y,
+          const int incy,
+          cudaStream_t stream)
+{
+  auto cublas_h = handle.get_cublas_handle();
+  cublas_device_pointer_mode<DevicePointerMode> pmode(cublas_h);
+  RAFT_CUBLAS_TRY(cublasaxpy(cublas_h, n, alpha, x, incx, y, incy, stream));
+}
+
+}  // namespace raft::linalg
diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h
index 024ed4a0e2..246e6466d8 100644
--- a/cpp/include/raft/linalg/cublas_wrappers.h
+++ b/cpp/include/raft/linalg/cublas_wrappers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -118,6 +118,35 @@ inline const char* cublas_error_to_string(cublasStatus_t err)
 namespace raft {
 namespace linalg {
 
+/**
+ * Assuming the default CUBLAS_POINTER_MODE_HOST, change it to host or device mode
+ * temporary for the lifetime of this object.
+ */
+template <bool DevicePointerMode = false>
+class cublas_device_pointer_mode {
+ public:
+  explicit cublas_device_pointer_mode(cublasHandle_t handle) : handle_(handle)
+  {
+    if constexpr (DevicePointerMode) {
+      RAFT_CUBLAS_TRY(cublasSetPointerMode(handle_, CUBLAS_POINTER_MODE_DEVICE));
+    }
+  }
+  auto operator=(const cublas_device_pointer_mode&) -> cublas_device_pointer_mode& = delete;
+  auto operator=(cublas_device_pointer_mode&&) -> cublas_device_pointer_mode& = delete;
+  static auto operator new(std::size_t) -> void*                              = delete;
+  static auto operator new[](std::size_t) -> void*                            = delete;
+
+  ~cublas_device_pointer_mode()
+  {
+    if constexpr (DevicePointerMode) {
+      RAFT_CUBLAS_TRY_NO_THROW(cublasSetPointerMode(handle_, CUBLAS_POINTER_MODE_HOST));
+    }
+  }
+
+ private:
+  cublasHandle_t handle_ = nullptr;
+};
+
 /**
  * @defgroup Axpy cublas ax+y operations
  * @{
diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh
index 9aff35619e..b5147915ba 100644
--- a/cpp/include/raft/linalg/gemm.cuh
+++ b/cpp/include/raft/linalg/gemm.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,64 @@
 namespace raft {
 namespace linalg {
 
+/**
+ * @brief the wrapper of cublas gemm function
+ *  It computes the following equation: C = alpha .* opA(A) * opB(B) + beta .* C
+ *
+ * @tparam math_t the element type
+ * @tparam DevicePointerMode whether pointers alpha, beta point to device memory
+ * @param [in] handle raft handle
+ * @param [in] trans_a cublas transpose op for A
+ * @param [in] trans_b cublas transpose op for B
+ * @param [in] m number of rows of C
+ * @param [in] n number of columns of C
+ * @param [in] k number of rows of opB(B) / number of columns of opA(A)
+ * @param [in] alpha host or device scalar
+ * @param [in] A such a matrix that the shape of column-major opA(A) is [m, k]
+ * @param [in] lda leading dimension of A
+ * @param [in] B such a matrix that the shape of column-major opA(B) is [k, n]
+ * @param [in] ldb leading dimension of B
+ * @param [in] beta host or device scalar
+ * @param [inout] C column-major matrix of size [m, n]
+ * @param [in] ldc leading dimension of C
+ * @param [in] stream
+ */
+template <typename math_t, bool DevicePointerMode = false>
+void gemm(const raft::handle_t& handle,
+          const bool trans_a,
+          const bool trans_b,
+          const int m,
+          const int n,
+          const int k,
+          const math_t* alpha,
+          const math_t* A,
+          const int lda,
+          const math_t* B,
+          const int ldb,
+          const math_t* beta,
+          const math_t* C,
+          const int ldc,
+          cudaStream_t stream)
+{
+  cublasHandle_t cublas_h = handle.get_cublas_handle();
+  cublas_device_pointer_mode<DevicePointerMode> pmode(cublas_h);
+  RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
+                             trans_a ? CUBLAS_OP_T : CUBLAS_OP_N,
+                             trans_b ? CUBLAS_OP_T : CUBLAS_OP_N,
+                             m,
+                             n,
+                             k,
+                             alpha,
+                             A,
+                             lda,
+                             B,
+                             ldb,
+                             beta,
+                             C,
+                             ldc,
+                             stream));
+}
+
 /**
  * @brief the wrapper of cublas gemm function
  *  It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C
diff --git a/cpp/include/raft/linalg/gemv.h b/cpp/include/raft/linalg/gemv.h
index 462107df65..9eafb3941a 100644
--- a/cpp/include/raft/linalg/gemv.h
+++ b/cpp/include/raft/linalg/gemv.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,58 @@
 namespace raft {
 namespace linalg {
 
+/**
+ * @brief the wrapper of cublas gemv function
+ *  It computes the following equation: y = alpha .* op(A) * x + beta .* y
+ *
+ * @tparam math_t the element type
+ * @tparam DevicePointerMode whether pointers alpha, beta point to device memory
+ * @param [in] handle raft handle
+ * @param [in] trans_a cublas transpose op for A
+ * @param [in] m number of rows of A
+ * @param [in] n number of columns of A
+ * @param [in] alpha host or device scalar
+ * @param [in] A column-major matrix of size [m, n]
+ * @param [in] lda leading dimension of A
+ * @param [in] x vector of length n if trans_a else m
+ * @param [in] incx stride between consecutive elements of x
+ * @param [in] beta host or device scalar
+ * @param [inout] y vector of length m if trans_a else n
+ * @param [in] incy stride between consecutive elements of y
+ * @param [in] stream
+ */
+template <typename math_t, bool DevicePointerMode = false>
+void gemv(const raft::handle_t& handle,
+          const bool trans_a,
+          const int m,
+          const int n,
+          const math_t* alpha,
+          const math_t* A,
+          const int lda,
+          const math_t* x,
+          const int incx,
+          const math_t* beta,
+          math_t* y,
+          const int incy,
+          cudaStream_t stream)
+{
+  cublasHandle_t cublas_h = handle.get_cublas_handle();
+  cublas_device_pointer_mode<DevicePointerMode> pmode(cublas_h);
+  RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
+                             trans_a ? CUBLAS_OP_T : CUBLAS_OP_N,
+                             m,
+                             n,
+                             alpha,
+                             A,
+                             lda,
+                             x,
+                             incx,
+                             beta,
+                             y,
+                             incy,
+                             stream));
+}
+
 template <typename math_t>
 void gemv(const raft::handle_t& handle,
           const math_t* A,
@@ -39,10 +91,7 @@ void gemv(const raft::handle_t& handle,
           const math_t beta,
           cudaStream_t stream)
 {
-  cublasHandle_t cublas_h = handle.get_cublas_handle();
-  cublasOperation_t op_a  = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
-  RAFT_CUBLAS_TRY(
-    cublasgemv(cublas_h, op_a, n_rows, n_cols, &alpha, A, n_rows, x, incx, &beta, y, incy, stream));
+  gemv(handle, trans_a, n_rows, n_cols, &alpha, A, n_rows, x, incx, &beta, y, incy, stream);
 }
 
 /**

From c2837b0c9014089cc225db1e0f92be50404b7461 Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Fri, 4 Feb 2022 18:04:54 -0500
Subject: [PATCH 097/171] Add CMake `install` rule for tests (#483)

This PR adds a CMake `install` rule for test targets. This step is a prerequisite to being able to package tests in their own `conda` package, which will enable us to deprecate _Project Flash_.

Authors:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/raft/pull/483
---
 cpp/test/CMakeLists.txt | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index f131b02039..369aac1e7c 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -115,6 +115,7 @@ PROPERTIES BUILD_RPATH                         "\$ORIGIN"
            CUDA_STANDARD_REQUIRED              ON
            POSITION_INDEPENDENT_CODE           ON
            INTERFACE_POSITION_INDEPENDENT_CODE ON
+           INSTALL_RPATH "\$ORIGIN/../../../lib"
 )
 
 target_compile_options(test_raft
@@ -140,3 +141,10 @@ PRIVATE
   $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
   $<TARGET_NAME_IF_EXISTS:conda_env>
 )
+
+install(
+    TARGETS test_raft
+    COMPONENT testing
+    DESTINATION bin/libraft/gtests
+    EXCLUDE_FROM_ALL
+)

From 0996d4a239c197a3cb8bde8e35ad2b91231909a9 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Fri, 4 Feb 2022 19:25:47 -0500
Subject: [PATCH 098/171] More README updates (#467)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Jiaming Yuan (https://github.com/trivialfis)
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/467
---
 BUILD.md  | 111 ++++++++++++++++++++++++++++++++++++++++++++++--------
 README.md |  26 ++++++++-----
 2 files changed, 112 insertions(+), 25 deletions(-)

diff --git a/BUILD.md b/BUILD.md
index f2186b9740..b10dc87f89 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -1,63 +1,144 @@
 # RAFT Build and Development Guide
 
 - [Building and installing RAFT](#build_install)
+    - [CUDA/GPU Requirements](#cuda_gpu_req)
+    - [Header-only C++](#nstall_header_only_cpp)
+    - [C++ Shared Libraries](#shared_cpp_libs)
+    - [Googletests](#gtests)
+    - [C++ Using Cmake](#cpp_using_cmake)
+    - [Python](#python)
 - [Using RAFT in downstream projects](#use_raft)
-    - [C++ Integration](#cxx_integration)
+    - [Cmake Header-only Integration](#cxx_integration)
+    - [Using Shared Libraries in Cmake](#use_shared_libs)
     - [Building RAFT C++ from source](#build_cxx_source)
     - [Python/Cython Integration](#py_integration)
 
 ## <a id="build_install"></a>Building and installing RAFT
 
-### CUDA/GPU Requirements
+### <a id="cuda_gpu_req"></a>CUDA/GPU Requirements
 - CUDA 11.0+
 - NVIDIA driver 450.80.02+
 - Pascal architecture of better (Compute capability >= 6.0)
 
-C++ RAFT is a header-only library but provides the option of building shared libraries with template instantiations for common types to speed up compile times for larger projects. The recommended way to build and install RAFT is to use the `build.sh` script in the root of the repository. This script can build both the C++ and Python code and provides options for building and installing the shared libraries.
+C++ RAFT is a header-only library but provides the option of building shared libraries with template instantiations for common types to speed up compile times for larger projects.
 
-To run C++ tests:
+The recommended way to build and install RAFT is to use the `build.sh` script in the root of the repository. This script can build both the C++ and Python code and provides options for building and installing the headers, Googletests, and individual shared libraries.
+
+### <a id="install_header_only_cpp"></a>Header-only C++
+
+RAFT depends on many different core libraries such as `thrust`, `cub`, `cucollections`, and `rmm`, which will be downloaded automatically by `cmake` even when only installing the headers. It's important to note that while all the headers will be installed and available, some parts of the RAFT API depend on libraries like `FAISS`, which can also be downloaded in the RAFT build but will need to be told to do so. 
 
+The following example builds and installs raft in header-only mode:
 ```bash
-./test_raft
+./build.sh libraft --nogtest
 ```
 
-To run Python tests, if `install` setup.py target is not run:
+###<a id="shared_cpp_libs"></a>C++ Shared Libraries (optional)
+
+Shared libraries are provided to speed up compile times for larger libraries which may heavily utilize some of the APIs. These shared libraries can also significantly improve re-compile times while developing against the APIs. 
+
+Build all the shared libraries by passing `--compile-libs` flag to `build.sh`:
 
 ```bash
-cd python
-python -m pytest raft
+./build.sh libraft --compile-libs --nogtest
+```
+ 
+To remain flexible, the individual shared libraries have their own flags and multiple can be used (though currently only the `nn` and `distance` packages contain shared libraries):
+```bash
+./build.sh libraft --compile-nn --compile-dist --nogtest
 ```
 
-To build manually, you can also use `CMake` and setup.py directly.
+###<a id="gtests"></a>Googletests
 
-For C++, the `RAFT_COMPILE_LIBRARIES` option can be used to compile the shared libraries. Shared libraries are provided for the `nn` and `distance` packages currently. The `nn` package requires FAISS, which will be built from source if it is not already installed. [FAISS](https://github.com/facebookresearch/faiss) can optionally be statically compiled into the `nn` shared library with the `RAFT_USE_FAISS_STATIC` option.
+Compile the Googletests by removing the `--nogtest` flag from `build.sh`:
+```bash
+./build.sh libraft --compile-nn --compile-dist
+```
+
+To run C++ tests:
+
+```bash
+./test_raft
+```
+
+### <a id="cpp_using_cmake"></a>C++ Using Cmake
 
 To install RAFT into a specific location, use `CMAKE_INSTALL_PREFIX`. The snippet below will install it into the current conda environment.
 ```bash
 cd cpp
 mkdir build
 cd build
-cmake -DRAFT_COMPILE_LIBRARIES=ON -DRAFT_USE_FAISS_STATIC=OFF  -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX ../
+cmake -D BUILD_TESTS=ON -DRAFT_COMPILE_LIBRARIES=ON -DRAFT_ENABLE_NN_DEPENDENCIES=ON  -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX ../
 make install
 ```
 
-For python:
+
+RAFT's cmake has the following configurable flags available:.
+
+| Flag | Possible Values | Default Value | Behavior |
+| --- | --- | --- | --- |
+| BUILD_TESTS | ON, OFF | ON | Compile Googletests |  
+| RAFT_COMPILE_LIBRARIES | ON, OFF | OFF | Compiles all `libraft` shared libraries (these are required for Googletests) |
+| RAFT_COMPILE_NN_LIBRARY | ON, OFF | ON | Compiles the `libraft-nn` shared library |  
+| RAFT_COMPILE_DIST_LIBRARY | ON, OFF | ON | Compiles the `libraft-distance` shared library |  
+| RAFT_ENABLE_NN_DEPENDENCIES | ON, OFF | OFF | Searches for dependencies of nearest neighbors API, such as FAISS, and compiles them if not found. |
+| RAFT_USE_FAISS_STATIC | ON, OFF | OFF | Statically link FAISS into `libraft-nn` | 
+| DETECT_CONDA_ENV | ON, OFF | ON | Enable detection of conda environment for dependencies |
+| NVTX | ON, OFF | OFF | Enable NVTX Markers |
+| CUDA_ENABLE_KERNELINFO | ON, OFF | OFF | Enables `kernelinfo` in nvcc. This is useful for `compute-sanitizer` | 
+| CUDA_ENABLE_LINEINFO  | ON, OFF | OFF | Enable the -lineinfo option for nvcc |
+| CUDA_STATIC_RUNTIME | ON, OFF | OFF | Statically link the CUDA runtime |
+
+Shared libraries are provided for the `libraft-nn` and `libraft-distance` components currently. The `libraft-nn` component depends upon [FAISS](https://github.com/facebookresearch/faiss) and the `RAFT_ENABLE_NN_DEPENDENCIES` option will build it from source if it is not already installed.
+
+
+
+### <a id="python"></a>Python
+
+Conda environment scripts are provided for installing the necessary dependencies for building and using the Python APIs. It is preferred to use `mamba`, as it provides significant speedup over `conda`. The following example will install create and install dependencies for a CUDA 11.5 conda environment:
+
+```bash
+conda env create --name raft_env -f conda/environments/raft_dev_cuda11.5.yml
+```
+
+The Python API can be built using the `build.sh` script:
 
+```bash
+./build.sh pyraft
+```
+
+`setup.py` can also be used to build the Python API manually:
 ```bash
 cd python
 python setup.py build_ext --inplace
 python setup.py install
 ```
 
+To run the Python tests:
+```bash 
+cd python 
+python -m pytest raft
+```
+
 ## <a id="use_raft"></a>Using RAFT in downstream projects
 
-### <a id="cxx_integration"></a>C++ Integration
+### <a id="cxx_integration"></a>C++ header-only integration using cmake
+
+Use RAFT in cmake projects with `find_package(raft)` for header-only operation and the `raft::raft` target will be available for configuring linking and `RAFT_INCLUDE_DIR` will be available for includes. Note that if any packages are used which require downstream dependencies, such as the `libraft-nn` package requiring FAISS, these dependencies will have be installed and configured in cmake independently.
 
-Use RAFT in cmake projects with `find_package(raft)` for header-only operation and the `raft::raft` target will be available for configuring linking and `RAFT_INCLUDE_DIR` will be available for includes. Note that if any packages are used which require downstream dependencies, such as the `nn` package requiring FAISS, these dependencies will have be installed and configured in cmake independently.
+### <a id="use_shared_libs"></a>Using pre-compiled shared libraries
 
 Use `find_package(raft COMPONENTS nn, distance)` to enable the shared libraries and pass dependencies through separate targets for each component. In this example, `raft::distance` and `raft::nn` targets will be available for configuring linking paths. These targets will also pass through any transitive dependencies (such as FAISS in the case of the `nn` package).
 
-### <a id="build_cxx_source"></a>Building RAFT C++ from source
+The pre-compiled libraries contain template specializations for commonly used types and require the additional include of header files with `extern template` definitions that tell the compiler not to instantiate templates that are already contained in the shared libraries. By convention, these header files are named `spectializations.hpp` and located in the base directory for the packages that contain specializations.
+
+The following example shows how to use the `libraft-distance` API with the pre-compiled specializations:
+```c++
+#include <raft/distance/distance.hpp>
+#include <raft/distance/specializations.hpp>
+```
+
+### <a id="build_cxx_source"></a>Building RAFT C++ from source in cmake
 
 RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library, so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` link target and `RAFT_INCLUDE_DIR` for includes. The `COMPILE_LIBRARIES` option enables the building of the shared libraries 
 
diff --git a/README.md b/README.md
index 88612f13f3..9260c755dd 100755
--- a/README.md
+++ b/README.md
@@ -1,28 +1,33 @@
 # <div align="left"><img src="https://rapids.ai/assets/images/rapids_logo.png" width="90px"/>&nbsp;RAFT: RAPIDS Analytics Framework Toolkit</div>
 
-RAFT is a [Scipy-like](https://scipy.org/) library for scientific computing, containing CUDA-accelerated building-blocks for rapidly composing analytics in the [RAPIDS](https://rapids.ai) ecosystem. These building-blocks include infrastructure as well as mathematical computational primitives, which accelerate the development of algorithms for data science applications. 
+RAFT contains fundamental widely-used algorithms and primitives for data science, graph and machine learning. The algorithms are CUDA-accelerated and form building-blocks for rapidly composing analytics in the [RAPIDS](https://rapids.ai) ecosystem. 
 
 By taking a primitives-based approach to algorithm development, RAFT
 1. accelerates algorithm construction time
 2. reduces the maintenance burden by maximizing reuse across projects, and
 3. centralizes the core computations, allowing future optimizations to benefit all algorithms that use them.
 
-RAFT provides a header-only C++ API (with optional shared libraries to accelerate build time) that cover the following general categories:
+At its core, RAFT is a header-only C++ library with optional shared libraries that span the following categories:
 
 #####
-| Category | Description / Examples |
+| Category | Examples |
 | --- | --- |
-| **Data Formats** | sparse & dense, conversions, and data generations |
+| **Data Formats** | sparse & dense, conversions, data generation |
 | **Data Generation** | sparse, spatial, machine learning datasets |
-| **Dense Linear Algebra** | matrix arithmetic, norms, factorization |
+| **Dense Linear Algebra** | matrix arithmetic, norms, factorization, least squares, svd & eigenvalue problems |
 | **Spatial** | pairwise distances, nearest neighbors, neighborhood graph construction |
-| **Sparse Operations** | linear algebra, slicing, symmetrization, norms, spectral embedding, msf |
+| **Sparse Operations** | linear algebra, eigenvalue problems, slicing, symmetrization, connected component labeling |
 | **Basic Clustering** | spectral clustering, hierarchical clustering, k-means |
-| **Optimizers** | eigenvalue decomposition, least squares, and lanczos |
-| **Statistics** | sampling, moments, metrics |
+| **Combinatorial Optimization** | linear assignment problem, minimum spanning forest |
+| **Iterative Solvers** | lanczos |
+| **Statistics** | sampling, moments and summary statistics, metrics |
 | **Distributed Tools** | multi-node multi-gpu infrastructure |
 
-RAFT also provides a Python API that enables the building of multi-node multi-GPU algorithms in the [Dask](https://dask.org/) ecosystem. We are continuing to improve the coverage of the Python API to expose the building-blocks from the categories above.
+RAFT also provides a Python library that includes
+1. a python wrapper around the `raft::handle_t` for managing cuda library resources
+2. building multi-node multi-GPU algorithms that leverage [Dask](https://dask.org/)
+
+We are continuing to improve the Python API by exposing the core algorithms and primitives from the categories above.
 
 ## Getting started
 
@@ -71,9 +76,10 @@ The folder structure mirrors other RAPIDS repos (cuDF, cuML, cuGraph...), with t
 - `ci`: Scripts for running CI in PRs
 - `conda`: Conda recipes and development conda environments
 - `cpp`: Source code for all C++ code. 
+  - `docs`: Doxygen configuration
   - `include`: The C++ API is fully-contained here 
   - `src`: Compiled template specializations for the shared libraries
-- `docs`: Source code and scripts for building library documentation
+- `docs`: Source code and scripts for building library documentation (doxygen + pydocs)
 - `python`: Source code for all Python source code.
 
 ## Contributing

From 57a23f3ed904bf625578e98a3a0fe04b0bc3db03 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Mon, 7 Feb 2022 18:50:00 +0100
Subject: [PATCH 099/171] Improve row-major meanvar kernel via minimizing
 atomicCAS locks (#489)

Map the row-major kernel grid onto the data more efficiently. In particular, make sure there is only one `atomicCAS` lock per thread block to avoid any possible deadlocks caused by branch divergence within the critical section.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/489
---
 cpp/include/raft/stats/detail/meanvar.cuh | 114 ++++++++++++++--------
 cpp/test/stats/meanvar.cu                 |  31 +++---
 2 files changed, 88 insertions(+), 57 deletions(-)

diff --git a/cpp/include/raft/stats/detail/meanvar.cuh b/cpp/include/raft/stats/detail/meanvar.cuh
index ed411ef74d..1c97326d3b 100644
--- a/cpp/include/raft/stats/detail/meanvar.cuh
+++ b/cpp/include/raft/stats/detail/meanvar.cuh
@@ -93,46 +93,79 @@ class mean_var {
 /*
 NB: current implementation here is not optimal, especially the rowmajor version;
     leaving this for further work (perhaps, as a more generic "linewiseReduce").
-
-Possible improvements:
-
-  1. (romajor) Process input by the warps, not by blocks (thus reduce the iteration workset),
-               then aggregate output partially within blocks.
-  2. (both) Use vectorized loads to utilize memory better
-  3. (rowmajor) Scale the grid size better to utilize more the GPU (like in linewise_op).
-
+    Vectorized loads/stores could speed things up a lot.
+ */
+/**
+ * meanvar kernel - row-major version
+ *
+ * Assumptions:
+ *
+ *  1. blockDim.x == WarpSize
+ *  2. Dimension X goes along columns (D)
+ *  3. Dimension Y goes along rows (N)
+ *
+ *
+ * @tparam T element type
+ * @tparam I indexing type
+ * @tparam BlockSize must be equal to blockDim.x * blockDim.y * blockDim.z
+ * @param data input data
+ * @param mvs meanvars -- output
+ * @param locks guards for updating meanvars
+ * @param len total length of input data (N * D)
+ * @param D number of columns in the input data.
  */
 template <typename T, typename I, int BlockSize>
-__global__ void meanvar_kernel_rowmajor(
-  const T* data, volatile mean_var<T>* mvs, int* locks, I len, I D)
+__global__ void __launch_bounds__(BlockSize)
+  meanvar_kernel_rowmajor(const T* data, volatile mean_var<T>* mvs, int* locks, I len, I D)
 {
-  const I thread_idx = threadIdx.x + BlockSize * blockIdx.x;
+  // read the data
+  const I col = threadIdx.x + blockDim.x * blockIdx.x;
   mean_var<T> thread_data;
-  {
-    const I grid_size = BlockSize * gridDim.x;
-    for (I i = thread_idx; i < len; i += grid_size) {
+  if (col < D) {
+    const I step = D * blockDim.y * gridDim.y;
+    for (I i = col + D * (threadIdx.y + blockDim.y * blockIdx.y); i < len; i += step) {
       thread_data += mean_var<T>(data[i]);
     }
   }
 
-  {
-    const I col = thread_idx % D;
-    int* lock   = locks + col;
-    while (atomicCAS(lock, 0, 1) == 1) {
+  // aggregate within block
+  if (blockDim.y > 1) {
+    __shared__ uint8_t shm_bytes[BlockSize * sizeof(mean_var<T>)];
+    auto shm = (mean_var<T>*)shm_bytes;
+    int tid  = threadIdx.x + threadIdx.y * blockDim.x;
+    shm[tid] = thread_data;
+    for (int bs = BlockSize >> 1; bs >= blockDim.x; bs = bs >> 1) {
+      __syncthreads();
+      if (tid < bs) { shm[tid] += shm[tid + bs]; }
+    }
+    thread_data = shm[tid];
+  }
+
+  // aggregate across blocks
+  if (threadIdx.y == 0) {
+    int* lock = locks + blockIdx.x;
+    if (threadIdx.x == 0 && col < D) {
+      while (atomicCAS(lock, 0, 1) == 1) {
+        __threadfence();
+      }
+    }
+    __syncthreads();
+    if (col < D) {
+      __threadfence();
+      mean_var<T> global_data;
+      global_data.load(mvs + col);
+      global_data += thread_data;
+      global_data.store(mvs + col);
       __threadfence();
     }
-    __threadfence();
-    mean_var<T> global_data;
-    global_data.load(mvs + col);
-    global_data += thread_data;
-    global_data.store(mvs + col);
-    __threadfence();
-    __stwt(lock, 0);
+    __syncthreads();
+    if (threadIdx.x == 0 && col < D) { __stwt(lock, 0); }
   }
 }
 
 template <typename T, typename I, int BlockSize>
-__global__ void meanvar_kernel_colmajor(T* mean, T* var, const T* data, I D, I N, bool sample)
+__global__ void __launch_bounds__(BlockSize)
+  meanvar_kernel_colmajor(T* mean, T* var, const T* data, I D, I N, bool sample)
 {
   using BlockReduce = cub::BlockReduce<mean_var<T>, BlockSize>;
   __shared__ typename BlockReduce::TempStorage shm;
@@ -164,21 +197,26 @@ void meanvar(
   T* mean, T* var, const T* data, I D, I N, bool sample, bool rowMajor, cudaStream_t stream)
 {
   if (rowMajor) {
-    const uint64_t len = uint64_t(D) * uint64_t(N);
-    ASSERT(len <= uint64_t(std::numeric_limits<I>::max()), "N * D does not fit the indexing type");
-    // lcm(row width, block size):
-    //   this way, each thread processes the same column on each iteration.
-    const uint64_t expected_grid_size =
-      (uint64_t(N) / raft::gcd<uint64_t>(uint64_t(N), uint64_t(BlockSize))) * uint64_t(BlockSize);
-    const uint gs =
-      uint(min(expected_grid_size, raft::ceildiv<uint64_t>(len, uint64_t(BlockSize))));
-
-    rmm::device_buffer buf((sizeof(mean_var<T>) + sizeof(int)) * D, stream);
+    static_assert(BlockSize >= WarpSize, "Block size must be not smaller than the warp size.");
+    const dim3 bs(WarpSize, BlockSize / WarpSize, 1);
+    dim3 gs(raft::ceildiv<typeof(bs.x)>(D, bs.x), raft::ceildiv<typeof(bs.y)>(N, bs.y), 1);
+
+    // Don't create more blocks than necessary to occupy the GPU
+    int occupancy;
+    RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &occupancy, meanvar_kernel_rowmajor<T, I, BlockSize>, BlockSize, 0));
+    gs.y = min(gs.y, raft::ceildiv<typeof(gs.y)>(occupancy * getMultiProcessorCount(), gs.x));
+
+    // Global memory: one mean_var<T> for each column
+    //                one lock per all blocks working on the same set of columns
+    rmm::device_buffer buf(sizeof(mean_var<T>) * D + sizeof(int) * gs.x, stream);
     RAFT_CUDA_TRY(cudaMemsetAsync(buf.data(), 0, buf.size(), stream));
     mean_var<T>* mvs = static_cast<mean_var<T>*>(buf.data());
     int* locks       = static_cast<int*>(static_cast<void*>(mvs + D));
-    meanvar_kernel_rowmajor<T, I, BlockSize>
-      <<<gs, BlockSize, 0, stream>>>(data, mvs, locks, len, D);
+
+    const uint64_t len = uint64_t(D) * uint64_t(N);
+    ASSERT(len <= uint64_t(std::numeric_limits<I>::max()), "N * D does not fit the indexing type");
+    meanvar_kernel_rowmajor<T, I, BlockSize><<<gs, bs, 0, stream>>>(data, mvs, locks, len, D);
     meanvar_kernel_fill<T, I>
       <<<raft::ceildiv<I>(D, BlockSize), BlockSize, 0, stream>>>(mean, var, mvs, D, sample);
   } else {
diff --git a/cpp/test/stats/meanvar.cu b/cpp/test/stats/meanvar.cu
index bebdd7a19d..a5bb5b0b0d 100644
--- a/cpp/test/stats/meanvar.cu
+++ b/cpp/test/stats/meanvar.cu
@@ -81,25 +81,18 @@ class MeanVarTest : public ::testing::TestWithParam<MeanVarInputs<T>> {
   rmm::device_uvector<T> data, mean_act, vars_act;
 };
 
-const std::vector<MeanVarInputs<float>> inputsf = {{1.f, 2.f, 1024, 32, true, false, 1234ULL},
-                                                   {1.f, 2.f, 1024, 64, true, false, 1234ULL},
-                                                   {1.f, 2.f, 1024, 128, true, false, 1234ULL},
-                                                   {1.f, 2.f, 1024, 256, true, false, 1234ULL},
-                                                   {-1.f, 2.f, 1024, 32, false, false, 1234ULL},
-                                                   {-1.f, 2.f, 1024, 64, false, false, 1234ULL},
-                                                   {-1.f, 2.f, 1024, 128, false, false, 1234ULL},
-                                                   {-1.f, 2.f, 1024, 256, false, false, 1234ULL},
-                                                   {-1.f, 2.f, 1024, 256, false, false, 1234ULL},
-                                                   {-1.f, 2.f, 1024, 257, false, false, 1234ULL},
-                                                   {1.f, 2.f, 1024, 32, true, true, 1234ULL},
-                                                   {1.f, 2.f, 1024, 64, true, true, 1234ULL},
-                                                   {1.f, 2.f, 1024, 128, true, true, 1234ULL},
-                                                   {1.f, 2.f, 1024, 256, true, true, 1234ULL},
-                                                   {-1.f, 2.f, 1024, 32, false, true, 1234ULL},
-                                                   {-1.f, 2.f, 1024, 64, false, true, 1234ULL},
-                                                   {-1.f, 2.f, 1024, 128, false, true, 1234ULL},
-                                                   {-1.f, 2.f, 1024, 256, false, true, 1234ULL},
-                                                   {-1.f, 2.f, 1024, 257, false, true, 1234ULL}};
+const std::vector<MeanVarInputs<float>> inputsf = {
+  {1.f, 2.f, 1024, 32, true, false, 1234ULL},    {1.f, 2.f, 1024, 64, true, false, 1234ULL},
+  {1.f, 2.f, 1024, 128, true, false, 1234ULL},   {1.f, 2.f, 1024, 256, true, false, 1234ULL},
+  {-1.f, 2.f, 1024, 32, false, false, 1234ULL},  {-1.f, 2.f, 1024, 64, false, false, 1234ULL},
+  {-1.f, 2.f, 1024, 128, false, false, 1234ULL}, {-1.f, 2.f, 1024, 256, false, false, 1234ULL},
+  {-1.f, 2.f, 1024, 256, false, false, 1234ULL}, {-1.f, 2.f, 1024, 257, false, false, 1234ULL},
+  {1.f, 2.f, 1024, 32, true, true, 1234ULL},     {1.f, 2.f, 1024, 64, true, true, 1234ULL},
+  {1.f, 2.f, 1024, 128, true, true, 1234ULL},    {1.f, 2.f, 1024, 256, true, true, 1234ULL},
+  {-1.f, 2.f, 1024, 32, false, true, 1234ULL},   {-1.f, 2.f, 1024, 64, false, true, 1234ULL},
+  {-1.f, 2.f, 1024, 128, false, true, 1234ULL},  {-1.f, 2.f, 1024, 256, false, true, 1234ULL},
+  {-1.f, 2.f, 1024, 257, false, true, 1234ULL},  {-1.f, 2.f, 700, 13, false, true, 1234ULL},
+  {10.f, 2.f, 500000, 811, false, true, 1234ULL}};
 
 const std::vector<MeanVarInputs<double>> inputsd = {{1.0, 2.0, 1024, 32, true, false, 1234ULL},
                                                     {1.0, 2.0, 1024, 64, true, false, 1234ULL},

From 5cae6807826a76d537469f2a8c47e101317f1b3c Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Mon, 7 Feb 2022 15:02:22 -0500
Subject: [PATCH 100/171] Reusing shared libs in gpu ci builds (#487)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)
  - Mark Sadang (https://github.com/msadang)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/raft/pull/487
---
 build.sh         |  2 +-
 ci/cpu/build.sh  |  2 +-
 ci/cpu/upload.sh | 22 ++++++++++------------
 ci/gpu/build.sh  |  5 +++--
 4 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/build.sh b/build.sh
index 3c8524f6ce..ca5b97b698 100755
--- a/build.sh
+++ b/build.sh
@@ -55,7 +55,7 @@ VERBOSE_FLAG=""
 BUILD_ALL_GPU_ARCH=0
 BUILD_TESTS=YES
 BUILD_STATIC_FAISS=OFF
-COMPILE_LIBRARIES=${BUILD_TESTS}
+COMPILE_LIBRARIES=OFF
 COMPILE_NN_LIBRARY=OFF
 COMPILE_DIST_LIBRARY=OFF
 ENABLE_NN_DEPENDENCIES=${BUILD_TESTS}
diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index cf202d8888..64d46a68c7 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -12,7 +12,7 @@ if [[ -e /etc/os-release ]] && (grep -qi centos /etc/os-release); then
 else
     export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
 fi
-export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
+export PARALLEL_LEVEL=${PARALLEL_LEVEL:-8}
 
 # Set home to the job's workspace
 export HOME=$WORKSPACE
diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index 9aa61bd1e2..fe1d651c31 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -30,10 +30,10 @@ fi
 
 gpuci_logger "Get conda file output locations"
 
-export LIBRAFT_NN_FILE=`conda mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_nn --output`
-export LIBRAFT_DISTANCE_FILE=`conda mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_distance --output`
-export LIBRAFT_HEADERS_FILE=`conda mambabuild --croot ${CONDA_BLD_DIR} conda/recipes/libraft_headers --output`
-export PYRAFT_FILE=`conda mambabuild --croot ${CONDA_BLD_DIR} conda/recipes/pyraft --python=$PYTHON --output`
+export LIBRAFT_HEADERS_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/libraft_headers --output`
+export LIBRAFT_NN_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_nn --output`
+export LIBRAFT_DISTANCE_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_distance --output`
+export PYRAFT_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/pyraft --python=$PYTHON --output`
 
 ################################################################################
 # UPLOAD - Conda packages
@@ -42,23 +42,21 @@ export PYRAFT_FILE=`conda mambabuild --croot ${CONDA_BLD_DIR} conda/recipes/pyra
 gpuci_logger "Starting conda uploads"
 
 if [[ "$BUILD_LIBRAFT" == "1" && "$UPLOAD_LIBRAFT" == "1" ]]; then
-  # libraft-nn
+
+  test -e ${LIBRAFT_HEADERS_FILE}
+  echo "Upload libraft-nn"
+  echo ${LIBRAFT_HEADERS_FILE}
+  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBRAFT_HEADERS_FILE} --no-progress
+
   test -e ${LIBRAFT_NN_FILE}
   echo "Upload libraft-nn"
   echo ${LIBRAFT_NN_FILE}
   gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBRAFT_NN_FILE} --no-progress
 
-  # libraft-distance
   test -e ${LIBRAFT_DISTANCE_FILE}
   echo "Upload libraft-distance"
   echo ${LIBRAFT_DISTANCE_FILE}
   gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBRAFT_DISTANCE_FILE} --no-progress
-
-  # libraft-headers
-  test -e ${LIBRAFT_HEADERS_FILE}
-  echo "Upload libraft-nn"
-  echo ${LIBRAFT_HEADERS_FILE}
-  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBRAFT_HEADERS_FILE} --no-progress
 fi
 
 if [[ "$BUILD_RAFT" == "1" ]]; then
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 5df731ca9b..afc6056b42 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -94,10 +94,11 @@ export LD_LIBRARY_PATH_CACHED=$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
 
 gpuci_logger "Build C++ and Python targets"
+# These should link against the existing shared libs
 if hasArg --skip-tests; then
-  "$WORKSPACE/build.sh" libraft pyraft libraft -v --compile-libs --nogtest
+  "$WORKSPACE/build.sh" pyraft libraft -v --nogtest
 else
-  "$WORKSPACE/build.sh" libraft pyraft libraft -v --compile-libs
+  "$WORKSPACE/build.sh" pyraft libraft -v
 fi
 
 gpuci_logger "sccache stats"

From 66a60b9f2789cd980e047a1984aed764a77287a6 Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Mon, 7 Feb 2022 20:24:04 -0500
Subject: [PATCH 101/171] LinAlg impl in detail (#383)

Addresses #330

Authors:
  - Divye Gala (https://github.com/divyegala)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/383
---
 .../raft/distance/detail/correlation.cuh      |    2 +-
 cpp/include/raft/distance/detail/cosine.cuh   |    2 +-
 cpp/include/raft/distance/detail/distance.cuh |    2 +-
 .../raft/distance/detail/euclidean.cuh        |    2 +-
 .../raft/distance/detail/fused_l2_nn.cuh      |    2 +-
 .../raft/distance/detail/hellinger.cuh        |    2 +-
 .../detail/pairwise_distance_base.cuh         |    4 +-
 cpp/include/raft/distance/distance.hpp        |    2 +-
 .../distance_type.hpp}                        |    2 +-
 cpp/include/raft/handle.hpp                   |    5 +-
 cpp/include/raft/label/classlabels.cuh        |    2 +-
 cpp/include/raft/label/merge_labels.cuh       |    2 +-
 cpp/include/raft/linalg/{add.cuh => add.hpp}  |   29 +-
 cpp/include/raft/linalg/binary_op.hpp         |   54 +
 ...y_r1_update.cuh => cholesky_r1_update.hpp} |   97 +-
 .../raft/linalg/coalesced_reduction.hpp       |   72 +
 cpp/include/raft/linalg/contractions.hpp      |  207 +++
 cpp/include/raft/linalg/detail/add.cuh        |   67 +
 .../raft/linalg/{ => detail}/binary_op.cuh    |   25 +-
 .../raft/linalg/detail/cholesky_r1_update.hpp |  129 ++
 .../{ => detail}/coalesced_reduction.cuh      |   35 +-
 .../raft/linalg/{ => detail}/contractions.cuh |  186 +--
 .../cublas_wrappers.hpp}                      |   78 +-
 .../cusolver_wrappers.hpp}                    |   66 +-
 cpp/include/raft/linalg/detail/divide.hpp     |   34 +
 .../raft/linalg/{eig.cuh => detail/eig.hpp}   |   65 +-
 cpp/include/raft/linalg/detail/eltwise.hpp    |   77 +
 cpp/include/raft/linalg/detail/functional.cuh |   69 +
 .../raft/linalg/{gemm.cuh => detail/gemm.hpp} |   29 +-
 cpp/include/raft/linalg/detail/gemv.hpp       |  118 ++
 .../raft/linalg/{init.h => detail/init.hpp}   |   18 +-
 cpp/include/raft/linalg/detail/lanczos.hpp    | 1401 +++++++++++++++++
 cpp/include/raft/linalg/detail/map.cuh        |   48 +
 .../linalg/{ => detail}/map_then_reduce.cuh   |   66 +-
 .../raft/linalg/detail/matrix_vector_op.cuh   |  132 ++
 .../raft/linalg/detail/mean_squared_error.hpp |   38 +
 cpp/include/raft/linalg/detail/multiply.hpp   |   34 +
 cpp/include/raft/linalg/detail/norm.hpp       |  116 ++
 cpp/include/raft/linalg/{ => detail}/qr.cuh   |   43 +-
 cpp/include/raft/linalg/detail/reduce.hpp     |   63 +
 .../linalg/{ => detail}/strided_reduction.cuh |   32 +-
 cpp/include/raft/linalg/detail/subtract.cuh   |   69 +
 .../raft/linalg/{svd.cuh => detail/svd.hpp}   |  178 +--
 .../{transpose.h => detail/transpose.hpp}     |   50 +-
 .../raft/linalg/{ => detail}/unary_op.cuh     |   40 +-
 .../raft/linalg/{divide.cuh => divide.hpp}    |    9 +-
 cpp/include/raft/linalg/eig.hpp               |  116 ++
 .../raft/linalg/{eltwise.cuh => eltwise.hpp}  |   42 +-
 cpp/include/raft/linalg/gemm.hpp              |  132 ++
 cpp/include/raft/linalg/{gemv.h => gemv.hpp}  |   95 +-
 cpp/include/raft/linalg/init.hpp              |   56 +
 cpp/include/raft/linalg/lanczos.hpp           | 1373 +---------------
 cpp/include/raft/linalg/{map.cuh => map.hpp}  |   27 +-
 cpp/include/raft/linalg/map_then_reduce.hpp   |   87 +
 ...rix_vector_op.cuh => matrix_vector_op.hpp} |   14 +-
 ...uared_error.cuh => mean_squared_error.hpp} |   10 +-
 .../linalg/{multiply.cuh => multiply.hpp}     |    7 +-
 .../raft/linalg/{norm.cuh => norm.hpp}        |   72 +-
 cpp/include/raft/linalg/qr.hpp                |   74 +
 .../raft/linalg/{reduce.cuh => reduce.hpp}    |   17 +-
 cpp/include/raft/linalg/strided_reduction.hpp |   72 +
 .../linalg/{subtract.cuh => subtract.hpp}     |   30 +-
 cpp/include/raft/linalg/svd.hpp               |  184 +++
 cpp/include/raft/linalg/transpose.hpp         |   57 +
 cpp/include/raft/linalg/unary_op.hpp          |   73 +
 cpp/include/raft/matrix/detail/math.cuh       |    8 +-
 cpp/include/raft/matrix/detail/matrix.cuh     |    5 +-
 .../sparse/distance/detail/bin_distance.cuh   |    1 -
 .../sparse/distance/detail/ip_distance.cuh    |    1 -
 .../sparse/distance/detail/l2_distance.cuh    |    4 +-
 .../sparse/distance/detail/lp_distance.cuh    |    1 -
 cpp/include/raft/sparse/distance/distance.hpp |    2 +-
 .../hierarchy/detail/connectivities.cuh       |    4 +-
 cpp/include/raft/sparse/op/detail/slice.h     |    2 +-
 .../selection/detail/connect_components.cuh   |    2 +-
 .../raft/sparse/selection/detail/knn.cuh      |    4 +-
 .../sparse/selection/detail/knn_graph.cuh     |    2 +-
 cpp/include/raft/sparse/selection/knn.hpp     |    2 +-
 .../raft/sparse/selection/knn_graph.hpp       |    2 +-
 cpp/include/raft/spatial/knn/ann_common.h     |    2 +-
 cpp/include/raft/spatial/knn/ball_cover.hpp   |    2 +-
 .../raft/spatial/knn/ball_cover_common.h      |    2 +-
 .../knn/detail/ann_quantized_faiss.cuh        |    2 +-
 .../raft/spatial/knn/detail/common_faiss.h    |    2 +-
 .../raft/spatial/knn/detail/fused_l2_knn.cuh  |    2 +-
 .../spatial/knn/detail/haversine_distance.cuh |    2 +-
 .../knn/detail/knn_brute_force_faiss.cuh      |    2 +-
 .../raft/spatial/knn/detail/processing.hpp    |    8 +-
 cpp/include/raft/spectral/kmeans.hpp          |   35 +-
 cpp/include/raft/spectral/lapack.hpp          |    4 +-
 cpp/include/raft/spectral/matrix_wrappers.hpp |   13 +-
 .../raft/spectral/modularity_maximization.hpp |    4 +-
 cpp/include/raft/spectral/spectral_util.hpp   |   48 +-
 cpp/include/raft/stats/detail/mean.cuh        |    2 +-
 cpp/include/raft/stats/detail/mean_center.cuh |    2 +-
 cpp/include/raft/stats/detail/meanvar.cuh     |    2 +-
 cpp/include/raft/stats/detail/stddev.cuh      |    2 +-
 cpp/include/raft/stats/detail/sum.cuh         |    2 +-
 cpp/test/distance/fused_l2_nn.cu              |    2 +-
 cpp/test/linalg/add.cu                        |    2 +-
 cpp/test/linalg/add.cuh                       |    2 +-
 cpp/test/linalg/binary_op.cu                  |    2 +-
 cpp/test/linalg/binary_op.cuh                 |    2 +-
 cpp/test/linalg/cholesky_r1.cu                |   26 +-
 cpp/test/linalg/coalesced_reduction.cu        |    2 +-
 cpp/test/linalg/divide.cu                     |    2 +-
 cpp/test/linalg/eig.cu                        |    2 +-
 cpp/test/linalg/eig_sel.cu                    |   20 +-
 cpp/test/linalg/eltwise.cu                    |    2 +-
 cpp/test/linalg/gemm_layout.cu                |    2 +-
 cpp/test/linalg/gemv.cu                       |    2 +-
 cpp/test/linalg/map.cu                        |    4 +-
 cpp/test/linalg/map_then_reduce.cu            |    2 +-
 cpp/test/linalg/matrix_vector_op.cuh          |    2 +-
 cpp/test/linalg/multiply.cu                   |    2 +-
 cpp/test/linalg/norm.cu                       |    2 +-
 cpp/test/linalg/reduce.cu                     |    2 +-
 cpp/test/linalg/reduce.cuh                    |    9 +-
 cpp/test/linalg/strided_reduction.cu          |    2 +-
 cpp/test/linalg/subtract.cu                   |    2 +-
 cpp/test/linalg/svd.cu                        |    2 +-
 cpp/test/linalg/transpose.cu                  |    2 +-
 cpp/test/linalg/unary_op.cu                   |    2 +-
 cpp/test/linalg/unary_op.cuh                  |    2 +-
 cpp/test/matrix/linewise_op.cu                |    2 +-
 cpp/test/sparse/connect_components.cu         |    4 +-
 cpp/test/sparse/dist_coo_spmv.cu              |    4 +-
 cpp/test/sparse/distance.cu                   |    2 +-
 cpp/test/sparse/knn.cu                        |    2 +-
 cpp/test/sparse/linkage.cu                    |    4 +-
 cpp/test/spatial/ball_cover.cu                |    2 +-
 cpp/test/spatial/faiss_mr.cu                  |    2 +-
 cpp/test/spatial/fused_l2_knn.cu              |    2 +-
 cpp/test/spatial/haversine.cu                 |    2 +-
 cpp/test/spatial/knn.cu                       |    2 +-
 cpp/test/stats/sum.cu                         |    2 +-
 136 files changed, 4087 insertions(+), 2565 deletions(-)
 rename cpp/include/raft/{linalg/distance_type.h => distance/distance_type.hpp} (97%)
 rename cpp/include/raft/linalg/{add.cuh => add.hpp} (72%)
 create mode 100644 cpp/include/raft/linalg/binary_op.hpp
 rename cpp/include/raft/linalg/{cholesky_r1_update.cuh => cholesky_r1_update.hpp} (56%)
 create mode 100644 cpp/include/raft/linalg/coalesced_reduction.hpp
 create mode 100644 cpp/include/raft/linalg/contractions.hpp
 create mode 100644 cpp/include/raft/linalg/detail/add.cuh
 rename cpp/include/raft/linalg/{ => detail}/binary_op.cuh (81%)
 create mode 100644 cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
 rename cpp/include/raft/linalg/{ => detail}/coalesced_reduction.cuh (71%)
 rename cpp/include/raft/linalg/{ => detail}/contractions.cuh (54%)
 rename cpp/include/raft/linalg/{cublas_wrappers.h => detail/cublas_wrappers.hpp} (94%)
 rename cpp/include/raft/linalg/{cusolver_wrappers.h => detail/cusolver_wrappers.hpp} (95%)
 create mode 100644 cpp/include/raft/linalg/detail/divide.hpp
 rename cpp/include/raft/linalg/{eig.cuh => detail/eig.hpp} (86%)
 create mode 100644 cpp/include/raft/linalg/detail/eltwise.hpp
 create mode 100644 cpp/include/raft/linalg/detail/functional.cuh
 rename cpp/include/raft/linalg/{gemm.cuh => detail/gemm.hpp} (88%)
 create mode 100644 cpp/include/raft/linalg/detail/gemv.hpp
 rename cpp/include/raft/linalg/{init.h => detail/init.hpp} (81%)
 create mode 100644 cpp/include/raft/linalg/detail/lanczos.hpp
 create mode 100644 cpp/include/raft/linalg/detail/map.cuh
 rename cpp/include/raft/linalg/{ => detail}/map_then_reduce.cuh (54%)
 create mode 100644 cpp/include/raft/linalg/detail/matrix_vector_op.cuh
 create mode 100644 cpp/include/raft/linalg/detail/mean_squared_error.hpp
 create mode 100644 cpp/include/raft/linalg/detail/multiply.hpp
 create mode 100644 cpp/include/raft/linalg/detail/norm.hpp
 rename cpp/include/raft/linalg/{ => detail}/qr.cuh (77%)
 create mode 100644 cpp/include/raft/linalg/detail/reduce.hpp
 rename cpp/include/raft/linalg/{ => detail}/strided_reduction.cuh (80%)
 create mode 100644 cpp/include/raft/linalg/detail/subtract.cuh
 rename cpp/include/raft/linalg/{svd.cuh => detail/svd.hpp} (59%)
 rename cpp/include/raft/linalg/{transpose.h => detail/transpose.hpp} (56%)
 rename cpp/include/raft/linalg/{ => detail}/unary_op.cuh (69%)
 rename cpp/include/raft/linalg/{divide.cuh => divide.hpp} (88%)
 create mode 100644 cpp/include/raft/linalg/eig.hpp
 rename cpp/include/raft/linalg/{eltwise.cuh => eltwise.hpp} (75%)
 create mode 100644 cpp/include/raft/linalg/gemm.hpp
 rename cpp/include/raft/linalg/{gemv.h => gemv.hpp} (73%)
 create mode 100644 cpp/include/raft/linalg/init.hpp
 rename cpp/include/raft/linalg/{map.cuh => map.hpp} (61%)
 create mode 100644 cpp/include/raft/linalg/map_then_reduce.hpp
 rename cpp/include/raft/linalg/{matrix_vector_op.cuh => matrix_vector_op.hpp} (89%)
 rename cpp/include/raft/linalg/{mean_squared_error.cuh => mean_squared_error.hpp} (82%)
 rename cpp/include/raft/linalg/{multiply.cuh => multiply.hpp} (88%)
 rename cpp/include/raft/linalg/{norm.cuh => norm.hpp} (65%)
 create mode 100644 cpp/include/raft/linalg/qr.hpp
 rename cpp/include/raft/linalg/{reduce.cuh => reduce.hpp} (81%)
 create mode 100644 cpp/include/raft/linalg/strided_reduction.hpp
 rename cpp/include/raft/linalg/{subtract.cuh => subtract.hpp} (68%)
 create mode 100644 cpp/include/raft/linalg/svd.hpp
 create mode 100644 cpp/include/raft/linalg/transpose.hpp
 create mode 100644 cpp/include/raft/linalg/unary_op.hpp

diff --git a/cpp/include/raft/distance/detail/correlation.cuh b/cpp/include/raft/distance/detail/correlation.cuh
index 8384598805..21d04f3f8d 100644
--- a/cpp/include/raft/distance/detail/correlation.cuh
+++ b/cpp/include/raft/distance/detail/correlation.cuh
@@ -17,7 +17,7 @@
 #pragma once
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/reduce.cuh>
+#include <raft/linalg/reduce.hpp>
 
 namespace raft {
 namespace distance {
diff --git a/cpp/include/raft/distance/detail/cosine.cuh b/cpp/include/raft/distance/detail/cosine.cuh
index 3007198f60..bead5f1f71 100644
--- a/cpp/include/raft/distance/detail/cosine.cuh
+++ b/cpp/include/raft/distance/detail/cosine.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/norm.hpp>
 
 namespace raft {
 namespace distance {
diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index 21031afef1..45850de115 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -30,7 +30,7 @@
 #include <raft/distance/detail/l1.cuh>
 #include <raft/distance/detail/minkowski.cuh>
 #include <raft/distance/detail/russell_rao.cuh>
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/distance/detail/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh
index a8deb8df24..4786f584c4 100644
--- a/cpp/include/raft/distance/detail/euclidean.cuh
+++ b/cpp/include/raft/distance/detail/euclidean.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/norm.hpp>
 
 namespace raft {
 namespace distance {
diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
index 6ad939ecd5..80eb6021ef 100644
--- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
@@ -20,7 +20,7 @@
 #include <limits>
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/contractions.cuh>
+#include <raft/linalg/contractions.hpp>
 #include <stdint.h>
 
 namespace raft {
diff --git a/cpp/include/raft/distance/detail/hellinger.cuh b/cpp/include/raft/distance/detail/hellinger.cuh
index 1874d2e942..3cb0469803 100644
--- a/cpp/include/raft/distance/detail/hellinger.cuh
+++ b/cpp/include/raft/distance/detail/hellinger.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 
 namespace raft {
 namespace distance {
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index 08911e0350..bfca731443 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -16,8 +16,8 @@
 #pragma once
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/contractions.cuh>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/contractions.hpp>
+#include <raft/linalg/norm.hpp>
 #include <raft/vectorized.cuh>
 
 #include <cstddef>
diff --git a/cpp/include/raft/distance/distance.hpp b/cpp/include/raft/distance/distance.hpp
index 3dad7ea6d7..935cf6677a 100644
--- a/cpp/include/raft/distance/distance.hpp
+++ b/cpp/include/raft/distance/distance.hpp
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <raft/distance/detail/distance.cuh>
+#include <raft/distance/distance_type.hpp>
 #include <raft/handle.hpp>
-#include <raft/linalg/distance_type.h>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/linalg/distance_type.h b/cpp/include/raft/distance/distance_type.hpp
similarity index 97%
rename from cpp/include/raft/linalg/distance_type.h
rename to cpp/include/raft/distance/distance_type.hpp
index 681a83f3f8..f75263b00d 100644
--- a/cpp/include/raft/linalg/distance_type.h
+++ b/cpp/include/raft/distance/distance_type.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 6421ba5344..22e9e78ebe 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -33,9 +33,10 @@
 //#include <common/cuml_comms_int.hpp>
 
 #include "cudart_utils.h"
+
 #include <raft/comms/comms.hpp>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/detail/cusolver_wrappers.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/exec_policy.hpp>
diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh
index dce732b06b..fda4c02a1c 100644
--- a/cpp/include/raft/label/classlabels.cuh
+++ b/cpp/include/raft/label/classlabels.cuh
@@ -20,7 +20,7 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/merge_labels.cuh
index a3f2411102..9cd5a29951 100644
--- a/cpp/include/raft/label/merge_labels.cuh
+++ b/cpp/include/raft/label/merge_labels.cuh
@@ -21,7 +21,7 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/init.h>
+#include <raft/linalg/init.hpp>
 
 namespace raft {
 namespace label {
diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.hpp
similarity index 72%
rename from cpp/include/raft/linalg/add.cuh
rename to cpp/include/raft/linalg/add.hpp
index 926cc44197..2f999a45d2 100644
--- a/cpp/include/raft/linalg/add.cuh
+++ b/cpp/include/raft/linalg/add.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,13 @@
 
 #pragma once
 
-#include "binary_op.cuh"
-#include "unary_op.cuh"
+#include "detail/add.cuh"
 
 namespace raft {
 namespace linalg {
 
+using detail::adds_scalar;
+
 /**
  * @brief Elementwise scalar add operation on the input buffer
  *
@@ -39,8 +40,7 @@ namespace linalg {
 template <typename InT, typename OutT = InT, typename IdxType = int>
 void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
 {
-  auto op = [scalar] __device__(InT in) { return OutT(in + scalar); };
-  unaryOp<InT, decltype(op), IdxType, OutT>(out, in, len, op, stream);
+  detail::addScalar(out, in, scalar, len, stream);
 }
 
 /**
@@ -59,18 +59,7 @@ void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t s
 template <typename InT, typename OutT = InT, typename IdxType = int>
 void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
 {
-  auto op = [] __device__(InT a, InT b) { return OutT(a + b); };
-  binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
-}
-
-template <class math_t, typename IdxType>
-__global__ void add_dev_scalar_kernel(math_t* outDev,
-                                      const math_t* inDev,
-                                      const math_t* singleScalarDev,
-                                      IdxType len)
-{
-  IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
-  if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; }
+  detail::add(out, in1, in2, len, stream);
 }
 
 /** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
@@ -90,11 +79,7 @@ void addDevScalar(math_t* outDev,
                   IdxType len,
                   cudaStream_t stream)
 {
-  // TODO: block dimension has not been tuned
-  dim3 block(256);
-  dim3 grid(raft::ceildiv(len, (IdxType)block.x));
-  add_dev_scalar_kernel<math_t><<<grid, block, 0, stream>>>(outDev, inDev, singleScalarDev, len);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  detail::addDevScalar(outDev, inDev, singleScalarDev, len, stream);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/binary_op.hpp b/cpp/include/raft/linalg/binary_op.hpp
new file mode 100644
index 0000000000..5c73b6d3c5
--- /dev/null
+++ b/cpp/include/raft/linalg/binary_op.hpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/binary_op.cuh"
+
+#include <raft/cuda_utils.cuh>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief perform element-wise binary operation on the input arrays
+ * @tparam InType input data-type
+ * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam OutType output data-type
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @param out the output array
+ * @param in1 the first input array
+ * @param in2 the second input array
+ * @param len number of elements in the input array
+ * @param op the device-lambda
+ * @param stream cuda stream where to launch work
+ * @note Lambda must be a functor with the following signature:
+ *       `OutType func(const InType& val1, const InType& val2);`
+ */
+template <typename InType,
+          typename Lambda,
+          typename OutType = InType,
+          typename IdxType = int,
+          int TPB          = 256>
+void binaryOp(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
+{
+  detail::binaryOp(out, in1, in2, len, op, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.hpp
similarity index 56%
rename from cpp/include/raft/linalg/cholesky_r1_update.cuh
rename to cpp/include/raft/linalg/cholesky_r1_update.hpp
index 1745b0dcc8..583c65c50e 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/cholesky_r1_update.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,7 @@
 
 #pragma once
 
-#include <raft/cuda_utils.cuh>
-#include <raft/handle.hpp>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
+#include "detail/cholesky_r1_update.hpp"
 
 namespace raft {
 namespace linalg {
@@ -132,94 +128,7 @@ void choleskyRank1Update(const raft::handle_t& handle,
                          cudaStream_t stream,
                          math_t eps = -1)
 {
-  // The matrix A' is defined as:
-  // A' = [[A_11, A_12]
-  //       [A_21, A_22]]
-  // where:
-  // - A_11 = A, matrix of size (n-1)x(n-1)
-  // - A_21[j] = A_12.T[j] = A_new[j] j=0..n-2, vector with (n-1) elements
-  // - A_22 = A_new[n-1] scalar.
-  //
-  // Instead of caclulating the Cholelsky decomposition of A' from scratch,
-  // we just update L with the new row. The new Cholesky decomposition will be
-  // calculated as:
-  // L' = [[L_11,    0]
-  //       [L_12, L_22]]
-  // where L_11 is the Cholesky decomposition of A (size [n-1 x n-1]), and
-  // L_12 and L_22 are the new quantities that we need to calculate.
-
-  // We need a workspace in device memory to store a scalar. Additionally, in
-  // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats.
-  const int align = 256;
-  int offset =
-    (uplo == CUBLAS_FILL_MODE_LOWER) ? raft::alignTo<int>(sizeof(math_t) * (n - 1), align) : 0;
-  if (workspace == nullptr) {
-    *n_bytes = offset + 1 * sizeof(math_t);
-    return;
-  }
-  math_t* s    = reinterpret_cast<math_t*>(((char*)workspace) + offset);
-  math_t* L_22 = L + (n - 1) * ld + n - 1;
-
-  math_t* A_new = nullptr;
-  math_t* A_row = nullptr;
-  if (uplo == CUBLAS_FILL_MODE_UPPER) {
-    // A_new is stored as the n-1 th column of L
-    A_new = L + (n - 1) * ld;
-  } else {
-    // If the input is lower triangular, then the new elements of A are stored
-    // as the n-th row of L. Since the matrix is column major, this is non
-    // contiguous. We copy elements from A_row to a contiguous workspace A_new.
-    A_row = L + n - 1;
-    A_new = reinterpret_cast<math_t*>(workspace);
-    RAFT_CUBLAS_TRY(
-      raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream));
-  }
-  cublasOperation_t op = (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  if (n > 1) {
-    // Calculate L_12 = x by solving equation L_11 x = A_12
-    math_t alpha = 1;
-    RAFT_CUBLAS_TRY(raft::linalg::cublastrsm(handle.get_cublas_handle(),
-                                             CUBLAS_SIDE_LEFT,
-                                             uplo,
-                                             op,
-                                             CUBLAS_DIAG_NON_UNIT,
-                                             n - 1,
-                                             1,
-                                             &alpha,
-                                             L,
-                                             ld,
-                                             A_new,
-                                             n - 1,
-                                             stream));
-
-    // A_new now stores L_12, we calculate s = L_12 * L_12
-    RAFT_CUBLAS_TRY(
-      raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream));
-
-    if (uplo == CUBLAS_FILL_MODE_LOWER) {
-      // Copy back the L_12 elements as the n-th row of L
-      RAFT_CUBLAS_TRY(
-        raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream));
-    }
-  } else {  // n == 1 case
-    RAFT_CUDA_TRY(cudaMemsetAsync(s, 0, sizeof(math_t), stream));
-  }
-
-  // L_22 = sqrt(A_22 - L_12 * L_12)
-  math_t s_host;
-  math_t L_22_host;
-  raft::update_host(&s_host, s, 1, stream);
-  raft::update_host(&L_22_host, L_22, 1, stream);  // L_22 stores A_22
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-  L_22_host = std::sqrt(L_22_host - s_host);
-
-  // Check for numeric error with sqrt. If the matrix is not positive definit or
-  // the system is very ill conditioned then the A_22 - L_12 * L_12 can be
-  // negative, which would result L_22 = NaN. A small positive eps parameter
-  // can be used to prevent this.
-  if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { L_22_host = eps; }
-  ASSERT(!std::isnan(L_22_host), "Error during Cholesky rank one update");
-  raft::update_device(L_22, &L_22_host, 1, stream);
+  detail::choleskyRank1Update(handle, L, n, ld, workspace, n_bytes, uplo, stream, eps);
 }
 };  // namespace linalg
 };  // namespace raft
diff --git a/cpp/include/raft/linalg/coalesced_reduction.hpp b/cpp/include/raft/linalg/coalesced_reduction.hpp
new file mode 100644
index 0000000000..0f1ca9202d
--- /dev/null
+++ b/cpp/include/raft/linalg/coalesced_reduction.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/coalesced_reduction.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Compute reduction of the input matrix along the leading dimension
+ *
+ * @tparam InType the data type of the input
+ * @tparam OutType the data type of the output (as well as the data type for
+ *  which reduction is performed)
+ * @tparam IdxType data type of the indices of the array
+ * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
+ * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*ReduceLambda)(OutType);</pre>
+ * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*FinalLambda)(OutType);</pre>
+ * @param dots the output reduction vector
+ * @param data the input matrix
+ * @param D leading dimension of data
+ * @param N second dimension data
+ * @param init initial value to use for the reduction
+ * @param main_op elementwise operation to apply before reduction
+ * @param reduce_op binary reduction operation
+ * @param final_op elementwise operation to apply before storing results
+ * @param inplace reduction result added inplace or overwrites old values?
+ * @param stream cuda stream where to launch work
+ */
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::Nop<OutType>>
+void coalescedReduction(OutType* dots,
+                        const InType* data,
+                        int D,
+                        int N,
+                        OutType init,
+                        cudaStream_t stream,
+                        bool inplace           = false,
+                        MainLambda main_op     = raft::Nop<InType, IdxType>(),
+                        ReduceLambda reduce_op = raft::Sum<OutType>(),
+                        FinalLambda final_op   = raft::Nop<OutType>())
+{
+  detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/contractions.hpp b/cpp/include/raft/linalg/contractions.hpp
new file mode 100644
index 0000000000..e317588b1d
--- /dev/null
+++ b/cpp/include/raft/linalg/contractions.hpp
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/contractions.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief This is the central enum that should be used to configure the perf
+ *        landscape of the Contraction kernel.
+ *
+ * Main goal of this Policy struct is to provide sufficient knobs to tune the
+ * perf of Contraction kernel, as and when we see matrices of different shapes.
+ *
+ * @tparam DataT   the IO and math datatype
+ * @tparam _veclen number of k-elements loaded by each thread for every LDG call
+ *                 it makes. This should be configured based on the input 'k'
+ *                 value and the input data type. For eg: if DataT = float and
+ *                 k is multiples of 4, then setting this to 4 gives the best
+ *                 LDG pattern. Possible values are {1, 2, 4}.
+ * @tparam _kblk   number of k-elements operated upon per main-loop iteration.
+ *                 Therefore total number of main-loop iterations will be
+ *                 `ceil(k/_kblk)`. This must be multiples of `_veclen`. Do note
+ *                 that bigger this value, the greater shared mem requirement.
+ * @tparam _rpt    Defines the number of rows that a given thread accumulates on.
+ *                 This directly results in increased register pressure. This
+ *                 also is used to compute the number of m-elements worked upon
+ *                 by each thread block.
+ * @tparam _cpt    Defines the number of cols that a given thread accumulates on.
+ *                 This directly results in increased register pressure. This
+ *                 also is used to compute the number of n-elements worked upon
+ *                 by each thread block.
+ * @tparam _tr     Number of threads working on the same output column. This is
+ *                 used to compute the number of m-elements worked upon by each
+ *                 thread block. This also determines the number of threads per
+ *                 thread block
+ * @tparam _tc     Number of threads working on the same output row. This is
+ *                 used to compute the number of m-elements worked upon by each
+ *                 thread block. This also determines the number of threads per
+ *                 thread block
+ */
+template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
+struct KernelPolicy {
+  enum {
+    /** number of elements along K worked upon per main loop iteration */
+    Kblk = _kblk,
+    /** number of elements loaded per LDG */
+    Veclen = _veclen,
+    /** number of rows a thread works on for accumulation */
+    AccRowsPerTh = _rpt,
+    /** number of cols a thread works on for accumulation */
+    AccColsPerTh = _cpt,
+    /** number of threads working the same output col */
+    AccThRows = _tr,
+    /** number of threads working the same output row */
+    AccThCols = _tc,
+    /** total threads per block */
+    Nthreads = AccThRows * AccThCols,
+    /** output tile size along rows */
+    Mblk = AccRowsPerTh * AccThRows,
+    /** output tile size along cols */
+    Nblk = AccColsPerTh * AccThCols,
+    /** number of threads loading a single row */
+    LdgThRow = Kblk / Veclen,
+    /** number of LDGs issued by a single thread for X */
+    LdgPerThX = Mblk * LdgThRow / Nthreads,
+    /** number of LDGs issued by a single thread for Y */
+    LdgPerThY = Nblk * LdgThRow / Nthreads,
+    /** number of rows of X covered per LDG */
+    LdgRowsX = Mblk / LdgPerThX,
+    /** number of rows of Y covered per LDG */
+    LdgRowsY = Nblk / LdgPerThY,
+    /** stride for accessing X/Y data in shared mem */
+    SmemStride = Kblk + Veclen,
+    /** size of one page for storing X data */
+    SmemPageX = SmemStride * Mblk,
+    /** size of one page for storing Y data */
+    SmemPageY = SmemStride * Nblk,
+    /** size of one smem page */
+    SmemPage = SmemPageX + SmemPageY,
+    /** size (in B) for smem needed */
+    SmemSize = 2 * SmemPage * sizeof(DataT),
+  };  // enum
+
+};  // struct KernelPolicy
+
+template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
+struct ColKernelPolicy {
+  enum {
+    /** number of elements along K worked upon per main loop iteration */
+    Kblk = _kblk,
+    /** number of elements loaded per LDG */
+    Veclen = _veclen,
+    /** number of rows a thread works on for accumulation */
+    AccRowsPerTh = _rpt,
+    /** number of cols a thread works on for accumulation */
+    AccColsPerTh = _cpt,
+    /** number of threads working the same output col */
+    AccThRows = _tr,
+    /** number of threads working the same output row */
+    AccThCols = _tc,
+    /** total threads per block */
+    Nthreads = AccThRows * AccThCols,
+    /** output tile size along rows */
+    Mblk = AccRowsPerTh * AccThRows,
+    /** output tile size along cols */
+    Nblk = AccColsPerTh * AccThCols,
+    /** number of threads loading a single col */
+    LdgThRow = Mblk / Veclen,
+    /** number of LDGs issued by a single thread for X */
+    LdgPerThX = Kblk * LdgThRow / Nthreads,
+    /** number of LDGs issued by a single thread for Y */
+    LdgPerThY = Kblk * LdgThRow / Nthreads,
+    /** number of rows of X covered per LDG */
+    LdgRowsX = Kblk / LdgPerThX,
+    /** number of rows of Y covered per LDG */
+    LdgRowsY = Kblk / LdgPerThY,
+    /** stride for accessing X/Y data in shared mem */
+    SmemStride = Mblk + Veclen,
+    /** size of one page for storing X data */
+    SmemPageX = SmemStride * Kblk,
+    /** size of one page for storing Y data */
+    SmemPageY = SmemStride * Kblk,
+    /** size of one smem page */
+    SmemPage = SmemPageX + SmemPageY,
+    /** size (in B) for smem needed */
+    SmemSize = 2 * SmemPage * sizeof(DataT),
+  };  // colMajor enum
+  static_assert(Mblk == Nblk, "Mblk should be equal to Nblk");
+};
+/**
+ * @defgroup Policy4x4 16 elements per thread Policy with k-block = 32
+ * @{
+ */
+template <typename DataT, int _veclen>
+struct Policy4x4 {
+};
+
+template <int _veclen>
+struct Policy4x4<float, _veclen> {
+  typedef KernelPolicy<float, _veclen, 32, 4, 4, 16, 16> Policy;
+  typedef ColKernelPolicy<float, _veclen, 32, 4, 4, 16, 16> ColPolicy;
+};
+
+template <int _veclen>
+struct Policy4x4<double, _veclen> {
+  typedef KernelPolicy<double, _veclen, 16, 4, 4, 16, 16> Policy;
+  typedef ColKernelPolicy<double, _veclen, 16, 4, 4, 16, 16> ColPolicy;
+};
+/** @} */
+
+/**
+ * @defgroup Policy2x8 16 elements per thread Policy with k-block = 16
+ * @{
+ */
+template <typename DataT, int _veclen = 1>
+struct Policy2x8 {
+};
+
+template <int _veclen>
+struct Policy2x8<float, _veclen> {
+  typedef KernelPolicy<float, _veclen, 16, 2, 8, 8, 32> Policy;
+  typedef ColKernelPolicy<float, _veclen, 16, 2, 8, 8, 32> ColPolicy;
+};
+
+template <int _veclen>
+struct Policy2x8<double, _veclen> {
+  // this is not used just for keeping compiler happy.
+  typedef KernelPolicy<double, _veclen, 32, 1, 2, 8, 32> Policy;
+  typedef ColKernelPolicy<double, _veclen, 32, 1, 2, 8, 32> ColPolicy;
+};
+/** @} */
+
+/**
+ * @brief Base class for gemm-like NT contractions
+ *
+ * This class does not provide any arithmetic operations, but only provides the
+ * memory-related operations of loading the `x` and `y` matrix blocks from the
+ * global memory into shared memory and then from shared into registers. Thus,
+ * this class acts as a basic building block for further composing gemm-like NT
+ * contractions on input matrices which are row-major (and so does the output)
+ *
+ * @tparam DataT  IO and math data type
+ * @tparam IdxT   indexing type
+ * @tparam Policy policy used to customize memory access behavior.
+ *                See documentation for `KernelPolicy` to know more.
+ */
+using detail::Contractions_NT;
+
+}  // namespace linalg
+}  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh
new file mode 100644
index 0000000000..794a776dcf
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/add.cuh
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "functional.cuh"
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/binary_op.hpp>
+#include <raft/linalg/unary_op.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(out, in, len, adds_scalar<InT, OutT>(scalar), stream);
+}
+
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::binaryOp(out, in1, in2, len, thrust::plus<InT>(), stream);
+}
+
+template <class math_t, typename IdxType>
+__global__ void add_dev_scalar_kernel(math_t* outDev,
+                                      const math_t* inDev,
+                                      const math_t* singleScalarDev,
+                                      IdxType len)
+{
+  IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
+  if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; }
+}
+
+template <typename math_t, typename IdxType = int>
+void addDevScalar(math_t* outDev,
+                  const math_t* inDev,
+                  const math_t* singleScalarDev,
+                  IdxType len,
+                  cudaStream_t stream)
+{
+  // TODO: block dimension has not been tuned
+  dim3 block(256);
+  dim3 grid(raft::ceildiv(len, (IdxType)block.x));
+  add_dev_scalar_kernel<math_t><<<grid, block, 0, stream>>>(outDev, inDev, singleScalarDev, len);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/detail/binary_op.cuh
similarity index 81%
rename from cpp/include/raft/linalg/binary_op.cuh
rename to cpp/include/raft/linalg/detail/binary_op.cuh
index 00a2af0014..6b1f8bc6d7 100644
--- a/cpp/include/raft/linalg/binary_op.cuh
+++ b/cpp/include/raft/linalg/detail/binary_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include <raft/cuda_utils.cuh>
 #include <raft/vectorized.cuh>
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
 template <typename InType, int VecLen, typename Lambda, typename IdxType, typename OutType>
 __global__ void binaryOpKernel(
@@ -60,22 +60,6 @@ inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint6
   return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0;
 }
 
-/**
- * @brief perform element-wise binary operation on the input arrays
- * @tparam InType input data-type
- * @tparam Lambda the device-lambda performing the actual operation
- * @tparam OutType output data-type
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads-per-block in the final kernel launched
- * @param out the output array
- * @param in1 the first input array
- * @param in2 the second input array
- * @param len number of elements in the input array
- * @param op the device-lambda
- * @param stream cuda stream where to launch work
- * @note Lambda must be a functor with the following signature:
- *       `OutType func(const InType& val1, const InType& val2);`
- */
 template <typename InType,
           typename Lambda,
           typename OutType = InType,
@@ -109,5 +93,6 @@ void binaryOp(
   }
 }
 
-};  // end namespace linalg
-};  // end namespace raft
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
new file mode 100644
index 0000000000..335544e094
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cublas_wrappers.hpp"
+#include "cusolver_wrappers.hpp"
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+#include <raft/linalg/binary_op.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename math_t>
+void choleskyRank1Update(const raft::handle_t& handle,
+                         math_t* L,
+                         int n,
+                         int ld,
+                         void* workspace,
+                         int* n_bytes,
+                         cublasFillMode_t uplo,
+                         cudaStream_t stream,
+                         math_t eps = -1)
+{
+  // The matrix A' is defined as:
+  // A' = [[A_11, A_12]
+  //       [A_21, A_22]]
+  // where:
+  // - A_11 = A, matrix of size (n-1)x(n-1)
+  // - A_21[j] = A_12.T[j] = A_new[j] j=0..n-2, vector with (n-1) elements
+  // - A_22 = A_new[n-1] scalar.
+  //
+  // Instead of caclulating the Cholelsky decomposition of A' from scratch,
+  // we just update L with the new row. The new Cholesky decomposition will be
+  // calculated as:
+  // L' = [[L_11,    0]
+  //       [L_12, L_22]]
+  // where L_11 is the Cholesky decomposition of A (size [n-1 x n-1]), and
+  // L_12 and L_22 are the new quantities that we need to calculate.
+
+  // We need a workspace in device memory to store a scalar. Additionally, in
+  // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats.
+  const int align = 256;
+  int offset =
+    (uplo == CUBLAS_FILL_MODE_LOWER) ? raft::alignTo<int>(sizeof(math_t) * (n - 1), align) : 0;
+  if (workspace == nullptr) {
+    *n_bytes = offset + 1 * sizeof(math_t);
+    return;
+  }
+  math_t* s    = reinterpret_cast<math_t*>(((char*)workspace) + offset);
+  math_t* L_22 = L + (n - 1) * ld + n - 1;
+
+  math_t* A_new = nullptr;
+  math_t* A_row = nullptr;
+  if (uplo == CUBLAS_FILL_MODE_UPPER) {
+    // A_new is stored as the n-1 th column of L
+    A_new = L + (n - 1) * ld;
+  } else {
+    // If the input is lower triangular, then the new elements of A are stored
+    // as the n-th row of L. Since the matrix is column major, this is non
+    // contiguous. We copy elements from A_row to a contiguous workspace A_new.
+    A_row = L + n - 1;
+    A_new = reinterpret_cast<math_t*>(workspace);
+    RAFT_CUBLAS_TRY(cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream));
+  }
+  cublasOperation_t op = (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N;
+  if (n > 1) {
+    // Calculate L_12 = x by solving equation L_11 x = A_12
+    math_t alpha = 1;
+    RAFT_CUBLAS_TRY(cublastrsm(handle.get_cublas_handle(),
+                               CUBLAS_SIDE_LEFT,
+                               uplo,
+                               op,
+                               CUBLAS_DIAG_NON_UNIT,
+                               n - 1,
+                               1,
+                               &alpha,
+                               L,
+                               ld,
+                               A_new,
+                               n - 1,
+                               stream));
+
+    // A_new now stores L_12, we calculate s = L_12 * L_12
+    RAFT_CUBLAS_TRY(cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream));
+
+    if (uplo == CUBLAS_FILL_MODE_LOWER) {
+      // Copy back the L_12 elements as the n-th row of L
+      RAFT_CUBLAS_TRY(cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream));
+    }
+  } else {  // n == 1 case
+    RAFT_CUDA_TRY(cudaMemsetAsync(s, 0, sizeof(math_t), stream));
+  }
+
+  // L_22 = sqrt(A_22 - L_12 * L_12)
+  math_t s_host;
+  math_t L_22_host;
+  raft::update_host(&s_host, s, 1, stream);
+  raft::update_host(&L_22_host, L_22, 1, stream);  // L_22 stores A_22
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  L_22_host = std::sqrt(L_22_host - s_host);
+
+  // Check for numeric error with sqrt. If the matrix is not positive definit or
+  // the system is very ill conditioned then the A_22 - L_12 * L_12 can be
+  // negative, which would result L_22 = NaN. A small positive eps parameter
+  // can be used to prevent this.
+  if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { L_22_host = eps; }
+  ASSERT(!std::isnan(L_22_host), "Error during Cholesky rank one update");
+  raft::update_device(L_22, &L_22_host, 1, stream);
+}
+
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
similarity index 71%
rename from cpp/include/raft/linalg/coalesced_reduction.cuh
rename to cpp/include/raft/linalg/detail/coalesced_reduction.cuh
index 717e2c42b2..7e545e4932 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
 // Kernel (based on norm.cuh) to perform reductions along the coalesced dimension
 // of the matrix, i.e. reduce along rows for row major or reduce along columns
@@ -61,33 +62,6 @@ __global__ void coalescedReductionKernel(OutType* dots,
   }
 }
 
-/**
- * @brief Compute reduction of the input matrix along the leading dimension
- *
- * @tparam InType the data type of the input
- * @tparam OutType the data type of the output (as well as the data type for
- *  which reduction is performed)
- * @tparam IdxType data type of the indices of the array
- * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
- * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*ReduceLambda)(OutType);</pre>
- * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*FinalLambda)(OutType);</pre>
- * @param dots the output reduction vector
- * @param data the input matrix
- * @param D leading dimension of data
- * @param N second dimension data
- * @param init initial value to use for the reduction
- * @param main_op elementwise operation to apply before reduction
- * @param reduce_op binary reduction operation
- * @param final_op elementwise operation to apply before storing results
- * @param inplace reduction result added inplace or overwrites old values?
- * @param stream cuda stream where to launch work
- */
 template <typename InType,
           typename OutType      = InType,
           typename IdxType      = int,
@@ -123,5 +97,6 @@ void coalescedReduction(OutType* dots,
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-};  // end namespace linalg
-};  // end namespace raft
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/detail/contractions.cuh
similarity index 54%
rename from cpp/include/raft/linalg/contractions.cuh
rename to cpp/include/raft/linalg/detail/contractions.cuh
index 817bfeab5c..40d0839f60 100644
--- a/cpp/include/raft/linalg/contractions.cuh
+++ b/cpp/include/raft/linalg/detail/contractions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,187 +20,8 @@
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
-/**
- * @brief This is the central enum that should be used to configure the perf
- *        landscape of the Contraction kernel.
- *
- * Main goal of this Policy struct is to provide sufficient knobs to tune the
- * perf of Contraction kernel, as and when we see matrices of different shapes.
- *
- * @tparam DataT   the IO and math datatype
- * @tparam _veclen number of k-elements loaded by each thread for every LDG call
- *                 it makes. This should be configured based on the input 'k'
- *                 value and the input data type. For eg: if DataT = float and
- *                 k is multiples of 4, then setting this to 4 gives the best
- *                 LDG pattern. Possible values are {1, 2, 4}.
- * @tparam _kblk   number of k-elements operated upon per main-loop iteration.
- *                 Therefore total number of main-loop iterations will be
- *                 `ceil(k/_kblk)`. This must be multiples of `_veclen`. Do note
- *                 that bigger this value, the greater shared mem requirement.
- * @tparam _rpt    Defines the number of rows that a given thread accumulates on.
- *                 This directly results in increased register pressure. This
- *                 also is used to compute the number of m-elements worked upon
- *                 by each thread block.
- * @tparam _cpt    Defines the number of cols that a given thread accumulates on.
- *                 This directly results in increased register pressure. This
- *                 also is used to compute the number of n-elements worked upon
- *                 by each thread block.
- * @tparam _tr     Number of threads working on the same output column. This is
- *                 used to compute the number of m-elements worked upon by each
- *                 thread block. This also determines the number of threads per
- *                 thread block
- * @tparam _tc     Number of threads working on the same output row. This is
- *                 used to compute the number of m-elements worked upon by each
- *                 thread block. This also determines the number of threads per
- *                 thread block
- */
-template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
-struct KernelPolicy {
-  enum {
-    /** number of elements along K worked upon per main loop iteration */
-    Kblk = _kblk,
-    /** number of elements loaded per LDG */
-    Veclen = _veclen,
-    /** number of rows a thread works on for accumulation */
-    AccRowsPerTh = _rpt,
-    /** number of cols a thread works on for accumulation */
-    AccColsPerTh = _cpt,
-    /** number of threads working the same output col */
-    AccThRows = _tr,
-    /** number of threads working the same output row */
-    AccThCols = _tc,
-    /** total threads per block */
-    Nthreads = AccThRows * AccThCols,
-    /** output tile size along rows */
-    Mblk = AccRowsPerTh * AccThRows,
-    /** output tile size along cols */
-    Nblk = AccColsPerTh * AccThCols,
-    /** number of threads loading a single row */
-    LdgThRow = Kblk / Veclen,
-    /** number of LDGs issued by a single thread for X */
-    LdgPerThX = Mblk * LdgThRow / Nthreads,
-    /** number of LDGs issued by a single thread for Y */
-    LdgPerThY = Nblk * LdgThRow / Nthreads,
-    /** number of rows of X covered per LDG */
-    LdgRowsX = Mblk / LdgPerThX,
-    /** number of rows of Y covered per LDG */
-    LdgRowsY = Nblk / LdgPerThY,
-    /** stride for accessing X/Y data in shared mem */
-    SmemStride = Kblk + Veclen,
-    /** size of one page for storing X data */
-    SmemPageX = SmemStride * Mblk,
-    /** size of one page for storing Y data */
-    SmemPageY = SmemStride * Nblk,
-    /** size of one smem page */
-    SmemPage = SmemPageX + SmemPageY,
-    /** size (in B) for smem needed */
-    SmemSize = 2 * SmemPage * sizeof(DataT),
-  };  // enum
-
-};  // struct KernelPolicy
-
-template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
-struct ColKernelPolicy {
-  enum {
-    /** number of elements along K worked upon per main loop iteration */
-    Kblk = _kblk,
-    /** number of elements loaded per LDG */
-    Veclen = _veclen,
-    /** number of rows a thread works on for accumulation */
-    AccRowsPerTh = _rpt,
-    /** number of cols a thread works on for accumulation */
-    AccColsPerTh = _cpt,
-    /** number of threads working the same output col */
-    AccThRows = _tr,
-    /** number of threads working the same output row */
-    AccThCols = _tc,
-    /** total threads per block */
-    Nthreads = AccThRows * AccThCols,
-    /** output tile size along rows */
-    Mblk = AccRowsPerTh * AccThRows,
-    /** output tile size along cols */
-    Nblk = AccColsPerTh * AccThCols,
-    /** number of threads loading a single col */
-    LdgThRow = Mblk / Veclen,
-    /** number of LDGs issued by a single thread for X */
-    LdgPerThX = Kblk * LdgThRow / Nthreads,
-    /** number of LDGs issued by a single thread for Y */
-    LdgPerThY = Kblk * LdgThRow / Nthreads,
-    /** number of rows of X covered per LDG */
-    LdgRowsX = Kblk / LdgPerThX,
-    /** number of rows of Y covered per LDG */
-    LdgRowsY = Kblk / LdgPerThY,
-    /** stride for accessing X/Y data in shared mem */
-    SmemStride = Mblk + Veclen,
-    /** size of one page for storing X data */
-    SmemPageX = SmemStride * Kblk,
-    /** size of one page for storing Y data */
-    SmemPageY = SmemStride * Kblk,
-    /** size of one smem page */
-    SmemPage = SmemPageX + SmemPageY,
-    /** size (in B) for smem needed */
-    SmemSize = 2 * SmemPage * sizeof(DataT),
-  };  // colMajor enum
-  static_assert(Mblk == Nblk, "Mblk should be equal to Nblk");
-};
-/**
- * @defgroup Policy4x4 16 elements per thread Policy with k-block = 32
- * @{
- */
-template <typename DataT, int _veclen>
-struct Policy4x4 {
-};
-
-template <int _veclen>
-struct Policy4x4<float, _veclen> {
-  typedef KernelPolicy<float, _veclen, 32, 4, 4, 16, 16> Policy;
-  typedef ColKernelPolicy<float, _veclen, 32, 4, 4, 16, 16> ColPolicy;
-};
-
-template <int _veclen>
-struct Policy4x4<double, _veclen> {
-  typedef KernelPolicy<double, _veclen, 16, 4, 4, 16, 16> Policy;
-  typedef ColKernelPolicy<double, _veclen, 16, 4, 4, 16, 16> ColPolicy;
-};
-/** @} */
-
-/**
- * @defgroup Policy2x8 16 elements per thread Policy with k-block = 16
- * @{
- */
-template <typename DataT, int _veclen = 1>
-struct Policy2x8 {
-};
-
-template <int _veclen>
-struct Policy2x8<float, _veclen> {
-  typedef KernelPolicy<float, _veclen, 16, 2, 8, 8, 32> Policy;
-  typedef ColKernelPolicy<float, _veclen, 16, 2, 8, 8, 32> ColPolicy;
-};
-
-template <int _veclen>
-struct Policy2x8<double, _veclen> {
-  // this is not used just for keeping compiler happy.
-  typedef KernelPolicy<double, _veclen, 32, 1, 2, 8, 32> Policy;
-  typedef ColKernelPolicy<double, _veclen, 32, 1, 2, 8, 32> ColPolicy;
-};
-/** @} */
-
-/**
- * @brief Base class for gemm-like NT contractions
- *
- * This class does not provide any arithmetic operations, but only provides the
- * memory-related operations of loading the `x` and `y` matrix blocks from the
- * global memory into shared memory and then from shared into registers. Thus,
- * this class acts as a basic building block for further composing gemm-like NT
- * contractions on input matrices which are row-major (and so does the output)
- *
- * @tparam DataT  IO and math data type
- * @tparam IdxT   indexing type
- * @tparam Policy policy used to customize memory access behavior.
- *                See documentation for `KernelPolicy` to know more.
- */
 template <typename DataT, typename IdxT, typename Policy, bool isRowMajor = true>
 struct Contractions_NT {
  protected:
@@ -492,5 +313,6 @@ struct Contractions_NT {
 
 };  // struct Contractions_NT
 
+}  // namespace detail
 }  // namespace linalg
-}  // namespace raft
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
similarity index 94%
rename from cpp/include/raft/linalg/cublas_wrappers.h
rename to cpp/include/raft/linalg/detail/cublas_wrappers.hpp
index 246e6466d8..752235d246 100644
--- a/cpp/include/raft/linalg/cublas_wrappers.h
+++ b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -117,6 +117,7 @@ inline const char* cublas_error_to_string(cublasStatus_t err)
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
 /**
  * Assuming the default CUBLAS_POINTER_MODE_HOST, change it to host or device mode
@@ -171,7 +172,7 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle,
                                  int incy,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSaxpy(handle, n, alpha, x, incx, y, incy);
 }
 
@@ -185,7 +186,7 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle,
                                  int incy,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDaxpy(handle, n, alpha, x, incx, y, incy);
 }
 /** @} */
@@ -202,7 +203,7 @@ template <>
 inline cublasStatus_t cublasSwap(
   cublasHandle_t handle, int n, float* x, int incx, float* y, int incy, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSswap(handle, n, x, incx, y, incy);
 }
 
@@ -210,7 +211,7 @@ template <>
 inline cublasStatus_t cublasSwap(
   cublasHandle_t handle, int n, double* x, int incx, double* y, int incy, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDswap(handle, n, x, incx, y, incy);
 }
 
@@ -228,14 +229,14 @@ template <>
 inline cublasStatus_t cublasCopy(
   cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasScopy(handle, n, x, incx, y, incy);
 }
 template <>
 inline cublasStatus_t cublasCopy(
   cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDcopy(handle, n, x, incx, y, incy);
 }
 /** @} */
@@ -274,7 +275,7 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle,
                                  int incy,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy);
 }
 
@@ -293,7 +294,7 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle,
                                  int incy,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy);
 }
 /** @} */
@@ -327,7 +328,7 @@ inline cublasStatus_t cublasger(cublasHandle_t handle,
                                 int lda,
                                 cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
@@ -344,7 +345,7 @@ inline cublasStatus_t cublasger(cublasHandle_t handle,
                                 int lda,
                                 cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 /** @} */
@@ -387,7 +388,7 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc);
 }
 
@@ -408,7 +409,7 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc);
 }
 /** @} */
@@ -454,7 +455,7 @@ inline cublasStatus_t cublasgemmBatched(  // NOLINT
   int batchCount,
   cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgemmBatched(handle,
                             transa,
                             transb,
@@ -491,7 +492,7 @@ inline cublasStatus_t cublasgemmBatched(  // NOLINT
   int batchCount,
   cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgemmBatched(handle,
                             transa,
                             transb,
@@ -558,7 +559,7 @@ inline cublasStatus_t cublasgemmStridedBatched(  // NOLINT
   int batchCount,
   cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgemmStridedBatched(handle,
                                    transa,
                                    transb,
@@ -601,7 +602,7 @@ inline cublasStatus_t cublasgemmStridedBatched(  // NOLINT
   int batchCount,
   cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgemmStridedBatched(handle,
                                    transa,
                                    transb,
@@ -648,7 +649,7 @@ inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,  // NOLINT
                                          int batchSize,
                                          cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize);
 }
 
@@ -662,7 +663,7 @@ inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,  // NOLINT
                                          int batchSize,
                                          cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize);
 }
 
@@ -691,7 +692,7 @@ inline cublasStatus_t cublasgetriBatched(  // NOLINT
   int batchSize,
   cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
@@ -708,7 +709,7 @@ inline cublasStatus_t cublasgetriBatched(  // NOLINT
   int batchSize,
   cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
@@ -749,7 +750,7 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
                                         int batchSize,
                                         cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgelsBatched(
     handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
 }
@@ -769,7 +770,7 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
                                         int batchSize,
                                         cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgelsBatched(
     handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
 }
@@ -812,7 +813,7 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc);
 }
 
@@ -832,7 +833,7 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc);
 }
 /** @} */
@@ -873,7 +874,7 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
@@ -893,7 +894,7 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 /** @} */
@@ -930,7 +931,7 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
@@ -948,7 +949,7 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle,
                                  int ldc,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 /** @} */
@@ -965,7 +966,7 @@ template <>
 inline cublasStatus_t cublasnrm2(
   cublasHandle_t handle, int n, const float* x, int incx, float* result, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSnrm2(handle, n, x, incx, result);
 }
 
@@ -973,7 +974,7 @@ template <>
 inline cublasStatus_t cublasnrm2(
   cublasHandle_t handle, int n, const double* x, int incx, double* result, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDnrm2(handle, n, x, incx, result);
 }
 /** @} */
@@ -1008,7 +1009,7 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle,
                                  int ldb,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
@@ -1027,7 +1028,7 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle,
                                  int ldb,
                                  cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
@@ -1055,7 +1056,7 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle,
                                 float* result,
                                 cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSdot(handle, n, x, incx, y, incy, result);
 }
 
@@ -1069,7 +1070,7 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle,
                                 double* result,
                                 cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDdot(handle, n, x, incx, y, incy, result);
 }
 /** @} */
@@ -1090,7 +1091,7 @@ inline cublasStatus_t cublassetpointermode(cublasHandle_t handle,
                                            cublasPointerMode_t mode,
                                            cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSetPointerMode(handle, mode);
 }
 /** @} */
@@ -1107,7 +1108,7 @@ template <>
 inline cublasStatus_t cublasscal(
   cublasHandle_t handle, int n, const float* alpha, float* x, int incx, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasSscal(handle, n, alpha, x, incx);
 }
 
@@ -1115,11 +1116,12 @@ template <>
 inline cublasStatus_t cublasscal(
   cublasHandle_t handle, int n, const double* alpha, double* x, int incx, cudaStream_t stream)
 {
-  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
   return cublasDscal(handle, n, alpha, x, incx);
 }
 
 /** @} */
 
+}  // namespace detail
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
similarity index 95%
rename from cpp/include/raft/linalg/cusolver_wrappers.h
rename to cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
index 988e7512d5..34ec6cb673 100644
--- a/cpp/include/raft/linalg/cusolver_wrappers.h
+++ b/cpp/include/raft/linalg/detail/cusolver_wrappers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -115,6 +115,7 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err)
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
 /**
  * @defgroup Getrf cusolver getrf operations
@@ -142,7 +143,7 @@ inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
 }
 
@@ -157,7 +158,7 @@ inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
 }
 
@@ -224,7 +225,7 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
 }
 
@@ -241,7 +242,7 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
 }
 /** @} */
@@ -323,7 +324,7 @@ inline cusolverStatus_t cusolverDnsyevj(  // NOLINT
   syevjInfo_t params,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
 }
 
@@ -342,7 +343,7 @@ inline cusolverStatus_t cusolverDnsyevj(  // NOLINT
   syevjInfo_t params,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
 }
 
@@ -419,7 +420,7 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo);
 }
 
@@ -436,12 +437,11 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo);
 }
 /** @} */
 
-#if CUDART_VERSION >= 10010
 /**
  * @defgroup syevdx cusolver syevdx operations
  * @{
@@ -545,7 +545,7 @@ inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSsyevdx(
     handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo);
 }
@@ -570,12 +570,11 @@ inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDsyevdx(
     handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo);
 }
 /** @} */
-#endif
 
 /**
  * @defgroup svd cusolver svd operations
@@ -633,7 +632,7 @@ inline cusolverStatus_t cusolverDngesvd(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSgesvd(
     handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo);
 }
@@ -657,7 +656,7 @@ inline cusolverStatus_t cusolverDngesvd(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDgesvd(
     handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo);
 }
@@ -757,7 +756,7 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
   gesvdjInfo_t params,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSgesvdj(
     handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params);
 }
@@ -781,7 +780,7 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
   gesvdjInfo_t params,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDgesvdj(
     handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params);
 }
@@ -846,7 +845,7 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
 }
 
@@ -861,7 +860,7 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
 }
 /** @} */
@@ -894,7 +893,7 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
 }
 
@@ -910,7 +909,7 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
 }
 /** @} */
@@ -942,7 +941,7 @@ inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
 }
 template <>
@@ -957,7 +956,7 @@ inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle,  // NOLINT
                                         int* devInfo,
                                         cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
 }
 
@@ -1024,7 +1023,7 @@ inline cusolverStatus_t cusolverDnorgqr(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo);
 }
 template <>
@@ -1041,7 +1040,7 @@ inline cusolverStatus_t cusolverDnorgqr(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo);
 }
 
@@ -1122,7 +1121,7 @@ inline cusolverStatus_t cusolverDnormqr(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
 }
 
@@ -1144,7 +1143,7 @@ inline cusolverStatus_t cusolverDnormqr(  // NOLINT
   int* devInfo,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
 }
 
@@ -1311,7 +1310,7 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
   void* pBuffer,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverSpSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverSpSetStream(handle, stream));
   return cusolverSpScsrqrsvBatched(
     handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer);
 }
@@ -1333,7 +1332,7 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
   void* pBuffer,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverSpSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverSpSetStream(handle, stream));
   return cusolverSpDcsrqrsvBatched(
     handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer);
 }
@@ -1372,7 +1371,7 @@ inline cusolverStatus_t cusolverDnxsyevd_bufferSize(  // NOLINT
   size_t* workspaceInBytesOnHost,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnXsyevd_bufferSize(handle,
                                      params,
                                      jobz,
@@ -1402,7 +1401,7 @@ inline cusolverStatus_t cusolverDnxsyevd_bufferSize(  // NOLINT
   size_t* workspaceInBytesOnHost,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnXsyevd_bufferSize(handle,
                                      params,
                                      jobz,
@@ -1452,7 +1451,7 @@ inline cusolverStatus_t cusolverDnxsyevd(  // NOLINT
   int* info,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnXsyevd(handle,
                           params,
                           jobz,
@@ -1488,7 +1487,7 @@ inline cusolverStatus_t cusolverDnxsyevd(  // NOLINT
   int* info,
   cudaStream_t stream)
 {
-  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  RAFT_CUSOLVER_TRY(cusolverDnSetStream(handle, stream));
   return cusolverDnXsyevd(handle,
                           params,
                           jobz,
@@ -1509,5 +1508,6 @@ inline cusolverStatus_t cusolverDnxsyevd(  // NOLINT
 /** @} */
 #endif
 
+}  // namespace detail
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/divide.hpp b/cpp/include/raft/linalg/detail/divide.hpp
new file mode 100644
index 0000000000..c694529fb5
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/divide.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "functional.cuh"
+#include <raft/linalg/unary_op.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename math_t, typename IdxType = int>
+void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(out, in, len, divides_scalar<math_t>(scalar), stream);
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/detail/eig.hpp
similarity index 86%
rename from cpp/include/raft/linalg/eig.cuh
rename to cpp/include/raft/linalg/detail/eig.hpp
index 200f69a88a..8716b4de29 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/detail/eig.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,17 +16,18 @@
 
 #pragma once
 
+#include "cusolver_wrappers.hpp"
 #include <cuda_runtime_api.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/cusolver_wrappers.h>
 #include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
 template <typename math_t>
 void eigDC_legacy(const raft::handle_t& handle,
@@ -73,19 +74,6 @@ void eigDC_legacy(const raft::handle_t& handle,
          "This usually occurs when some of the features do not vary enough.");
 }
 
-/**
- * @defgroup eig decomp with divide and conquer method for the column-major
- * symmetric matrices
- * @param handle raft handle
- * @param in the input buffer (symmetric matrix that has real eig values and
- * vectors.
- * @param n_rows: number of rows of the input
- * @param n_cols: number of cols of the input
- * @param eig_vectors: eigenvectors
- * @param eig_vals: eigen values
- * @param stream cuda stream
- * @{
- */
 template <typename math_t>
 void eigDC(const raft::handle_t& handle,
            const math_t* in,
@@ -149,22 +137,6 @@ void eigDC(const raft::handle_t& handle,
 
 enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT };
 
-#if CUDART_VERSION >= 10010
-
-/**
- * @defgroup eig decomp with divide and conquer method for the column-major
- * symmetric matrices
- * @param handle raft handle
- * @param in the input buffer (symmetric matrix that has real eig values and
- * vectors.
- * @param n_rows: number of rows of the input
- * @param n_cols: number of cols of the input
- * @param n_eig_vals: number of eigenvectors to be generated
- * @param eig_vectors: eigenvectors
- * @param eig_vals: eigen values
- * @param stream cuda stream
- * @{
- */
 template <typename math_t>
 void eigSelDC(const raft::handle_t& handle,
               math_t* in,
@@ -256,39 +228,23 @@ void eigSelDC(const raft::handle_t& handle,
   }
 }
 
-#endif
-
-/**
- * @defgroup overloaded function for eig decomp with Jacobi method for the
- * column-major symmetric matrices (in parameter)
- * @param handle: raft handle
- * @param n_rows: number of rows of the input
- * @param n_cols: number of cols of the input
- * @param eig_vectors: eigenvectors
- * @param eig_vals: eigen values
- * @param tol: error tolerance for the jacobi method. Algorithm stops when the
- * error is below tol
- * @param sweeps: number of sweeps in the Jacobi algorithm. The more the better
- * accuracy.
- * @{
- */
 template <typename math_t>
 void eigJacobi(const raft::handle_t& handle,
                const math_t* in,
-               std::size_t n_rows,
-               std::size_t n_cols,
+               int n_rows,
+               int n_cols,
                math_t* eig_vectors,
                math_t* eig_vals,
                cudaStream_t stream,
-               math_t tol           = 1.e-7,
-               std::uint32_t sweeps = 15)
+               math_t tol = 1.e-7,
+               int sweeps = 15)
 {
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   syevjInfo_t syevj_params = nullptr;
   RAFT_CUSOLVER_TRY(cusolverDnCreateSyevjInfo(&syevj_params));
   RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetTolerance(syevj_params, tol));
-  RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetMaxSweeps(syevj_params, static_cast<int>(sweeps)));
+  RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetMaxSweeps(syevj_params, sweeps));
 
   int lwork;
   RAFT_CUSOLVER_TRY(cusolverDnsyevj_bufferSize(cusolverH,
@@ -326,5 +282,6 @@ void eigJacobi(const raft::handle_t& handle,
   RAFT_CUSOLVER_TRY(cusolverDnDestroySyevjInfo(syevj_params));
 }
 
-};  // end namespace linalg
-};  // end namespace raft
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/eltwise.hpp b/cpp/include/raft/linalg/detail/eltwise.hpp
new file mode 100644
index 0000000000..b15717f205
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/eltwise.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "functional.cuh"
+
+#include <raft/linalg/binary_op.hpp>
+#include <raft/linalg/unary_op.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(out, in, len, adds_scalar<InType, OutType>(scalar), stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(out, in, len, multiplies_scalar<InType, OutType>(scalar), stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseAdd(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::binaryOp(out, in1, in2, len, thrust::plus<InType>(), stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseSub(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::binaryOp(out, in1, in2, len, thrust::minus<InType>(), stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseMultiply(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::binaryOp(out, in1, in2, len, thrust::multiplies<InType>(), stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseDivide(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::binaryOp(out, in1, in2, len, thrust::divides<InType>(), stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseDivideCheckZero(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::binaryOp(out, in1, in2, len, divides_check_zero<InType, OutType>(), stream);
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/functional.cuh b/cpp/include/raft/linalg/detail/functional.cuh
new file mode 100644
index 0000000000..067b1565e0
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/functional.cuh
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/functional.h>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename ArgType, typename ReturnType = ArgType>
+struct divides_scalar {
+ public:
+  divides_scalar(ArgType scalar) : scalar_(scalar) {}
+
+  __host__ __device__ inline ReturnType operator()(ArgType in) { return in / scalar_; }
+
+ private:
+  ArgType scalar_;
+};
+
+template <typename ArgType, typename ReturnType = ArgType>
+struct adds_scalar {
+ public:
+  adds_scalar(ArgType scalar) : scalar_(scalar) {}
+
+  __host__ __device__ inline ReturnType operator()(ArgType in) { return in + scalar_; }
+
+ private:
+  ArgType scalar_;
+};
+
+template <typename ArgType, typename ReturnType = ArgType>
+struct multiplies_scalar {
+ public:
+  multiplies_scalar(ArgType scalar) : scalar_(scalar) {}
+
+  __host__ __device__ inline ReturnType operator()(ArgType in) { return in * scalar_; }
+
+ private:
+  ArgType scalar_;
+};
+
+template <typename ArgType, typename ReturnType = ArgType>
+struct divides_check_zero {
+ public:
+  __host__ __device__ inline ReturnType operator()(ArgType a, ArgType b)
+  {
+    return (b == static_cast<ArgType>(0)) ? 0.0 : a / b;
+  }
+};
+
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/detail/gemm.hpp
similarity index 88%
rename from cpp/include/raft/linalg/gemm.cuh
rename to cpp/include/raft/linalg/detail/gemm.hpp
index b5147915ba..8a02b702e5 100644
--- a/cpp/include/raft/linalg/gemm.cuh
+++ b/cpp/include/raft/linalg/detail/gemm.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,14 @@
 
 #pragma once
 
+#include "cublas_wrappers.hpp"
 #include <cublas_v2.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.h>
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
 /**
  * @brief the wrapper of cublas gemm function
@@ -146,25 +147,6 @@ void gemm(const raft::handle_t& handle,
     handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream);
 }
 
-/**
- * @brief A wrapper for CUBLS GEMM function designed for handling all possible
- * combinations of operand layouts.
- * It computes the following equation: Z = alpha . X * Y + beta . Z
- * @tparam T Data type of input/output matrices (float/double)
- * @param handle raft handle
- * @param z output matrix of size M rows x N columns
- * @param x input matrix of size M rows x K columns
- * @param y input matrix of size K rows x N columns
- * @param _M number of rows of X and Z
- * @param _N number of rows of Y and columns of Z
- * @param _K number of columns of X and rows of Y
- * @param isZColMajor Storage layout of Z. true = col major, false = row major
- * @param isXColMajor Storage layout of X. true = col major, false = row major
- * @param isYColMajor Storage layout of Y. true = col major, false = row major
- * @param stream cuda stream
- * @param alpha scalar
- * @param beta scalar
- */
 template <typename T>
 void gemm(const raft::handle_t& handle,
           T* z,
@@ -253,5 +235,6 @@ void gemm(const raft::handle_t& handle,
     cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda, b, ldb, &beta, c, ldc, stream));
 }
 
-}  // end namespace linalg
-}  // end namespace raft
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/gemv.hpp b/cpp/include/raft/linalg/detail/gemv.hpp
new file mode 100644
index 0000000000..991268cf26
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/gemv.hpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cublas_v2.h>
+
+#include "cublas_wrappers.hpp"
+
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows,
+          const int n_cols,
+          const math_t* x,
+          const int incx,
+          math_t* y,
+          const int incy,
+          const bool trans_a,
+          const math_t alpha,
+          const math_t beta,
+          cudaStream_t stream)
+{
+  cublasHandle_t cublas_h = handle.get_cublas_handle();
+  cublasOperation_t op_a  = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  RAFT_CUBLAS_TRY(
+    cublasgemv(cublas_h, op_a, n_rows, n_cols, &alpha, A, n_rows, x, incx, &beta, y, incy, stream));
+}
+
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          const math_t alpha,
+          const math_t beta,
+          cudaStream_t stream)
+{
+  gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
+}
+
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          cudaStream_t stream)
+{
+  math_t alpha = math_t(1);
+  math_t beta  = math_t(0);
+
+  gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
+}
+
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const int lda,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          const math_t alpha,
+          const math_t beta,
+          cudaStream_t stream)
+{
+  cublasHandle_t cublas_h = handle.get_cublas_handle();
+  cublasOperation_t op_a  = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  RAFT_CUBLAS_TRY(
+    cublasgemv(cublas_h, op_a, n_rows_a, n_cols_a, &alpha, A, lda, x, 1, &beta, y, 1, stream));
+}
+
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const int lda,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          cudaStream_t stream)
+{
+  math_t alpha = math_t(1);
+  math_t beta  = math_t(0);
+  gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream);
+}
+
+};  // namespace detail
+};  // namespace linalg
+};  // namespace raft
diff --git a/cpp/include/raft/linalg/init.h b/cpp/include/raft/linalg/detail/init.hpp
similarity index 81%
rename from cpp/include/raft/linalg/init.h
rename to cpp/include/raft/linalg/detail/init.hpp
index 03d4f99e90..4718a2cb0e 100644
--- a/cpp/include/raft/linalg/init.h
+++ b/cpp/include/raft/linalg/detail/init.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,19 +23,8 @@
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
-namespace {
-
-/**
- * @brief Like Python range.
- *
- * Fills the output as out[i] = i.
- *
- * \param [out] out device array, size [end-start]
- * \param [in] start of the range
- * \param [in] end of range (exclusive)
- * \param [in] stream cuda stream
- */
 template <typename T>
 void range(T* out, int start, int end, cudaStream_t stream)
 {
@@ -59,6 +48,7 @@ void range(T* out, int n, cudaStream_t stream)
 {
   range(out, 0, n, stream);
 }
-}  // unnamed namespace
+
+}  // namespace detail
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/lanczos.hpp b/cpp/include/raft/linalg/detail/lanczos.hpp
new file mode 100644
index 0000000000..c761c06c14
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/lanczos.hpp
@@ -0,0 +1,1401 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+// for cmath:
+#define _USE_MATH_DEFINES
+
+#include <cmath>
+#include <vector>
+
+#include <cuda.h>
+#include <curand.h>
+
+#include "cublas_wrappers.hpp"
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <raft/spectral/lapack.hpp>
+#include <raft/spectral/matrix_wrappers.hpp>
+#include <raft/spectral/warn_dbg.hpp>
+
+namespace raft {
+
+using namespace matrix;
+using namespace linalg::detail;
+
+namespace spectral {
+namespace detail {
+
+// curandGeneratorNormalX
+inline curandStatus_t curandGenerateNormalX(
+  curandGenerator_t generator, float* outputPtr, size_t n, float mean, float stddev)
+{
+  return curandGenerateNormal(generator, outputPtr, n, mean, stddev);
+}
+inline curandStatus_t curandGenerateNormalX(
+  curandGenerator_t generator, double* outputPtr, size_t n, double mean, double stddev)
+{
+  return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev);
+}
+
+// =========================================================
+// Helper functions
+// =========================================================
+
+/**
+ *  @brief  Perform Lanczos iteration
+ *    Lanczos iteration is performed on a shifted matrix A+shift*I.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param A Matrix.
+ *  @param iter Pointer to current Lanczos iteration. On exit, the
+ *    variable is set equal to the final Lanczos iteration.
+ *  @param maxIter Maximum Lanczos iteration. This function will
+ *    perform a maximum of maxIter-*iter iterations.
+ *  @param shift Matrix shift.
+ *  @param tol Convergence tolerance. Lanczos iteration will
+ *    terminate when the residual norm (i.e. entry in beta_host) is
+ *    less than tol.
+ *  @param reorthogonalize Whether to reorthogonalize Lanczos
+ *    vectors.
+ *  @param alpha_host (Output, host memory, maxIter entries)
+ *    Diagonal entries of Lanczos system.
+ *  @param beta_host (Output, host memory, maxIter entries)
+ *    Off-diagonal entries of Lanczos system.
+ *  @param lanczosVecs_dev (Input/output, device memory,
+ *    n*(maxIter+1) entries) Lanczos vectors. Vectors are stored as
+ *    columns of a column-major matrix with dimensions
+ *    n x (maxIter+1).
+ *  @param work_dev (Output, device memory, maxIter entries)
+ *    Workspace. Not needed if full reorthogonalization is disabled.
+ *  @return Zero if successful. Otherwise non-zero.
+ */
+template <typename index_type_t, typename value_type_t>
+int performLanczosIteration(handle_t const& handle,
+                            sparse_matrix_t<index_type_t, value_type_t> const* A,
+                            index_type_t* iter,
+                            index_type_t maxIter,
+                            value_type_t shift,
+                            value_type_t tol,
+                            bool reorthogonalize,
+                            value_type_t* __restrict__ alpha_host,
+                            value_type_t* __restrict__ beta_host,
+                            value_type_t* __restrict__ lanczosVecs_dev,
+                            value_type_t* __restrict__ work_dev)
+{
+  // -------------------------------------------------------
+  // Variable declaration
+  // -------------------------------------------------------
+
+  // Useful variables
+  constexpr value_type_t one    = 1;
+  constexpr value_type_t negOne = -1;
+  constexpr value_type_t zero   = 0;
+  value_type_t alpha;
+
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream   = handle.get_stream();
+
+  RAFT_EXPECTS(A != nullptr, "Null matrix pointer.");
+
+  index_type_t n = A->nrows_;
+
+  // -------------------------------------------------------
+  // Compute second Lanczos vector
+  // -------------------------------------------------------
+  if (*iter <= 0) {
+    *iter = 1;
+
+    // Apply matrix
+    if (shift != 0)
+      RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n,
+                                    lanczosVecs_dev,
+                                    n * sizeof(value_type_t),
+                                    cudaMemcpyDeviceToDevice,
+                                    stream));
+    A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n);
+
+    // Orthogonalize Lanczos vector
+    RAFT_CUBLAS_TRY(cublasdot(
+      cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream));
+
+    alpha = -alpha_host[0];
+    RAFT_CUBLAS_TRY(cublasaxpy(
+      cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
+    RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream));
+
+    // Check if Lanczos has converged
+    if (beta_host[0] <= tol) return 0;
+
+    // Normalize Lanczos vector
+    alpha = 1 / beta_host[0];
+    RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
+  }
+
+  // -------------------------------------------------------
+  // Compute remaining Lanczos vectors
+  // -------------------------------------------------------
+
+  while (*iter < maxIter) {
+    ++(*iter);
+
+    // Apply matrix
+    if (shift != 0)
+      RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n,
+                                    lanczosVecs_dev + (*iter - 1) * n,
+                                    n * sizeof(value_type_t),
+                                    cudaMemcpyDeviceToDevice,
+                                    stream));
+    A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n));
+
+    // Full reorthogonalization
+    //   "Twice is enough" algorithm per Kahan and Parlett
+    if (reorthogonalize) {
+      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
+                                 CUBLAS_OP_T,
+                                 n,
+                                 *iter,
+                                 &one,
+                                 lanczosVecs_dev,
+                                 n,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 &zero,
+                                 work_dev,
+                                 1,
+                                 stream));
+
+      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
+                                 CUBLAS_OP_N,
+                                 n,
+                                 *iter,
+                                 &negOne,
+                                 lanczosVecs_dev,
+                                 n,
+                                 work_dev,
+                                 1,
+                                 &one,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 stream));
+
+      RAFT_CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1),
+                                    work_dev + (*iter - 1),
+                                    sizeof(value_type_t),
+                                    cudaMemcpyDeviceToHost,
+                                    stream));
+
+      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
+                                 CUBLAS_OP_T,
+                                 n,
+                                 *iter,
+                                 &one,
+                                 lanczosVecs_dev,
+                                 n,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 &zero,
+                                 work_dev,
+                                 1,
+                                 stream));
+
+      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
+                                 CUBLAS_OP_N,
+                                 n,
+                                 *iter,
+                                 &negOne,
+                                 lanczosVecs_dev,
+                                 n,
+                                 work_dev,
+                                 1,
+                                 &one,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 stream));
+    }
+
+    // Orthogonalization with 3-term recurrence relation
+    else {
+      RAFT_CUBLAS_TRY(cublasdot(cublas_h,
+                                n,
+                                lanczosVecs_dev + IDX(0, *iter - 1, n),
+                                1,
+                                lanczosVecs_dev + IDX(0, *iter, n),
+                                1,
+                                alpha_host + (*iter - 1),
+                                stream));
+
+      auto alpha = -alpha_host[*iter - 1];
+      RAFT_CUBLAS_TRY(cublasaxpy(cublas_h,
+                                 n,
+                                 &alpha,
+                                 lanczosVecs_dev + IDX(0, *iter - 1, n),
+                                 1,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 stream));
+
+      alpha = -beta_host[*iter - 2];
+      RAFT_CUBLAS_TRY(cublasaxpy(cublas_h,
+                                 n,
+                                 &alpha,
+                                 lanczosVecs_dev + IDX(0, *iter - 2, n),
+                                 1,
+                                 lanczosVecs_dev + IDX(0, *iter, n),
+                                 1,
+                                 stream));
+    }
+
+    // Compute residual
+    RAFT_CUBLAS_TRY(cublasnrm2(
+      cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream));
+
+    // Check if Lanczos has converged
+    if (beta_host[*iter - 1] <= tol) break;
+
+    // Normalize Lanczos vector
+    alpha = 1 / beta_host[*iter - 1];
+    RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+  }
+
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+
+  return 0;
+}
+
+/**
+ *  @brief  Find Householder transform for 3-dimensional system
+ *    Given an input vector v=[x,y,z]', this function finds a
+ *    Householder transform P such that P*v is a multiple of
+ *    e_1=[1,0,0]'. The input vector v is overwritten with the
+ *    Householder vector such that P=I-2*v*v'.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param v (Input/output, host memory, 3 entries) Input
+ *    3-dimensional vector. On exit, the vector is set to the
+ *    Householder vector.
+ *  @param Pv (Output, host memory, 1 entry) First entry of P*v
+ *    (here v is the input vector). Either equal to ||v||_2 or
+ *    -||v||_2.
+ *  @param P (Output, host memory, 9 entries) Householder transform
+ *    matrix. Matrix dimensions are 3 x 3.
+ */
+template <typename index_type_t, typename value_type_t>
+static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P)
+{
+  // Compute norm of vector
+  *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+
+  // Choose whether to reflect to e_1 or -e_1
+  //   This choice avoids catastrophic cancellation
+  if (v[0] >= 0) *Pv = -(*Pv);
+  v[0] -= *Pv;
+
+  // Normalize Householder vector
+  value_type_t normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+  if (normHouseholder != 0) {
+    v[0] /= normHouseholder;
+    v[1] /= normHouseholder;
+    v[2] /= normHouseholder;
+  } else {
+    v[0] = 0;
+    v[1] = 0;
+    v[2] = 0;
+  }
+
+  // Construct Householder matrix
+  index_type_t i, j;
+  for (j = 0; j < 3; ++j)
+    for (i = 0; i < 3; ++i)
+      P[IDX(i, j, 3)] = -2 * v[i] * v[j];
+  for (i = 0; i < 3; ++i)
+    P[IDX(i, i, 3)] += 1;
+}
+
+/**
+ *  @brief  Apply 3-dimensional Householder transform to 4 x 4 matrix
+ *    The Householder transform is pre-applied to the top three rows
+ *  of the matrix and post-applied to the left three columns. The
+ *  4 x 4 matrix is intended to contain the bulge that is produced
+ *  in the Francis QR algorithm.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param v (Input, host memory, 3 entries) Householder vector.
+ *  @param A (Input/output, host memory, 16 entries) 4 x 4 matrix.
+ */
+template <typename index_type_t, typename value_type_t>
+static void applyHouseholder3(const value_type_t* v, value_type_t* A)
+{
+  // Loop indices
+  index_type_t i, j;
+  // Dot product between Householder vector and matrix row/column
+  value_type_t vDotA;
+
+  // Pre-apply Householder transform
+  for (j = 0; j < 4; ++j) {
+    vDotA = 0;
+    for (i = 0; i < 3; ++i)
+      vDotA += v[i] * A[IDX(i, j, 4)];
+    for (i = 0; i < 3; ++i)
+      A[IDX(i, j, 4)] -= 2 * v[i] * vDotA;
+  }
+
+  // Post-apply Householder transform
+  for (i = 0; i < 4; ++i) {
+    vDotA = 0;
+    for (j = 0; j < 3; ++j)
+      vDotA += A[IDX(i, j, 4)] * v[j];
+    for (j = 0; j < 3; ++j)
+      A[IDX(i, j, 4)] -= 2 * vDotA * v[j];
+  }
+}
+
+/**
+ *  @brief  Perform one step of Francis QR algorithm
+ *    Equivalent to two steps of the classical QR algorithm on a
+ *    tridiagonal matrix.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param n Matrix dimension.
+ *  @param shift1 QR algorithm shift.
+ *  @param shift2 QR algorithm shift.
+ *  @param alpha (Input/output, host memory, n entries) Diagonal
+ *    entries of tridiagonal matrix.
+ *  @param beta (Input/output, host memory, n-1 entries)
+ *    Off-diagonal entries of tridiagonal matrix.
+ *  @param V (Input/output, host memory, n*n entries) Orthonormal
+ *    transforms from previous steps of QR algorithm. Matrix
+ *    dimensions are n x n. On exit, the orthonormal transform from
+ *    this Francis QR step is post-applied to the matrix.
+ *  @param work (Output, host memory, 3*n entries) Workspace.
+ *  @return Zero if successful. Otherwise non-zero.
+ */
+template <typename index_type_t, typename value_type_t>
+static int francisQRIteration(index_type_t n,
+                              value_type_t shift1,
+                              value_type_t shift2,
+                              value_type_t* alpha,
+                              value_type_t* beta,
+                              value_type_t* V,
+                              value_type_t* work)
+{
+  // -------------------------------------------------------
+  // Variable declaration
+  // -------------------------------------------------------
+
+  // Temporary storage of 4x4 bulge and Householder vector
+  value_type_t bulge[16];
+
+  // Householder vector
+  value_type_t householder[3];
+  // Householder matrix
+  value_type_t householderMatrix[3 * 3];
+
+  // Shifts are roots of the polynomial p(x)=x^2+b*x+c
+  value_type_t b = -shift1 - shift2;
+  value_type_t c = shift1 * shift2;
+
+  // Loop indices
+  index_type_t i, j, pos;
+  // Temporary variable
+  value_type_t temp;
+
+  // -------------------------------------------------------
+  // Implementation
+  // -------------------------------------------------------
+
+  // Compute initial Householder transform
+  householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c;
+  householder[1] = beta[0] * (alpha[0] + alpha[1] + b);
+  householder[2] = beta[0] * beta[1];
+  findHouseholder3<index_type_t, value_type_t>(householder, &temp, householderMatrix);
+
+  // Apply initial Householder transform to create bulge
+  memset(bulge, 0, 16 * sizeof(value_type_t));
+  for (i = 0; i < 4; ++i)
+    bulge[IDX(i, i, 4)] = alpha[i];
+  for (i = 0; i < 3; ++i) {
+    bulge[IDX(i + 1, i, 4)] = beta[i];
+    bulge[IDX(i, i + 1, 4)] = beta[i];
+  }
+  applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
+  Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n);
+  memcpy(V, work, 3 * n * sizeof(value_type_t));
+
+  // Chase bulge to bottom-right of matrix with Householder transforms
+  for (pos = 0; pos < n - 4; ++pos) {
+    // Move to next position
+    alpha[pos]     = bulge[IDX(0, 0, 4)];
+    householder[0] = bulge[IDX(1, 0, 4)];
+    householder[1] = bulge[IDX(2, 0, 4)];
+    householder[2] = bulge[IDX(3, 0, 4)];
+    for (j = 0; j < 3; ++j)
+      for (i = 0; i < 3; ++i)
+        bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+    bulge[IDX(3, 0, 4)] = 0;
+    bulge[IDX(3, 1, 4)] = 0;
+    bulge[IDX(3, 2, 4)] = beta[pos + 3];
+    bulge[IDX(0, 3, 4)] = 0;
+    bulge[IDX(1, 3, 4)] = 0;
+    bulge[IDX(2, 3, 4)] = beta[pos + 3];
+    bulge[IDX(3, 3, 4)] = alpha[pos + 4];
+
+    // Apply Householder transform
+    findHouseholder3<index_type_t, value_type_t>(householder, beta + pos, householderMatrix);
+    applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
+    Lapack<value_type_t>::gemm(
+      false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n);
+    memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(value_type_t));
+  }
+
+  // Apply penultimate Householder transform
+  //   Values in the last row and column are zero
+  alpha[n - 4]   = bulge[IDX(0, 0, 4)];
+  householder[0] = bulge[IDX(1, 0, 4)];
+  householder[1] = bulge[IDX(2, 0, 4)];
+  householder[2] = bulge[IDX(3, 0, 4)];
+  for (j = 0; j < 3; ++j)
+    for (i = 0; i < 3; ++i)
+      bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+  bulge[IDX(3, 0, 4)] = 0;
+  bulge[IDX(3, 1, 4)] = 0;
+  bulge[IDX(3, 2, 4)] = 0;
+  bulge[IDX(0, 3, 4)] = 0;
+  bulge[IDX(1, 3, 4)] = 0;
+  bulge[IDX(2, 3, 4)] = 0;
+  bulge[IDX(3, 3, 4)] = 0;
+  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 4, householderMatrix);
+  applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
+  Lapack<value_type_t>::gemm(
+    false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n);
+  memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(value_type_t));
+
+  // Apply final Householder transform
+  //   Values in the last two rows and columns are zero
+  alpha[n - 3]   = bulge[IDX(0, 0, 4)];
+  householder[0] = bulge[IDX(1, 0, 4)];
+  householder[1] = bulge[IDX(2, 0, 4)];
+  householder[2] = 0;
+  for (j = 0; j < 3; ++j)
+    for (i = 0; i < 3; ++i)
+      bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 3, householderMatrix);
+  applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
+  Lapack<value_type_t>::gemm(
+    false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n);
+  memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(value_type_t));
+
+  // Bulge has been eliminated
+  alpha[n - 2] = bulge[IDX(0, 0, 4)];
+  alpha[n - 1] = bulge[IDX(1, 1, 4)];
+  beta[n - 2]  = bulge[IDX(1, 0, 4)];
+
+  return 0;
+}
+
+/**
+ *  @brief  Perform implicit restart of Lanczos algorithm
+ *    Shifts are Chebyshev nodes of unwanted region of matrix spectrum.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param n Matrix dimension.
+ *  @param iter Current Lanczos iteration.
+ *  @param iter_new Lanczos iteration after restart.
+ *  @param shiftUpper Pointer (host memory) to upper bound for unwanted
+ *    region. Value is ignored if less than *shiftLower. If a
+ *    stronger upper bound has been found, the value is updated on
+ *    exit.
+ *  @param shiftLower Pointer (host memory) to lower bound for unwanted
+ *    region. Value is ignored if greater than *shiftUpper. If a
+ *    stronger lower bound has been found, the value is updated on
+ *    exit.
+ *  @param alpha_host (Input/output, host memory, iter entries)
+ *    Diagonal entries of Lanczos system.
+ *  @param beta_host (Input/output, host memory, iter entries)
+ *    Off-diagonal entries of Lanczos system.
+ *  @param V_host (Output, host memory, iter*iter entries)
+ *    Orthonormal transform used to obtain restarted system. Matrix
+ *    dimensions are iter x iter.
+ *  @param work_host (Output, host memory, 4*iter entries)
+ *    Workspace.
+ *  @param lanczosVecs_dev (Input/output, device memory, n*(iter+1)
+ *    entries) Lanczos vectors. Vectors are stored as columns of a
+ *    column-major matrix with dimensions n x (iter+1).
+ *  @param work_dev (Output, device memory, (n+iter)*iter entries)
+ *    Workspace.
+ *  @param smallest_eig specifies whether smallest (true) or largest
+ *    (false) eigenvalues are to be calculated.
+ *  @return error flag.
+ */
+template <typename index_type_t, typename value_type_t>
+static int lanczosRestart(handle_t const& handle,
+                          index_type_t n,
+                          index_type_t iter,
+                          index_type_t iter_new,
+                          value_type_t* shiftUpper,
+                          value_type_t* shiftLower,
+                          value_type_t* __restrict__ alpha_host,
+                          value_type_t* __restrict__ beta_host,
+                          value_type_t* __restrict__ V_host,
+                          value_type_t* __restrict__ work_host,
+                          value_type_t* __restrict__ lanczosVecs_dev,
+                          value_type_t* __restrict__ work_dev,
+                          bool smallest_eig)
+{
+  // -------------------------------------------------------
+  // Variable declaration
+  // -------------------------------------------------------
+
+  // Useful constants
+  constexpr value_type_t zero = 0;
+  constexpr value_type_t one  = 1;
+
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream   = handle.get_stream();
+
+  // Loop index
+  index_type_t i;
+
+  // Number of implicit restart steps
+  //   Assumed to be even since each call to Francis algorithm is
+  //   equivalent to two calls of QR algorithm
+  index_type_t restartSteps = iter - iter_new;
+
+  // Ritz values from Lanczos method
+  value_type_t* ritzVals_host = work_host + 3 * iter;
+  // Shifts for implicit restart
+  value_type_t* shifts_host;
+
+  // Orthonormal matrix for similarity transform
+  value_type_t* V_dev = work_dev + n * iter;
+
+  // -------------------------------------------------------
+  // Implementation
+  // -------------------------------------------------------
+
+  // Compute Ritz values
+  memcpy(ritzVals_host, alpha_host, iter * sizeof(value_type_t));
+  memcpy(work_host, beta_host, (iter - 1) * sizeof(value_type_t));
+  Lapack<value_type_t>::sterf(iter, ritzVals_host, work_host);
+
+  // Debug: Print largest eigenvalues
+  // for (int i = iter-iter_new; i < iter; ++i)
+  //  std::cout <<*(ritzVals_host+i)<< " ";
+  // std::cout <<std::endl;
+
+  // Initialize similarity transform with identity matrix
+  memset(V_host, 0, iter * iter * sizeof(value_type_t));
+  for (i = 0; i < iter; ++i)
+    V_host[IDX(i, i, iter)] = 1;
+
+  // Determine interval to suppress eigenvalues
+  if (smallest_eig) {
+    if (*shiftLower > *shiftUpper) {
+      *shiftUpper = ritzVals_host[iter - 1];
+      *shiftLower = ritzVals_host[iter_new];
+    } else {
+      *shiftUpper = std::max(*shiftUpper, ritzVals_host[iter - 1]);
+      *shiftLower = std::min(*shiftLower, ritzVals_host[iter_new]);
+    }
+  } else {
+    if (*shiftLower > *shiftUpper) {
+      *shiftUpper = ritzVals_host[iter - iter_new - 1];
+      *shiftLower = ritzVals_host[0];
+    } else {
+      *shiftUpper = std::max(*shiftUpper, ritzVals_host[iter - iter_new - 1]);
+      *shiftLower = std::min(*shiftLower, ritzVals_host[0]);
+    }
+  }
+
+  // Calculate Chebyshev nodes as shifts
+  shifts_host = ritzVals_host;
+  for (i = 0; i < restartSteps; ++i) {
+    shifts_host[i] = cos((i + 0.5) * static_cast<value_type_t>(M_PI) / restartSteps);
+    shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower));
+    shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower));
+  }
+
+  // Apply Francis QR algorithm to implicitly restart Lanczos
+  for (i = 0; i < restartSteps; i += 2)
+    if (francisQRIteration(
+          iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host))
+      WARNING("error in implicitly shifted QR algorithm");
+
+  // Obtain new residual
+  RAFT_CUDA_TRY(cudaMemcpyAsync(
+    V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream));
+
+  beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)];
+  RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
+                             CUBLAS_OP_N,
+                             n,
+                             iter,
+                             beta_host + iter_new - 1,
+                             lanczosVecs_dev,
+                             n,
+                             V_dev + IDX(0, iter_new, iter),
+                             1,
+                             beta_host + iter - 1,
+                             lanczosVecs_dev + IDX(0, iter, n),
+                             1,
+                             stream));
+
+  // Obtain new Lanczos vectors
+  RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             n,
+                             iter_new,
+                             iter,
+                             &one,
+                             lanczosVecs_dev,
+                             n,
+                             V_dev,
+                             iter,
+                             &zero,
+                             work_dev,
+                             n,
+                             stream));
+
+  RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev,
+                                work_dev,
+                                n * iter_new * sizeof(value_type_t),
+                                cudaMemcpyDeviceToDevice,
+                                stream));
+
+  // Normalize residual to obtain new Lanczos vector
+  RAFT_CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n),
+                                lanczosVecs_dev + IDX(0, iter, n),
+                                n * sizeof(value_type_t),
+                                cudaMemcpyDeviceToDevice,
+                                stream));
+
+  RAFT_CUBLAS_TRY(cublasnrm2(
+    cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream));
+
+  auto h_beta = 1 / beta_host[iter_new - 1];
+  RAFT_CUBLAS_TRY(
+    cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream));
+
+  return 0;
+}
+
+}  // namespace detail
+}  // namespace spectral
+
+namespace detail {
+
+/**
+ * @brief  Compute smallest eigenvectors of symmetric matrix
+ *    Computes eigenvalues and eigenvectors that are least
+ *    positive. If matrix is positive definite or positive
+ *    semidefinite, the computed eigenvalues are smallest in
+ *    magnitude.
+ *    The largest eigenvalue is estimated by performing several
+ *    Lanczos iterations. An implicitly restarted Lanczos method is
+ *    then applied to A+s*I, where s is negative the largest
+ *    eigenvalue.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param A Matrix.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter Maximum number of Lanczos steps. Does not include
+ *    Lanczos steps used to estimate largest eigenvalue.
+ *  @param restartIter Maximum size of Lanczos system before
+ *    performing an implicit restart. Should be at least 4.
+ *  @param tol Convergence tolerance. Lanczos iteration will
+ *    terminate when the residual norm is less than tol*theta, where
+ *    theta is an estimate for the smallest unwanted eigenvalue
+ *    (i.e. the (nEigVecs+1)th smallest eigenvalue).
+ *  @param reorthogonalize Whether to reorthogonalize Lanczos
+ *    vectors.
+ *  @param effIter On exit, pointer to final size of Lanczos system.
+ *  @param totalIter On exit, pointer to total number of Lanczos
+ *    iterations performed. Does not include Lanczos steps used to
+ *    estimate largest eigenvalue.
+ *  @param shift On exit, pointer to matrix shift (estimate for
+ *    largest eigenvalue).
+ *  @param alpha_host (Output, host memory, restartIter entries)
+ *    Diagonal entries of Lanczos system.
+ *  @param beta_host (Output, host memory, restartIter entries)
+ *    Off-diagonal entries of Lanczos system.
+ *  @param lanczosVecs_dev (Output, device memory, n*(restartIter+1)
+ *    entries) Lanczos vectors. Vectors are stored as columns of a
+ *    column-major matrix with dimensions n x (restartIter+1).
+ *  @param work_dev (Output, device memory,
+ *    (n+restartIter)*restartIter entries) Workspace.
+ *  @param eigVals_dev (Output, device memory, nEigVecs entries)
+ *    Largest eigenvalues of matrix.
+ *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
+ *    Eigenvectors corresponding to smallest eigenvalues of
+ *    matrix. Vectors are stored as columns of a column-major matrix
+ *    with dimensions n x nEigVecs.
+ *  @param seed random seed.
+ *  @return error flag.
+ */
+template <typename index_type_t, typename value_type_t>
+int computeSmallestEigenvectors(handle_t const& handle,
+                                sparse_matrix_t<index_type_t, value_type_t> const* A,
+                                index_type_t nEigVecs,
+                                index_type_t maxIter,
+                                index_type_t restartIter,
+                                value_type_t tol,
+                                bool reorthogonalize,
+                                index_type_t* effIter,
+                                index_type_t* totalIter,
+                                value_type_t* shift,
+                                value_type_t* __restrict__ alpha_host,
+                                value_type_t* __restrict__ beta_host,
+                                value_type_t* __restrict__ lanczosVecs_dev,
+                                value_type_t* __restrict__ work_dev,
+                                value_type_t* __restrict__ eigVals_dev,
+                                value_type_t* __restrict__ eigVecs_dev,
+                                unsigned long long seed)
+{
+  using namespace raft::spectral::detail;
+
+  // Useful constants
+  constexpr value_type_t one  = 1;
+  constexpr value_type_t zero = 0;
+
+  // Matrix dimension
+  index_type_t n = A->nrows_;
+
+  // Shift for implicit restart
+  value_type_t shiftUpper;
+  value_type_t shiftLower;
+
+  // Lanczos iteration counters
+  index_type_t maxIter_curr = restartIter;  // Maximum size of Lanczos system
+
+  // Status flags
+  int status;
+
+  // Loop index
+  index_type_t i;
+
+  // Host memory
+  value_type_t* Z_host;     // Eigenvectors in Lanczos basis
+  value_type_t* work_host;  // Workspace
+
+  // -------------------------------------------------------
+  // Check that parameters are valid
+  // -------------------------------------------------------
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
+  RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
+  RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
+  RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
+
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream   = handle.get_stream();
+
+  // -------------------------------------------------------
+  // Variable initialization
+  // -------------------------------------------------------
+
+  // Total number of Lanczos iterations
+  *totalIter = 0;
+
+  // Allocate host memory
+  std::vector<value_type_t> Z_host_v(restartIter * restartIter);
+  std::vector<value_type_t> work_host_v(4 * restartIter);
+
+  Z_host    = Z_host_v.data();
+  work_host = work_host_v.data();
+
+  // Initialize cuBLAS
+  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+  // -------------------------------------------------------
+  // Compute largest eigenvalue to determine shift
+  // -------------------------------------------------------
+
+  // Random number generator
+  curandGenerator_t randGen;
+  // Initialize random number generator
+  curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10);
+
+  curandSetPseudoRandomGeneratorSeed(randGen, seed);
+
+  // Initialize initial Lanczos vector
+  curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one);
+  value_type_t normQ1;
+  RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream));
+
+  auto h_val = 1 / normQ1;
+  RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
+
+  // Obtain tridiagonal matrix with Lanczos
+  *effIter = 0;
+  *shift   = 0;
+  status   = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                               A,
+                                                               effIter,
+                                                               maxIter_curr,
+                                                               *shift,
+                                                               0.0,
+                                                               reorthogonalize,
+                                                               alpha_host,
+                                                               beta_host,
+                                                               lanczosVecs_dev,
+                                                               work_dev);
+  if (status) WARNING("error in Lanczos iteration");
+
+  // Determine largest eigenvalue
+
+  Lapack<value_type_t>::sterf(*effIter, alpha_host, beta_host);
+  *shift = -alpha_host[*effIter - 1];
+
+  // -------------------------------------------------------
+  // Compute eigenvectors of shifted matrix
+  // -------------------------------------------------------
+
+  // Obtain tridiagonal matrix with Lanczos
+  *effIter = 0;
+
+  status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                               A,
+                                                               effIter,
+                                                               maxIter_curr,
+                                                               *shift,
+                                                               0,
+                                                               reorthogonalize,
+                                                               alpha_host,
+                                                               beta_host,
+                                                               lanczosVecs_dev,
+                                                               work_dev);
+  if (status) WARNING("error in Lanczos iteration");
+  *totalIter += *effIter;
+
+  // Apply Lanczos method until convergence
+  shiftLower = 1;
+  shiftUpper = -1;
+  while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) {
+    // Determine number of restart steps
+    // Number of steps must be even due to Francis algorithm
+    index_type_t iter_new = nEigVecs + 1;
+    if (restartIter - (maxIter - *totalIter) > nEigVecs + 1)
+      iter_new = restartIter - (maxIter - *totalIter);
+    if ((restartIter - iter_new) % 2) iter_new -= 1;
+    if (iter_new == *effIter) break;
+
+    // Implicit restart of Lanczos method
+    status = lanczosRestart<index_type_t, value_type_t>(handle,
+                                                        n,
+                                                        *effIter,
+                                                        iter_new,
+                                                        &shiftUpper,
+                                                        &shiftLower,
+                                                        alpha_host,
+                                                        beta_host,
+                                                        Z_host,
+                                                        work_host,
+                                                        lanczosVecs_dev,
+                                                        work_dev,
+                                                        true);
+    if (status) WARNING("error in Lanczos implicit restart");
+    *effIter = iter_new;
+
+    // Check for convergence
+    if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break;
+
+    // Proceed with Lanczos method
+
+    status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                                 A,
+                                                                 effIter,
+                                                                 maxIter_curr,
+                                                                 *shift,
+                                                                 tol * fabs(shiftLower),
+                                                                 reorthogonalize,
+                                                                 alpha_host,
+                                                                 beta_host,
+                                                                 lanczosVecs_dev,
+                                                                 work_dev);
+    if (status) WARNING("error in Lanczos iteration");
+    *totalIter += *effIter - iter_new;
+  }
+
+  // Warning if Lanczos has failed to converge
+  if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) {
+    WARNING("implicitly restarted Lanczos failed to converge");
+  }
+
+  // Solve tridiagonal system
+  memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t));
+  memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t));
+  Lapack<value_type_t>::steqr('I',
+                              *effIter,
+                              work_host + 2 * (*effIter),
+                              work_host + 3 * (*effIter),
+                              Z_host,
+                              *effIter,
+                              work_host);
+
+  // Obtain desired eigenvalues by applying shift
+  for (i = 0; i < *effIter; ++i)
+    work_host[i + 2 * (*effIter)] -= *shift;
+  for (i = *effIter; i < nEigVecs; ++i)
+    work_host[i + 2 * (*effIter)] = 0;
+
+  // Copy results to device memory
+  RAFT_CUDA_TRY(cudaMemcpyAsync(eigVals_dev,
+                                work_host + 2 * (*effIter),
+                                nEigVecs * sizeof(value_type_t),
+                                cudaMemcpyHostToDevice,
+                                stream));
+
+  RAFT_CUDA_TRY(cudaMemcpyAsync(work_dev,
+                                Z_host,
+                                (*effIter) * nEigVecs * sizeof(value_type_t),
+                                cudaMemcpyHostToDevice,
+                                stream));
+  CHECK_CUDA(stream);
+
+  // Convert eigenvectors from Lanczos basis to standard basis
+  RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             n,
+                             nEigVecs,
+                             *effIter,
+                             &one,
+                             lanczosVecs_dev,
+                             n,
+                             work_dev,
+                             *effIter,
+                             &zero,
+                             eigVecs_dev,
+                             n,
+                             stream));
+
+  // Clean up and exit
+  curandDestroyGenerator(randGen);
+  return 0;
+}
+
+template <typename index_type_t, typename value_type_t>
+int computeSmallestEigenvectors(handle_t const& handle,
+                                sparse_matrix_t<index_type_t, value_type_t> const& A,
+                                index_type_t nEigVecs,
+                                index_type_t maxIter,
+                                index_type_t restartIter,
+                                value_type_t tol,
+                                bool reorthogonalize,
+                                index_type_t& iter,
+                                value_type_t* __restrict__ eigVals_dev,
+                                value_type_t* __restrict__ eigVecs_dev,
+                                unsigned long long seed = 1234567)
+{
+  using namespace raft::spectral::detail;
+
+  // Matrix dimension
+  index_type_t n = A.nrows_;
+
+  // Check that parameters are valid
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
+  RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
+  RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
+  RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
+
+  // Allocate memory
+  std::vector<value_type_t> alpha_host_v(restartIter);
+  std::vector<value_type_t> beta_host_v(restartIter);
+
+  value_type_t* alpha_host = alpha_host_v.data();
+  value_type_t* beta_host  = beta_host_v.data();
+
+  vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
+  vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
+
+  // Perform Lanczos method
+  index_type_t effIter;
+  value_type_t shift;
+  int status = computeSmallestEigenvectors(handle,
+                                           &A,
+                                           nEigVecs,
+                                           maxIter,
+                                           restartIter,
+                                           tol,
+                                           reorthogonalize,
+                                           &effIter,
+                                           &iter,
+                                           &shift,
+                                           alpha_host,
+                                           beta_host,
+                                           lanczosVecs_dev.raw(),
+                                           work_dev.raw(),
+                                           eigVals_dev,
+                                           eigVecs_dev,
+                                           seed);
+
+  // Clean up and return
+  return status;
+}
+
+/**
+ *  @brief Compute largest eigenvectors of symmetric matrix
+ *    Computes eigenvalues and eigenvectors that are least
+ *    positive. If matrix is positive definite or positive
+ *    semidefinite, the computed eigenvalues are largest in
+ *    magnitude.
+ *    The largest eigenvalue is estimated by performing several
+ *    Lanczos iterations. An implicitly restarted Lanczos method is
+ *    then applied.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param A Matrix.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter Maximum number of Lanczos steps.
+ *  @param restartIter Maximum size of Lanczos system before
+ *    performing an implicit restart. Should be at least 4.
+ *  @param tol Convergence tolerance. Lanczos iteration will
+ *    terminate when the residual norm is less than tol*theta, where
+ *    theta is an estimate for the largest unwanted eigenvalue
+ *    (i.e. the (nEigVecs+1)th largest eigenvalue).
+ *  @param reorthogonalize Whether to reorthogonalize Lanczos
+ *    vectors.
+ *  @param effIter On exit, pointer to final size of Lanczos system.
+ *  @param totalIter On exit, pointer to total number of Lanczos
+ *    iterations performed.
+ *  @param alpha_host (Output, host memory, restartIter entries)
+ *    Diagonal entries of Lanczos system.
+ *  @param beta_host (Output, host memory, restartIter entries)
+ *    Off-diagonal entries of Lanczos system.
+ *  @param lanczosVecs_dev (Output, device memory, n*(restartIter+1)
+ *    entries) Lanczos vectors. Vectors are stored as columns of a
+ *    column-major matrix with dimensions n x (restartIter+1).
+ *  @param work_dev (Output, device memory,
+ *    (n+restartIter)*restartIter entries) Workspace.
+ *  @param eigVals_dev (Output, device memory, nEigVecs entries)
+ *    Largest eigenvalues of matrix.
+ *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
+ *    Eigenvectors corresponding to largest eigenvalues of
+ *    matrix. Vectors are stored as columns of a column-major matrix
+ *    with dimensions n x nEigVecs.
+ *  @param seed random seed.
+ *  @return error flag.
+ */
+template <typename index_type_t, typename value_type_t>
+int computeLargestEigenvectors(handle_t const& handle,
+                               sparse_matrix_t<index_type_t, value_type_t> const* A,
+                               index_type_t nEigVecs,
+                               index_type_t maxIter,
+                               index_type_t restartIter,
+                               value_type_t tol,
+                               bool reorthogonalize,
+                               index_type_t* effIter,
+                               index_type_t* totalIter,
+                               value_type_t* __restrict__ alpha_host,
+                               value_type_t* __restrict__ beta_host,
+                               value_type_t* __restrict__ lanczosVecs_dev,
+                               value_type_t* __restrict__ work_dev,
+                               value_type_t* __restrict__ eigVals_dev,
+                               value_type_t* __restrict__ eigVecs_dev,
+                               unsigned long long seed)
+{
+  using namespace raft::spectral::detail;
+
+  // Useful constants
+  constexpr value_type_t one  = 1;
+  constexpr value_type_t zero = 0;
+
+  // Matrix dimension
+  index_type_t n = A->nrows_;
+
+  // Lanczos iteration counters
+  index_type_t maxIter_curr = restartIter;  // Maximum size of Lanczos system
+
+  // Status flags
+  int status;
+
+  // Loop index
+  index_type_t i;
+
+  // Host memory
+  value_type_t* Z_host;     // Eigenvectors in Lanczos basis
+  value_type_t* work_host;  // Workspace
+
+  // -------------------------------------------------------
+  // Check that LAPACK is enabled
+  // -------------------------------------------------------
+  // Lapack<value_type_t>::check_lapack_enabled();
+
+  // -------------------------------------------------------
+  // Check that parameters are valid
+  // -------------------------------------------------------
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
+  RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
+  RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
+  RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
+
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream   = handle.get_stream();
+
+  // -------------------------------------------------------
+  // Variable initialization
+  // -------------------------------------------------------
+
+  // Total number of Lanczos iterations
+  *totalIter = 0;
+
+  // Allocate host memory
+  std::vector<value_type_t> Z_host_v(restartIter * restartIter);
+  std::vector<value_type_t> work_host_v(4 * restartIter);
+
+  Z_host    = Z_host_v.data();
+  work_host = work_host_v.data();
+
+  // Initialize cuBLAS
+  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+  // -------------------------------------------------------
+  // Compute largest eigenvalue
+  // -------------------------------------------------------
+
+  // Random number generator
+  curandGenerator_t randGen;
+  // Initialize random number generator
+  curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10);
+  curandSetPseudoRandomGeneratorSeed(randGen, seed);
+  // Initialize initial Lanczos vector
+  curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one);
+  value_type_t normQ1;
+  RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream));
+
+  auto h_val = 1 / normQ1;
+  RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
+
+  // Obtain tridiagonal matrix with Lanczos
+  *effIter               = 0;
+  value_type_t shift_val = 0.0;
+  value_type_t* shift    = &shift_val;
+
+  status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                               A,
+                                                               effIter,
+                                                               maxIter_curr,
+                                                               *shift,
+                                                               0,
+                                                               reorthogonalize,
+                                                               alpha_host,
+                                                               beta_host,
+                                                               lanczosVecs_dev,
+                                                               work_dev);
+  if (status) WARNING("error in Lanczos iteration");
+  *totalIter += *effIter;
+
+  // Apply Lanczos method until convergence
+  value_type_t shiftLower = 1;
+  value_type_t shiftUpper = -1;
+  while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) {
+    // Determine number of restart steps
+    //   Number of steps must be even due to Francis algorithm
+    index_type_t iter_new = nEigVecs + 1;
+    if (restartIter - (maxIter - *totalIter) > nEigVecs + 1)
+      iter_new = restartIter - (maxIter - *totalIter);
+    if ((restartIter - iter_new) % 2) iter_new -= 1;
+    if (iter_new == *effIter) break;
+
+    // Implicit restart of Lanczos method
+    status = lanczosRestart<index_type_t, value_type_t>(handle,
+                                                        n,
+                                                        *effIter,
+                                                        iter_new,
+                                                        &shiftUpper,
+                                                        &shiftLower,
+                                                        alpha_host,
+                                                        beta_host,
+                                                        Z_host,
+                                                        work_host,
+                                                        lanczosVecs_dev,
+                                                        work_dev,
+                                                        false);
+    if (status) WARNING("error in Lanczos implicit restart");
+    *effIter = iter_new;
+
+    // Check for convergence
+    if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break;
+
+    // Proceed with Lanczos method
+
+    status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                                 A,
+                                                                 effIter,
+                                                                 maxIter_curr,
+                                                                 *shift,
+                                                                 tol * fabs(shiftLower),
+                                                                 reorthogonalize,
+                                                                 alpha_host,
+                                                                 beta_host,
+                                                                 lanczosVecs_dev,
+                                                                 work_dev);
+    if (status) WARNING("error in Lanczos iteration");
+    *totalIter += *effIter - iter_new;
+  }
+
+  // Warning if Lanczos has failed to converge
+  if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) {
+    WARNING("implicitly restarted Lanczos failed to converge");
+  }
+  for (int i = 0; i < restartIter; ++i) {
+    for (int j = 0; j < restartIter; ++j)
+      Z_host[i * restartIter + j] = 0;
+  }
+  // Solve tridiagonal system
+  memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t));
+  memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t));
+  Lapack<value_type_t>::steqr('I',
+                              *effIter,
+                              work_host + 2 * (*effIter),
+                              work_host + 3 * (*effIter),
+                              Z_host,
+                              *effIter,
+                              work_host);
+
+  // note: We need to pick the top nEigVecs eigenvalues
+  // but effItter can be larger than nEigVecs
+  // hence we add an offset for that case, because we want to access top nEigVecs eigenpairs in the
+  // matrix of size effIter. remember the array is sorted, so it is not needed for smallest
+  // eigenvalues case because the first ones are the smallest ones
+
+  index_type_t top_eigenparis_idx_offset = *effIter - nEigVecs;
+
+  // Debug : print nEigVecs largest eigenvalues
+  // for (int i = top_eigenparis_idx_offset; i < *effIter; ++i)
+  //  std::cout <<*(work_host+(2*(*effIter)+i))<< " ";
+  // std::cout <<std::endl;
+
+  // Debug : print nEigVecs largest eigenvectors
+  // for (int i = top_eigenparis_idx_offset; i < *effIter; ++i)
+  //{
+  //  for (int j = 0; j < *effIter; ++j)
+  //    std::cout <<Z_host[i*(*effIter)+j]<< " ";
+  //  std::cout <<std::endl;
+  //}
+
+  // Obtain desired eigenvalues by applying shift
+  for (i = 0; i < *effIter; ++i)
+    work_host[i + 2 * (*effIter)] -= *shift;
+
+  for (i = 0; i < top_eigenparis_idx_offset; ++i)
+    work_host[i + 2 * (*effIter)] = 0;
+
+  // Copy results to device memory
+  // skip smallest eigenvalue if needed
+  RAFT_CUDA_TRY(cudaMemcpyAsync(eigVals_dev,
+                                work_host + 2 * (*effIter) + top_eigenparis_idx_offset,
+                                nEigVecs * sizeof(value_type_t),
+                                cudaMemcpyHostToDevice,
+                                stream));
+
+  // skip smallest eigenvector if needed
+  RAFT_CUDA_TRY(cudaMemcpyAsync(work_dev,
+                                Z_host + (top_eigenparis_idx_offset * (*effIter)),
+                                (*effIter) * nEigVecs * sizeof(value_type_t),
+                                cudaMemcpyHostToDevice,
+                                stream));
+
+  CHECK_CUDA(stream);
+
+  // Convert eigenvectors from Lanczos basis to standard basis
+  RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             n,
+                             nEigVecs,
+                             *effIter,
+                             &one,
+                             lanczosVecs_dev,
+                             n,
+                             work_dev,
+                             *effIter,
+                             &zero,
+                             eigVecs_dev,
+                             n,
+                             stream));
+
+  // Clean up and exit
+  curandDestroyGenerator(randGen);
+  return 0;
+}
+
+template <typename index_type_t, typename value_type_t>
+int computeLargestEigenvectors(handle_t const& handle,
+                               sparse_matrix_t<index_type_t, value_type_t> const& A,
+                               index_type_t nEigVecs,
+                               index_type_t maxIter,
+                               index_type_t restartIter,
+                               value_type_t tol,
+                               bool reorthogonalize,
+                               index_type_t& iter,
+                               value_type_t* __restrict__ eigVals_dev,
+                               value_type_t* __restrict__ eigVecs_dev,
+                               unsigned long long seed = 123456)
+{
+  // Matrix dimension
+  index_type_t n = A.nrows_;
+
+  // Check that parameters are valid
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
+  RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
+  RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
+  RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
+
+  // Allocate memory
+  std::vector<value_type_t> alpha_host_v(restartIter);
+  std::vector<value_type_t> beta_host_v(restartIter);
+
+  value_type_t* alpha_host = alpha_host_v.data();
+  value_type_t* beta_host  = beta_host_v.data();
+
+  vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
+  vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
+
+  // Perform Lanczos method
+  index_type_t effIter;
+  int status = computeLargestEigenvectors(handle,
+                                          &A,
+                                          nEigVecs,
+                                          maxIter,
+                                          restartIter,
+                                          tol,
+                                          reorthogonalize,
+                                          &effIter,
+                                          &iter,
+                                          alpha_host,
+                                          beta_host,
+                                          lanczosVecs_dev.raw(),
+                                          work_dev.raw(),
+                                          eigVals_dev,
+                                          eigVecs_dev,
+                                          seed);
+
+  // Clean up and return
+  return status;
+}
+
+}  // namespace detail
+}  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh
new file mode 100644
index 0000000000..56f1dd6f19
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/map.cuh
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/cub.cuh>
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+#include <raft/vectorized.cuh>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename InType, typename OutType, typename MapOp, int TPB, typename... Args>
+__global__ void mapKernel(OutType* out, size_t len, MapOp map, const InType* in, Args... args)
+{
+  auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
+
+  if (idx < len) { out[idx] = map(in[idx], args[idx]...); }
+}
+
+template <typename InType, typename OutType, typename MapOp, int TPB, typename... Args>
+void mapImpl(
+  OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+{
+  const int nblks = raft::ceildiv(len, (size_t)TPB);
+  mapKernel<InType, OutType, MapOp, TPB, Args...>
+    <<<nblks, TPB, 0, stream>>>(out, len, map, in, args...);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+}  // namespace detail
+}  // namespace linalg
+};  // namespace raft
diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/detail/map_then_reduce.cuh
similarity index 54%
rename from cpp/include/raft/linalg/map_then_reduce.cuh
rename to cpp/include/raft/linalg/detail/map_then_reduce.cuh
index 2fa3ae72de..281861b2f9 100644
--- a/cpp/include/raft/linalg/map_then_reduce.cuh
+++ b/cpp/include/raft/linalg/detail/map_then_reduce.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
 struct sum_tag {
 };
@@ -91,67 +92,6 @@ void mapThenReduceImpl(OutType* out,
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-/**
- * @brief CUDA version of map and then sum reduction operation
- * @tparam Type data-type upon which the math operation will be performed
- * @tparam MapOp the device-lambda performing the actual operation
- * @tparam TPB threads-per-block in the final kernel launched
- * @tparam Args additional parameters
- * @param out the output sum-reduced value (assumed to be a device pointer)
- * @param len number of elements in the input array
- * @param map the device-lambda
- * @param stream cuda-stream where to launch this kernel
- * @param in the input array
- * @param args additional input arrays
- */
-
-template <typename InType,
-          typename MapOp,
-          int TPB = 256,
-          typename... Args,
-          typename OutType = InType>
-void mapThenSumReduce(
-  OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
-{
-  mapThenReduceImpl<InType, OutType, MapOp, sum_tag, TPB, Args...>(
-    out, len, (OutType)0, map, sum_tag(), stream, in, args...);
-}
-
-/**
- * @brief CUDA version of map and then generic reduction operation
- * @tparam Type data-type upon which the math operation will be performed
- * @tparam MapOp the device-lambda performing the actual map operation
- * @tparam ReduceLambda the device-lambda performing the actual reduction
- * @tparam TPB threads-per-block in the final kernel launched
- * @tparam Args additional parameters
- * @param out the output reduced value (assumed to be a device pointer)
- * @param len number of elements in the input array
- * @param neutral The neutral element of the reduction operation. For example:
- *    0 for sum, 1 for multiply, +Inf for Min, -Inf for Max
- * @param map the device-lambda
- * @param op the reduction device lambda
- * @param stream cuda-stream where to launch this kernel
- * @param in the input array
- * @param args additional input arrays
- */
-
-template <typename InType,
-          typename MapOp,
-          typename ReduceLambda,
-          int TPB          = 256,
-          typename OutType = InType,
-          typename... Args>
-void mapThenReduce(OutType* out,
-                   size_t len,
-                   OutType neutral,
-                   MapOp map,
-                   ReduceLambda op,
-                   cudaStream_t stream,
-                   const InType* in,
-                   Args... args)
-{
-  mapThenReduceImpl<InType, OutType, MapOp, ReduceLambda, TPB, Args...>(
-    out, len, neutral, map, op, stream, in, args...);
-}
+};  // end namespace detail
 };  // end namespace linalg
 };  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
new file mode 100644
index 0000000000..94545e59f6
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/matrix/matrix.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+namespace {
+template <size_t VecBytes>
+struct AlignedAccess {
+  template <typename T>
+  static inline bool test(const T* matrix, size_t strideBytes)
+  {
+    return Pow2<VecBytes>::isAligned(matrix) && Pow2<VecBytes>::isAligned(strideBytes) &&
+           Pow2<sizeof(T)>::isAligned(VecBytes);
+  }
+};
+};  // namespace
+
+template <typename Type, int veclen_, typename Lambda, typename IdxType>
+__global__ void matrixVectorOpKernel(Type* out,
+                                     const Type* matrix,
+                                     const Type* vector,
+                                     IdxType D,
+                                     IdxType N,
+                                     bool rowMajor,
+                                     bool bcastAlongRows,
+                                     Lambda op)
+{
+  typedef TxN_t<Type, veclen_> VecType;
+  IdxType len = N * D;
+  IdxType idx = threadIdx.x;
+  idx += (IdxType)blockIdx.x * (IdxType)blockDim.x;
+  idx *= VecType::Ratio;
+  if (idx >= len) return;
+  IdxType vIdx;
+  VecType mat, vec;
+  ///@todo: yikes! use fast-int-div here.
+  ///@todo: shared mem for vector could help with perf
+  if (rowMajor && bcastAlongRows) {
+    vIdx = idx % D;
+    vec.load(vector, vIdx);
+  } else if (!rowMajor && !bcastAlongRows) {
+    vIdx = idx % N;
+    vec.load(vector, vIdx);
+  } else if (rowMajor && !bcastAlongRows) {
+    vIdx = idx / D;
+    vec.fill(vector[vIdx]);
+  } else {
+    vIdx = idx / N;
+    vec.fill(vector[vIdx]);
+  }
+  mat.load(matrix, idx);
+#pragma unroll
+  for (int i = 0; i < VecType::Ratio; ++i)
+    mat.val.data[i] = op(mat.val.data[i], vec.val.data[i]);
+  mat.store(out, idx);
+}
+
+template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
+void matrixVectorOpImpl(Type* out,
+                        const Type* matrix,
+                        const Type* vec,
+                        IdxType D,
+                        IdxType N,
+                        bool rowMajor,
+                        bool bcastAlongRows,
+                        Lambda op,
+                        cudaStream_t stream)
+{
+  IdxType len   = N * D;
+  IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
+  matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
+    <<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
+void matrixVectorOp(Type* out,
+                    const Type* matrix,
+                    const Type* vec,
+                    IdxType D,
+                    IdxType N,
+                    bool rowMajor,
+                    bool bcastAlongRows,
+                    Lambda op,
+                    cudaStream_t stream)
+{
+  IdxType stride = rowMajor ? D : N;
+  IdxType nLines = rowMajor ? N : D;
+  return matrix::linewiseOp(
+    out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec);
+}
+
+template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
+void matrixVectorOp(Type* out,
+                    const Type* matrix,
+                    const Type* vec1,
+                    const Type* vec2,
+                    IdxType D,
+                    IdxType N,
+                    bool rowMajor,
+                    bool bcastAlongRows,
+                    Lambda op,
+                    cudaStream_t stream)
+{
+  IdxType stride = rowMajor ? D : N;
+  IdxType nLines = rowMajor ? N : D;
+  return matrix::linewiseOp(
+    out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec1, vec2);
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/mean_squared_error.hpp b/cpp/include/raft/linalg/detail/mean_squared_error.hpp
new file mode 100644
index 0000000000..f0a9daebdb
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/mean_squared_error.hpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/map_then_reduce.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename math_t, int TPB = 256>
+void meanSquaredError(
+  math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream)
+{
+  auto sq_diff = [len, weight] __device__(const math_t a, const math_t b) {
+    math_t diff = a - b;
+    return diff * diff * weight / len;
+  };
+  raft::linalg::mapThenSumReduce<math_t, decltype(sq_diff), TPB>(out, len, sq_diff, stream, A, B);
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/multiply.hpp b/cpp/include/raft/linalg/detail/multiply.hpp
new file mode 100644
index 0000000000..da06c23aed
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/multiply.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/unary_op.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename math_t, typename IdxType = int>
+void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(
+    out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, stream);
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/norm.hpp b/cpp/include/raft/linalg/detail/norm.hpp
new file mode 100644
index 0000000000..fcf98c7daf
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/norm.hpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/reduce.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+/** different types of norms supported on the input buffers */
+enum NormType { L1Norm = 0, L2Norm };
+
+template <typename Type, typename IdxType, typename Lambda>
+void rowNormCaller(Type* dots,
+                   const Type* data,
+                   IdxType D,
+                   IdxType N,
+                   NormType type,
+                   bool rowMajor,
+                   cudaStream_t stream,
+                   Lambda fin_op)
+{
+  switch (type) {
+    case L1Norm:
+      raft::linalg::reduce(dots,
+                           data,
+                           D,
+                           N,
+                           (Type)0,
+                           rowMajor,
+                           true,
+                           stream,
+                           false,
+                           raft::L1Op<Type, IdxType>(),
+                           raft::Sum<Type>(),
+                           fin_op);
+      break;
+    case L2Norm:
+      raft::linalg::reduce(dots,
+                           data,
+                           D,
+                           N,
+                           (Type)0,
+                           rowMajor,
+                           true,
+                           stream,
+                           false,
+                           raft::L2Op<Type>(),
+                           raft::Sum<Type>(),
+                           fin_op);
+      break;
+    default: ASSERT(false, "Invalid norm type passed! [%d]", type);
+  };
+}
+
+template <typename Type, typename IdxType, typename Lambda>
+void colNormCaller(Type* dots,
+                   const Type* data,
+                   IdxType D,
+                   IdxType N,
+                   NormType type,
+                   bool rowMajor,
+                   cudaStream_t stream,
+                   Lambda fin_op)
+{
+  switch (type) {
+    case L1Norm:
+      raft::linalg::reduce(dots,
+                           data,
+                           D,
+                           N,
+                           (Type)0,
+                           rowMajor,
+                           false,
+                           stream,
+                           false,
+                           raft::L1Op<Type, IdxType>(),
+                           raft::Sum<Type>(),
+                           fin_op);
+      break;
+    case L2Norm:
+      raft::linalg::reduce(dots,
+                           data,
+                           D,
+                           N,
+                           (Type)0,
+                           rowMajor,
+                           false,
+                           stream,
+                           false,
+                           raft::L2Op<Type, IdxType>(),
+                           raft::Sum<Type>(),
+                           fin_op);
+      break;
+    default: ASSERT(false, "Invalid norm type passed! [%d]", type);
+  };
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/detail/qr.cuh
similarity index 77%
rename from cpp/include/raft/linalg/qr.cuh
rename to cpp/include/raft/linalg/detail/qr.cuh
index 2870d6d072..a250dd3578 100644
--- a/cpp/include/raft/linalg/qr.cuh
+++ b/cpp/include/raft/linalg/detail/qr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,30 +16,16 @@
 
 #pragma once
 
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
+#include "cublas_wrappers.hpp"
+#include "cusolver_wrappers.hpp"
 #include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
-/**
- * @defgroup QRdecomp QR decomposition
- * @{
- */
-
-/**
- * @brief compute QR decomp and return only Q matrix
- * @param handle: raft handle
- * @param M: input matrix
- * @param Q: Q matrix to be returned (on GPU)
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param stream cuda stream
- * @{
- */
 template <typename math_t>
 void qrGetQ(const raft::handle_t& handle,
             const math_t* M,
@@ -64,26 +50,13 @@ void qrGetQ(const raft::handle_t& handle,
   rmm::device_uvector<math_t> workspace(Lwork, stream);
   RAFT_CUSOLVER_TRY(cusolverDngeqrf(
     cusolverH, m, n, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
-  /// @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail.
-#if defined(CUDART_VERSION) && CUDART_VERSION <= 9020
-  RAFT_CUDA_TRY(cudaDeviceSynchronize());
-#endif
+
   RAFT_CUSOLVER_TRY(cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork));
   workspace.resize(Lwork, stream);
   RAFT_CUSOLVER_TRY(cusolverDnorgqr(
     cusolverH, m, n, k, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
 }
 
-/**
- * @brief compute QR decomp and return both Q and R matrices
- * @param handle: raft handle
- * @param M: input matrix
- * @param Q: Q matrix to be returned (on GPU)
- * @param R: R matrix to be returned (on GPU)
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param stream cuda stream
- */
 template <typename math_t>
 void qrGetQR(const raft::handle_t& handle,
              math_t* M,
@@ -119,10 +92,6 @@ void qrGetQR(const raft::handle_t& handle,
                                     Lwork,
                                     devInfo.data(),
                                     stream));
-  // @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail.
-#if defined(CUDART_VERSION) && CUDART_VERSION <= 9020
-  RAFT_CUDA_TRY(cudaDeviceSynchronize());
-#endif
 
   raft::matrix::copyUpperTriangular(R_full.data(), R, m, n, stream);
 
@@ -145,7 +114,7 @@ void qrGetQR(const raft::handle_t& handle,
                                     devInfo.data(),
                                     stream));
 }
-/** @} */
 
+};  // namespace detail
 };  // namespace linalg
 };  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/reduce.hpp b/cpp/include/raft/linalg/detail/reduce.hpp
new file mode 100644
index 0000000000..94c8f5ba52
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/reduce.hpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/coalesced_reduction.hpp>
+#include <raft/linalg/strided_reduction.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::Nop<OutType>>
+void reduce(OutType* dots,
+            const InType* data,
+            int D,
+            int N,
+            OutType init,
+            bool rowMajor,
+            bool alongRows,
+            cudaStream_t stream,
+            bool inplace           = false,
+            MainLambda main_op     = raft::Nop<InType, IdxType>(),
+            ReduceLambda reduce_op = raft::Sum<OutType>(),
+            FinalLambda final_op   = raft::Nop<OutType>())
+{
+  if (rowMajor && alongRows) {
+    raft::linalg::coalescedReduction(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else if (rowMajor && !alongRows) {
+    raft::linalg::stridedReduction(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else if (!rowMajor && alongRows) {
+    raft::linalg::stridedReduction(
+      dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
+  } else {
+    raft::linalg::coalescedReduction(
+      dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
+  }
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh
similarity index 80%
rename from cpp/include/raft/linalg/strided_reduction.cuh
rename to cpp/include/raft/linalg/detail/strided_reduction.cuh
index 0434f87151..a0d1e2abaa 100644
--- a/cpp/include/raft/linalg/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,10 +19,12 @@
 #include "unary_op.cuh"
 #include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/unary_op.hpp>
 #include <type_traits>
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
 // Kernel to perform reductions along the strided dimension
 // of the matrix, i.e. reduce along columns for row major or reduce along rows
@@ -102,33 +104,6 @@ __global__ void stridedReductionKernel(OutType* dots,
     raft::myAtomicReduce(dots + colStart, temp[myidx], reduce_op);
 }
 
-/**
- * @brief Compute reduction of the input matrix along the strided dimension
- *
- * @tparam InType the data type of the input
- * @tparam OutType the data type of the output (as well as the data type for
- *  which reduction is performed)
- * @tparam IdxType data type of the indices of the array
- * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
- * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*ReduceLambda)(OutType);</pre>
- * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
- * It must be a 'callable' supporting the following input and output:
- * <pre>OutType (*FinalLambda)(OutType);</pre>
- * @param dots the output reduction vector
- * @param data the input matrix
- * @param D leading dimension of data
- * @param N second dimension data
- * @param init initial value to use for the reduction
- * @param main_op elementwise operation to apply before reduction
- * @param reduce_op binary reduction operation
- * @param final_op elementwise operation to apply before storing results
- * @param inplace reduction result added inplace or overwrites old values?
- * @param stream cuda stream where to launch work
- */
 template <typename InType,
           typename OutType      = InType,
           typename IdxType      = int,
@@ -177,5 +152,6 @@ void stridedReduction(OutType* dots,
     raft::linalg::unaryOp(dots, dots, D, final_op, stream);
 }
 
+};  // end namespace detail
 };  // end namespace linalg
 };  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/subtract.cuh b/cpp/include/raft/linalg/detail/subtract.cuh
new file mode 100644
index 0000000000..23d5eded05
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/subtract.cuh
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/binary_op.hpp>
+#include <raft/linalg/unary_op.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
+{
+  auto op = [scalar] __device__(InT in) { return OutT(in - scalar); };
+  raft::linalg::unaryOp<InT, decltype(op), IdxType, OutT>(out, in, len, op, stream);
+}
+
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
+{
+  auto op = [] __device__(InT a, InT b) { return OutT(a - b); };
+  raft::linalg::binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
+}
+
+template <class math_t, typename IdxType>
+__global__ void subtract_dev_scalar_kernel(math_t* outDev,
+                                           const math_t* inDev,
+                                           const math_t* singleScalarDev,
+                                           IdxType len)
+{
+  // TODO: kernel do not use shared memory in current implementation
+  int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
+  if (i < len) { outDev[i] = inDev[i] - *singleScalarDev; }
+}
+
+template <typename math_t, typename IdxType = int, int TPB = 256>
+void subtractDevScalar(math_t* outDev,
+                       const math_t* inDev,
+                       const math_t* singleScalarDev,
+                       IdxType len,
+                       cudaStream_t stream)
+{
+  // Just for the note - there is no way to express such operation with cuBLAS in effective way
+  // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda
+  const IdxType nblks = raft::ceildiv(len, (IdxType)TPB);
+  subtract_dev_scalar_kernel<math_t>
+    <<<nblks, TPB, 0, stream>>>(outDev, inDev, singleScalarDev, len);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/detail/svd.hpp
similarity index 59%
rename from cpp/include/raft/linalg/svd.cuh
rename to cpp/include/raft/linalg/detail/svd.hpp
index 2afae788a1..796adc89ff 100644
--- a/cpp/include/raft/linalg/svd.cuh
+++ b/cpp/include/raft/linalg/detail/svd.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,15 +16,16 @@
 
 #pragma once
 
-#include "eig.cuh"
-#include "gemm.cuh"
-#include "transpose.h"
+#include "cublas_wrappers.hpp"
+#include "cusolver_wrappers.hpp"
+#include <raft/linalg/eig.hpp>
+#include <raft/linalg/gemm.hpp>
+#include <raft/linalg/transpose.hpp>
+
 #include <raft/common/nvtx.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
 #include <raft/matrix/math.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
@@ -32,25 +33,8 @@
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
-/**
- * @brief singular value decomposition (SVD) on the column major float type
- * input matrix using QR method
- * @param handle: raft handle
- * @param in: input matrix
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param sing_vals: singular values of input matrix
- * @param left_sing_vecs: left singular values of input matrix
- * @param right_sing_vecs: right singular values of input matrix
- * @param trans_right: transpose right vectors or not
- * @param gen_left_vec: generate left eig vector. Not activated.
- * @param gen_right_vec: generate right eig vector. Not activated.
- * @param stream cuda stream
- */
-// TODO: activate gen_left_vec and gen_right_vec options
-// TODO: couldn't template this function due to cusolverDnSgesvd and
-// cusolverSnSgesvd. Check if there is any other way.
 template <typename T>
 void svdQR(const raft::handle_t& handle,
            T* in,
@@ -69,15 +53,6 @@ void svdQR(const raft::handle_t& handle,
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
   cublasHandle_t cublasH       = handle.get_cublas_handle();
 
-#if CUDART_VERSION >= 10010 && CUDART_VERSION < 11000
-  // 46340: sqrt of max int value
-  ASSERT(n_rows <= 46340,
-         "svd solver is not supported for the data that has more than 46340 "
-         "samples (rows) "
-         "if you are using CUDA version <11. Please use other solvers such as "
-         "eig if it is available.");
-#endif
-
   const int m = n_rows;
   const int n = n_cols;
 
@@ -167,7 +142,7 @@ void svdEig(const raft::handle_t& handle,
                      beta,
                      stream);
 
-  eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, stream);
+  raft::linalg::eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, stream);
 
   raft::matrix::colReverse(V, n_cols, n_cols, stream);
   raft::matrix::rowReverse(S, n_cols, 1, stream);
@@ -192,23 +167,6 @@ void svdEig(const raft::handle_t& handle,
   }
 }
 
-/**
- * @brief on the column major input matrix using Jacobi method
- * @param handle: raft handle
- * @param in: input matrix
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param sing_vals: singular values of input matrix
- * @param left_sing_vecs: left singular vectors of input matrix
- * @param right_sing_vecs: right singular vectors of input matrix
- * @param gen_left_vec: generate left eig vector. Not activated.
- * @param gen_right_vec: generate right eig vector. Not activated.
- * @param tol: error tolerance for the jacobi method. Algorithm stops when the
- * error is below tol
- * @param max_sweeps: number of sweeps in the Jacobi algorithm. The more the better
- * accuracy.
- * @param stream cuda stream
- */
 template <typename math_t>
 void svdJacobi(const raft::handle_t& handle,
                math_t* in,
@@ -241,57 +199,44 @@ void svdJacobi(const raft::handle_t& handle,
   int lwork = 0;
   int econ  = 1;
 
-  RAFT_CUSOLVER_TRY(raft::linalg::cusolverDngesvdj_bufferSize(cusolverH,
-                                                              CUSOLVER_EIG_MODE_VECTOR,
-                                                              econ,
-                                                              m,
-                                                              n,
-                                                              in,
-                                                              m,
-                                                              sing_vals,
-                                                              left_sing_vecs,
-                                                              m,
-                                                              right_sing_vecs,
-                                                              n,
-                                                              &lwork,
-                                                              gesvdj_params));
+  RAFT_CUSOLVER_TRY(cusolverDngesvdj_bufferSize(cusolverH,
+                                                CUSOLVER_EIG_MODE_VECTOR,
+                                                econ,
+                                                m,
+                                                n,
+                                                in,
+                                                m,
+                                                sing_vals,
+                                                left_sing_vecs,
+                                                m,
+                                                right_sing_vecs,
+                                                n,
+                                                &lwork,
+                                                gesvdj_params));
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
 
-  RAFT_CUSOLVER_TRY(raft::linalg::cusolverDngesvdj(cusolverH,
-                                                   CUSOLVER_EIG_MODE_VECTOR,
-                                                   econ,
-                                                   m,
-                                                   n,
-                                                   in,
-                                                   m,
-                                                   sing_vals,
-                                                   left_sing_vecs,
-                                                   m,
-                                                   right_sing_vecs,
-                                                   n,
-                                                   d_work.data(),
-                                                   lwork,
-                                                   devInfo.data(),
-                                                   gesvdj_params,
-                                                   stream));
+  RAFT_CUSOLVER_TRY(cusolverDngesvdj(cusolverH,
+                                     CUSOLVER_EIG_MODE_VECTOR,
+                                     econ,
+                                     m,
+                                     n,
+                                     in,
+                                     m,
+                                     sing_vals,
+                                     left_sing_vecs,
+                                     m,
+                                     right_sing_vecs,
+                                     n,
+                                     d_work.data(),
+                                     lwork,
+                                     devInfo.data(),
+                                     gesvdj_params,
+                                     stream));
 
   RAFT_CUSOLVER_TRY(cusolverDnDestroyGesvdjInfo(gesvdj_params));
 }
 
-/**
- * @brief reconstruct a matrix use left and right singular vectors and
- * singular values
- * @param handle: raft handle
- * @param U: left singular vectors of size n_rows x k
- * @param S: square matrix with singular values on its diagonal, k x k
- * @param V: right singular vectors of size n_cols x k
- * @param out: reconstructed matrix to be returned
- * @param n_rows: number rows of output matrix
- * @param n_cols: number columns of output matrix
- * @param k: number of singular values
- * @param stream cuda stream
- */
 template <typename math_t>
 void svdReconstruction(const raft::handle_t& handle,
                        math_t* U,
@@ -323,20 +268,6 @@ void svdReconstruction(const raft::handle_t& handle,
                      stream);
 }
 
-/**
- * @brief reconstruct a matrix use left and right singular vectors and
- * singular values
- * @param handle: raft handle
- * @param A_d: input matrix
- * @param U: left singular vectors of size n_rows x k
- * @param S_vec: singular values as a vector
- * @param V: right singular vectors of size n_cols x k
- * @param n_rows: number rows of output matrix
- * @param n_cols: number columns of output matrix
- * @param k: number of singular values to be computed, 1.0 for normal SVD
- * @param tol: tolerance for the evaluation
- * @param stream cuda stream
- */
 template <typename math_t>
 bool evaluateSVDByL2Norm(const raft::handle_t& handle,
                          math_t* A_d,
@@ -374,25 +305,26 @@ bool evaluateSVDByL2Norm(const raft::handle_t& handle,
   rmm::device_uvector<math_t> A_minus_P(m * n, stream);
   RAFT_CUDA_TRY(cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream));
 
-  RAFT_CUBLAS_TRY(raft::linalg::cublasgeam(cublasH,
-                                           CUBLAS_OP_N,
-                                           CUBLAS_OP_N,
-                                           m,
-                                           n,
-                                           &alpha,
-                                           A_d,
-                                           m,
-                                           &beta,
-                                           P_d.data(),
-                                           m,
-                                           A_minus_P.data(),
-                                           m,
-                                           stream));
+  RAFT_CUBLAS_TRY(cublasgeam(cublasH,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             m,
+                             n,
+                             &alpha,
+                             A_d,
+                             m,
+                             &beta,
+                             P_d.data(),
+                             m,
+                             A_minus_P.data(),
+                             m,
+                             stream));
 
   math_t norm_A_minus_P = raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream);
   math_t percent_error  = 100.0 * norm_A_minus_P / normA;
   return (percent_error / 100.0 < tol);
 }
 
+};  // end namespace detail
 };  // end namespace linalg
 };  // end namespace raft
diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/detail/transpose.hpp
similarity index 56%
rename from cpp/include/raft/linalg/transpose.h
rename to cpp/include/raft/linalg/detail/transpose.hpp
index 6b629e73f0..659d3a8ef6 100644
--- a/cpp/include/raft/linalg/transpose.h
+++ b/cpp/include/raft/linalg/detail/transpose.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,22 +16,15 @@
 
 #pragma once
 
+#include "cublas_wrappers.hpp"
+
 #include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.h>
 #include <rmm/exec_policy.hpp>
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
-/**
- * @brief transpose on the column major input matrix using Jacobi method
- * @param handle: raft handle
- * @param in: input matrix
- * @param out: output. Transposed input matrix
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param stream: cuda stream
- */
 template <typename math_t>
 void transpose(const raft::handle_t& handle,
                math_t* in,
@@ -47,28 +40,22 @@ void transpose(const raft::handle_t& handle,
 
   const math_t alpha = 1.0;
   const math_t beta  = 0.0;
-  RAFT_CUBLAS_TRY(raft::linalg::cublasgeam(cublas_h,
-                                           CUBLAS_OP_T,
-                                           CUBLAS_OP_N,
-                                           out_n_rows,
-                                           out_n_cols,
-                                           &alpha,
-                                           in,
-                                           n_rows,
-                                           &beta,
-                                           out,
-                                           out_n_rows,
-                                           out,
-                                           out_n_rows,
-                                           stream));
+  RAFT_CUBLAS_TRY(cublasgeam(cublas_h,
+                             CUBLAS_OP_T,
+                             CUBLAS_OP_N,
+                             out_n_rows,
+                             out_n_cols,
+                             &alpha,
+                             in,
+                             n_rows,
+                             &beta,
+                             out,
+                             out_n_rows,
+                             out,
+                             out_n_rows,
+                             stream));
 }
 
-/**
- * @brief transpose on the column major input matrix using Jacobi method
- * @param inout: input and output matrix
- * @param n: number of rows and columns of input matrix
- * @param stream: cuda stream
- */
 template <typename math_t>
 void transpose(math_t* inout, int n, cudaStream_t stream)
 {
@@ -90,5 +77,6 @@ void transpose(math_t* inout, int n, cudaStream_t stream)
   });
 }
 
+};  // end namespace detail
 };  // end namespace linalg
 };  // end namespace raft
diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/detail/unary_op.cuh
similarity index 69%
rename from cpp/include/raft/linalg/unary_op.cuh
rename to cpp/include/raft/linalg/detail/unary_op.cuh
index ae8cff2325..9ddfe79657 100644
--- a/cpp/include/raft/linalg/unary_op.cuh
+++ b/cpp/include/raft/linalg/detail/unary_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 namespace raft {
 namespace linalg {
+namespace detail {
 
 template <typename InType, int VecLen, typename Lambda, typename OutType, typename IdxType>
 __global__ void unaryOpKernel(OutType* out, const InType* in, IdxType len, Lambda op)
@@ -50,27 +51,12 @@ void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStr
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-/**
- * @brief perform element-wise unary operation in the input array
- * @tparam InType input data-type
- * @tparam Lambda the device-lambda performing the actual operation
- * @tparam OutType output data-type
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads-per-block in the final kernel launched
- * @param out the output array
- * @param in the input array
- * @param len number of elements in the input array
- * @param op the device-lambda
- * @param stream cuda stream where to launch work
- * @note Lambda must be a functor with the following signature:
- *       `OutType func(const InType& val);`
- */
 template <typename InType,
           typename Lambda,
           typename IdxType = int,
           typename OutType = InType,
           int TPB          = 256>
-void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
+void unaryOpCaller(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
 {
   if (len <= 0) return;  // silently skip in case of 0 length input
   constexpr auto maxSize = sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
@@ -99,25 +85,8 @@ __global__ void writeOnlyUnaryOpKernel(OutType* out, IdxType len, Lambda op)
   if (idx < len) { op(out + idx, idx); }
 }
 
-/**
- * @brief Perform an element-wise unary operation into the output array
- *
- * Compared to `unaryOp()`, this method does not do any reads from any inputs
- *
- * @tparam OutType output data-type
- * @tparam Lambda  the device-lambda performing the actual operation
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB     threads-per-block in the final kernel launched
- *
- * @param[out] out    the output array [on device] [len = len]
- * @param[in]  len    number of elements in the input array
- * @param[in]  op     the device-lambda which must be of the form:
- *                    `void func(OutType* outLocationOffset, IdxType idx);`
- *                    where outLocationOffset will be out + idx.
- * @param[in]  stream cuda stream where to launch work
- */
 template <typename OutType, typename Lambda, typename IdxType = int, int TPB = 256>
-void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
+void writeOnlyUnaryOpCaller(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
 {
   if (len <= 0) return;  // silently skip in case of 0 length input
   auto nblks = raft::ceildiv<IdxType>(len, TPB);
@@ -125,5 +94,6 @@ void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
   RAFT_CUDA_TRY(cudaGetLastError());
 }
 
+};  // end namespace detail
 };  // end namespace linalg
 };  // end namespace raft
diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.hpp
similarity index 88%
rename from cpp/include/raft/linalg/divide.cuh
rename to cpp/include/raft/linalg/divide.hpp
index 562a3d8991..6c8480bf19 100644
--- a/cpp/include/raft/linalg/divide.cuh
+++ b/cpp/include/raft/linalg/divide.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,13 @@
 
 #pragma once
 
-#include "unary_op.cuh"
+#include "detail/divide.hpp"
 
 namespace raft {
 namespace linalg {
 
+using detail::divides_scalar;
+
 /**
  * @defgroup ScalarOps Scalar operations on the input buffer
  * @tparam math_t data-type upon which the math operation will be performed
@@ -35,8 +37,7 @@ namespace linalg {
 template <typename math_t, typename IdxType = int>
 void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
 {
-  unaryOp(
-    out, in, len, [scalar] __device__(math_t in) { return in / scalar; }, stream);
+  detail::divideScalar(out, in, scalar, len, stream);
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/eig.hpp b/cpp/include/raft/linalg/eig.hpp
new file mode 100644
index 0000000000..5c465a3a41
--- /dev/null
+++ b/cpp/include/raft/linalg/eig.hpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/eig.hpp"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @defgroup eig Eigen Decomposition Methods
+ * @{
+ */
+
+/**
+ * @brief eig decomp with divide and conquer method for the column-major
+ * symmetric matrices
+ * @param handle raft handle
+ * @param in the input buffer (symmetric matrix that has real eig values and
+ * vectors.
+ * @param n_rows: number of rows of the input
+ * @param n_cols: number of cols of the input
+ * @param eig_vectors: eigenvectors
+ * @param eig_vals: eigen values
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void eigDC(const raft::handle_t& handle,
+           const math_t* in,
+           std::size_t n_rows,
+           std::size_t n_cols,
+           math_t* eig_vectors,
+           math_t* eig_vals,
+           cudaStream_t stream)
+{
+  detail::eigDC(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream);
+}
+
+using detail::COPY_INPUT;
+using detail::EigVecMemUsage;
+using detail::OVERWRITE_INPUT;
+
+/**
+ * @brief eig sel decomp with divide and conquer method for the column-major
+ * symmetric matrices
+ * @param handle raft handle
+ * @param in the input buffer (symmetric matrix that has real eig values and
+ * vectors.
+ * @param n_rows: number of rows of the input
+ * @param n_cols: number of cols of the input
+ * @param n_eig_vals: number of eigenvectors to be generated
+ * @param eig_vectors: eigenvectors
+ * @param eig_vals: eigen values
+ * @param memUsage: the memory selection for eig vector output
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void eigSelDC(const raft::handle_t& handle,
+              math_t* in,
+              int n_rows,
+              int n_cols,
+              int n_eig_vals,
+              math_t* eig_vectors,
+              math_t* eig_vals,
+              EigVecMemUsage memUsage,
+              cudaStream_t stream)
+{
+  detail::eigSelDC(handle, in, n_rows, n_cols, n_eig_vals, eig_vectors, eig_vals, memUsage, stream);
+}
+
+/**
+ * @brief overloaded function for eig decomp with Jacobi method for the
+ * column-major symmetric matrices (in parameter)
+ * @param handle: raft handle
+ * @param in: input matrix
+ * @param n_rows: number of rows of the input
+ * @param n_cols: number of cols of the input
+ * @param eig_vectors: eigenvectors
+ * @param eig_vals: eigen values
+ * @param stream: stream on which this function will be run
+ * @param tol: error tolerance for the jacobi method. Algorithm stops when the
+ * error is below tol
+ * @param sweeps: number of sweeps in the Jacobi algorithm. The more the better
+ * accuracy.
+ */
+template <typename math_t>
+void eigJacobi(const raft::handle_t& handle,
+               const math_t* in,
+               int n_rows,
+               int n_cols,
+               math_t* eig_vectors,
+               math_t* eig_vals,
+               cudaStream_t stream,
+               math_t tol = 1.e-7,
+               int sweeps = 15)
+{
+  detail::eigJacobi(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream, tol, sweeps);
+}
+/** @} */  // end of eig
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/eltwise.cuh b/cpp/include/raft/linalg/eltwise.hpp
similarity index 75%
rename from cpp/include/raft/linalg/eltwise.cuh
rename to cpp/include/raft/linalg/eltwise.hpp
index 097c3ac218..5c2a97b57d 100644
--- a/cpp/include/raft/linalg/eltwise.cuh
+++ b/cpp/include/raft/linalg/eltwise.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,13 @@
 
 #pragma once
 
-#include "binary_op.cuh"
-#include "unary_op.cuh"
+#include "detail/eltwise.hpp"
 
 namespace raft {
 namespace linalg {
 
+using detail::adds_scalar;
+
 /**
  * @defgroup ScalarOps Scalar operations on the input buffer
  * @tparam InType data-type upon which the math operation will be performed
@@ -36,15 +37,15 @@ namespace linalg {
 template <typename InType, typename IdxType, typename OutType = InType>
 void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
 {
-  raft::linalg::unaryOp(
-    out, in, len, [scalar] __device__(InType in) { return in + scalar; }, stream);
+  detail::scalarAdd(out, in, scalar, len, stream);
 }
 
+using detail::multiplies_scalar;
+
 template <typename InType, typename IdxType, typename OutType = InType>
 void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
 {
-  raft::linalg::unaryOp(
-    out, in, len, [scalar] __device__(InType in) { return in * scalar; }, stream);
+  detail::scalarMultiply(out, in, scalar, len, stream);
 }
 /** @} */
 
@@ -63,50 +64,37 @@ template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseAdd(
   OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
 {
-  binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, stream);
+  detail::eltwiseAdd(out, in1, in2, len, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseSub(
   OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
 {
-  binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; }, stream);
+  detail::eltwiseSub(out, in1, in2, len, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseMultiply(
   OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
 {
-  binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; }, stream);
+  detail::eltwiseMultiply(out, in1, in2, len, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseDivide(
   OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
 {
-  binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; }, stream);
+  detail::eltwiseDivide(out, in1, in2, len, stream);
 }
 
+using detail::divides_check_zero;
+
 template <typename InType, typename IdxType, typename OutType = InType>
 void eltwiseDivideCheckZero(
   OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
 {
-  binaryOp(
-    out,
-    in1,
-    in2,
-    len,
-    [] __device__(InType a, InType b) {
-      if (b == InType(0.0))
-        return InType(0.0);
-      else
-        return a / b;
-    },
-    stream);
+  detail::eltwiseDivideCheckZero(out, in1, in2, len, stream);
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/gemm.hpp b/cpp/include/raft/linalg/gemm.hpp
new file mode 100644
index 0000000000..04ddbb3561
--- /dev/null
+++ b/cpp/include/raft/linalg/gemm.hpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/gemm.hpp"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief the wrapper of cublas gemm function
+ *  It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C
+ * @tparam math_t the type of input/output matrices
+ * @param handle raft handle
+ * @param a input matrix
+ * @param n_rows_a number of rows of A
+ * @param n_cols_a number of columns of A
+ * @param b input matrix
+ * @param c output matrix
+ * @param n_rows_c number of rows of C
+ * @param n_cols_c number of columns of C
+ * @param trans_a cublas transpose op for A
+ * @param trans_b cublas transpose op for B
+ * @param alpha scalar
+ * @param beta scalar
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void gemm(const raft::handle_t& handle,
+          const math_t* a,
+          int n_rows_a,
+          int n_cols_a,
+          const math_t* b,
+          math_t* c,
+          int n_rows_c,
+          int n_cols_c,
+          cublasOperation_t trans_a,
+          cublasOperation_t trans_b,
+          math_t alpha,
+          math_t beta,
+          cudaStream_t stream)
+{
+  detail::gemm(
+    handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream);
+}
+
+/**
+ * @brief the wrapper of cublas gemm function
+ *  It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C
+ * @tparam math_t the type of input/output matrices
+ * @param handle raft handle
+ * @param a input matrix
+ * @param n_rows_a number of rows of A
+ * @param n_cols_a number of columns of A
+ * @param b input matrix
+ * @param c output matrix
+ * @param n_rows_c number of rows of C
+ * @param n_cols_c number of columns of C
+ * @param trans_a cublas transpose op for A
+ * @param trans_b cublas transpose op for B
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void gemm(const raft::handle_t& handle,
+          const math_t* a,
+          int n_rows_a,
+          int n_cols_a,
+          const math_t* b,
+          math_t* c,
+          int n_rows_c,
+          int n_cols_c,
+          cublasOperation_t trans_a,
+          cublasOperation_t trans_b,
+          cudaStream_t stream)
+{
+  detail::gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, stream);
+}
+
+/**
+ * @brief A wrapper for CUBLS GEMM function designed for handling all possible
+ * combinations of operand layouts.
+ * It computes the following equation: Z = alpha . X * Y + beta . Z
+ * @tparam T Data type of input/output matrices (float/double)
+ * @param handle raft handle
+ * @param z output matrix of size M rows x N columns
+ * @param x input matrix of size M rows x K columns
+ * @param y input matrix of size K rows x N columns
+ * @param _M number of rows of X and Z
+ * @param _N number of rows of Y and columns of Z
+ * @param _K number of columns of X and rows of Y
+ * @param isZColMajor Storage layout of Z. true = col major, false = row major
+ * @param isXColMajor Storage layout of X. true = col major, false = row major
+ * @param isYColMajor Storage layout of Y. true = col major, false = row major
+ * @param stream cuda stream
+ * @param alpha scalar
+ * @param beta scalar
+ */
+template <typename T>
+void gemm(const raft::handle_t& handle,
+          T* z,
+          T* x,
+          T* y,
+          int _M,
+          int _N,
+          int _K,
+          bool isZColMajor,
+          bool isXColMajor,
+          bool isYColMajor,
+          cudaStream_t stream,
+          T alpha = T(1.0),
+          T beta  = T(0.0))
+{
+  detail::gemm(
+    handle, z, x, y, _M, _N, _K, isZColMajor, isXColMajor, isYColMajor, stream, alpha, beta);
+}
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/include/raft/linalg/gemv.h b/cpp/include/raft/linalg/gemv.hpp
similarity index 73%
rename from cpp/include/raft/linalg/gemv.h
rename to cpp/include/raft/linalg/gemv.hpp
index 9eafb3941a..45766b8c9a 100644
--- a/cpp/include/raft/linalg/gemv.h
+++ b/cpp/include/raft/linalg/gemv.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,8 @@
 
 #pragma once
 
-#include <cublas_v2.h>
-#include <raft/cuda_utils.cuh>
-#include <raft/linalg/cublas_wrappers.h>
+#include "detail/gemv.hpp"
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 
 #include <raft/handle.hpp>
 
@@ -61,20 +60,20 @@ void gemv(const raft::handle_t& handle,
           cudaStream_t stream)
 {
   cublasHandle_t cublas_h = handle.get_cublas_handle();
-  cublas_device_pointer_mode<DevicePointerMode> pmode(cublas_h);
-  RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
-                             trans_a ? CUBLAS_OP_T : CUBLAS_OP_N,
-                             m,
-                             n,
-                             alpha,
-                             A,
-                             lda,
-                             x,
-                             incx,
-                             beta,
-                             y,
-                             incy,
-                             stream));
+  detail::cublas_device_pointer_mode<DevicePointerMode> pmode(cublas_h);
+  RAFT_CUBLAS_TRY(detail::cublasgemv(cublas_h,
+                                     trans_a ? CUBLAS_OP_T : CUBLAS_OP_N,
+                                     m,
+                                     n,
+                                     alpha,
+                                     A,
+                                     lda,
+                                     x,
+                                     incx,
+                                     beta,
+                                     y,
+                                     incy,
+                                     stream));
 }
 
 template <typename math_t>
@@ -99,16 +98,17 @@ void gemv(const raft::handle_t& handle,
  *
  * where
  *
+ * @param handle raft handle
  * @param A is a column-major matrix of size n_rows_a * n_cols_a.
  *   op(A) is either the transpose operation (trans_a == true) or identity.
- *
- * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a.
- *     set it when you need to use only the first n_rows_a rows of the matrix A, which has
- *     (perhaps, due to padding) lda rows.
- *
+ * @param n_rows_a number of rows in A
+ * @param n_cols_a number of cols in A
  * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
- *
  * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ * @param trans_a whether to take transpose of a
+ * @param alpha is a scalar scale of Ax.
+ * @param beta is a scalar scale of y.
+ * @param stream stream on which this function is run
  */
 template <typename math_t>
 void gemv(const raft::handle_t& handle,
@@ -122,7 +122,7 @@ void gemv(const raft::handle_t& handle,
           const math_t beta,
           cudaStream_t stream)
 {
-  gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
+  detail::gemv(handle, A, n_rows_a, n_cols_a, x, y, trans_a, alpha, beta, stream);
 }
 
 /**
@@ -130,12 +130,15 @@ void gemv(const raft::handle_t& handle,
  *
  * where
  *
+ * @param handle raft handle
  * @param A is a column-major matrix of size n_rows_a * n_cols_a.
  *   op(A) is either the transpose operation (trans_a == true) or identity.
- *
+ * @param n_rows_a number of rows in A
+ * @param n_cols_a number of cols in A
  * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
- *
  * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ * @param trans_a whether to take transpose of a
+ * @param stream stream on which this function is run
  */
 template <typename math_t>
 void gemv(const raft::handle_t& handle,
@@ -147,31 +150,27 @@ void gemv(const raft::handle_t& handle,
           const bool trans_a,
           cudaStream_t stream)
 {
-  math_t alpha = math_t(1);
-  math_t beta  = math_t(0);
-
-  gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
+  detail::gemv(handle, A, n_rows_a, n_cols_a, x, y, trans_a, stream);
 }
 
 /**
  * y = alpha * op(A) * x + beta * y
  *
  * where
- *
- * @param alpha is a scalar scale of Ax.
- *
- * @param beta is a scalar scale of y.
- *
+ * @param handle raft handle
  * @param A is a column-major matrix of size n_rows_a * n_cols_a.
  *   op(A) is either the transpose operation (trans_a == true) or identity.
- *
+ * @param n_rows_a number of rows in A
+ * @param n_cols_a number of cols in A
  * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a.
  *     set it when you need to use only the first n_rows_a rows of the matrix A, which has
  *     (perhaps, due to padding) lda rows.
- *
  * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
- *
  * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ * @param trans_a whether to take transpose of a
+ * @param alpha is a scalar scale of Ax.
+ * @param beta is a scalar scale of y.
+ * @param stream stream on which this function is run
  */
 template <typename math_t>
 void gemv(const raft::handle_t& handle,
@@ -186,27 +185,25 @@ void gemv(const raft::handle_t& handle,
           const math_t beta,
           cudaStream_t stream)
 {
-  cublasHandle_t cublas_h = handle.get_cublas_handle();
-  cublasOperation_t op_a  = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
-  RAFT_CUBLAS_TRY(
-    cublasgemv(cublas_h, op_a, n_rows_a, n_cols_a, &alpha, A, lda, x, 1, &beta, y, 1, stream));
+  detail::gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream);
 }
 
 /**
  * y = op(A) * x
  *
  * where
- *
+ * @param handle raft handle
  * @param A is a column-major matrix of size n_rows_a * n_cols_a.
  *   op(A) is either the transpose operation (trans_a == true) or identity.
- *
+ * @param n_rows_a number of rows in A
+ * @param n_cols_a number of cols in A
  * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a.
  *     set it when you need to use only the first n_rows_a rows of the matrix A, which has
  *     (perhaps, due to padding) lda rows.
- *
  * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
- *
  * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ * @param trans_a whether to take transpose of a
+ * @param stream stream on which this function is run
  *
  */
 template <typename math_t>
@@ -220,9 +217,7 @@ void gemv(const raft::handle_t& handle,
           const bool trans_a,
           cudaStream_t stream)
 {
-  math_t alpha = math_t(1);
-  math_t beta  = math_t(0);
-  gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream);
+  detail::gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, stream);
 }
 
 };  // namespace linalg
diff --git a/cpp/include/raft/linalg/init.hpp b/cpp/include/raft/linalg/init.hpp
new file mode 100644
index 0000000000..bb577672e8
--- /dev/null
+++ b/cpp/include/raft/linalg/init.hpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/init.hpp"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Like Python range.
+ *
+ * Fills the output as out[i] = i.
+ *
+ * \param [out] out device array, size [end-start]
+ * \param [in] start of the range
+ * \param [in] end of range (exclusive)
+ * \param [in] stream cuda stream
+ */
+template <typename T>
+void range(T* out, int start, int end, cudaStream_t stream)
+{
+  detail::range(out, start, end, stream);
+}
+
+/**
+ * @brief Like Python range.
+ *
+ * Fills the output as out[i] = i.
+ *
+ * \param [out] out device array, size [n]
+ * \param [in] n length of the array
+ * \param [in] stream cuda stream
+ */
+template <typename T, int TPB = 256>
+void range(T* out, int n, cudaStream_t stream)
+{
+  detail::range(out, n, stream);
+}
+
+}  // namespace linalg
+}  // namespace raft
diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp
index 9376994742..e7d965f810 100644
--- a/cpp/include/raft/linalg/lanczos.hpp
+++ b/cpp/include/raft/linalg/lanczos.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,982 +16,14 @@
 
 #pragma once
 
-// for cmath:
-#define _USE_MATH_DEFINES
-
-#include <cmath>
-#include <vector>
-
-#include <cuda.h>
-#include <curand.h>
-
-#include <raft/cudart_utils.h>
-#include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/spectral/lapack.hpp>
-#include <raft/spectral/matrix_wrappers.hpp>
-#include <raft/spectral/warn_dbg.hpp>
+#include "detail/lanczos.hpp"
 
 namespace raft {
 
-using namespace matrix;
-using namespace linalg;
-
-namespace spectral {
-
-// curandGeneratorNormalX
-inline curandStatus_t curandGenerateNormalX(
-  curandGenerator_t generator, float* outputPtr, size_t n, float mean, float stddev)
-{
-  return curandGenerateNormal(generator, outputPtr, n, mean, stddev);
-}
-inline curandStatus_t curandGenerateNormalX(
-  curandGenerator_t generator, double* outputPtr, size_t n, double mean, double stddev)
-{
-  return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev);
-}
-
-// =========================================================
-// Helper functions
-// =========================================================
-
-/**
- *  @brief  Perform Lanczos iteration
- *    Lanczos iteration is performed on a shifted matrix A+shift*I.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param handle the raft handle.
- *  @param A Matrix.
- *  @param iter Pointer to current Lanczos iteration. On exit, the
- *    variable is set equal to the final Lanczos iteration.
- *  @param maxIter Maximum Lanczos iteration. This function will
- *    perform a maximum of maxIter-*iter iterations.
- *  @param shift Matrix shift.
- *  @param tol Convergence tolerance. Lanczos iteration will
- *    terminate when the residual norm (i.e. entry in beta_host) is
- *    less than tol.
- *  @param reorthogonalize Whether to reorthogonalize Lanczos
- *    vectors.
- *  @param alpha_host (Output, host memory, maxIter entries)
- *    Diagonal entries of Lanczos system.
- *  @param beta_host (Output, host memory, maxIter entries)
- *    Off-diagonal entries of Lanczos system.
- *  @param lanczosVecs_dev (Input/output, device memory,
- *    n*(maxIter+1) entries) Lanczos vectors. Vectors are stored as
- *    columns of a column-major matrix with dimensions
- *    n x (maxIter+1).
- *  @param work_dev (Output, device memory, maxIter entries)
- *    Workspace. Not needed if full reorthogonalization is disabled.
- *  @return Zero if successful. Otherwise non-zero.
- */
-template <typename index_type_t, typename value_type_t>
-int performLanczosIteration(handle_t const& handle,
-                            sparse_matrix_t<index_type_t, value_type_t> const* A,
-                            index_type_t* iter,
-                            index_type_t maxIter,
-                            value_type_t shift,
-                            value_type_t tol,
-                            bool reorthogonalize,
-                            value_type_t* __restrict__ alpha_host,
-                            value_type_t* __restrict__ beta_host,
-                            value_type_t* __restrict__ lanczosVecs_dev,
-                            value_type_t* __restrict__ work_dev)
-{
-  // -------------------------------------------------------
-  // Variable declaration
-  // -------------------------------------------------------
-
-  // Useful variables
-  constexpr value_type_t one    = 1;
-  constexpr value_type_t negOne = -1;
-  constexpr value_type_t zero   = 0;
-  value_type_t alpha;
-
-  auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
-
-  RAFT_EXPECTS(A != nullptr, "Null matrix pointer.");
-
-  index_type_t n = A->nrows_;
-
-  // -------------------------------------------------------
-  // Compute second Lanczos vector
-  // -------------------------------------------------------
-  if (*iter <= 0) {
-    *iter = 1;
-
-    // Apply matrix
-    if (shift != 0)
-      CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n,
-                               lanczosVecs_dev,
-                               n * sizeof(value_type_t),
-                               cudaMemcpyDeviceToDevice,
-                               stream));
-    A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n);
-
-    // Orthogonalize Lanczos vector
-    RAFT_CUBLAS_TRY(cublasdot(
-      cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream));
-
-    alpha = -alpha_host[0];
-    RAFT_CUBLAS_TRY(cublasaxpy(
-      cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
-    RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream));
-
-    // Check if Lanczos has converged
-    if (beta_host[0] <= tol) return 0;
-
-    // Normalize Lanczos vector
-    alpha = 1 / beta_host[0];
-    RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
-  }
-
-  // -------------------------------------------------------
-  // Compute remaining Lanczos vectors
-  // -------------------------------------------------------
-
-  while (*iter < maxIter) {
-    ++(*iter);
-
-    // Apply matrix
-    if (shift != 0)
-      CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n,
-                               lanczosVecs_dev + (*iter - 1) * n,
-                               n * sizeof(value_type_t),
-                               cudaMemcpyDeviceToDevice,
-                               stream));
-    A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n));
-
-    // Full reorthogonalization
-    //   "Twice is enough" algorithm per Kahan and Parlett
-    if (reorthogonalize) {
-      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
-                                 CUBLAS_OP_T,
-                                 n,
-                                 *iter,
-                                 &one,
-                                 lanczosVecs_dev,
-                                 n,
-                                 lanczosVecs_dev + IDX(0, *iter, n),
-                                 1,
-                                 &zero,
-                                 work_dev,
-                                 1,
-                                 stream));
-
-      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
-                                 CUBLAS_OP_N,
-                                 n,
-                                 *iter,
-                                 &negOne,
-                                 lanczosVecs_dev,
-                                 n,
-                                 work_dev,
-                                 1,
-                                 &one,
-                                 lanczosVecs_dev + IDX(0, *iter, n),
-                                 1,
-                                 stream));
-
-      CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1),
-                               work_dev + (*iter - 1),
-                               sizeof(value_type_t),
-                               cudaMemcpyDeviceToHost,
-                               stream));
-
-      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
-                                 CUBLAS_OP_T,
-                                 n,
-                                 *iter,
-                                 &one,
-                                 lanczosVecs_dev,
-                                 n,
-                                 lanczosVecs_dev + IDX(0, *iter, n),
-                                 1,
-                                 &zero,
-                                 work_dev,
-                                 1,
-                                 stream));
-
-      RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
-                                 CUBLAS_OP_N,
-                                 n,
-                                 *iter,
-                                 &negOne,
-                                 lanczosVecs_dev,
-                                 n,
-                                 work_dev,
-                                 1,
-                                 &one,
-                                 lanczosVecs_dev + IDX(0, *iter, n),
-                                 1,
-                                 stream));
-    }
-
-    // Orthogonalization with 3-term recurrence relation
-    else {
-      RAFT_CUBLAS_TRY(cublasdot(cublas_h,
-                                n,
-                                lanczosVecs_dev + IDX(0, *iter - 1, n),
-                                1,
-                                lanczosVecs_dev + IDX(0, *iter, n),
-                                1,
-                                alpha_host + (*iter - 1),
-                                stream));
-
-      auto alpha = -alpha_host[*iter - 1];
-      RAFT_CUBLAS_TRY(cublasaxpy(cublas_h,
-                                 n,
-                                 &alpha,
-                                 lanczosVecs_dev + IDX(0, *iter - 1, n),
-                                 1,
-                                 lanczosVecs_dev + IDX(0, *iter, n),
-                                 1,
-                                 stream));
-
-      alpha = -beta_host[*iter - 2];
-      RAFT_CUBLAS_TRY(cublasaxpy(cublas_h,
-                                 n,
-                                 &alpha,
-                                 lanczosVecs_dev + IDX(0, *iter - 2, n),
-                                 1,
-                                 lanczosVecs_dev + IDX(0, *iter, n),
-                                 1,
-                                 stream));
-    }
-
-    // Compute residual
-    RAFT_CUBLAS_TRY(cublasnrm2(
-      cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream));
-
-    // Check if Lanczos has converged
-    if (beta_host[*iter - 1] <= tol) break;
-
-    // Normalize Lanczos vector
-    alpha = 1 / beta_host[*iter - 1];
-    RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
-  }
-
-  CUDA_TRY(cudaStreamSynchronize(stream));
-
-  return 0;
-}
-
-/**
- *  @brief  Find Householder transform for 3-dimensional system
- *    Given an input vector v=[x,y,z]', this function finds a
- *    Householder transform P such that P*v is a multiple of
- *    e_1=[1,0,0]'. The input vector v is overwritten with the
- *    Householder vector such that P=I-2*v*v'.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param v (Input/output, host memory, 3 entries) Input
- *    3-dimensional vector. On exit, the vector is set to the
- *    Householder vector.
- *  @param Pv (Output, host memory, 1 entry) First entry of P*v
- *    (here v is the input vector). Either equal to ||v||_2 or
- *    -||v||_2.
- *  @param P (Output, host memory, 9 entries) Householder transform
- *    matrix. Matrix dimensions are 3 x 3.
- */
-template <typename index_type_t, typename value_type_t>
-static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P)
-{
-  // Compute norm of vector
-  *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
-
-  // Choose whether to reflect to e_1 or -e_1
-  //   This choice avoids catastrophic cancellation
-  if (v[0] >= 0) *Pv = -(*Pv);
-  v[0] -= *Pv;
-
-  // Normalize Householder vector
-  value_type_t normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
-  if (normHouseholder != 0) {
-    v[0] /= normHouseholder;
-    v[1] /= normHouseholder;
-    v[2] /= normHouseholder;
-  } else {
-    v[0] = 0;
-    v[1] = 0;
-    v[2] = 0;
-  }
-
-  // Construct Householder matrix
-  index_type_t i, j;
-  for (j = 0; j < 3; ++j)
-    for (i = 0; i < 3; ++i)
-      P[IDX(i, j, 3)] = -2 * v[i] * v[j];
-  for (i = 0; i < 3; ++i)
-    P[IDX(i, i, 3)] += 1;
-}
-
-/**
- *  @brief  Apply 3-dimensional Householder transform to 4 x 4 matrix
- *    The Householder transform is pre-applied to the top three rows
- *  of the matrix and post-applied to the left three columns. The
- *  4 x 4 matrix is intended to contain the bulge that is produced
- *  in the Francis QR algorithm.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param v (Input, host memory, 3 entries) Householder vector.
- *  @param A (Input/output, host memory, 16 entries) 4 x 4 matrix.
- */
-template <typename index_type_t, typename value_type_t>
-static void applyHouseholder3(const value_type_t* v, value_type_t* A)
-{
-  // Loop indices
-  index_type_t i, j;
-  // Dot product between Householder vector and matrix row/column
-  value_type_t vDotA;
-
-  // Pre-apply Householder transform
-  for (j = 0; j < 4; ++j) {
-    vDotA = 0;
-    for (i = 0; i < 3; ++i)
-      vDotA += v[i] * A[IDX(i, j, 4)];
-    for (i = 0; i < 3; ++i)
-      A[IDX(i, j, 4)] -= 2 * v[i] * vDotA;
-  }
-
-  // Post-apply Householder transform
-  for (i = 0; i < 4; ++i) {
-    vDotA = 0;
-    for (j = 0; j < 3; ++j)
-      vDotA += A[IDX(i, j, 4)] * v[j];
-    for (j = 0; j < 3; ++j)
-      A[IDX(i, j, 4)] -= 2 * vDotA * v[j];
-  }
-}
-
-/**
- *  @brief  Perform one step of Francis QR algorithm
- *    Equivalent to two steps of the classical QR algorithm on a
- *    tridiagonal matrix.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param n Matrix dimension.
- *  @param shift1 QR algorithm shift.
- *  @param shift2 QR algorithm shift.
- *  @param alpha (Input/output, host memory, n entries) Diagonal
- *    entries of tridiagonal matrix.
- *  @param beta (Input/output, host memory, n-1 entries)
- *    Off-diagonal entries of tridiagonal matrix.
- *  @param V (Input/output, host memory, n*n entries) Orthonormal
- *    transforms from previous steps of QR algorithm. Matrix
- *    dimensions are n x n. On exit, the orthonormal transform from
- *    this Francis QR step is post-applied to the matrix.
- *  @param work (Output, host memory, 3*n entries) Workspace.
- *  @return Zero if successful. Otherwise non-zero.
- */
-template <typename index_type_t, typename value_type_t>
-static int francisQRIteration(index_type_t n,
-                              value_type_t shift1,
-                              value_type_t shift2,
-                              value_type_t* alpha,
-                              value_type_t* beta,
-                              value_type_t* V,
-                              value_type_t* work)
-{
-  // -------------------------------------------------------
-  // Variable declaration
-  // -------------------------------------------------------
-
-  // Temporary storage of 4x4 bulge and Householder vector
-  value_type_t bulge[16];
-
-  // Householder vector
-  value_type_t householder[3];
-  // Householder matrix
-  value_type_t householderMatrix[3 * 3];
-
-  // Shifts are roots of the polynomial p(x)=x^2+b*x+c
-  value_type_t b = -shift1 - shift2;
-  value_type_t c = shift1 * shift2;
-
-  // Loop indices
-  index_type_t i, j, pos;
-  // Temporary variable
-  value_type_t temp;
-
-  // -------------------------------------------------------
-  // Implementation
-  // -------------------------------------------------------
-
-  // Compute initial Householder transform
-  householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c;
-  householder[1] = beta[0] * (alpha[0] + alpha[1] + b);
-  householder[2] = beta[0] * beta[1];
-  findHouseholder3<index_type_t, value_type_t>(householder, &temp, householderMatrix);
-
-  // Apply initial Householder transform to create bulge
-  memset(bulge, 0, 16 * sizeof(value_type_t));
-  for (i = 0; i < 4; ++i)
-    bulge[IDX(i, i, 4)] = alpha[i];
-  for (i = 0; i < 3; ++i) {
-    bulge[IDX(i + 1, i, 4)] = beta[i];
-    bulge[IDX(i, i + 1, 4)] = beta[i];
-  }
-  applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-  Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n);
-  memcpy(V, work, 3 * n * sizeof(value_type_t));
-
-  // Chase bulge to bottom-right of matrix with Householder transforms
-  for (pos = 0; pos < n - 4; ++pos) {
-    // Move to next position
-    alpha[pos]     = bulge[IDX(0, 0, 4)];
-    householder[0] = bulge[IDX(1, 0, 4)];
-    householder[1] = bulge[IDX(2, 0, 4)];
-    householder[2] = bulge[IDX(3, 0, 4)];
-    for (j = 0; j < 3; ++j)
-      for (i = 0; i < 3; ++i)
-        bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
-    bulge[IDX(3, 0, 4)] = 0;
-    bulge[IDX(3, 1, 4)] = 0;
-    bulge[IDX(3, 2, 4)] = beta[pos + 3];
-    bulge[IDX(0, 3, 4)] = 0;
-    bulge[IDX(1, 3, 4)] = 0;
-    bulge[IDX(2, 3, 4)] = beta[pos + 3];
-    bulge[IDX(3, 3, 4)] = alpha[pos + 4];
-
-    // Apply Householder transform
-    findHouseholder3<index_type_t, value_type_t>(householder, beta + pos, householderMatrix);
-    applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-    Lapack<value_type_t>::gemm(
-      false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n);
-    memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(value_type_t));
-  }
-
-  // Apply penultimate Householder transform
-  //   Values in the last row and column are zero
-  alpha[n - 4]   = bulge[IDX(0, 0, 4)];
-  householder[0] = bulge[IDX(1, 0, 4)];
-  householder[1] = bulge[IDX(2, 0, 4)];
-  householder[2] = bulge[IDX(3, 0, 4)];
-  for (j = 0; j < 3; ++j)
-    for (i = 0; i < 3; ++i)
-      bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
-  bulge[IDX(3, 0, 4)] = 0;
-  bulge[IDX(3, 1, 4)] = 0;
-  bulge[IDX(3, 2, 4)] = 0;
-  bulge[IDX(0, 3, 4)] = 0;
-  bulge[IDX(1, 3, 4)] = 0;
-  bulge[IDX(2, 3, 4)] = 0;
-  bulge[IDX(3, 3, 4)] = 0;
-  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 4, householderMatrix);
-  applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-  Lapack<value_type_t>::gemm(
-    false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n);
-  memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(value_type_t));
-
-  // Apply final Householder transform
-  //   Values in the last two rows and columns are zero
-  alpha[n - 3]   = bulge[IDX(0, 0, 4)];
-  householder[0] = bulge[IDX(1, 0, 4)];
-  householder[1] = bulge[IDX(2, 0, 4)];
-  householder[2] = 0;
-  for (j = 0; j < 3; ++j)
-    for (i = 0; i < 3; ++i)
-      bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
-  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 3, householderMatrix);
-  applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-  Lapack<value_type_t>::gemm(
-    false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n);
-  memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(value_type_t));
-
-  // Bulge has been eliminated
-  alpha[n - 2] = bulge[IDX(0, 0, 4)];
-  alpha[n - 1] = bulge[IDX(1, 1, 4)];
-  beta[n - 2]  = bulge[IDX(1, 0, 4)];
-
-  return 0;
-}
-
-/**
- *  @brief  Perform implicit restart of Lanczos algorithm
- *    Shifts are Chebyshev nodes of unwanted region of matrix spectrum.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param handle the raft handle.
- *  @param n Matrix dimension.
- *  @param iter Current Lanczos iteration.
- *  @param iter_new Lanczos iteration after restart.
- *  @param shiftUpper Pointer (host memory) to upper bound for unwanted
- *    region. Value is ignored if less than *shiftLower. If a
- *    stronger upper bound has been found, the value is updated on
- *    exit.
- *  @param shiftLower Pointer (host memory) to lower bound for unwanted
- *    region. Value is ignored if greater than *shiftUpper. If a
- *    stronger lower bound has been found, the value is updated on
- *    exit.
- *  @param alpha_host (Input/output, host memory, iter entries)
- *    Diagonal entries of Lanczos system.
- *  @param beta_host (Input/output, host memory, iter entries)
- *    Off-diagonal entries of Lanczos system.
- *  @param V_host (Output, host memory, iter*iter entries)
- *    Orthonormal transform used to obtain restarted system. Matrix
- *    dimensions are iter x iter.
- *  @param work_host (Output, host memory, 4*iter entries)
- *    Workspace.
- *  @param lanczosVecs_dev (Input/output, device memory, n*(iter+1)
- *    entries) Lanczos vectors. Vectors are stored as columns of a
- *    column-major matrix with dimensions n x (iter+1).
- *  @param work_dev (Output, device memory, (n+iter)*iter entries)
- *    Workspace.
- *  @param smallest_eig specifies whether smallest (true) or largest
- *    (false) eigenvalues are to be calculated.
- *  @return error flag.
- */
-template <typename index_type_t, typename value_type_t>
-static int lanczosRestart(handle_t const& handle,
-                          index_type_t n,
-                          index_type_t iter,
-                          index_type_t iter_new,
-                          value_type_t* shiftUpper,
-                          value_type_t* shiftLower,
-                          value_type_t* __restrict__ alpha_host,
-                          value_type_t* __restrict__ beta_host,
-                          value_type_t* __restrict__ V_host,
-                          value_type_t* __restrict__ work_host,
-                          value_type_t* __restrict__ lanczosVecs_dev,
-                          value_type_t* __restrict__ work_dev,
-                          bool smallest_eig)
-{
-  // -------------------------------------------------------
-  // Variable declaration
-  // -------------------------------------------------------
-
-  // Useful constants
-  constexpr value_type_t zero = 0;
-  constexpr value_type_t one  = 1;
-
-  auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
-
-  // Loop index
-  index_type_t i;
-
-  // Number of implicit restart steps
-  //   Assumed to be even since each call to Francis algorithm is
-  //   equivalent to two calls of QR algorithm
-  index_type_t restartSteps = iter - iter_new;
-
-  // Ritz values from Lanczos method
-  value_type_t* ritzVals_host = work_host + 3 * iter;
-  // Shifts for implicit restart
-  value_type_t* shifts_host;
-
-  // Orthonormal matrix for similarity transform
-  value_type_t* V_dev = work_dev + n * iter;
-
-  // -------------------------------------------------------
-  // Implementation
-  // -------------------------------------------------------
-
-  // Compute Ritz values
-  memcpy(ritzVals_host, alpha_host, iter * sizeof(value_type_t));
-  memcpy(work_host, beta_host, (iter - 1) * sizeof(value_type_t));
-  Lapack<value_type_t>::sterf(iter, ritzVals_host, work_host);
-
-  // Debug: Print largest eigenvalues
-  // for (int i = iter-iter_new; i < iter; ++i)
-  //  std::cout <<*(ritzVals_host+i)<< " ";
-  // std::cout <<std::endl;
-
-  // Initialize similarity transform with identity matrix
-  memset(V_host, 0, iter * iter * sizeof(value_type_t));
-  for (i = 0; i < iter; ++i)
-    V_host[IDX(i, i, iter)] = 1;
-
-  // Determine interval to suppress eigenvalues
-  if (smallest_eig) {
-    if (*shiftLower > *shiftUpper) {
-      *shiftUpper = ritzVals_host[iter - 1];
-      *shiftLower = ritzVals_host[iter_new];
-    } else {
-      *shiftUpper = std::max(*shiftUpper, ritzVals_host[iter - 1]);
-      *shiftLower = std::min(*shiftLower, ritzVals_host[iter_new]);
-    }
-  } else {
-    if (*shiftLower > *shiftUpper) {
-      *shiftUpper = ritzVals_host[iter - iter_new - 1];
-      *shiftLower = ritzVals_host[0];
-    } else {
-      *shiftUpper = std::max(*shiftUpper, ritzVals_host[iter - iter_new - 1]);
-      *shiftLower = std::min(*shiftLower, ritzVals_host[0]);
-    }
-  }
-
-  // Calculate Chebyshev nodes as shifts
-  shifts_host = ritzVals_host;
-  for (i = 0; i < restartSteps; ++i) {
-    shifts_host[i] = cos((i + 0.5) * static_cast<value_type_t>(M_PI) / restartSteps);
-    shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower));
-    shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower));
-  }
-
-  // Apply Francis QR algorithm to implicitly restart Lanczos
-  for (i = 0; i < restartSteps; i += 2)
-    if (francisQRIteration(
-          iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host))
-      WARNING("error in implicitly shifted QR algorithm");
-
-  // Obtain new residual
-  CUDA_TRY(cudaMemcpyAsync(
-    V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream));
-
-  beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)];
-  RAFT_CUBLAS_TRY(cublasgemv(cublas_h,
-                             CUBLAS_OP_N,
-                             n,
-                             iter,
-                             beta_host + iter_new - 1,
-                             lanczosVecs_dev,
-                             n,
-                             V_dev + IDX(0, iter_new, iter),
-                             1,
-                             beta_host + iter - 1,
-                             lanczosVecs_dev + IDX(0, iter, n),
-                             1,
-                             stream));
-
-  // Obtain new Lanczos vectors
-  RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
-                             CUBLAS_OP_N,
-                             CUBLAS_OP_N,
-                             n,
-                             iter_new,
-                             iter,
-                             &one,
-                             lanczosVecs_dev,
-                             n,
-                             V_dev,
-                             iter,
-                             &zero,
-                             work_dev,
-                             n,
-                             stream));
-
-  CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev,
-                           work_dev,
-                           n * iter_new * sizeof(value_type_t),
-                           cudaMemcpyDeviceToDevice,
-                           stream));
-
-  // Normalize residual to obtain new Lanczos vector
-  CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n),
-                           lanczosVecs_dev + IDX(0, iter, n),
-                           n * sizeof(value_type_t),
-                           cudaMemcpyDeviceToDevice,
-                           stream));
-
-  RAFT_CUBLAS_TRY(cublasnrm2(
-    cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream));
-
-  auto h_beta = 1 / beta_host[iter_new - 1];
-  RAFT_CUBLAS_TRY(
-    cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream));
-
-  return 0;
-}
-
-}  // namespace spectral
-
 // =========================================================
 // Eigensolver
 // =========================================================
 
-/**
- * @brief  Compute smallest eigenvectors of symmetric matrix
- *    Computes eigenvalues and eigenvectors that are least
- *    positive. If matrix is positive definite or positive
- *    semidefinite, the computed eigenvalues are smallest in
- *    magnitude.
- *    The largest eigenvalue is estimated by performing several
- *    Lanczos iterations. An implicitly restarted Lanczos method is
- *    then applied to A+s*I, where s is negative the largest
- *    eigenvalue.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param handle the raft handle.
- *  @param A Matrix.
- *  @param nEigVecs Number of eigenvectors to compute.
- *  @param maxIter Maximum number of Lanczos steps. Does not include
- *    Lanczos steps used to estimate largest eigenvalue.
- *  @param restartIter Maximum size of Lanczos system before
- *    performing an implicit restart. Should be at least 4.
- *  @param tol Convergence tolerance. Lanczos iteration will
- *    terminate when the residual norm is less than tol*theta, where
- *    theta is an estimate for the smallest unwanted eigenvalue
- *    (i.e. the (nEigVecs+1)th smallest eigenvalue).
- *  @param reorthogonalize Whether to reorthogonalize Lanczos
- *    vectors.
- *  @param effIter On exit, pointer to final size of Lanczos system.
- *  @param totalIter On exit, pointer to total number of Lanczos
- *    iterations performed. Does not include Lanczos steps used to
- *    estimate largest eigenvalue.
- *  @param shift On exit, pointer to matrix shift (estimate for
- *    largest eigenvalue).
- *  @param alpha_host (Output, host memory, restartIter entries)
- *    Diagonal entries of Lanczos system.
- *  @param beta_host (Output, host memory, restartIter entries)
- *    Off-diagonal entries of Lanczos system.
- *  @param lanczosVecs_dev (Output, device memory, n*(restartIter+1)
- *    entries) Lanczos vectors. Vectors are stored as columns of a
- *    column-major matrix with dimensions n x (restartIter+1).
- *  @param work_dev (Output, device memory,
- *    (n+restartIter)*restartIter entries) Workspace.
- *  @param eigVals_dev (Output, device memory, nEigVecs entries)
- *    Largest eigenvalues of matrix.
- *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
- *    Eigenvectors corresponding to smallest eigenvalues of
- *    matrix. Vectors are stored as columns of a column-major matrix
- *    with dimensions n x nEigVecs.
- *  @param seed random seed.
- *  @return error flag.
- */
-template <typename index_type_t, typename value_type_t>
-int computeSmallestEigenvectors(handle_t const& handle,
-                                sparse_matrix_t<index_type_t, value_type_t> const* A,
-                                index_type_t nEigVecs,
-                                index_type_t maxIter,
-                                index_type_t restartIter,
-                                value_type_t tol,
-                                bool reorthogonalize,
-                                index_type_t* effIter,
-                                index_type_t* totalIter,
-                                value_type_t* shift,
-                                value_type_t* __restrict__ alpha_host,
-                                value_type_t* __restrict__ beta_host,
-                                value_type_t* __restrict__ lanczosVecs_dev,
-                                value_type_t* __restrict__ work_dev,
-                                value_type_t* __restrict__ eigVals_dev,
-                                value_type_t* __restrict__ eigVecs_dev,
-                                unsigned long long seed)
-{
-  using namespace spectral;
-
-  // Useful constants
-  constexpr value_type_t one  = 1;
-  constexpr value_type_t zero = 0;
-
-  // Matrix dimension
-  index_type_t n = A->nrows_;
-
-  // Shift for implicit restart
-  value_type_t shiftUpper;
-  value_type_t shiftLower;
-
-  // Lanczos iteration counters
-  index_type_t maxIter_curr = restartIter;  // Maximum size of Lanczos system
-
-  // Status flags
-  int status;
-
-  // Loop index
-  index_type_t i;
-
-  // Host memory
-  value_type_t* Z_host;     // Eigenvectors in Lanczos basis
-  value_type_t* work_host;  // Workspace
-
-  // -------------------------------------------------------
-  // Check that parameters are valid
-  // -------------------------------------------------------
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
-  RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
-  RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
-  RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
-  RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
-
-  auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
-
-  // -------------------------------------------------------
-  // Variable initialization
-  // -------------------------------------------------------
-
-  // Total number of Lanczos iterations
-  *totalIter = 0;
-
-  // Allocate host memory
-  std::vector<value_type_t> Z_host_v(restartIter * restartIter);
-  std::vector<value_type_t> work_host_v(4 * restartIter);
-
-  Z_host    = Z_host_v.data();
-  work_host = work_host_v.data();
-
-  // Initialize cuBLAS
-  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
-
-  // -------------------------------------------------------
-  // Compute largest eigenvalue to determine shift
-  // -------------------------------------------------------
-
-  // Random number generator
-  curandGenerator_t randGen;
-  // Initialize random number generator
-  curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10);
-
-  curandSetPseudoRandomGeneratorSeed(randGen, seed);
-
-  // Initialize initial Lanczos vector
-  curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one);
-  value_type_t normQ1;
-  RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream));
-
-  auto h_val = 1 / normQ1;
-  RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
-
-  // Obtain tridiagonal matrix with Lanczos
-  *effIter = 0;
-  *shift   = 0;
-  status   = performLanczosIteration<index_type_t, value_type_t>(handle,
-                                                               A,
-                                                               effIter,
-                                                               maxIter_curr,
-                                                               *shift,
-                                                               0.0,
-                                                               reorthogonalize,
-                                                               alpha_host,
-                                                               beta_host,
-                                                               lanczosVecs_dev,
-                                                               work_dev);
-  if (status) WARNING("error in Lanczos iteration");
-
-  // Determine largest eigenvalue
-
-  Lapack<value_type_t>::sterf(*effIter, alpha_host, beta_host);
-  *shift = -alpha_host[*effIter - 1];
-
-  // -------------------------------------------------------
-  // Compute eigenvectors of shifted matrix
-  // -------------------------------------------------------
-
-  // Obtain tridiagonal matrix with Lanczos
-  *effIter = 0;
-
-  status = performLanczosIteration<index_type_t, value_type_t>(handle,
-                                                               A,
-                                                               effIter,
-                                                               maxIter_curr,
-                                                               *shift,
-                                                               0,
-                                                               reorthogonalize,
-                                                               alpha_host,
-                                                               beta_host,
-                                                               lanczosVecs_dev,
-                                                               work_dev);
-  if (status) WARNING("error in Lanczos iteration");
-  *totalIter += *effIter;
-
-  // Apply Lanczos method until convergence
-  shiftLower = 1;
-  shiftUpper = -1;
-  while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) {
-    // Determine number of restart steps
-    // Number of steps must be even due to Francis algorithm
-    index_type_t iter_new = nEigVecs + 1;
-    if (restartIter - (maxIter - *totalIter) > nEigVecs + 1)
-      iter_new = restartIter - (maxIter - *totalIter);
-    if ((restartIter - iter_new) % 2) iter_new -= 1;
-    if (iter_new == *effIter) break;
-
-    // Implicit restart of Lanczos method
-    status = lanczosRestart<index_type_t, value_type_t>(handle,
-                                                        n,
-                                                        *effIter,
-                                                        iter_new,
-                                                        &shiftUpper,
-                                                        &shiftLower,
-                                                        alpha_host,
-                                                        beta_host,
-                                                        Z_host,
-                                                        work_host,
-                                                        lanczosVecs_dev,
-                                                        work_dev,
-                                                        true);
-    if (status) WARNING("error in Lanczos implicit restart");
-    *effIter = iter_new;
-
-    // Check for convergence
-    if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break;
-
-    // Proceed with Lanczos method
-
-    status = performLanczosIteration<index_type_t, value_type_t>(handle,
-                                                                 A,
-                                                                 effIter,
-                                                                 maxIter_curr,
-                                                                 *shift,
-                                                                 tol * fabs(shiftLower),
-                                                                 reorthogonalize,
-                                                                 alpha_host,
-                                                                 beta_host,
-                                                                 lanczosVecs_dev,
-                                                                 work_dev);
-    if (status) WARNING("error in Lanczos iteration");
-    *totalIter += *effIter - iter_new;
-  }
-
-  // Warning if Lanczos has failed to converge
-  if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) {
-    WARNING("implicitly restarted Lanczos failed to converge");
-  }
-
-  // Solve tridiagonal system
-  memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t));
-  memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t));
-  Lapack<value_type_t>::steqr('I',
-                              *effIter,
-                              work_host + 2 * (*effIter),
-                              work_host + 3 * (*effIter),
-                              Z_host,
-                              *effIter,
-                              work_host);
-
-  // Obtain desired eigenvalues by applying shift
-  for (i = 0; i < *effIter; ++i)
-    work_host[i + 2 * (*effIter)] -= *shift;
-  for (i = *effIter; i < nEigVecs; ++i)
-    work_host[i + 2 * (*effIter)] = 0;
-
-  // Copy results to device memory
-  CUDA_TRY(cudaMemcpyAsync(eigVals_dev,
-                           work_host + 2 * (*effIter),
-                           nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice,
-                           stream));
-
-  CUDA_TRY(cudaMemcpyAsync(work_dev,
-                           Z_host,
-                           (*effIter) * nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice,
-                           stream));
-  CHECK_CUDA(stream);
-
-  // Convert eigenvectors from Lanczos basis to standard basis
-  RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
-                             CUBLAS_OP_N,
-                             CUBLAS_OP_N,
-                             n,
-                             nEigVecs,
-                             *effIter,
-                             &one,
-                             lanczosVecs_dev,
-                             n,
-                             work_dev,
-                             *effIter,
-                             &zero,
-                             eigVecs_dev,
-                             n,
-                             stream));
-
-  // Clean up and exit
-  curandDestroyGenerator(randGen);
-  return 0;
-}
-
 /**
  *  @brief  Compute smallest eigenvectors of symmetric matrix
  *    Computes eigenvalues and eigenvectors that are least
@@ -1042,344 +74,17 @@ int computeSmallestEigenvectors(handle_t const& handle,
                                 value_type_t* __restrict__ eigVecs_dev,
                                 unsigned long long seed = 1234567)
 {
-  using namespace spectral;
-
-  // Matrix dimension
-  index_type_t n = A.nrows_;
-
-  // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
-  RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
-  RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
-  RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
-  RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
-
-  // Allocate memory
-  std::vector<value_type_t> alpha_host_v(restartIter);
-  std::vector<value_type_t> beta_host_v(restartIter);
-
-  value_type_t* alpha_host = alpha_host_v.data();
-  value_type_t* beta_host  = beta_host_v.data();
-
-  vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
-  vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
-
-  // Perform Lanczos method
-  index_type_t effIter;
-  value_type_t shift;
-  int status = computeSmallestEigenvectors(handle,
-                                           &A,
-                                           nEigVecs,
-                                           maxIter,
-                                           restartIter,
-                                           tol,
-                                           reorthogonalize,
-                                           &effIter,
-                                           &iter,
-                                           &shift,
-                                           alpha_host,
-                                           beta_host,
-                                           lanczosVecs_dev.raw(),
-                                           work_dev.raw(),
-                                           eigVals_dev,
-                                           eigVecs_dev,
-                                           seed);
-
-  // Clean up and return
-  return status;
-}
-
-// =========================================================
-// Eigensolver
-// =========================================================
-
-/**
- *  @brief Compute largest eigenvectors of symmetric matrix
- *    Computes eigenvalues and eigenvectors that are least
- *    positive. If matrix is positive definite or positive
- *    semidefinite, the computed eigenvalues are largest in
- *    magnitude.
- *    The largest eigenvalue is estimated by performing several
- *    Lanczos iterations. An implicitly restarted Lanczos method is
- *    then applied.
- *  @tparam index_type_t the type of data used for indexing.
- *  @tparam value_type_t the type of data used for weights, distances.
- *  @param handle the raft handle.
- *  @param A Matrix.
- *  @param nEigVecs Number of eigenvectors to compute.
- *  @param maxIter Maximum number of Lanczos steps.
- *  @param restartIter Maximum size of Lanczos system before
- *    performing an implicit restart. Should be at least 4.
- *  @param tol Convergence tolerance. Lanczos iteration will
- *    terminate when the residual norm is less than tol*theta, where
- *    theta is an estimate for the largest unwanted eigenvalue
- *    (i.e. the (nEigVecs+1)th largest eigenvalue).
- *  @param reorthogonalize Whether to reorthogonalize Lanczos
- *    vectors.
- *  @param effIter On exit, pointer to final size of Lanczos system.
- *  @param totalIter On exit, pointer to total number of Lanczos
- *    iterations performed.
- *  @param alpha_host (Output, host memory, restartIter entries)
- *    Diagonal entries of Lanczos system.
- *  @param beta_host (Output, host memory, restartIter entries)
- *    Off-diagonal entries of Lanczos system.
- *  @param lanczosVecs_dev (Output, device memory, n*(restartIter+1)
- *    entries) Lanczos vectors. Vectors are stored as columns of a
- *    column-major matrix with dimensions n x (restartIter+1).
- *  @param work_dev (Output, device memory,
- *    (n+restartIter)*restartIter entries) Workspace.
- *  @param eigVals_dev (Output, device memory, nEigVecs entries)
- *    Largest eigenvalues of matrix.
- *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
- *    Eigenvectors corresponding to largest eigenvalues of
- *    matrix. Vectors are stored as columns of a column-major matrix
- *    with dimensions n x nEigVecs.
- *  @param seed random seed.
- *  @return error flag.
- */
-template <typename index_type_t, typename value_type_t>
-int computeLargestEigenvectors(handle_t const& handle,
-                               sparse_matrix_t<index_type_t, value_type_t> const* A,
-                               index_type_t nEigVecs,
-                               index_type_t maxIter,
-                               index_type_t restartIter,
-                               value_type_t tol,
-                               bool reorthogonalize,
-                               index_type_t* effIter,
-                               index_type_t* totalIter,
-                               value_type_t* __restrict__ alpha_host,
-                               value_type_t* __restrict__ beta_host,
-                               value_type_t* __restrict__ lanczosVecs_dev,
-                               value_type_t* __restrict__ work_dev,
-                               value_type_t* __restrict__ eigVals_dev,
-                               value_type_t* __restrict__ eigVecs_dev,
-                               unsigned long long seed)
-{
-  using namespace spectral;
-
-  // Useful constants
-  constexpr value_type_t one  = 1;
-  constexpr value_type_t zero = 0;
-
-  // Matrix dimension
-  index_type_t n = A->nrows_;
-
-  // Lanczos iteration counters
-  index_type_t maxIter_curr = restartIter;  // Maximum size of Lanczos system
-
-  // Status flags
-  int status;
-
-  // Loop index
-  index_type_t i;
-
-  // Host memory
-  value_type_t* Z_host;     // Eigenvectors in Lanczos basis
-  value_type_t* work_host;  // Workspace
-
-  // -------------------------------------------------------
-  // Check that LAPACK is enabled
-  // -------------------------------------------------------
-  // Lapack<value_type_t>::check_lapack_enabled();
-
-  // -------------------------------------------------------
-  // Check that parameters are valid
-  // -------------------------------------------------------
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
-  RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
-  RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
-  RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
-  RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
-
-  auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
-
-  // -------------------------------------------------------
-  // Variable initialization
-  // -------------------------------------------------------
-
-  // Total number of Lanczos iterations
-  *totalIter = 0;
-
-  // Allocate host memory
-  std::vector<value_type_t> Z_host_v(restartIter * restartIter);
-  std::vector<value_type_t> work_host_v(4 * restartIter);
-
-  Z_host    = Z_host_v.data();
-  work_host = work_host_v.data();
-
-  // Initialize cuBLAS
-  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
-
-  // -------------------------------------------------------
-  // Compute largest eigenvalue
-  // -------------------------------------------------------
-
-  // Random number generator
-  curandGenerator_t randGen;
-  // Initialize random number generator
-  curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_PHILOX4_32_10);
-  curandSetPseudoRandomGeneratorSeed(randGen, seed);
-  // Initialize initial Lanczos vector
-  curandGenerateNormalX(randGen, lanczosVecs_dev, n + n % 2, zero, one);
-  value_type_t normQ1;
-  RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, lanczosVecs_dev, 1, &normQ1, stream));
-
-  auto h_val = 1 / normQ1;
-  RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
-
-  // Obtain tridiagonal matrix with Lanczos
-  *effIter               = 0;
-  value_type_t shift_val = 0.0;
-  value_type_t* shift    = &shift_val;
-
-  status = performLanczosIteration<index_type_t, value_type_t>(handle,
-                                                               A,
-                                                               effIter,
-                                                               maxIter_curr,
-                                                               *shift,
-                                                               0,
-                                                               reorthogonalize,
-                                                               alpha_host,
-                                                               beta_host,
-                                                               lanczosVecs_dev,
-                                                               work_dev);
-  if (status) WARNING("error in Lanczos iteration");
-  *totalIter += *effIter;
-
-  // Apply Lanczos method until convergence
-  value_type_t shiftLower = 1;
-  value_type_t shiftUpper = -1;
-  while (*totalIter < maxIter && beta_host[*effIter - 1] > tol * shiftLower) {
-    // Determine number of restart steps
-    //   Number of steps must be even due to Francis algorithm
-    index_type_t iter_new = nEigVecs + 1;
-    if (restartIter - (maxIter - *totalIter) > nEigVecs + 1)
-      iter_new = restartIter - (maxIter - *totalIter);
-    if ((restartIter - iter_new) % 2) iter_new -= 1;
-    if (iter_new == *effIter) break;
-
-    // Implicit restart of Lanczos method
-    status = lanczosRestart<index_type_t, value_type_t>(handle,
-                                                        n,
-                                                        *effIter,
-                                                        iter_new,
-                                                        &shiftUpper,
-                                                        &shiftLower,
-                                                        alpha_host,
-                                                        beta_host,
-                                                        Z_host,
-                                                        work_host,
-                                                        lanczosVecs_dev,
-                                                        work_dev,
-                                                        false);
-    if (status) WARNING("error in Lanczos implicit restart");
-    *effIter = iter_new;
-
-    // Check for convergence
-    if (beta_host[*effIter - 1] <= tol * fabs(shiftLower)) break;
-
-    // Proceed with Lanczos method
-
-    status = performLanczosIteration<index_type_t, value_type_t>(handle,
-                                                                 A,
-                                                                 effIter,
-                                                                 maxIter_curr,
-                                                                 *shift,
-                                                                 tol * fabs(shiftLower),
-                                                                 reorthogonalize,
-                                                                 alpha_host,
-                                                                 beta_host,
-                                                                 lanczosVecs_dev,
-                                                                 work_dev);
-    if (status) WARNING("error in Lanczos iteration");
-    *totalIter += *effIter - iter_new;
-  }
-
-  // Warning if Lanczos has failed to converge
-  if (beta_host[*effIter - 1] > tol * fabs(shiftLower)) {
-    WARNING("implicitly restarted Lanczos failed to converge");
-  }
-  for (int i = 0; i < restartIter; ++i) {
-    for (int j = 0; j < restartIter; ++j)
-      Z_host[i * restartIter + j] = 0;
-  }
-  // Solve tridiagonal system
-  memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t));
-  memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t));
-  Lapack<value_type_t>::steqr('I',
-                              *effIter,
-                              work_host + 2 * (*effIter),
-                              work_host + 3 * (*effIter),
-                              Z_host,
-                              *effIter,
-                              work_host);
-
-  // note: We need to pick the top nEigVecs eigenvalues
-  // but effItter can be larger than nEigVecs
-  // hence we add an offset for that case, because we want to access top nEigVecs eigenpairs in the
-  // matrix of size effIter. remember the array is sorted, so it is not needed for smallest
-  // eigenvalues case because the first ones are the smallest ones
-
-  index_type_t top_eigenparis_idx_offset = *effIter - nEigVecs;
-
-  // Debug : print nEigVecs largest eigenvalues
-  // for (int i = top_eigenparis_idx_offset; i < *effIter; ++i)
-  //  std::cout <<*(work_host+(2*(*effIter)+i))<< " ";
-  // std::cout <<std::endl;
-
-  // Debug : print nEigVecs largest eigenvectors
-  // for (int i = top_eigenparis_idx_offset; i < *effIter; ++i)
-  //{
-  //  for (int j = 0; j < *effIter; ++j)
-  //    std::cout <<Z_host[i*(*effIter)+j]<< " ";
-  //  std::cout <<std::endl;
-  //}
-
-  // Obtain desired eigenvalues by applying shift
-  for (i = 0; i < *effIter; ++i)
-    work_host[i + 2 * (*effIter)] -= *shift;
-
-  for (i = 0; i < top_eigenparis_idx_offset; ++i)
-    work_host[i + 2 * (*effIter)] = 0;
-
-  // Copy results to device memory
-  // skip smallest eigenvalue if needed
-  CUDA_TRY(cudaMemcpyAsync(eigVals_dev,
-                           work_host + 2 * (*effIter) + top_eigenparis_idx_offset,
-                           nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice,
-                           stream));
-
-  // skip smallest eigenvector if needed
-  CUDA_TRY(cudaMemcpyAsync(work_dev,
-                           Z_host + (top_eigenparis_idx_offset * (*effIter)),
-                           (*effIter) * nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice,
-                           stream));
-
-  CHECK_CUDA(stream);
-
-  // Convert eigenvectors from Lanczos basis to standard basis
-  RAFT_CUBLAS_TRY(cublasgemm(cublas_h,
-                             CUBLAS_OP_N,
-                             CUBLAS_OP_N,
-                             n,
-                             nEigVecs,
-                             *effIter,
-                             &one,
-                             lanczosVecs_dev,
-                             n,
-                             work_dev,
-                             *effIter,
-                             &zero,
-                             eigVecs_dev,
-                             n,
-                             stream));
-
-  // Clean up and exit
-  curandDestroyGenerator(randGen);
-  return 0;
+  return detail::computeSmallestEigenvectors(handle,
+                                             A,
+                                             nEigVecs,
+                                             maxIter,
+                                             restartIter,
+                                             tol,
+                                             reorthogonalize,
+                                             iter,
+                                             eigVals_dev,
+                                             eigVecs_dev,
+                                             seed);
 }
 
 /**
@@ -1432,47 +137,17 @@ int computeLargestEigenvectors(handle_t const& handle,
                                value_type_t* __restrict__ eigVecs_dev,
                                unsigned long long seed = 123456)
 {
-  // Matrix dimension
-  index_type_t n = A.nrows_;
-
-  // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
-  RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
-  RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
-  RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
-  RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
-
-  // Allocate memory
-  std::vector<value_type_t> alpha_host_v(restartIter);
-  std::vector<value_type_t> beta_host_v(restartIter);
-
-  value_type_t* alpha_host = alpha_host_v.data();
-  value_type_t* beta_host  = beta_host_v.data();
-
-  vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
-  vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
-
-  // Perform Lanczos method
-  index_type_t effIter;
-  int status = computeLargestEigenvectors(handle,
-                                          &A,
-                                          nEigVecs,
-                                          maxIter,
-                                          restartIter,
-                                          tol,
-                                          reorthogonalize,
-                                          &effIter,
-                                          &iter,
-                                          alpha_host,
-                                          beta_host,
-                                          lanczosVecs_dev.raw(),
-                                          work_dev.raw(),
-                                          eigVals_dev,
-                                          eigVecs_dev,
-                                          seed);
-
-  // Clean up and return
-  return status;
+  return detail::computeLargestEigenvectors(handle,
+                                            A,
+                                            nEigVecs,
+                                            maxIter,
+                                            restartIter,
+                                            tol,
+                                            reorthogonalize,
+                                            iter,
+                                            eigVals_dev,
+                                            eigVecs_dev,
+                                            seed);
 }
 
 }  // namespace raft
diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.hpp
similarity index 61%
rename from cpp/include/raft/linalg/map.cuh
rename to cpp/include/raft/linalg/map.hpp
index 4facc5e72c..febeaa8621 100644
--- a/cpp/include/raft/linalg/map.cuh
+++ b/cpp/include/raft/linalg/map.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,32 +16,11 @@
 
 #pragma once
 
-#include <cub/cub.cuh>
-#include <raft/cuda_utils.cuh>
-#include <raft/handle.hpp>
-#include <raft/vectorized.cuh>
+#include "detail/map.cuh"
 
 namespace raft {
 namespace linalg {
 
-template <typename InType, typename OutType, typename MapOp, int TPB, typename... Args>
-__global__ void mapKernel(OutType* out, size_t len, MapOp map, const InType* in, Args... args)
-{
-  auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
-
-  if (idx < len) { out[idx] = map(in[idx], args[idx]...); }
-}
-
-template <typename InType, typename OutType, typename MapOp, int TPB, typename... Args>
-void mapImpl(
-  OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
-{
-  const int nblks = raft::ceildiv(len, (size_t)TPB);
-  mapKernel<InType, OutType, MapOp, TPB, Args...>
-    <<<nblks, TPB, 0, stream>>>(out, len, map, in, args...);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-}
-
 /**
  * @brief CUDA version of map
  * @tparam InType data-type upon which the math operation will be performed
@@ -64,7 +43,7 @@ template <typename InType,
           typename OutType = InType>
 void map(OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
 {
-  mapImpl<InType, OutType, MapOp, TPB, Args...>(out, len, map, stream, in, args...);
+  detail::mapImpl<InType, OutType, MapOp, TPB, Args...>(out, len, map, stream, in, args...);
 }
 
 }  // namespace linalg
diff --git a/cpp/include/raft/linalg/map_then_reduce.hpp b/cpp/include/raft/linalg/map_then_reduce.hpp
new file mode 100644
index 0000000000..04275995a0
--- /dev/null
+++ b/cpp/include/raft/linalg/map_then_reduce.hpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/map_then_reduce.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief CUDA version of map and then sum reduction operation
+ * @tparam Type data-type upon which the math operation will be performed
+ * @tparam MapOp the device-lambda performing the actual operation
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @tparam Args additional parameters
+ * @param out the output sum-reduced value (assumed to be a device pointer)
+ * @param len number of elements in the input array
+ * @param map the device-lambda
+ * @param stream cuda-stream where to launch this kernel
+ * @param in the input array
+ * @param args additional input arrays
+ */
+
+template <typename InType,
+          typename MapOp,
+          int TPB = 256,
+          typename... Args,
+          typename OutType = InType>
+void mapThenSumReduce(
+  OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+{
+  detail::mapThenReduceImpl<InType, OutType, MapOp, detail::sum_tag, TPB, Args...>(
+    out, len, (OutType)0, map, detail::sum_tag(), stream, in, args...);
+}
+
+/**
+ * @brief CUDA version of map and then generic reduction operation
+ * @tparam Type data-type upon which the math operation will be performed
+ * @tparam MapOp the device-lambda performing the actual map operation
+ * @tparam ReduceLambda the device-lambda performing the actual reduction
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @tparam Args additional parameters
+ * @param out the output reduced value (assumed to be a device pointer)
+ * @param len number of elements in the input array
+ * @param neutral The neutral element of the reduction operation. For example:
+ *    0 for sum, 1 for multiply, +Inf for Min, -Inf for Max
+ * @param map the device-lambda
+ * @param op the reduction device lambda
+ * @param stream cuda-stream where to launch this kernel
+ * @param in the input array
+ * @param args additional input arrays
+ */
+
+template <typename InType,
+          typename MapOp,
+          typename ReduceLambda,
+          int TPB          = 256,
+          typename OutType = InType,
+          typename... Args>
+void mapThenReduce(OutType* out,
+                   size_t len,
+                   OutType neutral,
+                   MapOp map,
+                   ReduceLambda op,
+                   cudaStream_t stream,
+                   const InType* in,
+                   Args... args)
+{
+  detail::mapThenReduceImpl<InType, OutType, MapOp, ReduceLambda, TPB, Args...>(
+    out, len, neutral, map, op, stream, in, args...);
+}
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.hpp
similarity index 89%
rename from cpp/include/raft/linalg/matrix_vector_op.cuh
rename to cpp/include/raft/linalg/matrix_vector_op.hpp
index 750eca0742..b9790ebce2 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/matrix/matrix.hpp>
+#include "detail/matrix_vector_op.cuh"
 
 namespace raft {
 namespace linalg {
@@ -55,10 +55,7 @@ void matrixVectorOp(Type* out,
                     Lambda op,
                     cudaStream_t stream)
 {
-  IdxType stride = rowMajor ? D : N;
-  IdxType nLines = rowMajor ? N : D;
-  return matrix::linewiseOp(
-    out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec);
+  detail::matrixVectorOp(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
 }
 
 /**
@@ -97,10 +94,7 @@ void matrixVectorOp(Type* out,
                     Lambda op,
                     cudaStream_t stream)
 {
-  IdxType stride = rowMajor ? D : N;
-  IdxType nLines = rowMajor ? N : D;
-  return matrix::linewiseOp(
-    out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec1, vec2);
+  detail::matrixVectorOp(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.hpp
similarity index 82%
rename from cpp/include/raft/linalg/mean_squared_error.cuh
rename to cpp/include/raft/linalg/mean_squared_error.hpp
index a3fcc5bac6..42af8642b6 100644
--- a/cpp/include/raft/linalg/mean_squared_error.cuh
+++ b/cpp/include/raft/linalg/mean_squared_error.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "map_then_reduce.cuh"
+#include "detail/mean_squared_error.hpp"
 
 namespace raft {
 namespace linalg {
@@ -36,11 +36,7 @@ template <typename math_t, int TPB = 256>
 void meanSquaredError(
   math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream)
 {
-  auto sq_diff = [len, weight] __device__(const math_t a, const math_t b) {
-    math_t diff = a - b;
-    return diff * diff * weight / len;
-  };
-  mapThenSumReduce<math_t, decltype(sq_diff), TPB>(out, len, sq_diff, stream, A, B);
+  detail::meanSquaredError(out, A, B, len, weight, stream);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.hpp
similarity index 88%
rename from cpp/include/raft/linalg/multiply.cuh
rename to cpp/include/raft/linalg/multiply.hpp
index 53d57ecd00..4a1628b44a 100644
--- a/cpp/include/raft/linalg/multiply.cuh
+++ b/cpp/include/raft/linalg/multiply.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "unary_op.cuh"
+#include "detail/multiply.hpp"
 
 namespace raft {
 namespace linalg {
@@ -35,8 +35,7 @@ namespace linalg {
 template <typename math_t, typename IdxType = int>
 void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
 {
-  unaryOp(
-    out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, stream);
+  detail::multiplyScalar(out, in, scalar, len, stream);
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.hpp
similarity index 65%
rename from cpp/include/raft/linalg/norm.cuh
rename to cpp/include/raft/linalg/norm.hpp
index 82558c8023..a6336769ca 100644
--- a/cpp/include/raft/linalg/norm.cuh
+++ b/cpp/include/raft/linalg/norm.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,15 @@
 
 #pragma once
 
-#include "reduce.cuh"
+#include "detail/norm.hpp"
 
 namespace raft {
 namespace linalg {
 
 /** different types of norms supported on the input buffers */
-enum NormType { L1Norm = 0, L2Norm };
+using detail::L1Norm;
+using detail::L2Norm;
+using detail::NormType;
 
 /**
  * @brief Compute row-wise norm of the input matrix and perform fin_op lambda
@@ -54,37 +56,7 @@ void rowNorm(Type* dots,
              cudaStream_t stream,
              Lambda fin_op = raft::Nop<Type, IdxType>())
 {
-  switch (type) {
-    case L1Norm:
-      reduce(dots,
-             data,
-             D,
-             N,
-             (Type)0,
-             rowMajor,
-             true,
-             stream,
-             false,
-             raft::L1Op<Type, IdxType>(),
-             raft::Sum<Type>(),
-             fin_op);
-      break;
-    case L2Norm:
-      reduce(dots,
-             data,
-             D,
-             N,
-             (Type)0,
-             rowMajor,
-             true,
-             stream,
-             false,
-             raft::L2Op<Type>(),
-             raft::Sum<Type>(),
-             fin_op);
-      break;
-    default: ASSERT(false, "Invalid norm type passed! [%d]", type);
-  };
+  detail::rowNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op);
 }
 
 /**
@@ -111,37 +83,7 @@ void colNorm(Type* dots,
              cudaStream_t stream,
              Lambda fin_op = raft::Nop<Type, IdxType>())
 {
-  switch (type) {
-    case L1Norm:
-      reduce(dots,
-             data,
-             D,
-             N,
-             (Type)0,
-             rowMajor,
-             false,
-             stream,
-             false,
-             raft::L1Op<Type, IdxType>(),
-             raft::Sum<Type>(),
-             fin_op);
-      break;
-    case L2Norm:
-      reduce(dots,
-             data,
-             D,
-             N,
-             (Type)0,
-             rowMajor,
-             false,
-             stream,
-             false,
-             raft::L2Op<Type, IdxType>(),
-             raft::Sum<Type>(),
-             fin_op);
-      break;
-    default: ASSERT(false, "Invalid norm type passed! [%d]", type);
-  };
+  detail::colNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/qr.hpp b/cpp/include/raft/linalg/qr.hpp
new file mode 100644
index 0000000000..50e97e4069
--- /dev/null
+++ b/cpp/include/raft/linalg/qr.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/qr.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @defgroup QRdecomp QR decomposition
+ * @{
+ */
+
+/**
+ * @brief compute QR decomp and return only Q matrix
+ * @param handle: raft handle
+ * @param M: input matrix
+ * @param Q: Q matrix to be returned (on GPU)
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param stream cuda stream
+ * @{
+ */
+template <typename math_t>
+void qrGetQ(const raft::handle_t& handle,
+            const math_t* M,
+            math_t* Q,
+            int n_rows,
+            int n_cols,
+            cudaStream_t stream)
+{
+  detail::qrGetQ(handle, M, Q, n_rows, n_cols, stream);
+}
+
+/**
+ * @brief compute QR decomp and return both Q and R matrices
+ * @param handle: raft handle
+ * @param M: input matrix
+ * @param Q: Q matrix to be returned (on GPU)
+ * @param R: R matrix to be returned (on GPU)
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void qrGetQR(const raft::handle_t& handle,
+             math_t* M,
+             math_t* Q,
+             math_t* R,
+             int n_rows,
+             int n_cols,
+             cudaStream_t stream)
+{
+  detail::qrGetQR(handle, M, Q, R, n_rows, n_cols, stream);
+}
+/** @} */
+
+};  // namespace linalg
+};  // namespace raft
diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.hpp
similarity index 81%
rename from cpp/include/raft/linalg/reduce.cuh
rename to cpp/include/raft/linalg/reduce.hpp
index 1f14f6eb31..1c4ef70df8 100644
--- a/cpp/include/raft/linalg/reduce.cuh
+++ b/cpp/include/raft/linalg/reduce.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,7 @@
 
 #pragma once
 
-#include "coalesced_reduction.cuh"
-#include "strided_reduction.cuh"
-#include <raft/cuda_utils.cuh>
+#include "detail/reduce.hpp"
 
 namespace raft {
 namespace linalg {
@@ -71,15 +69,8 @@ void reduce(OutType* dots,
             ReduceLambda reduce_op = raft::Sum<OutType>(),
             FinalLambda final_op   = raft::Nop<OutType>())
 {
-  if (rowMajor && alongRows) {
-    coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-  } else if (rowMajor && !alongRows) {
-    stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-  } else if (!rowMajor && alongRows) {
-    stridedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
-  } else {
-    coalescedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
-  }
+  detail::reduce(
+    dots, data, D, N, init, rowMajor, alongRows, stream, inplace, main_op, reduce_op, final_op);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/strided_reduction.hpp b/cpp/include/raft/linalg/strided_reduction.hpp
new file mode 100644
index 0000000000..0f97323e5a
--- /dev/null
+++ b/cpp/include/raft/linalg/strided_reduction.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/strided_reduction.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Compute reduction of the input matrix along the strided dimension
+ *
+ * @tparam InType the data type of the input
+ * @tparam OutType the data type of the output (as well as the data type for
+ *  which reduction is performed)
+ * @tparam IdxType data type of the indices of the array
+ * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
+ * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*ReduceLambda)(OutType);</pre>
+ * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*FinalLambda)(OutType);</pre>
+ * @param dots the output reduction vector
+ * @param data the input matrix
+ * @param D leading dimension of data
+ * @param N second dimension data
+ * @param init initial value to use for the reduction
+ * @param main_op elementwise operation to apply before reduction
+ * @param reduce_op binary reduction operation
+ * @param final_op elementwise operation to apply before storing results
+ * @param inplace reduction result added inplace or overwrites old values?
+ * @param stream cuda stream where to launch work
+ */
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::Nop<OutType>>
+void stridedReduction(OutType* dots,
+                      const InType* data,
+                      IdxType D,
+                      IdxType N,
+                      OutType init,
+                      cudaStream_t stream,
+                      bool inplace           = false,
+                      MainLambda main_op     = raft::Nop<InType, IdxType>(),
+                      ReduceLambda reduce_op = raft::Sum<OutType>(),
+                      FinalLambda final_op   = raft::Nop<OutType>())
+{
+  detail::stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.hpp
similarity index 68%
rename from cpp/include/raft/linalg/subtract.cuh
rename to cpp/include/raft/linalg/subtract.hpp
index b33378bf33..9d48948cad 100644
--- a/cpp/include/raft/linalg/subtract.cuh
+++ b/cpp/include/raft/linalg/subtract.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,7 @@
 
 #pragma once
 
-#include "binary_op.cuh"
-#include "unary_op.cuh"
-#include <raft/cuda_utils.cuh>
+#include "detail/subtract.cuh"
 
 namespace raft {
 namespace linalg {
@@ -40,8 +38,7 @@ namespace linalg {
 template <typename InT, typename OutT = InT, typename IdxType = int>
 void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
 {
-  auto op = [scalar] __device__(InT in) { return OutT(in - scalar); };
-  unaryOp<InT, decltype(op), IdxType, OutT>(out, in, len, op, stream);
+  detail::subtractScalar(out, in, scalar, len, stream);
 }
 
 /**
@@ -60,19 +57,7 @@ void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStrea
 template <typename InT, typename OutT = InT, typename IdxType = int>
 void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
 {
-  auto op = [] __device__(InT a, InT b) { return OutT(a - b); };
-  binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
-}
-
-template <class math_t, typename IdxType>
-__global__ void subtract_dev_scalar_kernel(math_t* outDev,
-                                           const math_t* inDev,
-                                           const math_t* singleScalarDev,
-                                           IdxType len)
-{
-  // TODO: kernel do not use shared memory in current implementation
-  int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
-  if (i < len) { outDev[i] = inDev[i] - *singleScalarDev; }
+  detail::subtract(out, in1, in2, len, stream);
 }
 
 /** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
@@ -93,12 +78,7 @@ void subtractDevScalar(math_t* outDev,
                        IdxType len,
                        cudaStream_t stream)
 {
-  // Just for the note - there is no way to express such operation with cuBLAS in effective way
-  // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda
-  const IdxType nblks = raft::ceildiv(len, (IdxType)TPB);
-  subtract_dev_scalar_kernel<math_t>
-    <<<nblks, TPB, 0, stream>>>(outDev, inDev, singleScalarDev, len);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  detail::subtractDevScalar(outDev, inDev, singleScalarDev, len, stream);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/svd.hpp b/cpp/include/raft/linalg/svd.hpp
new file mode 100644
index 0000000000..a30180b174
--- /dev/null
+++ b/cpp/include/raft/linalg/svd.hpp
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/svd.hpp"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief singular value decomposition (SVD) on the column major float type
+ * input matrix using QR method
+ * @param handle: raft handle
+ * @param in: input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param sing_vals: singular values of input matrix
+ * @param left_sing_vecs: left singular values of input matrix
+ * @param right_sing_vecs: right singular values of input matrix
+ * @param trans_right: transpose right vectors or not
+ * @param gen_left_vec: generate left eig vector. Not activated.
+ * @param gen_right_vec: generate right eig vector. Not activated.
+ * @param stream cuda stream
+ */
+// TODO: activate gen_left_vec and gen_right_vec options
+// TODO: couldn't template this function due to cusolverDnSgesvd and
+// cusolverSnSgesvd. Check if there is any other way.
+template <typename T>
+void svdQR(const raft::handle_t& handle,
+           T* in,
+           int n_rows,
+           int n_cols,
+           T* sing_vals,
+           T* left_sing_vecs,
+           T* right_sing_vecs,
+           bool trans_right,
+           bool gen_left_vec,
+           bool gen_right_vec,
+           cudaStream_t stream)
+{
+  detail::svdQR(handle,
+                in,
+                n_rows,
+                n_cols,
+                sing_vals,
+                left_sing_vecs,
+                right_sing_vecs,
+                trans_right,
+                gen_left_vec,
+                gen_right_vec,
+                stream);
+}
+
+template <typename T>
+void svdEig(const raft::handle_t& handle,
+            T* in,
+            int n_rows,
+            int n_cols,
+            T* S,
+            T* U,
+            T* V,
+            bool gen_left_vec,
+            cudaStream_t stream)
+{
+  detail::svdEig(handle, in, n_rows, n_cols, S, U, V, gen_left_vec, stream);
+}
+
+/**
+ * @brief on the column major input matrix using Jacobi method
+ * @param handle: raft handle
+ * @param in: input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param sing_vals: singular values of input matrix
+ * @param left_sing_vecs: left singular vectors of input matrix
+ * @param right_sing_vecs: right singular vectors of input matrix
+ * @param gen_left_vec: generate left eig vector. Not activated.
+ * @param gen_right_vec: generate right eig vector. Not activated.
+ * @param tol: error tolerance for the jacobi method. Algorithm stops when the
+ * error is below tol
+ * @param max_sweeps: number of sweeps in the Jacobi algorithm. The more the better
+ * accuracy.
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void svdJacobi(const raft::handle_t& handle,
+               math_t* in,
+               int n_rows,
+               int n_cols,
+               math_t* sing_vals,
+               math_t* left_sing_vecs,
+               math_t* right_sing_vecs,
+               bool gen_left_vec,
+               bool gen_right_vec,
+               math_t tol,
+               int max_sweeps,
+               cudaStream_t stream)
+{
+  detail::svdJacobi(handle,
+                    in,
+                    n_rows,
+                    n_cols,
+                    sing_vals,
+                    left_sing_vecs,
+                    right_sing_vecs,
+                    gen_left_vec,
+                    gen_right_vec,
+                    tol,
+                    max_sweeps,
+                    stream);
+}
+
+/**
+ * @brief reconstruct a matrix use left and right singular vectors and
+ * singular values
+ * @param handle: raft handle
+ * @param U: left singular vectors of size n_rows x k
+ * @param S: square matrix with singular values on its diagonal, k x k
+ * @param V: right singular vectors of size n_cols x k
+ * @param out: reconstructed matrix to be returned
+ * @param n_rows: number rows of output matrix
+ * @param n_cols: number columns of output matrix
+ * @param k: number of singular values
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void svdReconstruction(const raft::handle_t& handle,
+                       math_t* U,
+                       math_t* S,
+                       math_t* V,
+                       math_t* out,
+                       int n_rows,
+                       int n_cols,
+                       int k,
+                       cudaStream_t stream)
+{
+  detail::svdReconstruction(handle, U, S, V, out, n_rows, n_cols, k, stream);
+}
+
+/**
+ * @brief reconstruct a matrix use left and right singular vectors and
+ * singular values
+ * @param handle: raft handle
+ * @param A_d: input matrix
+ * @param U: left singular vectors of size n_rows x k
+ * @param S_vec: singular values as a vector
+ * @param V: right singular vectors of size n_cols x k
+ * @param n_rows: number rows of output matrix
+ * @param n_cols: number columns of output matrix
+ * @param k: number of singular values to be computed, 1.0 for normal SVD
+ * @param tol: tolerance for the evaluation
+ * @param stream cuda stream
+ */
+template <typename math_t>
+bool evaluateSVDByL2Norm(const raft::handle_t& handle,
+                         math_t* A_d,
+                         math_t* U,
+                         math_t* S_vec,
+                         math_t* V,
+                         int n_rows,
+                         int n_cols,
+                         int k,
+                         math_t tol,
+                         cudaStream_t stream)
+{
+  return detail::evaluateSVDByL2Norm(handle, A_d, U, S_vec, V, n_rows, n_cols, k, tol, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/transpose.hpp b/cpp/include/raft/linalg/transpose.hpp
new file mode 100644
index 0000000000..50608877fa
--- /dev/null
+++ b/cpp/include/raft/linalg/transpose.hpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/transpose.hpp"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief transpose on the column major input matrix using Jacobi method
+ * @param handle: raft handle
+ * @param in: input matrix
+ * @param out: output. Transposed input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename math_t>
+void transpose(const raft::handle_t& handle,
+               math_t* in,
+               math_t* out,
+               int n_rows,
+               int n_cols,
+               cudaStream_t stream)
+{
+  detail::transpose(handle, in, out, n_rows, n_cols, stream);
+}
+
+/**
+ * @brief transpose on the column major input matrix using Jacobi method
+ * @param inout: input and output matrix
+ * @param n: number of rows and columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename math_t>
+void transpose(math_t* inout, int n, cudaStream_t stream)
+{
+  detail::transpose(inout, n, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/unary_op.hpp b/cpp/include/raft/linalg/unary_op.hpp
new file mode 100644
index 0000000000..51faa2e4a4
--- /dev/null
+++ b/cpp/include/raft/linalg/unary_op.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/unary_op.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief perform element-wise unary operation in the input array
+ * @tparam InType input data-type
+ * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam OutType output data-type
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @param out the output array
+ * @param in the input array
+ * @param len number of elements in the input array
+ * @param op the device-lambda
+ * @param stream cuda stream where to launch work
+ * @note Lambda must be a functor with the following signature:
+ *       `OutType func(const InType& val);`
+ */
+template <typename InType,
+          typename Lambda,
+          typename IdxType = int,
+          typename OutType = InType,
+          int TPB          = 256>
+void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
+{
+  detail::unaryOpCaller(out, in, len, op, stream);
+}
+
+/**
+ * @brief Perform an element-wise unary operation into the output array
+ *
+ * Compared to `unaryOp()`, this method does not do any reads from any inputs
+ *
+ * @tparam OutType output data-type
+ * @tparam Lambda  the device-lambda performing the actual operation
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB     threads-per-block in the final kernel launched
+ *
+ * @param[out] out    the output array [on device] [len = len]
+ * @param[in]  len    number of elements in the input array
+ * @param[in]  op     the device-lambda which must be of the form:
+ *                    `void func(OutType* outLocationOffset, IdxType idx);`
+ *                    where outLocationOffset will be out + idx.
+ * @param[in]  stream cuda stream where to launch work
+ */
+template <typename OutType, typename Lambda, typename IdxType = int, int TPB = 256>
+void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
+{
+  detail::writeOnlyUnaryOpCaller(out, len, op, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh
index 95103ab98e..6b32cbc06e 100644
--- a/cpp/include/raft/matrix/detail/math.cuh
+++ b/cpp/include/raft/matrix/detail/math.cuh
@@ -20,10 +20,10 @@
 
 #include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/map_then_reduce.cuh>
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/binary_op.hpp>
+#include <raft/linalg/map_then_reduce.hpp>
+#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index f9cfffe64d..6d631b4f4f 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -29,7 +29,7 @@
 #include <cusolverDn.h>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.h>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 
 namespace raft {
 namespace matrix {
@@ -278,7 +278,8 @@ m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t st
 {
   cublasHandle_t cublasH = handle.get_cublas_handle();
   m_t normval            = 0;
-  RAFT_CUBLAS_TRY(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream));
+  // #TODO: Call from the public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(cublasH, size, in, 1, &normval, stream));
   return normval;
 }
 
diff --git a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
index 07bf251f14..f65f524f62 100644
--- a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
@@ -20,7 +20,6 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/sparse/detail/utils.h>
 #include <raft/sparse/distance/common.h>
diff --git a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
index 00054a8e96..b1e9ae5a55 100644
--- a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
@@ -19,7 +19,6 @@
 #include <limits.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
 
 #include <raft/sparse/convert/csr.hpp>
diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
index 7f63a7fec8..57411f6998 100644
--- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
@@ -20,8 +20,8 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/sparse/csr.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/sparse/detail/utils.h>
diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
index 1e907c98eb..4ceb31a3c8 100644
--- a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
@@ -20,7 +20,6 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
 #include <raft/sparse/cusparse_wrappers.h>
 
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp
index 2f121dce33..7ec032d186 100644
--- a/cpp/include/raft/sparse/distance/distance.hpp
+++ b/cpp/include/raft/sparse/distance/distance.hpp
@@ -20,7 +20,7 @@
 #include <unordered_set>
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
 
diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
index 5d4640f4a6..bd96ca8649 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
@@ -20,10 +20,10 @@
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/convert/csr.hpp>
 #include <raft/sparse/coo.hpp>
diff --git a/cpp/include/raft/sparse/op/detail/slice.h b/cpp/include/raft/sparse/op/detail/slice.h
index 3c47d19a0b..0f4f50ceb6 100644
--- a/cpp/include/raft/sparse/op/detail/slice.h
+++ b/cpp/include/raft/sparse/op/detail/slice.h
@@ -20,7 +20,7 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
 
 #include <thrust/device_ptr.h>
diff --git a/cpp/include/raft/sparse/selection/detail/connect_components.cuh b/cpp/include/raft/sparse/selection/detail/connect_components.cuh
index 817b9782f2..b56b2df02e 100644
--- a/cpp/include/raft/sparse/selection/detail/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/detail/connect_components.cuh
@@ -18,7 +18,7 @@
 
 #include <raft/distance/fused_l2_nn.hpp>
 #include <raft/label/classlabels.cuh>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/norm.hpp>
 #include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/convert/csr.hpp>
 #include <raft/sparse/coo.hpp>
diff --git a/cpp/include/raft/sparse/selection/detail/knn.cuh b/cpp/include/raft/sparse/selection/detail/knn.cuh
index de0a15c029..3de10a2782 100644
--- a/cpp/include/raft/sparse/selection/detail/knn.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn.cuh
@@ -20,8 +20,8 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <raft/mr/device/buffer.hpp>
 
diff --git a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
index 1191251039..6ac96e1324 100644
--- a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
@@ -24,7 +24,7 @@
 
 #include <raft/spatial/knn/knn.hpp>
 
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
diff --git a/cpp/include/raft/sparse/selection/knn.hpp b/cpp/include/raft/sparse/selection/knn.hpp
index ddfb74dedc..8b2747d104 100644
--- a/cpp/include/raft/sparse/selection/knn.hpp
+++ b/cpp/include/raft/sparse/selection/knn.hpp
@@ -16,8 +16,8 @@
 
 #pragma once
 
+#include <raft/distance/distance_type.hpp>
 #include <raft/handle.hpp>
-#include <raft/linalg/distance_type.h>
 #include <raft/sparse/selection/detail/knn.cuh>
 
 namespace raft {
diff --git a/cpp/include/raft/sparse/selection/knn_graph.hpp b/cpp/include/raft/sparse/selection/knn_graph.hpp
index 6d7cb826da..825761d44d 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.hpp
+++ b/cpp/include/raft/sparse/selection/knn_graph.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/selection/detail/knn_graph.cuh>
 
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index 79f75dc8ae..339ca3687a 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 
 #include <faiss/gpu/GpuIndex.h>
 #include <raft/spatial/knn/faiss_mr.hpp>
diff --git a/cpp/include/raft/spatial/knn/ball_cover.hpp b/cpp/include/raft/spatial/knn/ball_cover.hpp
index a4bdb8f2de..5b93439218 100644
--- a/cpp/include/raft/spatial/knn/ball_cover.hpp
+++ b/cpp/include/raft/spatial/knn/ball_cover.hpp
@@ -21,7 +21,7 @@
 #include "ball_cover_common.h"
 #include "detail/ball_cover.cuh"
 #include "detail/ball_cover/common.cuh"
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <thrust/transform.h>
 
 namespace raft {
diff --git a/cpp/include/raft/spatial/knn/ball_cover_common.h b/cpp/include/raft/spatial/knn/ball_cover_common.h
index 2830b81cc0..e1a202107b 100644
--- a/cpp/include/raft/spatial/knn/ball_cover_common.h
+++ b/cpp/include/raft/spatial/knn/ball_cover_common.h
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <cstdint>
+#include <raft/distance/distance_type.hpp>
 #include <raft/handle.hpp>
-#include <raft/linalg/distance_type.h>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index 1c2f21b72c..153b6b1d8a 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -43,7 +43,7 @@
 
 #include <thrust/iterator/transform_iterator.h>
 
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 
 #include <cuml/neighbors/knn.hpp>
 
diff --git a/cpp/include/raft/spatial/knn/detail/common_faiss.h b/cpp/include/raft/spatial/knn/detail/common_faiss.h
index 328ec0bf81..587505316b 100644
--- a/cpp/include/raft/spatial/knn/detail/common_faiss.h
+++ b/cpp/include/raft/spatial/knn/detail/common_faiss.h
@@ -20,7 +20,7 @@
 #include <raft/cudart_utils.h>
 
 #include <faiss/gpu/GpuDistance.h>
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index 385e16383e..6b5df01a97 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -17,7 +17,7 @@
 #include <cub/cub.cuh>
 #include <faiss/gpu/utils/Select.cuh>
 #include <limits>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/norm.hpp>
 // TODO: Need to hide the PairwiseDistance class impl and expose to public API
 #include "processing.hpp"
 #include <raft/distance/detail/distance.cuh>
diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
index 8d40860535..06473f6151 100644
--- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
+++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
@@ -25,8 +25,8 @@
 #include <faiss/gpu/utils/Select.cuh>
 #include <faiss/utils/Heap.h>
 
+#include <raft/distance/distance_type.hpp>
 #include <raft/handle.hpp>
-#include <raft/linalg/distance_type.h>
 #include <raft/spatial/knn/faiss_mr.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 98a8885369..d5dfe4f8f8 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -30,8 +30,8 @@
 
 #include <cstdint>
 #include <iostream>
+#include <raft/distance/distance_type.hpp>
 #include <raft/handle.hpp>
-#include <raft/linalg/distance_type.h>
 #include <raft/spatial/knn/faiss_mr.hpp>
 #include <set>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index f87fffc6cf..a515ca8507 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -15,10 +15,10 @@
  */
 #pragma once
 
-#include <raft/linalg/distance_type.h>
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/linalg/norm.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/norm.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/stats/mean.hpp>
 #include <raft/stats/mean_center.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp
index bcb28bfb1e..56f4022a8c 100644
--- a/cpp/include/raft/spectral/kmeans.hpp
+++ b/cpp/include/raft/spectral/kmeans.hpp
@@ -31,7 +31,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/device_atomics.cuh>
 #include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.h>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/spectral/matrix_wrappers.hpp>
 #include <raft/spectral/warn_dbg.hpp>
 
@@ -657,20 +657,21 @@ static int updateCentroids(handle_t const& handle,
   thrust::device_ptr<index_type_t> rows(work_int + d * n);
 
   // Take transpose of observation matrix
-  RAFT_CUBLAS_TRY(cublasgeam(cublas_h,
-                             CUBLAS_OP_T,
-                             CUBLAS_OP_N,
-                             n,
-                             d,
-                             &one,
-                             obs,
-                             d,
-                             &zero,
-                             (value_type_t*)NULL,
-                             n,
-                             thrust::raw_pointer_cast(obs_copy),
-                             n,
-                             stream));
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgeam(cublas_h,
+                                                   CUBLAS_OP_T,
+                                                   CUBLAS_OP_N,
+                                                   n,
+                                                   d,
+                                                   &one,
+                                                   obs,
+                                                   d,
+                                                   &zero,
+                                                   (value_type_t*)NULL,
+                                                   n,
+                                                   thrust::raw_pointer_cast(obs_copy),
+                                                   n,
+                                                   stream));
 
   // Cluster assigned to each observation matrix entry
   thrust::sequence(thrust_exec_policy, rows, rows + d * n);
@@ -852,7 +853,9 @@ int kmeans(handle_t const& handle,
   }
 
   // Initialize cuBLAS
-  RAFT_CUBLAS_TRY(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(
+    raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // -------------------------------------------------------
   // k-means++ algorithm
diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp
index 891c367d5e..d066c68a68 100644
--- a/cpp/include/raft/spectral/lapack.hpp
+++ b/cpp/include/raft/spectral/lapack.hpp
@@ -18,8 +18,8 @@
 #include <cusolverDn.h>
 
 #include <raft/error.hpp>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/detail/cusolver_wrappers.hpp>
 
 // for now; TODO: check if/where this `define` should be;
 //
diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp
index a260e75505..75f0121795 100644
--- a/cpp/include/raft/spectral/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/matrix_wrappers.hpp
@@ -17,7 +17,7 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.h>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <rmm/device_uvector.hpp>
 
@@ -349,7 +349,8 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
     if (beta == 0) {
       CUDA_TRY(cudaMemsetAsync(y, 0, n * sizeof(value_type), stream));
     } else if (beta != 1) {
-      RAFT_CUBLAS_TRY(linalg::cublasscal(cublas_h, n, &beta, y, 1, stream));
+      // TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasscal(cublas_h, n, &beta, y, 1, stream));
     }
 
     // Apply diagonal matrix
@@ -412,7 +413,9 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
     // gamma = d'*x
     //
     // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res);
-    RAFT_CUBLAS_TRY(linalg::cublasdot(cublas_h,
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(
+      raft::linalg::detail::cublasdot(cublas_h,
                                       n,
                                       laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
                                       1,
@@ -424,7 +427,9 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
     // y = y -(gamma/edge_sum)*d
     //
     value_type gamma_ = -dot_res / edge_sum_;
-    RAFT_CUBLAS_TRY(linalg::cublasaxpy(cublas_h,
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(
+      raft::linalg::detail::cublasaxpy(cublas_h,
                                        n,
                                        &gamma_,
                                        laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp
index c61b5f1458..8188a772b8 100644
--- a/cpp/include/raft/spectral/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/modularity_maximization.hpp
@@ -160,7 +160,9 @@ void analyzeModularity(handle_t const& handle,
   vector_t<weight_t> Bx(handle, n);
 
   // Initialize cuBLAS
-  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  // #TODO: Use public API when ready
+  RAFT_CUBLAS_TRY(
+    raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // Initialize Modularity
   modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp
index a30906de10..6b57566a73 100644
--- a/cpp/include/raft/spectral/spectral_util.hpp
+++ b/cpp/include/raft/spectral/spectral_util.hpp
@@ -18,6 +18,7 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/reduce.h>
@@ -132,7 +133,9 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs,
                       thrust::minus<weight_t>());
     RAFT_CHECK_CUDA(stream);
 
-    RAFT_CUBLAS_TRY(cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream));
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(
+      raft::linalg::detail::cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream));
 
     std /= std::sqrt(static_cast<weight_t>(n));
 
@@ -149,22 +152,25 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs,
   //   TODO: in-place transpose
   {
     vector_t<weight_t> work(handle, nEigVecs * n);
-    RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
-
-    RAFT_CUBLAS_TRY(cublasgeam(cublas_h,
-                               CUBLAS_OP_T,
-                               CUBLAS_OP_N,
-                               nEigVecs,
-                               n,
-                               &one,
-                               eigVecs,
-                               n,
-                               &zero,
-                               (weight_t*)NULL,
-                               nEigVecs,
-                               work.raw(),
-                               nEigVecs,
-                               stream));
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(
+      raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgeam(cublas_h,
+                                                     CUBLAS_OP_T,
+                                                     CUBLAS_OP_N,
+                                                     nEigVecs,
+                                                     n,
+                                                     &one,
+                                                     eigVecs,
+                                                     n,
+                                                     &zero,
+                                                     (weight_t*)NULL,
+                                                     nEigVecs,
+                                                     work.raw(),
+                                                     nEigVecs,
+                                                     stream));
 
     RAFT_CUDA_TRY(cudaMemcpyAsync(
       eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream));
@@ -216,14 +222,18 @@ bool construct_indicator(handle_t const& handle,
   RAFT_CHECK_CUDA(stream);
 
   // Compute size of ith partition
-  RAFT_CUBLAS_TRY(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream));
+  // TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(
+    cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream));
 
   clustersize = round(clustersize);
   if (clustersize < 0.5) { return false; }
 
   // Compute part stats
   B.mv(1, part_i.raw(), 0, Bx.raw());
-  RAFT_CUBLAS_TRY(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream));
+  // TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(
+    raft::linalg::detail::cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream));
 
   return true;
 }
diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh
index 899e378d38..a512579c11 100644
--- a/cpp/include/raft/stats/detail/mean.cuh
+++ b/cpp/include/raft/stats/detail/mean.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/eltwise.hpp>
 
 #include <cub/cub.cuh>
 
diff --git a/cpp/include/raft/stats/detail/mean_center.cuh b/cpp/include/raft/stats/detail/mean_center.cuh
index 1a4fc20c51..db2eaf8459 100644
--- a/cpp/include/raft/stats/detail/mean_center.cuh
+++ b/cpp/include/raft/stats/detail/mean_center.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/matrix_vector_op.hpp>
 #include <raft/vectorized.cuh>
 
 namespace raft {
diff --git a/cpp/include/raft/stats/detail/meanvar.cuh b/cpp/include/raft/stats/detail/meanvar.cuh
index 1c97326d3b..7d4c68e364 100644
--- a/cpp/include/raft/stats/detail/meanvar.cuh
+++ b/cpp/include/raft/stats/detail/meanvar.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/reduce.cuh>
+#include <raft/linalg/reduce.hpp>
 
 namespace raft::stats::detail {
 
diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh
index 229eb34a7d..c07c212e54 100644
--- a/cpp/include/raft/stats/detail/stddev.cuh
+++ b/cpp/include/raft/stats/detail/stddev.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/binary_op.hpp>
 
 #include <cub/cub.cuh>
 
diff --git a/cpp/include/raft/stats/detail/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh
index 1db504965c..ad46c3bf10 100644
--- a/cpp/include/raft/stats/detail/sum.cuh
+++ b/cpp/include/raft/stats/detail/sum.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/eltwise.hpp>
 
 #include <cub/cub.cuh>
 
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 27908d8f15..176922529f 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -20,7 +20,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/distance/detail/fused_l2_nn.cuh>
 #include <raft/distance/fused_l2_nn.hpp>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/norm.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu
index b54e69df36..c277db76ee 100644
--- a/cpp/test/linalg/add.cu
+++ b/cpp/test/linalg/add.cu
@@ -18,7 +18,7 @@
 #include "add.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/add.cuh>
+#include <raft/linalg/add.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/add.cuh b/cpp/test/linalg/add.cuh
index 70e4866407..1f1ff87a4d 100644
--- a/cpp/test/linalg/add.cuh
+++ b/cpp/test/linalg/add.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/add.cuh>
+#include <raft/linalg/add.hpp>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu
index 2cf0679849..55810a5ca0 100644
--- a/cpp/test/linalg/binary_op.cu
+++ b/cpp/test/linalg/binary_op.cu
@@ -18,7 +18,7 @@
 #include "binary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/binary_op.hpp>
 #include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/test/linalg/binary_op.cuh
index c2ba8c18ee..b9ca9f8fd2 100644
--- a/cpp/test/linalg/binary_op.cuh
+++ b/cpp/test/linalg/binary_op.cuh
@@ -18,7 +18,7 @@
 
 #include "../test_utils.h"
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/binary_op.hpp>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu
index 9f07341c33..9f44cc8d5f 100644
--- a/cpp/test/linalg/cholesky_r1.cu
+++ b/cpp/test/linalg/cholesky_r1.cu
@@ -17,8 +17,8 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/cholesky_r1_update.cuh>
-#include <raft/linalg/cusolver_wrappers.h>
+#include <raft/linalg/cholesky_r1_update.hpp>
+#include <raft/linalg/detail/cusolver_wrappers.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -42,7 +42,8 @@ class CholeskyR1Test : public ::testing::Test {
 
     // Allocate workspace
     solver_handle = handle.get_cusolver_dn_handle();
-    RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnpotrf_bufferSize(
+    // TODO: Call from public API when ready
+    RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnpotrf_bufferSize(
       solver_handle, CUBLAS_FILL_MODE_LOWER, n_rows, L.data(), n_rows, &Lwork));
     int n_bytes = 0;
     // Initializing in CUBLAS_FILL_MODE_LOWER, because that has larger workspace
@@ -72,15 +73,16 @@ class CholeskyR1Test : public ::testing::Test {
 
         // Expected solution using Cholesky factorization from scratch
         raft::copy(L_exp.data(), G.data(), n, handle.get_stream());
-        RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnpotrf(solver_handle,
-                                                        uplo,
-                                                        rank,
-                                                        L_exp.data(),
-                                                        n_rows,
-                                                        (math_t*)workspace.data(),
-                                                        Lwork,
-                                                        devInfo.data(),
-                                                        handle.get_stream()));
+        // TODO: Call from public API when ready
+        RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnpotrf(solver_handle,
+                                                                uplo,
+                                                                rank,
+                                                                L_exp.data(),
+                                                                n_rows,
+                                                                (math_t*)workspace.data(),
+                                                                Lwork,
+                                                                devInfo.data(),
+                                                                handle.get_stream()));
 
         // Incremental Cholesky factorization using rank one updates.
         raft::linalg::choleskyRank1Update(
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index f7bd0b60a4..1d1a4fd864 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -19,7 +19,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/coalesced_reduction.cuh>
+#include <raft/linalg/coalesced_reduction.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu
index 51173c07f0..f07a7d05ce 100644
--- a/cpp/test/linalg/divide.cu
+++ b/cpp/test/linalg/divide.cu
@@ -18,7 +18,7 @@
 #include "unary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/divide.cuh>
+#include <raft/linalg/divide.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
index 83bca6198d..9100a3a5f6 100644
--- a/cpp/test/linalg/eig.cu
+++ b/cpp/test/linalg/eig.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eig.cuh>
+#include <raft/linalg/eig.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index b46efc38fd..4ae2653e47 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -20,7 +20,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eig.cuh>
+#include <raft/linalg/eig.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
@@ -83,15 +83,15 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
     raft::update_device(eig_vectors_ref.data(), eig_vectors_ref_h, 12, stream);
     raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, 4, stream);
 
-    eigSelDC(handle,
-             cov_matrix.data(),
-             params.n_row,
-             params.n_col,
-             3,
-             eig_vectors.data(),
-             eig_vals.data(),
-             EigVecMemUsage::OVERWRITE_INPUT,
-             stream);
+    raft::linalg::eigSelDC(handle,
+                           cov_matrix.data(),
+                           params.n_row,
+                           params.n_col,
+                           3,
+                           eig_vectors.data(),
+                           eig_vals.data(),
+                           EigVecMemUsage::OVERWRITE_INPUT,
+                           stream);
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   }
 
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu
index 56e091837a..146d48e179 100644
--- a/cpp/test/linalg/eltwise.cu
+++ b/cpp/test/linalg/eltwise.cu
@@ -17,7 +17,7 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/eltwise.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu
index d539fe9a69..72567ff5f9 100644
--- a/cpp/test/linalg/gemm_layout.cu
+++ b/cpp/test/linalg/gemm_layout.cu
@@ -17,7 +17,7 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/gemm.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/gemv.cu b/cpp/test/linalg/gemv.cu
index f32d4cf809..ea84e06675 100644
--- a/cpp/test/linalg/gemv.cu
+++ b/cpp/test/linalg/gemv.cu
@@ -17,7 +17,7 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/gemv.h>
+#include <raft/linalg/gemv.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu
index 74b812a63b..d27fad4dfc 100644
--- a/cpp/test/linalg/map.cu
+++ b/cpp/test/linalg/map.cu
@@ -17,8 +17,8 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eltwise.cuh>
-#include <raft/linalg/map.cuh>
+#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/map.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index 5b60b69d36..9875e2548f 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 #include <limits>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/map_then_reduce.cuh>
+#include <raft/linalg/map_then_reduce.hpp>
 #include <raft/random/rng.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh
index 531bf370d4..9ab005a075 100644
--- a/cpp/test/linalg/matrix_vector_op.cuh
+++ b/cpp/test/linalg/matrix_vector_op.cuh
@@ -16,7 +16,7 @@
 
 #include "../test_utils.h"
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/matrix_vector_op.hpp>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu
index 7bf797a4dc..ec0599eb1b 100644
--- a/cpp/test/linalg/multiply.cu
+++ b/cpp/test/linalg/multiply.cu
@@ -18,7 +18,7 @@
 #include "unary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/multiply.cuh>
+#include <raft/linalg/multiply.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
index 6415d4dca9..56e111d056 100644
--- a/cpp/test/linalg/norm.cu
+++ b/cpp/test/linalg/norm.cu
@@ -17,7 +17,7 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/norm.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
index fa05397555..14f34f142d 100644
--- a/cpp/test/linalg/reduce.cu
+++ b/cpp/test/linalg/reduce.cu
@@ -19,7 +19,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/reduce.cuh>
+#include <raft/linalg/reduce.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh
index 5fb7ddbc7b..7840df2c0d 100644
--- a/cpp/test/linalg/reduce.cuh
+++ b/cpp/test/linalg/reduce.cuh
@@ -17,11 +17,9 @@
 #pragma once
 
 #include <cublas_v2.h>
-
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/unary_op.cuh>
-
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
@@ -69,7 +67,8 @@ void unaryAndGemv(OutType* dots, const InType* data, int D, int N, cudaStream_t
   raft::linalg::unaryOp<OutType>(
     ones.data(), ones.data(), ones.size(), [=] __device__(OutType input) { return 1; }, stream);
   OutType alpha = 1, beta = 0;
-  RAFT_CUBLAS_TRY(raft::linalg::cublasgemv(
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(
     handle, CUBLAS_OP_N, D, N, &alpha, sq.data(), D, ones.data(), 1, &beta, dots, 1, stream));
   RAFT_CUDA_TRY(cudaDeviceSynchronize());
   RAFT_CUBLAS_TRY(cublasDestroy(handle));
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu
index f2aa02d9d9..6d33fbdef1 100644
--- a/cpp/test/linalg/strided_reduction.cu
+++ b/cpp/test/linalg/strided_reduction.cu
@@ -18,7 +18,7 @@
 #include "reduce.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/strided_reduction.cuh>
+#include <raft/linalg/strided_reduction.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu
index 9c62eeb9f1..7b82dab1ad 100644
--- a/cpp/test/linalg/subtract.cu
+++ b/cpp/test/linalg/subtract.cu
@@ -17,7 +17,7 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/subtract.cuh>
+#include <raft/linalg/subtract.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu
index 47895cbc6a..e9128bad93 100644
--- a/cpp/test/linalg/svd.cu
+++ b/cpp/test/linalg/svd.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/svd.cuh>
+#include <raft/linalg/svd.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <raft/random/rng.hpp>
 
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index 6aa83fc074..60db1ee82b 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/transpose.h>
+#include <raft/linalg/transpose.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu
index 1b132955f5..050fed78ea 100644
--- a/cpp/test/linalg/unary_op.cu
+++ b/cpp/test/linalg/unary_op.cu
@@ -18,7 +18,7 @@
 #include "unary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/random/rng.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/test/linalg/unary_op.cuh
index 8bb2d1e0be..b47e60d4f6 100644
--- a/cpp/test/linalg/unary_op.cuh
+++ b/cpp/test/linalg/unary_op.cuh
@@ -18,7 +18,7 @@
 
 #include "../test_utils.h"
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/matrix/linewise_op.cu b/cpp/test/matrix/linewise_op.cu
index 1cd00b8adc..cd0d065ad4 100644
--- a/cpp/test/matrix/linewise_op.cu
+++ b/cpp/test/matrix/linewise_op.cu
@@ -20,7 +20,7 @@
 #include <gtest/gtest.h>
 #include <raft/common/nvtx.hpp>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/matrix_vector_op.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu
index c9b15737a1..648964fc57 100644
--- a/cpp/test/sparse/connect_components.cu
+++ b/cpp/test/sparse/connect_components.cu
@@ -26,8 +26,8 @@
 #include <raft/sparse/mst/mst.cuh>
 #include <raft/sparse/selection/knn_graph.hpp>
 
-#include <raft/linalg/distance_type.h>
-#include <raft/linalg/transpose.h>
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/transpose.hpp>
 #include <raft/sparse/convert/csr.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/hierarchy/single_linkage.hpp>
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index dc136d6f18..d3f4adb01b 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -19,8 +19,8 @@
 #include <cusparse_v2.h>
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu
index f4f346561c..c8798f832f 100644
--- a/cpp/test/sparse/distance.cu
+++ b/cpp/test/sparse/distance.cu
@@ -19,7 +19,7 @@
 #include <cusparse_v2.h>
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
 
 #include <raft/sparse/distance/distance.hpp>
diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu
index bcfa796931..f0336a31fa 100644
--- a/cpp/test/sparse/knn.cu
+++ b/cpp/test/sparse/knn.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 
 #include "../test_utils.h"
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/sparse/selection/knn.hpp>
 
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu
index 81e6dc4768..cb09b9e7f5 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/sparse/linkage.cu
@@ -17,8 +17,8 @@
 #include "../test_utils.h"
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
-#include <raft/linalg/transpose.h>
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/transpose.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/hierarchy/single_linkage.hpp>
 
diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu
index 257950e4d7..66cd11be1f 100644
--- a/cpp/test/spatial/ball_cover.cu
+++ b/cpp/test/spatial/ball_cover.cu
@@ -17,7 +17,7 @@
 #include "../test_utils.h"
 #include "spatial_data.h"
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/spatial/knn/ball_cover.hpp>
 #include <raft/spatial/knn/detail/knn_brute_force_faiss.cuh>
 #if defined RAFT_NN_COMPILED
diff --git a/cpp/test/spatial/faiss_mr.cu b/cpp/test/spatial/faiss_mr.cu
index a626824621..e635619897 100644
--- a/cpp/test/spatial/faiss_mr.cu
+++ b/cpp/test/spatial/faiss_mr.cu
@@ -17,7 +17,7 @@
 #include "../test_utils.h"
 
 #include <faiss/gpu/GpuResources.h>
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/spatial/knn/knn.hpp>
 
 #include <rmm/device_buffer.hpp>
diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu
index d30c018ed2..3254d41401 100644
--- a/cpp/test/spatial/fused_l2_knn.cu
+++ b/cpp/test/spatial/fused_l2_knn.cu
@@ -19,7 +19,7 @@
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/StandardGpuResources.h>
 
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/random/rng.hpp>
 #include <raft/spatial/knn/detail/common_faiss.h>
 #include <raft/spatial/knn/detail/fused_l2_knn.cuh>
diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu
index f60ec54bbc..6b7402a7bd 100644
--- a/cpp/test/spatial/haversine.cu
+++ b/cpp/test/spatial/haversine.cu
@@ -17,7 +17,7 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <iostream>
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/spatial/knn/detail/haversine_distance.cuh>
 #include <rmm/device_uvector.hpp>
 #include <vector>
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index 8af1505bcd..ee216ee434 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -16,7 +16,7 @@
 
 #include "../test_utils.h"
 
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 
 #include <raft/spatial/knn/knn.hpp>
 #if defined RAFT_NN_COMPILED
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index 55e72656da..fd656423ad 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -17,7 +17,7 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/eltwise.hpp>
 #include <raft/random/rng.hpp>
 #include <raft/stats/sum.hpp>
 

From 23e16501e0ea914496a0f4c57310c4252f5c607c Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Tue, 8 Feb 2022 02:39:09 +0100
Subject: [PATCH 102/171] Fix integer overflow in distances (#490)

Fix for https://github.com/rapidsai/cuml/issues/4552.

Authors:
  - Rory Mitchell (https://github.com/RAMitchell)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/490
---
 .../detail/pairwise_distance_base.cuh         |  5 ++-
 cpp/test/distance/dist_canberra.cu            |  6 ++-
 cpp/test/distance/dist_chebyshev.cu           |  6 ++-
 cpp/test/distance/dist_correlation.cu         |  6 ++-
 cpp/test/distance/dist_cos.cu                 |  6 ++-
 cpp/test/distance/dist_euc_exp.cu             |  5 ++-
 cpp/test/distance/dist_euc_unexp.cu           |  5 ++-
 cpp/test/distance/dist_hamming.cu             |  6 ++-
 cpp/test/distance/dist_hellinger.cu           |  6 ++-
 cpp/test/distance/dist_jensen_shannon.cu      |  6 ++-
 cpp/test/distance/dist_kl_divergence.cu       |  6 ++-
 cpp/test/distance/dist_l1.cu                  |  6 ++-
 cpp/test/distance/dist_minkowski.cu           |  5 ++-
 cpp/test/distance/dist_russell_rao.cu         |  6 ++-
 cpp/test/distance/distance_base.cuh           | 37 ++++++++++++++++++-
 15 files changed, 101 insertions(+), 16 deletions(-)

diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index bfca731443..996cc544a6 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -276,7 +276,8 @@ struct PairwiseDistances : public BaseClass {
         for (int j = 0; j < P::AccColsPerTh; ++j) {
           auto colId = startx + j * P::AccThCols;
           if (rowId < this->m && colId < this->n) {
-            dOutput[rowId * this->n + colId] = fin_op(acc[i][j], 0);
+            // Promote to 64 bit index for final write, as output array can be > 2^31
+            dOutput[std::size_t(rowId) * this->n + colId] = fin_op(acc[i][j], 0);
           }
         }
       }
diff --git a/cpp/test/distance/dist_canberra.cu b/cpp/test/distance/dist_canberra.cu
index ca90907779..1f368fbee8 100644
--- a/cpp/test/distance/dist_canberra.cu
+++ b/cpp/test/distance/dist_canberra.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,5 +64,9 @@ TEST_P(DistanceCanberraD, Result)
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD, ::testing::ValuesIn(inputsd));
 
+class BigMatrixCanberra : public BigMatrixDistanceTest<raft::distance::DistanceType::Canberra> {
+};
+TEST_F(BigMatrixCanberra, Result) {}
+
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_chebyshev.cu b/cpp/test/distance/dist_chebyshev.cu
index 641b958d72..8f506601ca 100644
--- a/cpp/test/distance/dist_chebyshev.cu
+++ b/cpp/test/distance/dist_chebyshev.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,5 +64,9 @@ TEST_P(DistanceLinfD, Result)
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD, ::testing::ValuesIn(inputsd));
 
+class BigMatrixLinf : public BigMatrixDistanceTest<raft::distance::DistanceType::Linf> {
+};
+TEST_F(BigMatrixLinf, Result) {}
+
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_correlation.cu b/cpp/test/distance/dist_correlation.cu
index 72df5b10f4..77d770b4d1 100644
--- a/cpp/test/distance/dist_correlation.cu
+++ b/cpp/test/distance/dist_correlation.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -65,5 +65,9 @@ TEST_P(DistanceCorrelationD, Result)
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationD, ::testing::ValuesIn(inputsd));
 
+class BigMatrixCorrelation
+  : public BigMatrixDistanceTest<raft::distance::DistanceType::CorrelationExpanded> {
+};
+TEST_F(BigMatrixCorrelation, Result) {}
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_cos.cu b/cpp/test/distance/dist_cos.cu
index a085e82705..900a71e514 100644
--- a/cpp/test/distance/dist_cos.cu
+++ b/cpp/test/distance/dist_cos.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,5 +64,9 @@ TEST_P(DistanceExpCosD, Result)
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosD, ::testing::ValuesIn(inputsd));
 
+class BigMatrixCos : public BigMatrixDistanceTest<raft::distance::DistanceType::CosineExpanded> {
+};
+TEST_F(BigMatrixCos, Result) {}
+
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_euc_exp.cu b/cpp/test/distance/dist_euc_exp.cu
index f840a91bec..ff142da7fa 100644
--- a/cpp/test/distance/dist_euc_exp.cu
+++ b/cpp/test/distance/dist_euc_exp.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,5 +64,8 @@ TEST_P(DistanceEucExpTestD, Result)
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestD, ::testing::ValuesIn(inputsd));
 
+class BigMatrixEucExp : public BigMatrixDistanceTest<raft::distance::DistanceType::L2Expanded> {
+};
+TEST_F(BigMatrixEucExp, Result) {}
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_euc_unexp.cu b/cpp/test/distance/dist_euc_unexp.cu
index 6d374f3332..81e6be7116 100644
--- a/cpp/test/distance/dist_euc_unexp.cu
+++ b/cpp/test/distance/dist_euc_unexp.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -65,5 +65,8 @@ TEST_P(DistanceEucUnexpTestD, Result)
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestD, ::testing::ValuesIn(inputsd));
 
+class BigMatrixEucUnexp : public BigMatrixDistanceTest<raft::distance::DistanceType::L2Unexpanded> {
+};
+TEST_F(BigMatrixEucUnexp, Result) {}
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_hamming.cu b/cpp/test/distance/dist_hamming.cu
index e0f1efc3f7..616ce8f729 100644
--- a/cpp/test/distance/dist_hamming.cu
+++ b/cpp/test/distance/dist_hamming.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -65,5 +65,9 @@ TEST_P(DistanceHammingD, Result)
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingD, ::testing::ValuesIn(inputsd));
 
+class BigMatrixHamming
+  : public BigMatrixDistanceTest<raft::distance::DistanceType::HammingUnexpanded> {
+};
+TEST_F(BigMatrixHamming, Result) {}
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_hellinger.cu b/cpp/test/distance/dist_hellinger.cu
index caa96f189d..d6f994aaf6 100644
--- a/cpp/test/distance/dist_hellinger.cu
+++ b/cpp/test/distance/dist_hellinger.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -65,5 +65,9 @@ TEST_P(DistanceHellingerExpD, Result)
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD, ::testing::ValuesIn(inputsd));
 
+class BigMatrixHellingerExp
+  : public BigMatrixDistanceTest<raft::distance::DistanceType::HellingerExpanded> {
+};
+TEST_F(BigMatrixHellingerExp, Result) {}
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_jensen_shannon.cu b/cpp/test/distance/dist_jensen_shannon.cu
index 74b02ef18d..43e4f3aa0f 100644
--- a/cpp/test/distance/dist_jensen_shannon.cu
+++ b/cpp/test/distance/dist_jensen_shannon.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -65,5 +65,9 @@ TEST_P(DistanceJensenShannonD, Result)
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonD, ::testing::ValuesIn(inputsd));
 
+class BigMatrixJensenShannon
+  : public BigMatrixDistanceTest<raft::distance::DistanceType::JensenShannon> {
+};
+TEST_F(BigMatrixJensenShannon, Result) {}
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_kl_divergence.cu b/cpp/test/distance/dist_kl_divergence.cu
index e551eda0ab..6a5fe8d7ac 100644
--- a/cpp/test/distance/dist_kl_divergence.cu
+++ b/cpp/test/distance/dist_kl_divergence.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -65,5 +65,9 @@ TEST_P(DistanceKLDivergenceD, Result)
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceD, ::testing::ValuesIn(inputsd));
 
+class BigMatrixKLDivergence
+  : public BigMatrixDistanceTest<raft::distance::DistanceType::KLDivergence> {
+};
+TEST_F(BigMatrixKLDivergence, Result) {}
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_l1.cu b/cpp/test/distance/dist_l1.cu
index ac2ee024f6..322fb52d5c 100644
--- a/cpp/test/distance/dist_l1.cu
+++ b/cpp/test/distance/dist_l1.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,5 +64,9 @@ TEST_P(DistanceUnexpL1D, Result)
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1D, ::testing::ValuesIn(inputsd));
 
+class BigMatrixUnexpL1 : public BigMatrixDistanceTest<raft::distance::DistanceType::L1> {
+};
+TEST_F(BigMatrixUnexpL1, Result) {}
+
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_minkowski.cu b/cpp/test/distance/dist_minkowski.cu
index f0a6833f2b..3e0a2ead92 100644
--- a/cpp/test/distance/dist_minkowski.cu
+++ b/cpp/test/distance/dist_minkowski.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,5 +64,8 @@ TEST_P(DistanceLpUnexpD, Result)
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpD, ::testing::ValuesIn(inputsd));
 
+class BigMatrixLpUnexp : public BigMatrixDistanceTest<raft::distance::DistanceType::LpUnexpanded> {
+};
+TEST_F(BigMatrixLpUnexp, Result) {}
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_russell_rao.cu b/cpp/test/distance/dist_russell_rao.cu
index 42234d4f0b..e92a01c70a 100644
--- a/cpp/test/distance/dist_russell_rao.cu
+++ b/cpp/test/distance/dist_russell_rao.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -65,5 +65,9 @@ TEST_P(DistanceRussellRaoD, Result)
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoD, ::testing::ValuesIn(inputsd));
 
+class BigMatrixRussellRao
+  : public BigMatrixDistanceTest<raft::distance::DistanceType::RusselRaoExpanded> {
+};
+TEST_F(BigMatrixRussellRao, Result) {}
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index 8f0de29eed..4c9d5b11cc 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -476,5 +476,40 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
   rmm::device_uvector<DataType> x, y, dist_ref, dist, dist2;
 };
 
+template <raft::distance::DistanceType distanceType>
+class BigMatrixDistanceTest : public ::testing::Test {
+ public:
+  BigMatrixDistanceTest()
+    : x(m * k, handle.get_stream()), dist(std::size_t(m) * m, handle.get_stream()){};
+  void SetUp() override
+  {
+    auto testInfo = testing::UnitTest::GetInstance()->current_test_info();
+    common::nvtx::range fun_scope("test::%s/%s", testInfo->test_suite_name(), testInfo->name());
+
+    size_t worksize = raft::distance::getWorkspaceSize<distanceType, float, float, float>(
+      x.data(), x.data(), m, n, k);
+    rmm::device_uvector<char> workspace(worksize, handle.get_stream());
+    raft::distance::distance<distanceType, float, float, float>(x.data(),
+                                                                x.data(),
+                                                                dist.data(),
+                                                                m,
+                                                                n,
+                                                                k,
+                                                                workspace.data(),
+                                                                worksize,
+                                                                handle.get_stream(),
+                                                                true,
+                                                                0.0f);
+
+    RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
+  }
+
+ protected:
+  int m = 48000;
+  int n = 48000;
+  int k = 1;
+  raft::handle_t handle;
+  rmm::device_uvector<float> x, dist;
+};
 }  // end namespace distance
 }  // end namespace raft

From 1a49fc1bba8ccfb87c7a13b400665b337a1fcd66 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Tue, 8 Feb 2022 13:25:42 +0100
Subject: [PATCH 103/171] Interruptible execution (#433)

### Cooperative-style interruptible C++ threads.

This proposal introduces `raft::interruptible` introducing three functions:
```C++
static void synchronize(rmm::cuda_stream_view stream);
static void yield();
static void cancel(std::thread::id thread_id);
```
`synchronize` and `yield` serve as cancellation points for the executing CPU thread. `cancel` allows to throw an async exception in a target CPU thread, which is observed in the nearest cancellation point. Altogether, these allow to cancel a long-running job without killing the OS process.

The key to make this work is an obvious observation that the CPU spends most of the time waiting on `cudaStreamSynchronize`. By replacing that with `interruptible::synchronize`, we introduce cancellation points in all critical places in code. If that is not enough in some edge cases (the cancellation points are too far apart), a developer can use `yield` to ensure that a cancellation request is received sooner rather than later.

#### Implementation

##### C++

`raft::interruptible` keeps an `std::atomic_flag` in the thread-local storage in each thread, which tells whether the thread can continue executing (being in non-cancelled state). [`cancel`](https://github.com/rapidsai/raft/blob/6948cab96483ddc7047b1ae0a162574e32bcd8f0/cpp/include/raft/interruptible.hpp#L122) clears this flag, and [`yield`](https://github.com/rapidsai/raft/blob/6948cab96483ddc7047b1ae0a162574e32bcd8f0/cpp/include/raft/interruptible.hpp#L194-L204) checks it and resets to the signalled state (throwing a `raft::interrupted_exception` exception if necessary). [`synchronize`](https://github.com/rapidsai/raft/blob/6948cab96483ddc7047b1ae0a162574e32bcd8f0/cpp/include/raft/interruptible.hpp#L206-L217) implements a spinning lock querying the state of the stream and `yield`ing on each iteration. I also add an overload [`sync_stream`](https://github.com/rapidsai/raft/blob/ee99523ff6a8257ec213e5ad15292f2132a2a687/cpp/include/raft/handle.hpp#L133) to the raft handle type, to make it easier to modify the behavior of all synchronization calls in raft and cuml.

##### python
This proposal adds a context manager [`cuda_interruptible`](https://github.com/rapidsai/raft/blob/36e8de5f73e9ec7e604b38a4290ac82bc35be4b7/python/raft/common/interruptible.pyx#L28) to handle Ctrl+C requests during C++ calls (using posix signals). `cuda_interruptible` simply calls `raft::interruptible::cancel` on the target C++ thread.

#### Motivation
See https://github.com/rapidsai/cuml/pull/4463

Resolves https://github.com/rapidsai/cuml/issues/4384

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Tamas Bela Feher (https://github.com/tfeher)

URL: https://github.com/rapidsai/raft/pull/433
---
 cpp/CMakeLists.txt                            |  14 +-
 cpp/cmake/modules/ConfigureCUDA.cmake         |   6 +-
 cpp/include/raft/comms/detail/mpi_comms.hpp   |  35 +--
 cpp/include/raft/comms/detail/std_comms.hpp   |  37 +--
 cpp/include/raft/comms/detail/test.hpp        |  18 +-
 cpp/include/raft/comms/detail/util.hpp        |  40 ++-
 cpp/include/raft/handle.hpp                   |  14 +-
 cpp/include/raft/interruptible.hpp            | 266 ++++++++++++++++++
 .../raft/linalg/detail/cholesky_r1_update.hpp |   4 +-
 cpp/include/raft/linalg/detail/lanczos.hpp    |   2 +-
 cpp/include/raft/linalg/detail/svd.hpp        |   2 +-
 cpp/include/raft/mr/buffer_base.hpp           |   2 +-
 .../sparse/hierarchy/detail/agglomerative.cuh |   2 +-
 .../raft/sparse/linalg/detail/spectral.cuh    |   2 +-
 cpp/include/raft/sparse/op/detail/reduce.cuh  |   2 +-
 .../selection/detail/connect_components.cuh   |   2 +-
 cpp/test/CMakeLists.txt                       |   1 +
 cpp/test/distance/dist_adj.cu                 |   2 +-
 cpp/test/distance/distance_base.cuh           |   2 +-
 cpp/test/handle.cpp                           |   2 +-
 cpp/test/interruptible.cu                     | 144 ++++++++++
 cpp/test/linalg/add.cu                        |   2 +-
 cpp/test/linalg/binary_op.cu                  |   2 +-
 cpp/test/linalg/coalesced_reduction.cu        |   2 +-
 cpp/test/linalg/divide.cu                     |   2 +-
 cpp/test/linalg/eig.cu                        |   2 +-
 cpp/test/linalg/eig_sel.cu                    |   4 +-
 cpp/test/linalg/eltwise.cu                    |   4 +-
 cpp/test/linalg/map_then_reduce.cu            |   2 +-
 cpp/test/linalg/matrix_vector_op.cu           |   2 +-
 cpp/test/linalg/multiply.cu                   |   2 +-
 cpp/test/linalg/norm.cu                       |   4 +-
 cpp/test/linalg/reduce.cu                     |   2 +-
 cpp/test/linalg/strided_reduction.cu          |   2 +-
 cpp/test/linalg/subtract.cu                   |   2 +-
 cpp/test/linalg/svd.cu                        |   2 +-
 cpp/test/linalg/transpose.cu                  |   2 +-
 cpp/test/linalg/unary_op.cu                   |   4 +-
 cpp/test/matrix/math.cu                       |   2 +-
 cpp/test/matrix/matrix.cu                     |   2 +-
 cpp/test/random/rng_int.cu                    |   4 +-
 cpp/test/random/sample_without_replacement.cu |   2 +-
 cpp/test/sparse/connect_components.cu         |   2 +-
 cpp/test/sparse/csr_row_slice.cu              |   4 +-
 cpp/test/sparse/csr_transpose.cu              |   2 +-
 cpp/test/sparse/knn_graph.cu                  |   2 +-
 cpp/test/sparse/linkage.cu                    |   2 +-
 cpp/test/sparse/symmetrize.cu                 |   2 +-
 cpp/test/spatial/haversine.cu                 |   2 +-
 cpp/test/spatial/knn.cu                       |   2 +-
 cpp/test/spatial/selection.cu                 |   2 +-
 cpp/test/stats/mean_center.cu                 |   2 +-
 cpp/test/stats/stddev.cu                      |   2 +-
 cpp/test/stats/sum.cu                         |   2 +-
 docs/source/cpp_api/core.rst                  |   8 +
 python/raft/common/interruptible.pxd          |  34 +++
 python/raft/common/interruptible.pyx          |  84 ++++++
 python/raft/test/test_interruptible.py        |  54 ++++
 python/setup.py                               |   4 +-
 59 files changed, 719 insertions(+), 142 deletions(-)
 create mode 100644 cpp/include/raft/interruptible.hpp
 create mode 100644 cpp/test/interruptible.cu
 create mode 100644 python/raft/common/interruptible.pxd
 create mode 100644 python/raft/common/interruptible.pyx
 create mode 100644 python/raft/test/test_interruptible.py

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 8acd9c0099..ea0ef2c2f1 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -84,6 +84,13 @@ endif()
 ##############################################################################
 # - compiler options ---------------------------------------------------------
 
+if (NOT DISABLE_OPENMP)
+  find_package(OpenMP)
+  if(OPENMP_FOUND)
+    message(VERBOSE "RAFT: OpenMP found in ${OpenMP_CXX_INCLUDE_DIRS}")
+  endif()
+endif()
+
 # * find CUDAToolkit package
 # * determine GPU architectures
 # * enable the CMake CUDA language
@@ -97,13 +104,6 @@ include(cmake/modules/ConfigureCUDA.cmake)
 ##############################################################################
 # - Requirements -------------------------------------------------------------
 
-if (NOT DISABLE_OPENMP)
-  find_package(OpenMP)
-  if(OPENMP_FOUND)
-    message(VERBOSE "RAFT: OpenMP found in ${OpenMP_CXX_INCLUDE_DIRS}")
-  endif()
-endif()
-
 # add third party dependencies using CPM
 rapids_cpm_init()
 
diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index a9163a474f..5984c424e7 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -38,6 +38,10 @@ if(CUDA_ENABLE_LINEINFO)
     list(APPEND RAFT_CUDA_FLAGS -lineinfo)
 endif()
 
+if(OpenMP_FOUND)
+    list(APPEND RAFT_CUDA_FLAGS -Xcompiler=${OpenMP_CXX_FLAGS})
+endif()
+
 # Debug options
 if(CMAKE_BUILD_TYPE MATCHES Debug)
     message(VERBOSE "RAFT: Building with debugging flags")
diff --git a/cpp/include/raft/comms/detail/mpi_comms.hpp b/cpp/include/raft/comms/detail/mpi_comms.hpp
index 3bfd72baf9..b0da532f0a 100644
--- a/cpp/include/raft/comms/detail/mpi_comms.hpp
+++ b/cpp/include/raft/comms/detail/mpi_comms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -333,38 +333,7 @@ class mpi_comms : public comms_iface {
                                     stream));
   }
 
-  status_t sync_stream(cudaStream_t stream) const
-  {
-    cudaError_t cudaErr;
-    ncclResult_t ncclErr, ncclAsyncErr;
-    while (1) {
-      cudaErr = cudaStreamQuery(stream);
-      if (cudaErr == cudaSuccess) return status_t::SUCCESS;
-
-      if (cudaErr != cudaErrorNotReady) {
-        // An error occurred querying the status of the stream
-        return status_t::ERROR;
-      }
-
-      ncclErr = ncclCommGetAsyncError(nccl_comm_, &ncclAsyncErr);
-      if (ncclErr != ncclSuccess) {
-        // An error occurred retrieving the asynchronous error
-        return status_t::ERROR;
-      }
-
-      if (ncclAsyncErr != ncclSuccess) {
-        // An asynchronous error happened. Stop the operation and destroy
-        // the communicator
-        ncclErr = ncclCommAbort(nccl_comm_);
-        if (ncclErr != ncclSuccess)
-          // Caller may abort with an exception or try to re-create a new communicator.
-          return status_t::ABORT;
-      }
-
-      // Let other threads (including NCCL threads) use the CPU.
-      pthread_yield();
-    }
-  };
+  status_t sync_stream(cudaStream_t stream) const { return nccl_sync_stream(nccl_comm_, stream); }
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
   void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const
diff --git a/cpp/include/raft/comms/detail/std_comms.hpp b/cpp/include/raft/comms/detail/std_comms.hpp
index 758a9d3781..d8b0f2090c 100644
--- a/cpp/include/raft/comms/detail/std_comms.hpp
+++ b/cpp/include/raft/comms/detail/std_comms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -441,38 +441,7 @@ class std_comms : public comms_iface {
                                     stream));
   }
 
-  status_t sync_stream(cudaStream_t stream) const
-  {
-    cudaError_t cudaErr;
-    ncclResult_t ncclErr, ncclAsyncErr;
-    while (1) {
-      cudaErr = cudaStreamQuery(stream);
-      if (cudaErr == cudaSuccess) return status_t::SUCCESS;
-
-      if (cudaErr != cudaErrorNotReady) {
-        // An error occurred querying the status of the stream_
-        return status_t::ERROR;
-      }
-
-      ncclErr = ncclCommGetAsyncError(nccl_comm_, &ncclAsyncErr);
-      if (ncclErr != ncclSuccess) {
-        // An error occurred retrieving the asynchronous error
-        return status_t::ERROR;
-      }
-
-      if (ncclAsyncErr != ncclSuccess) {
-        // An asynchronous error happened. Stop the operation and destroy
-        // the communicator
-        ncclErr = ncclCommAbort(nccl_comm_);
-        if (ncclErr != ncclSuccess)
-          // Caller may abort with an exception or try to re-create a new communicator.
-          return status_t::ABORT;
-      }
-
-      // Let other threads (including NCCL threads) use the CPU.
-      std::this_thread::yield();
-    }
-  }
+  status_t sync_stream(cudaStream_t stream) const { return nccl_sync_stream(nccl_comm_, stream); }
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
   void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const
@@ -553,4 +522,4 @@ class std_comms : public comms_iface {
 };
 }  // namespace detail
 }  // end namespace comms
-}  // end namespace raft
\ No newline at end of file
+}  // end namespace raft
diff --git a/cpp/include/raft/comms/detail/test.hpp b/cpp/include/raft/comms/detail/test.hpp
index cd84d2becd..d81d7c80fb 100644
--- a/cpp/include/raft/comms/detail/test.hpp
+++ b/cpp/include/raft/comms/detail/test.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,7 +53,7 @@ bool test_collective_allreduce(const handle_t& handle, int root)
 
   int temp_h = 0;
   RAFT_CUDA_TRY(cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream));
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  handle.sync_stream(stream);
   communicator.barrier();
 
   std::cout << "Clique size: " << communicator.get_size() << std::endl;
@@ -88,7 +88,7 @@ bool test_collective_broadcast(const handle_t& handle, int root)
   int temp_h = -1;  // Verify more than one byte is being sent
   RAFT_CUDA_TRY(
     cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  handle.sync_stream(stream);
   communicator.barrier();
 
   std::cout << "Clique size: " << communicator.get_size() << std::endl;
@@ -121,7 +121,7 @@ bool test_collective_reduce(const handle_t& handle, int root)
   int temp_h = -1;  // Verify more than one byte is being sent
   RAFT_CUDA_TRY(
     cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  handle.sync_stream(stream);
   communicator.barrier();
 
   std::cout << "Clique size: " << communicator.get_size() << std::endl;
@@ -158,7 +158,7 @@ bool test_collective_allgather(const handle_t& handle, int root)
   int temp_h[communicator.get_size()];  // Verify more than one byte is being sent
   RAFT_CUDA_TRY(cudaMemcpyAsync(
     &temp_h, recv_d.data(), sizeof(int) * communicator.get_size(), cudaMemcpyDeviceToHost, stream));
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  handle.sync_stream(stream);
   communicator.barrier();
 
   std::cout << "Clique size: " << communicator.get_size() << std::endl;
@@ -198,7 +198,7 @@ bool test_collective_gather(const handle_t& handle, int root)
     std::vector<int> temp_h(communicator.get_size(), 0);
     RAFT_CUDA_TRY(cudaMemcpyAsync(
       temp_h.data(), recv_d.data(), sizeof(int) * temp_h.size(), cudaMemcpyDeviceToHost, stream));
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
 
     for (int i = 0; i < communicator.get_size(); i++) {
       if (temp_h[i] != i) return false;
@@ -253,7 +253,7 @@ bool test_collective_gatherv(const handle_t& handle, int root)
                                   sizeof(int) * displacements.back(),
                                   cudaMemcpyDeviceToHost,
                                   stream));
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
 
     for (int i = 0; i < communicator.get_size(); i++) {
       if (std::count_if(temp_h.begin() + displacements[i],
@@ -292,7 +292,7 @@ bool test_collective_reducescatter(const handle_t& handle, int root)
   int temp_h = -1;  // Verify more than one byte is being sent
   RAFT_CUDA_TRY(
     cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  handle.sync_stream(stream);
   communicator.barrier();
 
   std::cout << "Clique size: " << communicator.get_size() << std::endl;
@@ -502,7 +502,7 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrial
 
     std::vector<int> h_received_data(communicator.get_size());
     raft::update_host(h_received_data.data(), received_data.data(), received_data.size(), stream);
-    CUDA_TRY(cudaStreamSynchronize(stream));
+    h.sync_stream(stream);
     for (int i = 0; i < communicator.get_size(); ++i) {
       if (h_received_data[i] != i) { ret = false; }
     }
diff --git a/cpp/include/raft/comms/detail/util.hpp b/cpp/include/raft/comms/detail/util.hpp
index 1c0d152016..7bd60cf8e1 100644
--- a/cpp/include/raft/comms/detail/util.hpp
+++ b/cpp/include/raft/comms/detail/util.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <raft/interruptible.hpp>
+
 #include <nccl.h>
 #include <raft/error.hpp>
 #include <string>
@@ -109,6 +111,42 @@ get_nccl_op(const op_t op)
     default: throw "Unsupported datatype";
   }
 }
+
+status_t nccl_sync_stream(ncclComm_t comm, cudaStream_t stream)
+{
+  cudaError_t cudaErr;
+  ncclResult_t ncclErr, ncclAsyncErr;
+  while (1) {
+    cudaErr = cudaStreamQuery(stream);
+    if (cudaErr == cudaSuccess) return status_t::SUCCESS;
+
+    if (cudaErr != cudaErrorNotReady) {
+      // An error occurred querying the status of the stream_
+      return status_t::ERROR;
+    }
+
+    ncclErr = ncclCommGetAsyncError(comm, &ncclAsyncErr);
+    if (ncclErr != ncclSuccess) {
+      // An error occurred retrieving the asynchronous error
+      return status_t::ERROR;
+    }
+
+    if (ncclAsyncErr != ncclSuccess || !interruptible::yield_no_throw()) {
+      // An asynchronous error happened. Stop the operation and destroy
+      // the communicator
+      ncclErr = ncclCommAbort(comm);
+      if (ncclErr != ncclSuccess)
+        // Caller may abort with an exception or try to re-create a new communicator.
+        return status_t::ABORT;
+      // TODO: shouldn't we place status_t::ERROR above under the condition, and
+      //       status_t::ABORT below here (i.e. after successful ncclCommAbort)?
+    }
+
+    // Let other threads (including NCCL threads) use the CPU.
+    std::this_thread::yield();
+  }
+}
+
 };  // namespace detail
 };  // namespace comms
 };  // namespace raft
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 22e9e78ebe..015d422f9a 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,6 +35,7 @@
 #include "cudart_utils.h"
 
 #include <raft/comms/comms.hpp>
+#include <raft/interruptible.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/detail/cusolver_wrappers.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
@@ -127,10 +128,15 @@ class handle_t {
 
   rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; }
 
+  /**
+   * @brief synchronize a stream on the handle
+   */
+  void sync_stream(rmm::cuda_stream_view stream) const { interruptible::synchronize(stream); }
+
   /**
    * @brief synchronize main stream on the handle
    */
-  void sync_stream() const { stream_view_.synchronize(); }
+  void sync_stream() const { sync_stream(stream_view_); }
 
   /**
    * @brief returns main stream on the handle
@@ -199,7 +205,7 @@ class handle_t {
   void sync_stream_pool() const
   {
     for (std::size_t i = 0; i < get_stream_pool_size(); i++) {
-      stream_pool_->get_stream(i).synchronize();
+      sync_stream(stream_pool_->get_stream(i));
     }
   }
 
@@ -212,7 +218,7 @@ class handle_t {
   {
     RAFT_EXPECTS(stream_pool_, "ERROR: rmm::cuda_stream_pool was not initialized");
     for (const auto& stream_index : stream_indices) {
-      stream_pool_->get_stream(stream_index).synchronize();
+      sync_stream(stream_pool_->get_stream(stream_index));
     }
   }
 
diff --git a/cpp/include/raft/interruptible.hpp b/cpp/include/raft/interruptible.hpp
new file mode 100644
index 0000000000..7ff5ca0c88
--- /dev/null
+++ b/cpp/include/raft/interruptible.hpp
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <raft/cudart_utils.h>
+#include <raft/error.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <thread>
+#include <unordered_map>
+
+namespace raft {
+
+/**
+ * @brief Exception thrown during `interruptible::synchronize` call when it detects a request
+ * to cancel the work performed in this CPU thread.
+ */
+struct interrupted_exception : public raft::exception {
+  using raft::exception::exception;
+};
+
+/**
+ * @brief Cooperative-style interruptible execution.
+ *
+ * This class provides facilities for interrupting execution of a C++ thread at designated points
+ * in code from outside of the thread. In particular, it provides an interruptible version of the
+ * blocking CUDA synchronization function, that allows dropping a long-running GPU work.
+ *
+ *
+ * **Important:** Although CUDA synchronize calls serve as cancellation points, the interruptible
+ * machinery has nothing to do with CUDA streams or events. In other words, when you call `cancel`,
+ * it’s the CPU waiting function what is interrupted, not the GPU stream work. This means, when the
+ * `interrupted_exception` is raised, any unfinished GPU stream work continues to run. It’s the
+ * responsibility of the developer then to make sure the unfinished stream work does not affect the
+ * program in an undesirable way.
+ *
+ *
+ * What can happen to CUDA stream when the `synchronize` is cancelled? If you catch the
+ * `interrupted_exception` immediately, you can safely wait on the stream again.
+ * Otherwise, some of the allocated resources may be released before the active kernel finishes
+ * using them, which will result in writing into deallocated or reallocated memory and undefined
+ * behavior in general. A dead-locked kernel may never finish (or may crash if you’re lucky). In
+ * practice, the outcome is usually acceptable for the use case of emergency program interruption
+ * (e.g., CTRL+C), but extra effort on the use side is required to allow safe interrupting and
+ * resuming of the GPU stream work.
+ */
+class interruptible {
+ public:
+  /**
+   * @brief Synchronize the CUDA stream, subject to being interrupted by `interruptible::cancel`
+   * called on this CPU thread.
+   *
+   * @param [in] stream a CUDA stream.
+   *
+   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+   * thread before the currently captured work has been finished.
+   * @throw raft::cuda_error if another CUDA error happens.
+   */
+  static inline void synchronize(rmm::cuda_stream_view stream)
+  {
+    get_token()->synchronize_impl(cudaStreamQuery, stream);
+  }
+
+  /**
+   * @brief Synchronize the CUDA event, subject to being interrupted by `interruptible::cancel`
+   * called on this CPU thread.
+   *
+   * @param [in] event a CUDA event.
+   *
+   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+   * thread before the currently captured work has been finished.
+   * @throw raft::cuda_error if another CUDA error happens.
+   */
+  static inline void synchronize(cudaEvent_t event)
+  {
+    get_token()->synchronize_impl(cudaEventQuery, event);
+  }
+
+  /**
+   * @brief Check the thread state, whether the thread can continue execution or is interrupted by
+   * `interruptible::cancel`.
+   *
+   * This is a cancellation point for an interruptible thread. It's called in the internals of
+   * `interruptible::synchronize` in a loop. If two synchronize calls are far apart, it's
+   * recommended to call `interruptible::yield()` in between to make sure the thread does not become
+   * unresponsive for too long.
+   *
+   * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
+   *
+   * @throw raft::interrupted_exception if interruptible::cancel() was called on the current CPU
+   * thread.
+   */
+  static inline void yield() { get_token()->yield_impl(); }
+
+  /**
+   * @brief Check the thread state, whether the thread can continue execution or is interrupted by
+   * `interruptible::cancel`.
+   *
+   * Same as `interruptible::yield`, but does not throw an exception if the thread is cancelled.
+   *
+   * Both `yield` and `yield_no_throw` reset the state to non-cancelled after execution.
+   *
+   * @return whether the thread can continue, i.e. `true` means continue, `false` means cancelled.
+   */
+  static inline auto yield_no_throw() -> bool { return get_token()->yield_no_throw_impl(); }
+
+  /**
+   * @brief Get a cancellation token for this CPU thread.
+   *
+   * @return an object that can be used to cancel the GPU work waited on this CPU thread.
+   */
+  static inline auto get_token() -> std::shared_ptr<interruptible>
+  {
+    // NB: using static thread-local storage to keep the token alive once it is initialized
+    static thread_local std::shared_ptr<interruptible> s(
+      get_token_impl<true>(std::this_thread::get_id()));
+    return s;
+  }
+
+  /**
+   * @brief Get a cancellation token for a CPU thread given by its id.
+   *
+   * The returned token may live longer than the associated thread. In that case, using its
+   * `cancel` method has no effect.
+   *
+   * @param [in] thread_id an id of a C++ CPU thread.
+   * @return an object that can be used to cancel the GPU work waited on the given CPU thread.
+   */
+  static inline auto get_token(std::thread::id thread_id) -> std::shared_ptr<interruptible>
+  {
+    return get_token_impl<false>(thread_id);
+  }
+
+  /**
+   * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
+   * CPU thread given by the `thread_id`
+   *
+   * Note, this function uses a mutex to safely get a cancellation token that may be shared
+   * among multiple threads. If you plan to use it from a signal handler, consider the non-static
+   * `cancel()` instead.
+   *
+   * @param [in] thread_id a CPU thread, in which the work should be interrupted.
+   */
+  static inline void cancel(std::thread::id thread_id) { get_token(thread_id)->cancel(); }
+
+  /**
+   * @brief Cancel any current or next call to `interruptible::synchronize` performed on the
+   * CPU thread given by this `interruptible` token.
+   *
+   * Note, this function does not involve thread synchronization/locks and does not throw any
+   * exceptions, so it's safe to call from a signal handler.
+   */
+  inline void cancel() noexcept { continue_.clear(std::memory_order_relaxed); }
+
+  // don't allow the token to leave the shared_ptr
+  interruptible(interruptible const&) = delete;
+  interruptible(interruptible&&)      = delete;
+  auto operator=(interruptible const&) -> interruptible& = delete;
+  auto operator=(interruptible&&) -> interruptible& = delete;
+
+ private:
+  /** Global registry of thread-local cancellation stores. */
+  static inline std::unordered_map<std::thread::id, std::weak_ptr<interruptible>> registry_;
+  /** Protect the access to the registry. */
+  static inline std::mutex mutex_;
+
+  /**
+   * Create a new interruptible token or get an existing from the global registry_.
+   *
+   * Presumptions:
+   *
+   *   1. get_token_impl<true> must be called at most once per thread.
+   *   2. When `Claim == true`, thread_id must be equal to std::this_thread::get_id().
+   *   3. get_token_impl<false> can be called as many times as needed, producing a valid
+   *      token for any input thread_id, independent of whether a C++ thread with this
+   *      id exists or not.
+   *
+   * @tparam Claim whether to bind the token to the given thread.
+   * @param [in] thread_id the id of the associated C++ thread.
+   * @return new or existing interruptible token.
+   */
+  template <bool Claim>
+  static auto get_token_impl(std::thread::id thread_id) -> std::shared_ptr<interruptible>
+  {
+    std::lock_guard<std::mutex> guard_get(mutex_);
+    // the following constructs an empty shared_ptr if the key does not exist.
+    auto& weak_store  = registry_[thread_id];
+    auto thread_store = weak_store.lock();
+    if (!thread_store || (Claim && thread_store->claimed_)) {
+      // Create a new thread_store in two cases:
+      //  1. It does not exist in the map yet
+      //  2. The previous store in the map has not yet been deleted
+      thread_store.reset(new interruptible(), [thread_id](auto ts) {
+        std::lock_guard<std::mutex> guard_erase(mutex_);
+        auto found = registry_.find(thread_id);
+        if (found != registry_.end()) {
+          auto stored = found->second.lock();
+          // thread_store is not moveable, thus retains its original location.
+          // Not equal pointers below imply the new store has been already placed
+          // in the registry_ by the same std::thread::id
+          if (!stored || stored.get() == ts) { registry_.erase(found); }
+        }
+        delete ts;
+      });
+      std::weak_ptr<interruptible>(thread_store).swap(weak_store);
+    }
+    // The thread_store is "claimed" by the thread
+    if constexpr (Claim) { thread_store->claimed_ = true; }
+    return thread_store;
+  }
+
+  /**
+   * Communicate whether the thread is in a cancelled state or can continue execution.
+   *
+   * `yield` checks this flag and always resets it to the signalled state; `cancel` clears it.
+   * These are the only two places where it's used.
+   */
+  std::atomic_flag continue_;
+  /** This flag is set to true when the created token is placed into a thread-local storage. */
+  bool claimed_ = false;
+
+  interruptible() noexcept { yield_no_throw_impl(); }
+
+  void yield_impl()
+  {
+    if (!yield_no_throw_impl()) {
+      throw interrupted_exception("The work in this thread was cancelled.");
+    }
+  }
+
+  auto yield_no_throw_impl() noexcept -> bool
+  {
+    return continue_.test_and_set(std::memory_order_relaxed);
+  }
+
+  template <typename Query, typename Object>
+  inline void synchronize_impl(Query query, Object object)
+  {
+    cudaError_t query_result;
+    while (true) {
+      yield_impl();
+      query_result = query(object);
+      if (query_result != cudaErrorNotReady) { break; }
+      std::this_thread::yield();
+    }
+    RAFT_CUDA_TRY(query_result);
+  }
+};
+
+}  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
index 335544e094..48993886a6 100644
--- a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
+++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
@@ -112,7 +112,7 @@ void choleskyRank1Update(const raft::handle_t& handle,
   math_t L_22_host;
   raft::update_host(&s_host, s, 1, stream);
   raft::update_host(&L_22_host, L_22, 1, stream);  // L_22 stores A_22
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  handle.sync_stream(stream);
   L_22_host = std::sqrt(L_22_host - s_host);
 
   // Check for numeric error with sqrt. If the matrix is not positive definit or
@@ -126,4 +126,4 @@ void choleskyRank1Update(const raft::handle_t& handle,
 
 }  // namespace detail
 }  // namespace linalg
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/lanczos.hpp b/cpp/include/raft/linalg/detail/lanczos.hpp
index c761c06c14..3d8fde7e68 100644
--- a/cpp/include/raft/linalg/detail/lanczos.hpp
+++ b/cpp/include/raft/linalg/detail/lanczos.hpp
@@ -273,7 +273,7 @@ int performLanczosIteration(handle_t const& handle,
     RAFT_CUBLAS_TRY(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
   }
 
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  handle.sync_stream(stream);
 
   return 0;
 }
diff --git a/cpp/include/raft/linalg/detail/svd.hpp b/cpp/include/raft/linalg/detail/svd.hpp
index 796adc89ff..5d349cd101 100644
--- a/cpp/include/raft/linalg/detail/svd.hpp
+++ b/cpp/include/raft/linalg/detail/svd.hpp
@@ -101,7 +101,7 @@ void svdQR(const raft::handle_t& handle,
 
   int dev_info;
   raft::update_host(&dev_info, devInfo.data(), 1, stream);
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  handle.sync_stream(stream);
   ASSERT(dev_info == 0,
          "svd.cuh: svd couldn't converge to a solution. "
          "This usually occurs when some of the features do not vary enough.");
diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp
index 6998c1f186..11724bed00 100644
--- a/cpp/include/raft/mr/buffer_base.hpp
+++ b/cpp/include/raft/mr/buffer_base.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
index 4e78494e6b..31ebe38d85 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
@@ -119,7 +119,7 @@ void build_dendrogram_host(const handle_t& handle,
   update_host(mst_dst_h.data(), cols, n_edges, stream);
   update_host(mst_weights_h.data(), data, n_edges, stream);
 
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  handle.sync_stream(stream);
 
   std::vector<value_idx> children_h(n_edges * 2);
   std::vector<value_idx> out_size_h(n_edges);
diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
index de62f25ffa..9d1741fab7 100644
--- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -51,7 +51,7 @@ void fit_embedding(const raft::handle_t& handle,
   rmm::device_uvector<T> eigVecs(n * (n_components + 1), stream);
   rmm::device_uvector<int> labels(n, stream);
 
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  handle.sync_stream(stream);
 
   /**
    * Raft spectral clustering
diff --git a/cpp/include/raft/sparse/op/detail/reduce.cuh b/cpp/include/raft/sparse/op/detail/reduce.cuh
index 074a139ba9..ba728f54c8 100644
--- a/cpp/include/raft/sparse/op/detail/reduce.cuh
+++ b/cpp/include/raft/sparse/op/detail/reduce.cuh
@@ -147,7 +147,7 @@ void max_duplicates(const raft::handle_t& handle,
   // compute final size
   value_idx size = 0;
   raft::update_host(&size, diff.data() + (diff.size() - 1), 1, stream);
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  handle.sync_stream(stream);
   size++;
 
   out.allocate(size, m, n, true, stream);
diff --git a/cpp/include/raft/sparse/selection/detail/connect_components.cuh b/cpp/include/raft/sparse/selection/detail/connect_components.cuh
index b56b2df02e..afbb7f17b3 100644
--- a/cpp/include/raft/sparse/selection/detail/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/detail/connect_components.cuh
@@ -415,7 +415,7 @@ void connect_components(
   // compute final size
   value_idx size = 0;
   raft::update_host(&size, out_index.data() + (out_index.size() - 1), 1, stream);
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  handle.sync_stream(stream);
 
   size++;
 
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 369aac1e7c..a3df5c7a4b 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -36,6 +36,7 @@ add_executable(test_raft
     test/eigen_solvers.cu
     test/handle.cpp
     test/integer_utils.cpp
+    test/interruptible.cu
     test/nvtx.cpp
     test/pow2_utils.cu
     test/label/label.cu
diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index 8637d1f6bb..3bfc70ccf0 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -128,7 +128,7 @@ class DistanceAdjTest : public ::testing::TestWithParam<DistanceAdjInputs<DataTy
       fin_op,
       stream,
       isRowMajor);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
   void TearDown() override {}
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index 4c9d5b11cc..afac15522f 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -465,7 +465,7 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
                                              stream,
                                              isRowMajor,
                                              metric_arg);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp
index 22816d0aad..118002dba0 100644
--- a/cpp/test/handle.cpp
+++ b/cpp/test/handle.cpp
@@ -47,7 +47,7 @@ TEST(Raft, Handle)
   rmm::cuda_stream_view stream_view(stream);
   handle_t handle(stream_view);
   ASSERT_EQ(stream_view, handle.get_stream());
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  handle.sync_stream(stream);
   RAFT_CUDA_TRY(cudaStreamDestroy(stream));
 }
 
diff --git a/cpp/test/interruptible.cu b/cpp/test/interruptible.cu
new file mode 100644
index 0000000000..92adfabd55
--- /dev/null
+++ b/cpp/test/interruptible.cu
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstddef>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <omp.h>
+#include <raft/common/nvtx.hpp>
+#include <raft/interruptible.hpp>
+#include <rmm/cuda_stream.hpp>
+#include <thread>
+#include <vector>
+
+namespace raft {
+
+__global__ void gpu_wait(int millis)
+{
+  for (auto i = millis; i > 0; i--) {
+#if __CUDA_ARCH__ >= 700
+    __nanosleep(1000000);
+#else
+    // For older CUDA devices:
+    // just do some random work that takes more or less the same time from run to run.
+    volatile double x = 0;
+    for (int i = 0; i < 10000; i++) {
+      x = x + double(i);
+      x /= 2.0;
+      __syncthreads();
+    }
+#endif
+  }
+}
+
+TEST(Raft, InterruptibleBasic)
+{
+  ASSERT_TRUE(interruptible::yield_no_throw());
+
+  // Cancel using the token
+  interruptible::get_token()->cancel();
+  ASSERT_FALSE(interruptible::yield_no_throw());
+  ASSERT_TRUE(interruptible::yield_no_throw());
+
+  // Cancel by thread id
+  interruptible::cancel(std::this_thread::get_id());
+  ASSERT_FALSE(interruptible::yield_no_throw());
+  ASSERT_TRUE(interruptible::yield_no_throw());
+}
+
+TEST(Raft, InterruptibleRepeatedGetToken)
+{
+  auto i     = std::this_thread::get_id();
+  auto a1    = interruptible::get_token();
+  auto count = a1.use_count();
+  auto a2    = interruptible::get_token();
+  ASSERT_LT(count, a1.use_count());
+  count   = a1.use_count();
+  auto b1 = interruptible::get_token(i);
+  ASSERT_LT(count, a1.use_count());
+  count   = a1.use_count();
+  auto b2 = interruptible::get_token(i);
+  ASSERT_LT(count, a1.use_count());
+
+  ASSERT_EQ(a1, a2);
+  ASSERT_EQ(a1, b2);
+  ASSERT_EQ(b1, b2);
+}
+
+TEST(Raft, InterruptibleDelayedInit)
+{
+  std::thread([&]() {
+    auto a = interruptible::get_token(std::this_thread::get_id());
+    ASSERT_EQ(a.use_count(), 1);  // the only pointer here is [a]
+    auto b = interruptible::get_token();
+    ASSERT_EQ(a.use_count(), 3);  // [a, b, thread_local]
+    auto c = interruptible::get_token();
+    ASSERT_EQ(a.use_count(), 4);  // [a, b, c, thread_local]
+    ASSERT_EQ(a.get(), b.get());
+    ASSERT_EQ(a.get(), c.get());
+  }).join();
+}
+
+TEST(Raft, InterruptibleOpenMP)
+{
+  // number of threads must be smaller than max number of resident grids for GPU
+  const int n_threads = 10;
+  // 1 <= n_expected_succeed <= n_threads
+  const int n_expected_succeed = 5;
+  // How many milliseconds passes between a thread i and i+1 finishes.
+  // i.e. thread i executes (C + i*n_expected_succeed) milliseconds in total.
+  const int thread_delay_millis = 20;
+  common::nvtx::range fun_scope("interruptible");
+
+  std::vector<std::shared_ptr<interruptible>> thread_tokens(n_threads);
+  int n_finished  = 0;
+  int n_cancelled = 0;
+
+  omp_set_dynamic(0);
+  omp_set_num_threads(n_threads);
+#pragma omp parallel reduction(+ : n_finished) reduction(+ : n_cancelled) num_threads(n_threads)
+  {
+    auto i = omp_get_thread_num();
+    common::nvtx::range omp_scope("interruptible::thread-%d", i);
+    rmm::cuda_stream stream;
+    gpu_wait<<<1, 1, 0, stream.value()>>>(1);
+    interruptible::synchronize(stream);
+    thread_tokens[i] = interruptible::get_token();
+
+#pragma omp barrier
+    try {
+      common::nvtx::range wait_scope("interruptible::wait-%d", i);
+      gpu_wait<<<1, 1, 0, stream.value()>>>((1 + i) * thread_delay_millis);
+      interruptible::synchronize(stream);
+      n_finished = 1;
+    } catch (interrupted_exception&) {
+      n_cancelled = 1;
+    }
+    if (i == n_expected_succeed - 1) {
+      common::nvtx::range cancel_scope("interruptible::cancel-%d", i);
+      for (auto token : thread_tokens)
+        token->cancel();
+    }
+
+#pragma omp barrier
+    // clear the cancellation state to not disrupt other tests
+    interruptible::yield_no_throw();
+  }
+  ASSERT_EQ(n_finished, n_expected_succeed);
+  ASSERT_EQ(n_cancelled, n_threads - n_expected_succeed);
+}
+}  // namespace raft
diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu
index c277db76ee..d5daef8d7b 100644
--- a/cpp/test/linalg/add.cu
+++ b/cpp/test/linalg/add.cu
@@ -47,7 +47,7 @@ class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
     r.uniform(in2.data(), len, InT(-1.0), InT(1.0), stream);
     naiveAddElem<InT, OutT>(out_ref.data(), in1.data(), in2.data(), len, stream);
     add<InT, OutT>(out.data(), in1.data(), in2.data(), len, stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
   void compare()
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu
index 55810a5ca0..d1b00da728 100644
--- a/cpp/test/linalg/binary_op.cu
+++ b/cpp/test/linalg/binary_op.cu
@@ -58,7 +58,7 @@ class BinaryOpTest : public ::testing::TestWithParam<BinaryOpInputs<InType, IdxT
     r.uniform(in2.data(), len, InType(-1.0), InType(1.0), stream);
     naiveAdd(out_ref.data(), in1.data(), in2.data(), len);
     binaryOpLaunch(out.data(), in1.data(), in2.data(), len, stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index 1d1a4fd864..56b4c5bd49 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -75,7 +75,7 @@ class coalescedReductionTest : public ::testing::TestWithParam<coalescedReductio
     // Add to result with inplace = true next
     coalescedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream, true);
 
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu
index f07a7d05ce..fd1bb8a670 100644
--- a/cpp/test/linalg/divide.cu
+++ b/cpp/test/linalg/divide.cu
@@ -61,7 +61,7 @@ class DivideTest : public ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T
     r.uniform(in.data(), len, T(-1.0), T(1.0), stream);
     naiveDivide(out_ref.data(), in.data(), params.scalar, len, stream);
     divideScalar(out.data(), in.data(), params.scalar, len, stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
index 9100a3a5f6..9949c900ef 100644
--- a/cpp/test/linalg/eig.cu
+++ b/cpp/test/linalg/eig.cu
@@ -134,7 +134,7 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
               stream,
               tol,
               sweeps);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index 4ae2653e47..7aab2c18c0 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -92,7 +92,7 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
                            eig_vals.data(),
                            EigVecMemUsage::OVERWRITE_INPUT,
                            stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu
index 146d48e179..982dc21573 100644
--- a/cpp/test/linalg/eltwise.cu
+++ b/cpp/test/linalg/eltwise.cu
@@ -76,7 +76,7 @@ class ScalarMultiplyTest : public ::testing::TestWithParam<ScalarMultiplyInputs<
     r.uniform(in, len, T(-1.0), T(1.0), stream);
     naiveScale(out_ref, in, scalar, len, stream);
     scalarMultiply(out, in, scalar, len, stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
@@ -164,7 +164,7 @@ class EltwiseAddTest : public ::testing::TestWithParam<EltwiseAddInputs<T>> {
     r.uniform(in2, len, T(-1.0), T(1.0), stream);
     naiveAdd(out_ref, in1, in2, len, stream);
     eltwiseAdd(out, in1, in2, len, stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index 9875e2548f..a12bb6ff9d 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -87,7 +87,7 @@ class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<InType>> {
     auto len = params.len;
     r.uniform(in.data(), len, InType(-1.0), InType(1.0), stream);
     mapReduceLaunch(out_ref.data(), out.data(), in.data(), len, stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index 4ff5243826..1a97603430 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -134,7 +134,7 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<T, IdxType>>
                          params.bcastAlongRows,
                          params.useTwoVectors,
                          stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu
index ec0599eb1b..6341fa341d 100644
--- a/cpp/test/linalg/multiply.cu
+++ b/cpp/test/linalg/multiply.cu
@@ -45,7 +45,7 @@ class MultiplyTest : public ::testing::TestWithParam<UnaryOpInputs<T>> {
     r.uniform(in.data(), len, T(-1.0), T(1.0), stream);
     naiveScale(out_ref.data(), in.data(), params.scalar, len, stream);
     multiplyScalar(out.data(), in.data(), params.scalar, len, stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
index 56e111d056..e574c52692 100644
--- a/cpp/test/linalg/norm.cu
+++ b/cpp/test/linalg/norm.cu
@@ -95,7 +95,7 @@ class RowNormTest : public ::testing::TestWithParam<NormInputs<T>> {
     } else {
       rowNorm(dots_act.data(), data.data(), cols, rows, params.type, params.rowMajor, stream);
     }
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
@@ -159,7 +159,7 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<T>> {
     } else {
       colNorm(dots_act.data(), data.data(), cols, rows, params.type, params.rowMajor, stream);
     }
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
index 14f34f142d..cb69dc0e81 100644
--- a/cpp/test/linalg/reduce.cu
+++ b/cpp/test/linalg/reduce.cu
@@ -96,7 +96,7 @@ class ReduceTest : public ::testing::TestWithParam<ReduceInputs<InType, OutType>
       reduceLaunch(
         dots_act.data(), data.data(), cols, rows, params.rowMajor, params.alongRows, true, stream);
     }
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu
index 6d33fbdef1..840889dee8 100644
--- a/cpp/test/linalg/strided_reduction.cu
+++ b/cpp/test/linalg/strided_reduction.cu
@@ -61,7 +61,7 @@ class stridedReductionTest : public ::testing::TestWithParam<stridedReductionInp
 
     unaryAndGemv(dots_exp.data(), data.data(), cols, rows, stream);
     stridedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu
index 7b82dab1ad..2801592de9 100644
--- a/cpp/test/linalg/subtract.cu
+++ b/cpp/test/linalg/subtract.cu
@@ -96,7 +96,7 @@ class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
     subtractScalar(out.data(), out.data(), T(1), len, stream);
     subtract(in1.data(), in1.data(), in2.data(), len, stream);
     subtractScalar(in1.data(), in1.data(), T(1), len, stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu
index e9128bad93..e074197dec 100644
--- a/cpp/test/linalg/svd.cu
+++ b/cpp/test/linalg/svd.cu
@@ -91,7 +91,7 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
           true,
           true,
           stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index 60db1ee82b..3c651bb8ee 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -63,7 +63,7 @@ class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
 
     transpose(handle, data.data(), data_trans.data(), params.n_row, params.n_col, stream);
     transpose(data.data(), params.n_row, stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu
index 050fed78ea..7a976ec336 100644
--- a/cpp/test/linalg/unary_op.cu
+++ b/cpp/test/linalg/unary_op.cu
@@ -59,7 +59,7 @@ class UnaryOpTest : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxTyp
     raft::random::Rng r(params.seed);
     auto len = params.len;
     r.uniform(in.data(), len, InType(-1.0), InType(1.0), stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
   virtual void DoTest()
@@ -68,7 +68,7 @@ class UnaryOpTest : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxTyp
     auto scalar = params.scalar;
     naiveScale(out_ref.data(), in.data(), scalar, len, stream);
     unaryOpLaunch(out.data(), in.data(), scalar, len, stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
     ASSERT_TRUE(devArrMatch(
       out_ref.data(), out.data(), params.len, CompareApprox<OutType>(params.tolerance)));
   }
diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu
index 3215df0d73..127e582145 100644
--- a/cpp/test/matrix/math.cu
+++ b/cpp/test/matrix/math.cu
@@ -177,7 +177,7 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
     update_device(out_smallzero_ref.data(), in_small_val_zero_ref_h.data(), 4, stream);
     setSmallValuesZero(out_smallzero.data(), in_smallzero.data(), 4, stream);
     setSmallValuesZero(in_smallzero.data(), 4, stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu
index 86b94fb011..fb2f6c6b15 100644
--- a/cpp/test/matrix/matrix.cu
+++ b/cpp/test/matrix/matrix.cu
@@ -63,7 +63,7 @@ class MatrixTest : public ::testing::TestWithParam<MatrixInputs<T>> {
 
     rmm::device_uvector<T> outTrunc(6, stream);
     truncZeroOrigin(in1.data(), params.n_row, outTrunc.data(), 3, 2, stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu
index 4b0f1f0a4f..02c8dc9f39 100644
--- a/cpp/test/random/rng_int.cu
+++ b/cpp/test/random/rng_int.cu
@@ -94,10 +94,10 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     meanKernel<T, threads><<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(
       stats.data(), data.data(), params.len);
     update_host<float>(h_stats, stats.data(), 2, stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
     h_stats[0] /= params.len;
     h_stats[1] = (h_stats[1] / params.len) - (h_stats[0] * h_stats[0]);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
   void getExpectedMeanVar(float meanvar[2])
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu
index d3b1baf388..a8bba340fa 100644
--- a/cpp/test/random/sample_without_replacement.cu
+++ b/cpp/test/random/sample_without_replacement.cu
@@ -77,7 +77,7 @@ class SWoRTest : public ::testing::TestWithParam<SWoRInputs<T>> {
                                params.len,
                                stream);
     update_host(&(h_outIdx[0]), outIdx.data(), params.sampledLen, stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu
index 648964fc57..e4b197d7f5 100644
--- a/cpp/test/sparse/connect_components.cu
+++ b/cpp/test/sparse/connect_components.cu
@@ -127,7 +127,7 @@ class ConnectComponentsTest
                                                                     false,
                                                                     false);
 
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
 
     // The sum of edges for both MST runs should be n_rows - 1
     final_edges = output_mst.n_edges + mst_coo.n_edges;
diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu
index e37827d18d..f0a245b432 100644
--- a/cpp/test/sparse/csr_row_slice.cu
+++ b/cpp/test/sparse/csr_row_slice.cu
@@ -98,7 +98,7 @@ class CSRRowSliceTest : public ::testing::TestWithParam<CSRRowSliceInputs<value_
     update_device(
       out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream);
     update_device(out_data_ref.data(), out_data_ref_h.data(), out_data_ref_h.size(), stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
   void SetUp() override
@@ -124,7 +124,7 @@ class CSRRowSliceTest : public ::testing::TestWithParam<CSRRowSliceInputs<value_
                                              out_data.data(),
                                              stream);
 
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
   void compare()
diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu
index b1a432422e..ab95c3610f 100644
--- a/cpp/test/sparse/csr_transpose.cu
+++ b/cpp/test/sparse/csr_transpose.cu
@@ -118,7 +118,7 @@ class CSRTransposeTest : public ::testing::TestWithParam<CSRTransposeInputs<valu
                                         params.nnz,
                                         stream);
 
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
   void compare()
diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu
index 88a3f24df6..3645ef45ba 100644
--- a/cpp/test/sparse/knn_graph.cu
+++ b/cpp/test/sparse/knn_graph.cu
@@ -91,7 +91,7 @@ class KNNGraphTest : public ::testing::TestWithParam<KNNGraphInputs<value_idx, v
       out->rows(), out->cols(), out->vals(), out->nnz, sum.data());
 
     sum_h = sum.value(stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
   void TearDown() override { delete out; }
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu
index cb09b9e7f5..7944d0ee1f 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/sparse/linkage.cu
@@ -188,7 +188,7 @@ class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
       params.c,
       params.n_clusters);
 
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
 
     score = compute_rand_index(labels.data(), labels_ref.data(), params.n_row, stream);
   }
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu
index 9c766d2d05..9a2e35b0fe 100644
--- a/cpp/test/sparse/symmetrize.cu
+++ b/cpp/test/sparse/symmetrize.cu
@@ -111,7 +111,7 @@ class SparseSymmetrizeTest
       out.rows(), out.cols(), out.vals(), out.nnz, sum.data());
 
     sum_h = sum.value(stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu
index 6b7402a7bd..f78c6c46da 100644
--- a/cpp/test/spatial/haversine.cu
+++ b/cpp/test/spatial/haversine.cu
@@ -94,7 +94,7 @@ class HaversineKNNTest : public ::testing::Test {
                                               k,
                                               stream);
 
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
   void SetUp() override { basicTest(); }
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index ee216ee434..54c3b55e5e 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -153,7 +153,7 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
     raft::copy(input_.data(), input_ptr, rows_ * cols_, stream);
     raft::copy(search_data_.data(), input_ptr, rows_ * cols_, stream);
     raft::copy(search_labels_.data(), labels_ptr, rows_, stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  private:
diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu
index 8ccf3b6b73..769406487a 100644
--- a/cpp/test/spatial/selection.cu
+++ b/cpp/test/spatial/selection.cu
@@ -111,7 +111,7 @@ class SparseSelectionTest
                                  k,
                                  stream);
 
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
   void compare()
diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu
index af6d7c8d7b..ddabe0e814 100644
--- a/cpp/test/stats/mean_center.cu
+++ b/cpp/test/stats/mean_center.cu
@@ -79,7 +79,7 @@ class MeanCenterTest : public ::testing::TestWithParam<MeanCenterInputs<T, IdxTy
                               params.bcastAlongRows,
                               (T)-1.0,
                               stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index 24206cee7e..ef7964201f 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -66,7 +66,7 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
     vars_act.resize(cols, stream);
     r.normal(data.data(), len, params.mean, params.stddev, stream);
     stdVarSGtest(data.data(), stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
   void stdVarSGtest(T* data, cudaStream_t stream)
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index fd656423ad..0df140b8b4 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -62,7 +62,7 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
 
     raft::update_device(data.data(), data_h, len, stream);
     sum(sum_act.data(), data.data(), cols, rows, false, stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    handle.sync_stream(stream);
   }
 
  protected:
diff --git a/docs/source/cpp_api/core.rst b/docs/source/cpp_api/core.rst
index 13a4dca267..bae39e3282 100644
--- a/docs/source/cpp_api/core.rst
+++ b/docs/source/cpp_api/core.rst
@@ -11,3 +11,11 @@ handle_t
 .. doxygenclass:: raft::handle_t
     :project: RAFT
     :members:
+
+
+interruptible
+########
+
+.. doxygenclass:: raft::interruptible
+    :project: RAFT
+    :members:
diff --git a/python/raft/common/interruptible.pxd b/python/raft/common/interruptible.pxd
new file mode 100644
index 0000000000..a73e8c1ac7
--- /dev/null
+++ b/python/raft/common/interruptible.pxd
@@ -0,0 +1,34 @@
+#
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from libcpp.memory cimport shared_ptr
+from rmm._lib.cuda_stream_view cimport cuda_stream_view
+
+cdef extern from "raft/interruptible.hpp" namespace "raft" nogil:
+    cdef cppclass interruptible:
+        void cancel()
+
+cdef extern from "raft/interruptible.hpp" \
+        namespace "raft::interruptible" nogil:
+    cdef void inter_synchronize \
+        "raft::interruptible::synchronize"(cuda_stream_view stream) except+
+    cdef void inter_yield "raft::interruptible::yield"() except+
+    cdef shared_ptr[interruptible] get_token() except+
diff --git a/python/raft/common/interruptible.pyx b/python/raft/common/interruptible.pyx
new file mode 100644
index 0000000000..dfc95490ed
--- /dev/null
+++ b/python/raft/common/interruptible.pyx
@@ -0,0 +1,84 @@
+#
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+import contextlib
+import signal
+from cython.operator cimport dereference
+
+from rmm._lib.cuda_stream_view cimport cuda_stream_view
+from cuda.ccudart cimport cudaStream_t
+from .cuda cimport Stream
+
+
+@contextlib.contextmanager
+def cuda_interruptible():
+    '''
+    Temporarily install a keyboard interrupt handler (Ctrl+C)
+    that cancels the enclosed interruptible C++ thread.
+
+    Use this on a long-running C++ function imported via cython:
+
+    .. code-block:: python
+
+        with cuda_interruptible():
+            my_long_running_function(...)
+
+    It's also recommended to release the GIL during the call, to
+    make sure the handler has a chance to run:
+
+    .. code-block:: python
+
+        with cuda_interruptible():
+            with nogil:
+                my_long_running_function(...)
+
+    '''
+    cdef shared_ptr[interruptible] token = get_token()
+
+    def newhr(*args, **kwargs):
+        with nogil:
+            dereference(token).cancel()
+
+    oldhr = signal.signal(signal.SIGINT, newhr)
+    try:
+        yield
+    finally:
+        signal.signal(signal.SIGINT, oldhr)
+
+
+def synchronize(stream: Stream):
+    '''
+    Same as cudaStreamSynchronize, but can be interrupted
+    if called within a `with cuda_interruptible()` block.
+    '''
+    cdef cuda_stream_view c_stream = cuda_stream_view(stream.getStream())
+    with nogil:
+        inter_synchronize(c_stream)
+
+
+def cuda_yield():
+    '''
+    Check for an asynchronously received interrupted_exception.
+    Raises the exception if a user pressed Ctrl+C within a
+    `with cuda_interruptible()` block before.
+    '''
+    with nogil:
+        inter_yield()
diff --git a/python/raft/test/test_interruptible.py b/python/raft/test/test_interruptible.py
new file mode 100644
index 0000000000..81f4f99ed8
--- /dev/null
+++ b/python/raft/test/test_interruptible.py
@@ -0,0 +1,54 @@
+
+import os
+import pytest
+import signal
+import time
+from raft.common.interruptible import cuda_interruptible, cuda_yield
+
+
+def send_ctrl_c():
+    # signal.raise_signal(signal.SIGINT) available only since python 3.8
+    os.kill(os.getpid(), signal.SIGINT)
+
+
+def test_should_cancel_via_interruptible():
+    start_time = time.monotonic()
+    with pytest.raises(RuntimeError, match='this thread was cancelled'):
+        with cuda_interruptible():
+            send_ctrl_c()
+            cuda_yield()
+            time.sleep(1.0)
+    end_time = time.monotonic()
+    assert end_time < start_time + 0.5, \
+        "The process seems to have waited, while it shouldn't have."
+
+
+def test_should_cancel_via_python():
+    start_time = time.monotonic()
+    with pytest.raises(KeyboardInterrupt):
+        send_ctrl_c()
+        cuda_yield()
+        time.sleep(1.0)
+    end_time = time.monotonic()
+    assert end_time < start_time + 0.5, \
+        "The process seems to have waited, while it shouldn't have."
+
+
+def test_should_wait_no_interrupt():
+    start_time = time.monotonic()
+    with cuda_interruptible():
+        cuda_yield()
+        time.sleep(1.0)
+    end_time = time.monotonic()
+    assert end_time > start_time + 0.5, \
+        "The process seems to be cancelled, while it shouldn't be."
+
+
+def test_should_wait_no_yield():
+    start_time = time.monotonic()
+    with cuda_interruptible():
+        send_ctrl_c()
+        time.sleep(1.0)
+    end_time = time.monotonic()
+    assert end_time > start_time + 0.5, \
+        "The process seems to be cancelled, while it shouldn't be."
diff --git a/python/setup.py b/python/setup.py
index f5b1e8bace..80f687a442 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -102,7 +102,7 @@
 # - Cython extensions build and parameters -----------------------------------
 
 
-libs = ["nccl", "cusolver", "cusparse", "cublas"]
+libs = ['cudart', "nccl", "cusolver", "cusparse", "cublas"]
 
 include_dirs = [cuda_include_dir,
                 numpy.get_include(),

From 6422bae50414ac87f000aab34eb401f73119fa2c Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Tue, 8 Feb 2022 17:05:34 +0100
Subject: [PATCH 104/171] Fix badly merged cublas wrappers (#492)

My previous PR with cublas overloads was made in the middle of raft refactoring, and followed the old file hierarchy. This PR fixes the new overloads to the new style.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/492
---
 cpp/include/raft/linalg/{axpy.h => axpy.hpp} |  8 +---
 cpp/include/raft/linalg/detail/axpy.hpp      | 43 ++++++++++++++++++++
 cpp/include/raft/linalg/detail/gemm.hpp      |  4 +-
 cpp/include/raft/linalg/detail/gemv.hpp      | 37 +++++++++++++++--
 cpp/include/raft/linalg/gemm.hpp             | 43 ++++++++++++++++++++
 cpp/include/raft/linalg/gemv.hpp             | 22 ++--------
 6 files changed, 127 insertions(+), 30 deletions(-)
 rename cpp/include/raft/linalg/{axpy.h => axpy.hpp} (84%)
 create mode 100644 cpp/include/raft/linalg/detail/axpy.hpp

diff --git a/cpp/include/raft/linalg/axpy.h b/cpp/include/raft/linalg/axpy.hpp
similarity index 84%
rename from cpp/include/raft/linalg/axpy.h
rename to cpp/include/raft/linalg/axpy.hpp
index 27b14aea08..5a5a873132 100644
--- a/cpp/include/raft/linalg/axpy.h
+++ b/cpp/include/raft/linalg/axpy.hpp
@@ -16,9 +16,7 @@
 
 #pragma once
 
-#include <raft/cuda_utils.cuh>
-#include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.h>
+#include "detail/axpy.hpp"
 
 namespace raft::linalg {
 
@@ -47,9 +45,7 @@ void axpy(const raft::handle_t& handle,
           const int incy,
           cudaStream_t stream)
 {
-  auto cublas_h = handle.get_cublas_handle();
-  cublas_device_pointer_mode<DevicePointerMode> pmode(cublas_h);
-  RAFT_CUBLAS_TRY(cublasaxpy(cublas_h, n, alpha, x, incx, y, incy, stream));
+  detail::axpy<T, DevicePointerMode>(handle, n, alpha, x, incx, y, incy, stream);
 }
 
 }  // namespace raft::linalg
diff --git a/cpp/include/raft/linalg/detail/axpy.hpp b/cpp/include/raft/linalg/detail/axpy.hpp
new file mode 100644
index 0000000000..f5527bf10f
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/axpy.hpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cublas_v2.h>
+
+#include "cublas_wrappers.hpp"
+
+#include <raft/cuda_utils.cuh>
+#include <raft/handle.hpp>
+
+namespace raft::linalg::detail {
+
+template <typename T, bool DevicePointerMode = false>
+void axpy(const raft::handle_t& handle,
+          const int n,
+          const T* alpha,
+          const T* x,
+          const int incx,
+          T* y,
+          const int incy,
+          cudaStream_t stream)
+{
+  auto cublas_h = handle.get_cublas_handle();
+  cublas_device_pointer_mode<DevicePointerMode> pmode(cublas_h);
+  RAFT_CUBLAS_TRY(cublasaxpy(cublas_h, n, alpha, x, incx, y, incy, stream));
+}
+
+}  // namespace raft::linalg::detail
diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp
index 8a02b702e5..0ea1723a9e 100644
--- a/cpp/include/raft/linalg/detail/gemm.hpp
+++ b/cpp/include/raft/linalg/detail/gemm.hpp
@@ -16,8 +16,10 @@
 
 #pragma once
 
-#include "cublas_wrappers.hpp"
 #include <cublas_v2.h>
+
+#include "cublas_wrappers.hpp"
+
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
 
diff --git a/cpp/include/raft/linalg/detail/gemv.hpp b/cpp/include/raft/linalg/detail/gemv.hpp
index 991268cf26..3692743152 100644
--- a/cpp/include/raft/linalg/detail/gemv.hpp
+++ b/cpp/include/raft/linalg/detail/gemv.hpp
@@ -27,6 +27,38 @@ namespace raft {
 namespace linalg {
 namespace detail {
 
+template <typename math_t, bool DevicePointerMode = false>
+void gemv(const raft::handle_t& handle,
+          const bool trans_a,
+          const int m,
+          const int n,
+          const math_t* alpha,
+          const math_t* A,
+          const int lda,
+          const math_t* x,
+          const int incx,
+          const math_t* beta,
+          math_t* y,
+          const int incy,
+          cudaStream_t stream)
+{
+  cublasHandle_t cublas_h = handle.get_cublas_handle();
+  detail::cublas_device_pointer_mode<DevicePointerMode> pmode(cublas_h);
+  RAFT_CUBLAS_TRY(detail::cublasgemv(cublas_h,
+                                     trans_a ? CUBLAS_OP_T : CUBLAS_OP_N,
+                                     m,
+                                     n,
+                                     alpha,
+                                     A,
+                                     lda,
+                                     x,
+                                     incx,
+                                     beta,
+                                     y,
+                                     incy,
+                                     stream));
+}
+
 template <typename math_t>
 void gemv(const raft::handle_t& handle,
           const math_t* A,
@@ -41,10 +73,7 @@ void gemv(const raft::handle_t& handle,
           const math_t beta,
           cudaStream_t stream)
 {
-  cublasHandle_t cublas_h = handle.get_cublas_handle();
-  cublasOperation_t op_a  = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
-  RAFT_CUBLAS_TRY(
-    cublasgemv(cublas_h, op_a, n_rows, n_cols, &alpha, A, n_rows, x, incx, &beta, y, incy, stream));
+  gemv(handle, trans_a, n_rows, n_cols, &alpha, A, n_rows, x, incx, &beta, y, incy, stream);
 }
 
 template <typename math_t>
diff --git a/cpp/include/raft/linalg/gemm.hpp b/cpp/include/raft/linalg/gemm.hpp
index 04ddbb3561..f22d15e650 100644
--- a/cpp/include/raft/linalg/gemm.hpp
+++ b/cpp/include/raft/linalg/gemm.hpp
@@ -21,6 +21,49 @@
 namespace raft {
 namespace linalg {
 
+/**
+ * @brief the wrapper of cublas gemm function
+ *  It computes the following equation: C = alpha .* opA(A) * opB(B) + beta .* C
+ *
+ * @tparam math_t the element type
+ * @tparam DevicePointerMode whether pointers alpha, beta point to device memory
+ * @param [in] handle raft handle
+ * @param [in] trans_a cublas transpose op for A
+ * @param [in] trans_b cublas transpose op for B
+ * @param [in] m number of rows of C
+ * @param [in] n number of columns of C
+ * @param [in] k number of rows of opB(B) / number of columns of opA(A)
+ * @param [in] alpha host or device scalar
+ * @param [in] A such a matrix that the shape of column-major opA(A) is [m, k]
+ * @param [in] lda leading dimension of A
+ * @param [in] B such a matrix that the shape of column-major opA(B) is [k, n]
+ * @param [in] ldb leading dimension of B
+ * @param [in] beta host or device scalar
+ * @param [inout] C column-major matrix of size [m, n]
+ * @param [in] ldc leading dimension of C
+ * @param [in] stream
+ */
+template <typename math_t, bool DevicePointerMode = false>
+void gemm(const raft::handle_t& handle,
+          const bool trans_a,
+          const bool trans_b,
+          const int m,
+          const int n,
+          const int k,
+          const math_t* alpha,
+          const math_t* A,
+          const int lda,
+          const math_t* B,
+          const int ldb,
+          const math_t* beta,
+          const math_t* C,
+          const int ldc,
+          cudaStream_t stream)
+{
+  detail::gemm<math_t, DevicePointerMode>(
+    handle, trans_a, trans_b, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, stream);
+}
+
 /**
  * @brief the wrapper of cublas gemm function
  *  It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C
diff --git a/cpp/include/raft/linalg/gemv.hpp b/cpp/include/raft/linalg/gemv.hpp
index 45766b8c9a..2098027b16 100644
--- a/cpp/include/raft/linalg/gemv.hpp
+++ b/cpp/include/raft/linalg/gemv.hpp
@@ -17,9 +17,6 @@
 #pragma once
 
 #include "detail/gemv.hpp"
-#include <raft/linalg/detail/cublas_wrappers.hpp>
-
-#include <raft/handle.hpp>
 
 namespace raft {
 namespace linalg {
@@ -59,21 +56,8 @@ void gemv(const raft::handle_t& handle,
           const int incy,
           cudaStream_t stream)
 {
-  cublasHandle_t cublas_h = handle.get_cublas_handle();
-  detail::cublas_device_pointer_mode<DevicePointerMode> pmode(cublas_h);
-  RAFT_CUBLAS_TRY(detail::cublasgemv(cublas_h,
-                                     trans_a ? CUBLAS_OP_T : CUBLAS_OP_N,
-                                     m,
-                                     n,
-                                     alpha,
-                                     A,
-                                     lda,
-                                     x,
-                                     incx,
-                                     beta,
-                                     y,
-                                     incy,
-                                     stream));
+  detail::gemv<math_t, DevicePointerMode>(
+    handle, trans_a, m, n, alpha, A, lda, x, incx, beta, y, incy, stream);
 }
 
 template <typename math_t>
@@ -90,7 +74,7 @@ void gemv(const raft::handle_t& handle,
           const math_t beta,
           cudaStream_t stream)
 {
-  gemv(handle, trans_a, n_rows, n_cols, &alpha, A, n_rows, x, incx, &beta, y, incy, stream);
+  detail::gemv(handle, A, n_rows, n_cols, x, incx, y, incy, trans_a, alpha, beta, stream);
 }
 
 /**

From 29718bd74ff60b201126a519cf2fecb2cfcddc50 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Tue, 8 Feb 2022 15:51:07 -0500
Subject: [PATCH 105/171] Cleaning up cusparse_wrappers (#441)

Closes #289

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Jiaming Yuan (https://github.com/trivialfis)
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Seunghwa Kang (https://github.com/seunghwak)

URL: https://github.com/rapidsai/raft/pull/441
---
 cpp/doxygen/Doxyfile.in                       |   2 +-
 cpp/include/raft/handle.hpp                   |   2 +-
 cpp/include/raft/sparse/convert/dense.hpp     |   4 +-
 .../raft/sparse/convert/detail/coo.cuh        |   2 +-
 .../raft/sparse/convert/detail/csr.cuh        |  11 +-
 .../raft/sparse/convert/detail/dense.cuh      |  35 +-
 cpp/include/raft/sparse/detail/csr.cuh        |   2 +-
 .../raft/sparse/detail/cusparse_macros.h      | 123 +++++
 .../sparse/{ => detail}/cusparse_wrappers.h   | 421 +++++++++++++-----
 .../sparse/distance/detail/bin_distance.cuh   |   2 +-
 .../raft/sparse/distance/detail/coo_spmv.cuh  |   2 +-
 .../sparse/distance/detail/ip_distance.cuh    |   6 +-
 .../sparse/distance/detail/l2_distance.cuh    |   2 +-
 .../sparse/distance/detail/lp_distance.cuh    |   3 +-
 cpp/include/raft/sparse/distance/distance.hpp |  14 +-
 cpp/include/raft/sparse/linalg/detail/add.cuh |   2 +-
 .../raft/sparse/linalg/detail/norm.cuh        |   2 +-
 .../raft/sparse/linalg/detail/spectral.cuh    |   2 +-
 .../raft/sparse/linalg/detail/symmetrize.cuh  |   2 +-
 .../raft/sparse/linalg/detail/transpose.h     |  62 +--
 cpp/include/raft/sparse/op/detail/filter.cuh  |   2 +-
 cpp/include/raft/sparse/op/detail/reduce.cuh  |   2 +-
 cpp/include/raft/sparse/op/detail/row_op.cuh  |   2 +-
 cpp/include/raft/sparse/op/detail/slice.h     |   4 +-
 cpp/include/raft/sparse/op/detail/sort.h      |   2 +-
 cpp/include/raft/spectral/matrix_wrappers.hpp |  55 +--
 cpp/test/sparse/csr_row_slice.cu              |   2 +-
 cpp/test/sparse/csr_to_dense.cu               |   5 +-
 cpp/test/sparse/csr_transpose.cu              |   2 +-
 cpp/test/sparse/dist_coo_spmv.cu              |   4 +-
 cpp/test/sparse/distance.cu                   |   2 +-
 cpp/test/sparse/knn.cu                        |   2 -
 32 files changed, 566 insertions(+), 219 deletions(-)
 create mode 100644 cpp/include/raft/sparse/detail/cusparse_macros.h
 rename cpp/include/raft/sparse/{ => detail}/cusparse_wrappers.h (84%)

diff --git a/cpp/doxygen/Doxyfile.in b/cpp/doxygen/Doxyfile.in
index eb27b2d02c..c83224050e 100644
--- a/cpp/doxygen/Doxyfile.in
+++ b/cpp/doxygen/Doxyfile.in
@@ -815,7 +815,7 @@ RECURSIVE              = YES
 
 EXCLUDE                = @CMAKE_CURRENT_SOURCE_DIR@/include/raft/sparse/linalg/symmetrize.hpp \ # Contains device code
                          @CMAKE_CURRENT_SOURCE_DIR@/include/raft/sparse/csr.hpp \    # Contains device code
-                         @CMAKE_CURRENT_SOURCE_DIR@/include/raft/sparse/cusparse_wrappers.h
+                         @CMAKE_CURRENT_SOURCE_DIR@/include/raft/sparse/detail/cusparse_wrappers.h
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 015d422f9a..7d6a5bfafd 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -38,7 +38,7 @@
 #include <raft/interruptible.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/detail/cusolver_wrappers.hpp>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_macros.h>
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/exec_policy.hpp>
 
diff --git a/cpp/include/raft/sparse/convert/dense.hpp b/cpp/include/raft/sparse/convert/dense.hpp
index c8d3b46d03..2570d7ae65 100644
--- a/cpp/include/raft/sparse/convert/dense.hpp
+++ b/cpp/include/raft/sparse/convert/dense.hpp
@@ -32,6 +32,7 @@ namespace convert {
  * @param[in] handle : cusparse handle for conversion
  * @param[in] nrows : number of rows in CSR
  * @param[in] ncols : number of columns in CSR
+ * @param[in] nnz : number of nonzeros in CSR
  * @param[in] csr_indptr : CSR row index pointer array
  * @param[in] csr_indices : CSR column indices array
  * @param[in] csr_data : CSR data array
@@ -44,6 +45,7 @@ template <typename value_idx, typename value_t>
 void csr_to_dense(cusparseHandle_t handle,
                   value_idx nrows,
                   value_idx ncols,
+                  value_idx nnz,
                   const value_idx* csr_indptr,
                   const value_idx* csr_indices,
                   const value_t* csr_data,
@@ -53,7 +55,7 @@ void csr_to_dense(cusparseHandle_t handle,
                   bool row_major = true)
 {
   detail::csr_to_dense<value_idx, value_t>(
-    handle, nrows, ncols, csr_indptr, csr_indices, csr_data, lda, out, stream, row_major);
+    handle, nrows, ncols, nnz, csr_indptr, csr_indices, csr_data, lda, out, stream, row_major);
 }
 
 };  // end NAMESPACE convert
diff --git a/cpp/include/raft/sparse/convert/detail/coo.cuh b/cpp/include/raft/sparse/convert/detail/coo.cuh
index fd300dcdba..c37087789c 100644
--- a/cpp/include/raft/sparse/convert/detail/coo.cuh
+++ b/cpp/include/raft/sparse/convert/detail/coo.cuh
@@ -19,7 +19,7 @@
 #include <cusparse_v2.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
diff --git a/cpp/include/raft/sparse/convert/detail/csr.cuh b/cpp/include/raft/sparse/convert/detail/csr.cuh
index 0f4dc4976c..751335dfca 100644
--- a/cpp/include/raft/sparse/convert/detail/csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/csr.cuh
@@ -21,7 +21,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
@@ -61,15 +61,16 @@ void coo_to_csr(const raft::handle_t& handle,
     cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream));
   RAFT_CUDA_TRY(
     cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream));
-  auto buffSize = raft::sparse::cusparsecoosort_bufferSizeExt(
+  auto buffSize = raft::sparse::detail::cusparsecoosort_bufferSizeExt(
     cusparseHandle, m, m, nnz, srcRows, srcCols, stream);
   rmm::device_uvector<char> pBuffer(buffSize, stream);
   rmm::device_uvector<int> P(nnz, stream);
   RAFT_CUSPARSE_TRY(cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data()));
-  raft::sparse::cusparsecoosortByRow(
+  raft::sparse::detail::cusparsecoosortByRow(
     cusparseHandle, m, m, nnz, dstRows.data(), dstCols, P.data(), pBuffer.data(), stream);
-  raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(), stream);
-  raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m, dst_offsets, stream);
+  raft::sparse::detail::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(), stream);
+  raft::sparse::detail::cusparsecoo2csr(
+    cusparseHandle, dstRows.data(), nnz, m, dst_offsets, stream);
   RAFT_CUDA_TRY(cudaDeviceSynchronize());
 }
 
diff --git a/cpp/include/raft/sparse/convert/detail/dense.cuh b/cpp/include/raft/sparse/convert/detail/dense.cuh
index 9f48fd2172..b2756b81c9 100644
--- a/cpp/include/raft/sparse/convert/detail/dense.cuh
+++ b/cpp/include/raft/sparse/convert/detail/dense.cuh
@@ -19,7 +19,7 @@
 #include <cusparse_v2.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
@@ -31,6 +31,7 @@
 #include <iostream>
 
 #include <raft/sparse/detail/utils.h>
+#include <rmm/device_uvector.hpp>
 
 namespace raft {
 namespace sparse {
@@ -67,6 +68,7 @@ __global__ void csr_to_dense_warp_per_row_kernel(
  * @param[in] handle : cusparse handle for conversion
  * @param[in] nrows : number of rows in CSR
  * @param[in] ncols : number of columns in CSR
+ * @param[in] nnz : the number of nonzeros in CSR
  * @param[in] csr_indptr : CSR row index pointer array
  * @param[in] csr_indices : CSR column indices array
  * @param[in] csr_data : CSR data array
@@ -79,6 +81,7 @@ template <typename value_idx, typename value_t>
 void csr_to_dense(cusparseHandle_t handle,
                   value_idx nrows,
                   value_idx ncols,
+                  value_idx nnz,
                   const value_idx* csr_indptr,
                   const value_idx* csr_indices,
                   const value_t* csr_data,
@@ -96,8 +99,34 @@ void csr_to_dense(cusparseHandle_t handle,
     RAFT_CUSPARSE_TRY(cusparseSetMatIndexBase(out_mat, CUSPARSE_INDEX_BASE_ZERO));
     RAFT_CUSPARSE_TRY(cusparseSetMatType(out_mat, CUSPARSE_MATRIX_TYPE_GENERAL));
 
-    RAFT_CUSPARSE_TRY(raft::sparse::cusparsecsr2dense(
-      handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out, lda, stream));
+    size_t buffer_size;
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecsr2dense_buffersize(handle,
+                                                                         nrows,
+                                                                         ncols,
+                                                                         nnz,
+                                                                         out_mat,
+                                                                         csr_data,
+                                                                         csr_indptr,
+                                                                         csr_indices,
+                                                                         out,
+                                                                         lda,
+                                                                         &buffer_size,
+                                                                         stream));
+
+    rmm::device_uvector<char> buffer(buffer_size, stream);
+
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecsr2dense(handle,
+                                                              nrows,
+                                                              ncols,
+                                                              nnz,
+                                                              out_mat,
+                                                              csr_data,
+                                                              csr_indptr,
+                                                              csr_indices,
+                                                              out,
+                                                              lda,
+                                                              buffer.data(),
+                                                              stream));
 
     RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyMatDescr(out_mat));
 
diff --git a/cpp/include/raft/sparse/detail/csr.cuh b/cpp/include/raft/sparse/detail/csr.cuh
index cb39f34ba4..a256ac402b 100644
--- a/cpp/include/raft/sparse/detail/csr.cuh
+++ b/cpp/include/raft/sparse/detail/csr.cuh
@@ -19,7 +19,7 @@
 #include <cusparse_v2.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/sparse/detail/cusparse_macros.h b/cpp/include/raft/sparse/detail/cusparse_macros.h
new file mode 100644
index 0000000000..1f9f0e5175
--- /dev/null
+++ b/cpp/include/raft/sparse/detail/cusparse_macros.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse.h>
+#include <raft/error.hpp>
+///@todo: enable this once logging is enabled
+//#include <cuml/common/logger.hpp>
+
+#define _CUSPARSE_ERR_TO_STR(err) \
+  case err: return #err;
+
+// Notes:
+//(1.) CUDA_VER_10_1_UP aggregates all the CUDA version selection logic;
+//(2.) to enforce a lower version,
+//
+//`#define CUDA_ENFORCE_LOWER
+// #include <raft/sparse/detail/cusparse_wrappers.h>`
+//
+// (i.e., before including this header)
+//
+#define CUDA_VER_10_1_UP (CUDART_VERSION >= 10100)
+
+namespace raft {
+
+/**
+ * @brief Exception thrown when a cuSparse error is encountered.
+ */
+struct cusparse_error : public raft::exception {
+  explicit cusparse_error(char const* const message) : raft::exception(message) {}
+  explicit cusparse_error(std::string const& message) : raft::exception(message) {}
+};
+
+namespace sparse {
+namespace detail {
+
+inline const char* cusparse_error_to_string(cusparseStatus_t err)
+{
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 10100
+  return cusparseGetErrorString(err);
+#else   // CUDART_VERSION
+  switch (err) {
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_SUCCESS);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_NOT_INITIALIZED);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ALLOC_FAILED);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INVALID_VALUE);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ARCH_MISMATCH);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+    default: return "CUSPARSE_STATUS_UNKNOWN";
+  };
+#endif  // CUDART_VERSION
+}
+
+}  // namespace detail
+}  // namespace sparse
+}  // namespace raft
+
+#undef _CUSPARSE_ERR_TO_STR
+
+/**
+ * @brief Error checking macro for cuSparse runtime API functions.
+ *
+ * Invokes a cuSparse runtime API function call, if the call does not return
+ * CUSPARSE_STATUS_SUCCESS, throws an exception detailing the cuSparse error that occurred
+ */
+#define RAFT_CUSPARSE_TRY(call)                                              \
+  do {                                                                       \
+    cusparseStatus_t const status = (call);                                  \
+    if (CUSPARSE_STATUS_SUCCESS != status) {                                 \
+      std::string msg{};                                                     \
+      SET_ERROR_MSG(msg,                                                     \
+                    "cuSparse error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                               \
+                    #call,                                                   \
+                    status,                                                  \
+                    raft::sparse::detail::cusparse_error_to_string(status)); \
+      throw raft::cusparse_error(msg);                                       \
+    }                                                                        \
+  } while (0)
+
+// FIXME: Remove after consumer rename
+#ifndef CUSPARSE_TRY
+#define CUSPARSE_TRY(call) RAFT_CUSPARSE_TRY(call)
+#endif
+
+// FIXME: Remove after consumer rename
+#ifndef CUSPARSE_CHECK
+#define CUSPARSE_CHECK(call) CUSPARSE_TRY(call)
+#endif
+
+//@todo: use logger here once logging is enabled
+/** check for cusparse runtime API errors but do not assert */
+#define RAFT_CUSPARSE_TRY_NO_THROW(call)                           \
+  do {                                                             \
+    cusparseStatus_t err = call;                                   \
+    if (err != CUSPARSE_STATUS_SUCCESS) {                          \
+      printf("CUSPARSE call='%s' got errorcode=%d err=%s",         \
+             #call,                                                \
+             err,                                                  \
+             raft::sparse::detail::cusparse_error_to_string(err)); \
+    }                                                              \
+  } while (0)
+
+// FIXME: Remove after consumer rename
+#ifndef CUSPARSE_CHECK_NO_THROW
+#define CUSPARSE_CHECK_NO_THROW(call) RAFT_CUSPARSE_TRY_NO_THROW(call)
+#endif
diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/detail/cusparse_wrappers.h
similarity index 84%
rename from cpp/include/raft/sparse/cusparse_wrappers.h
rename to cpp/include/raft/sparse/detail/cusparse_wrappers.h
index e2306686ce..aef3976294 100644
--- a/cpp/include/raft/sparse/cusparse_wrappers.h
+++ b/cpp/include/raft/sparse/detail/cusparse_wrappers.h
@@ -16,116 +16,14 @@
 
 #pragma once
 
+#include <cusparse.h>
 #include <raft/error.hpp>
-
-#include <cusparse_v2.h>
-///@todo: enable this once logging is enabled
-//#include <cuml/common/logger.hpp>
-
-#define _CUSPARSE_ERR_TO_STR(err) \
-  case err: return #err;
-
-// Notes:
-//(1.) CUDA_VER_10_1_UP aggregates all the CUDA version selection logic;
-//(2.) to enforce a lower version,
-//
-//`#define CUDA_ENFORCE_LOWER
-// #include <raft/sparse/cusparse_wrappers.h>`
-//
-// (i.e., before including this header)
-//
-#define CUDA_VER_10_1_UP (CUDART_VERSION >= 10100)
+#include <raft/sparse/detail/cusparse_macros.h>
 
 namespace raft {
-
-/**
- * @brief Exception thrown when a cuSparse error is encountered.
- */
-struct cusparse_error : public raft::exception {
-  explicit cusparse_error(char const* const message) : raft::exception(message) {}
-  explicit cusparse_error(std::string const& message) : raft::exception(message) {}
-};
-
 namespace sparse {
 namespace detail {
 
-inline const char* cusparse_error_to_string(cusparseStatus_t err)
-{
-#if defined(CUDART_VERSION) && CUDART_VERSION >= 10100
-  return cusparseGetErrorString(err);
-#else   // CUDART_VERSION
-  switch (err) {
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_SUCCESS);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_NOT_INITIALIZED);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ALLOC_FAILED);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INVALID_VALUE);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ARCH_MISMATCH);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR);
-    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
-    default: return "CUSPARSE_STATUS_UNKNOWN";
-  };
-#endif  // CUDART_VERSION
-}
-
-}  // namespace detail
-}  // namespace sparse
-}  // namespace raft
-
-#undef _CUSPARSE_ERR_TO_STR
-
-/**
- * @brief Error checking macro for cuSparse runtime API functions.
- *
- * Invokes a cuSparse runtime API function call, if the call does not return
- * CUSPARSE_STATUS_SUCCESS, throws an exception detailing the cuSparse error that occurred
- */
-#define RAFT_CUSPARSE_TRY(call)                                              \
-  do {                                                                       \
-    cusparseStatus_t const status = (call);                                  \
-    if (CUSPARSE_STATUS_SUCCESS != status) {                                 \
-      std::string msg{};                                                     \
-      SET_ERROR_MSG(msg,                                                     \
-                    "cuSparse error encountered at: ",                       \
-                    "call='%s', Reason=%d:%s",                               \
-                    #call,                                                   \
-                    status,                                                  \
-                    raft::sparse::detail::cusparse_error_to_string(status)); \
-      throw raft::cusparse_error(msg);                                       \
-    }                                                                        \
-  } while (0)
-
-// FIXME: Remove after consumer rename
-#ifndef CUSPARSE_TRY
-#define CUSPARSE_TRY(call) RAFT_CUSPARSE_TRY(call)
-#endif
-
-// FIXME: Remove after consumer rename
-#ifndef CUSPARSE_CHECK
-#define CUSPARSE_CHECK(call) CUSPARSE_TRY(call)
-#endif
-
-//@todo: use logger here once logging is enabled
-/** check for cusparse runtime API errors but do not assert */
-#define RAFT_CUSPARSE_TRY_NO_THROW(call)                           \
-  do {                                                             \
-    cusparseStatus_t err = call;                                   \
-    if (err != CUSPARSE_STATUS_SUCCESS) {                          \
-      printf("CUSPARSE call='%s' got errorcode=%d err=%s",         \
-             #call,                                                \
-             err,                                                  \
-             raft::sparse::detail::cusparse_error_to_string(err)); \
-    }                                                              \
-  } while (0)
-
-// FIXME: Remove after consumer rename
-#ifndef CUSPARSE_CHECK_NO_THROW
-#define CUSPARSE_CHECK_NO_THROW(call) RAFT_CUSPARSE_TRY_NO_THROW(call)
-#endif
-
-namespace raft {
-namespace sparse {
-
 /**
  * @defgroup gthr cusparse gather methods
  * @{
@@ -875,6 +773,41 @@ inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle,
                                                    cudaStream_t stream)
 {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+
+#if CUDART_VERSION >= 11020
+  cusparseSpMatDescr_t matA;
+  cusparsecreatecsr(&matA,
+                    m,
+                    n,
+                    nnz,
+                    const_cast<int*>(csrRowPtrA),
+                    const_cast<int*>(csrColIndA),
+                    const_cast<float*>(csrValA));
+
+  cusparseDnVecDescr_t vecX;
+  cusparsecreatednvec(&vecX, static_cast<int64_t>(n), const_cast<float*>(x));
+
+  cusparseDnVecDescr_t vecY;
+  cusparsecreatednvec(&vecY, static_cast<int64_t>(n), y);
+
+  cusparseStatus_t result = cusparseSpMV_bufferSize(handle,
+                                                    transA,
+                                                    alpha,
+                                                    matA,
+                                                    vecX,
+                                                    beta,
+                                                    vecY,
+                                                    CUDA_R_32F,
+                                                    CUSPARSE_SPMV_ALG_DEFAULT,
+                                                    bufferSizeInBytes);
+
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroySpMat(matA));
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnVec(vecX));
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnVec(vecY));
+  return result;
+
+#else
+
   return cusparseCsrmvEx_bufferSize(handle,
                                     alg,
                                     transA,
@@ -896,6 +829,7 @@ inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle,
                                     CUDA_R_32F,
                                     CUDA_R_32F,
                                     bufferSizeInBytes);
+#endif
 }
 template <>
 inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle,
@@ -916,6 +850,39 @@ inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle,
                                                    cudaStream_t stream)
 {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+
+#if CUDART_VERSION >= 11020
+  cusparseSpMatDescr_t matA;
+  cusparsecreatecsr(&matA,
+                    m,
+                    n,
+                    nnz,
+                    const_cast<int*>(csrRowPtrA),
+                    const_cast<int*>(csrColIndA),
+                    const_cast<double*>(csrValA));
+
+  cusparseDnVecDescr_t vecX;
+  cusparsecreatednvec(&vecX, static_cast<int64_t>(n), const_cast<double*>(x));
+
+  cusparseDnVecDescr_t vecY;
+  cusparsecreatednvec(&vecY, static_cast<int64_t>(n), y);
+
+  cusparseStatus_t result = cusparseSpMV_bufferSize(handle,
+                                                    transA,
+                                                    alpha,
+                                                    matA,
+                                                    vecX,
+                                                    beta,
+                                                    vecY,
+                                                    CUDA_R_64F,
+                                                    CUSPARSE_SPMV_ALG_DEFAULT,
+                                                    bufferSizeInBytes);
+
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroySpMat(matA));
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnVec(vecX));
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnVec(vecY));
+  return result;
+#else
   return cusparseCsrmvEx_bufferSize(handle,
                                     alg,
                                     transA,
@@ -937,6 +904,7 @@ inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle,
                                     CUDA_R_64F,
                                     CUDA_R_64F,
                                     bufferSizeInBytes);
+#endif
 }
 
 template <typename T>
@@ -975,6 +943,31 @@ inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle,
                                         cudaStream_t stream)
 {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+
+#if CUDART_VERSION >= 11020
+  cusparseSpMatDescr_t matA;
+  cusparsecreatecsr(&matA,
+                    m,
+                    n,
+                    nnz,
+                    const_cast<int*>(csrRowPtrA),
+                    const_cast<int*>(csrColIndA),
+                    const_cast<float*>(csrValA));
+
+  cusparseDnVecDescr_t vecX;
+  cusparsecreatednvec(&vecX, static_cast<int64_t>(n), const_cast<float*>(x));
+
+  cusparseDnVecDescr_t vecY;
+  cusparsecreatednvec(&vecY, static_cast<int64_t>(n), y);
+
+  cusparseStatus_t result = cusparseSpMV(
+    handle, transA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, CUSPARSE_SPMV_ALG_DEFAULT, buffer);
+
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroySpMat(matA));
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnVec(vecX));
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnVec(vecY));
+  return result;
+#else
   return cusparseCsrmvEx(handle,
                          alg,
                          transA,
@@ -996,6 +989,7 @@ inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle,
                          CUDA_R_32F,
                          CUDA_R_32F,
                          buffer);
+#endif
 }
 template <>
 inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle,
@@ -1016,6 +1010,33 @@ inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle,
                                         cudaStream_t stream)
 {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+
+#if CUDART_VERSION >= 11020
+  cusparseSpMatDescr_t matA;
+  cusparsecreatecsr(&matA,
+                    m,
+                    n,
+                    nnz,
+                    const_cast<int*>(csrRowPtrA),
+                    const_cast<int*>(csrColIndA),
+                    const_cast<double*>(csrValA));
+
+  cusparseDnVecDescr_t vecX;
+  cusparsecreatednvec(&vecX, static_cast<int64_t>(n), const_cast<double*>(x));
+
+  cusparseDnVecDescr_t vecY;
+  cusparsecreatednvec(&vecY, static_cast<int64_t>(n), y);
+
+  cusparseStatus_t result = cusparseSpMV(
+    handle, transA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, buffer);
+
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroySpMat(matA));
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnVec(vecX));
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnVec(vecY));
+  return result;
+
+#else
+
   return cusparseCsrmvEx(handle,
                          alg,
                          transA,
@@ -1037,6 +1058,7 @@ inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle,
                          CUDA_R_64F,
                          CUDA_R_64F,
                          buffer);
+#endif
 }
 
 /** @} */
@@ -1564,50 +1586,233 @@ inline cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle,
  * @{
  */
 
+template <typename T>
+cusparseStatus_t cusparsecsr2dense_buffersize(cusparseHandle_t handle,
+                                              int m,
+                                              int n,
+                                              int nnz,
+                                              const cusparseMatDescr_t descrA,
+                                              const T* csrValA,
+                                              const int* csrRowPtrA,
+                                              const int* csrColIndA,
+                                              T* A,
+                                              int lda,
+                                              size_t* buffer_size,
+                                              cudaStream_t stream,
+                                              bool row_major = false);
+
+template <>
+inline cusparseStatus_t cusparsecsr2dense_buffersize(cusparseHandle_t handle,
+                                                     int m,
+                                                     int n,
+                                                     int nnz,
+                                                     const cusparseMatDescr_t descrA,
+                                                     const float* csrValA,
+                                                     const int* csrRowPtrA,
+                                                     const int* csrColIndA,
+                                                     float* A,
+                                                     int lda,
+                                                     size_t* buffer_size,
+                                                     cudaStream_t stream,
+                                                     bool row_major)
+{
+#if CUDART_VERSION >= 11020
+  cusparseOrder_t order = row_major ? CUSPARSE_ORDER_ROW : CUSPARSE_ORDER_COL;
+
+  cusparseSpMatDescr_t matA;
+  cusparsecreatecsr(&matA,
+                    m,
+                    n,
+                    nnz,
+                    const_cast<int*>(csrRowPtrA),
+                    const_cast<int*>(csrColIndA),
+                    const_cast<float*>(csrValA));
+
+  cusparseDnMatDescr_t matB;
+  cusparsecreatednmat(&matB,
+                      static_cast<int64_t>(m),
+                      static_cast<int64_t>(n),
+                      static_cast<int64_t>(lda),
+                      const_cast<float*>(A),
+                      order);
+
+  cusparseStatus_t result = cusparseSparseToDense_bufferSize(
+    handle, matA, matB, CUSPARSE_SPARSETODENSE_ALG_DEFAULT, buffer_size);
+
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroySpMat(matA));
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(matB));
+
+#else
+
+  cusparseStatus_t result = CUSPARSE_STATUS_SUCCESS;
+  buffer_size[0]          = 0;
+
+#endif
+  return result;
+}
+
+template <>
+inline cusparseStatus_t cusparsecsr2dense_buffersize(cusparseHandle_t handle,
+                                                     int m,
+                                                     int n,
+                                                     int nnz,
+                                                     const cusparseMatDescr_t descrA,
+                                                     const double* csrValA,
+                                                     const int* csrRowPtrA,
+                                                     const int* csrColIndA,
+                                                     double* A,
+                                                     int lda,
+                                                     size_t* buffer_size,
+                                                     cudaStream_t stream,
+                                                     bool row_major)
+{
+#if CUDART_VERSION >= 11020
+  cusparseOrder_t order = row_major ? CUSPARSE_ORDER_ROW : CUSPARSE_ORDER_COL;
+  cusparseSpMatDescr_t matA;
+  cusparsecreatecsr(&matA,
+                    m,
+                    n,
+                    nnz,
+                    const_cast<int*>(csrRowPtrA),
+                    const_cast<int*>(csrColIndA),
+                    const_cast<double*>(csrValA));
+
+  cusparseDnMatDescr_t matB;
+  cusparsecreatednmat(&matB,
+                      static_cast<int64_t>(m),
+                      static_cast<int64_t>(n),
+                      static_cast<int64_t>(lda),
+                      const_cast<double*>(A),
+                      order);
+
+  cusparseStatus_t result = cusparseSparseToDense_bufferSize(
+    handle, matA, matB, CUSPARSE_SPARSETODENSE_ALG_DEFAULT, buffer_size);
+
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroySpMat(matA));
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(matB));
+
+#else
+  cusparseStatus_t result = CUSPARSE_STATUS_SUCCESS;
+  buffer_size[0]          = 0;
+
+#endif
+
+  return result;
+}
+
 template <typename T>
 cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle,
                                    int m,
                                    int n,
+                                   int nnz,
                                    const cusparseMatDescr_t descrA,
                                    const T* csrValA,
                                    const int* csrRowPtrA,
                                    const int* csrColIndA,
                                    T* A,
                                    int lda,
-                                   cudaStream_t stream);
+                                   void* buffer,
+                                   cudaStream_t stream,
+                                   bool row_major = false);
 
 template <>
 inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle,
                                           int m,
                                           int n,
+                                          int nnz,
                                           const cusparseMatDescr_t descrA,
                                           const float* csrValA,
                                           const int* csrRowPtrA,
                                           const int* csrColIndA,
                                           float* A,
                                           int lda,
-                                          cudaStream_t stream)
+                                          void* buffer,
+                                          cudaStream_t stream,
+                                          bool row_major)
 {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+
+#if CUDART_VERSION >= 11020
+  cusparseOrder_t order = row_major ? CUSPARSE_ORDER_ROW : CUSPARSE_ORDER_COL;
+  cusparseSpMatDescr_t matA;
+  cusparsecreatecsr(&matA,
+                    m,
+                    n,
+                    nnz,
+                    const_cast<int*>(csrRowPtrA),
+                    const_cast<int*>(csrColIndA),
+                    const_cast<float*>(csrValA));
+
+  cusparseDnMatDescr_t matB;
+  cusparsecreatednmat(&matB,
+                      static_cast<int64_t>(m),
+                      static_cast<int64_t>(n),
+                      static_cast<int64_t>(lda),
+                      const_cast<float*>(A),
+                      order);
+
+  cusparseStatus_t result =
+    cusparseSparseToDense(handle, matA, matB, CUSPARSE_SPARSETODENSE_ALG_DEFAULT, buffer);
+
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroySpMat(matA));
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(matB));
+
+  return result;
+#else
   return cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda);
+#endif
 }
 template <>
 inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle,
                                           int m,
                                           int n,
+                                          int nnz,
                                           const cusparseMatDescr_t descrA,
                                           const double* csrValA,
                                           const int* csrRowPtrA,
                                           const int* csrColIndA,
                                           double* A,
                                           int lda,
-                                          cudaStream_t stream)
+                                          void* buffer,
+                                          cudaStream_t stream,
+                                          bool row_major)
 {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+
+#if CUDART_VERSION >= 11020
+  cusparseOrder_t order = row_major ? CUSPARSE_ORDER_ROW : CUSPARSE_ORDER_COL;
+  cusparseSpMatDescr_t matA;
+  cusparsecreatecsr(&matA,
+                    m,
+                    n,
+                    nnz,
+                    const_cast<int*>(csrRowPtrA),
+                    const_cast<int*>(csrColIndA),
+                    const_cast<double*>(csrValA));
+
+  cusparseDnMatDescr_t matB;
+  cusparsecreatednmat(&matB,
+                      static_cast<int64_t>(m),
+                      static_cast<int64_t>(n),
+                      static_cast<int64_t>(lda),
+                      const_cast<double*>(A),
+                      order);
+
+  cusparseStatus_t result =
+    cusparseSparseToDense(handle, matA, matB, CUSPARSE_SPARSETODENSE_ALG_DEFAULT, buffer);
+
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroySpMat(matA));
+  RAFT_CUSPARSE_TRY_NO_THROW(cusparseDestroyDnMat(matB));
+
+  return result;
+#else
+
   return cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda);
+#endif
 }
 
 /** @} */
 
+}  // namespace detail
 }  // namespace sparse
-}  // namespace raft
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
index f65f524f62..124fa2285d 100644
--- a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
@@ -20,7 +20,7 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/sparse/detail/utils.h>
 #include <raft/sparse/distance/common.h>
 #include <raft/sparse/distance/detail/ip_distance.cuh>
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
index c23a2b1537..046b65a0f0 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
@@ -22,7 +22,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/mr/device/buffer.hpp>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 
 #include "../../csr.hpp"
 #include "../../detail/utils.h"
diff --git a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
index b1e9ae5a55..6e717e9920 100644
--- a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
@@ -19,10 +19,10 @@
 #include <limits.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/distance/distance_type.hpp>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 
-#include <raft/sparse/convert/csr.hpp>
-#include <raft/sparse/convert/dense.hpp>
+#include <raft/sparse/convert/coo.hpp>
 #include <raft/sparse/detail/utils.h>
 #include <raft/sparse/distance/common.h>
 #include <raft/sparse/distance/detail/coo_spmv.cuh>
diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
index 57411f6998..0624674e81 100644
--- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
@@ -23,7 +23,7 @@
 #include <raft/distance/distance_type.hpp>
 #include <raft/linalg/unary_op.hpp>
 #include <raft/sparse/csr.hpp>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/sparse/detail/utils.h>
 #include <raft/sparse/distance/common.h>
 #include <raft/sparse/distance/detail/ip_distance.cuh>
diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
index 4ceb31a3c8..de9049ced7 100644
--- a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
@@ -20,8 +20,7 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
-
+#include <raft/distance/distance_type.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/csr.hpp>
diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp
index 7ec032d186..dc9837ab43 100644
--- a/cpp/include/raft/sparse/distance/distance.hpp
+++ b/cpp/include/raft/sparse/distance/distance.hpp
@@ -16,28 +16,16 @@
 
 #pragma once
 
-#include <raft/cudart_utils.h>
+#include <raft/sparse/distance/common.h>
 #include <unordered_set>
 
-#include <raft/cuda_utils.cuh>
 #include <raft/distance/distance_type.hpp>
-#include <raft/mr/device/buffer.hpp>
-#include <raft/sparse/cusparse_wrappers.h>
-
-#include <raft/sparse/convert/coo.hpp>
-#include <raft/sparse/convert/csr.hpp>
-#include <raft/sparse/convert/dense.hpp>
-#include <raft/sparse/csr.hpp>
-#include <raft/sparse/detail/utils.h>
-#include <raft/sparse/linalg/transpose.hpp>
 
 #include <raft/sparse/distance/detail/bin_distance.cuh>
 #include <raft/sparse/distance/detail/ip_distance.cuh>
 #include <raft/sparse/distance/detail/l2_distance.cuh>
 #include <raft/sparse/distance/detail/lp_distance.cuh>
 
-#include <cusparse_v2.h>
-
 namespace raft {
 namespace sparse {
 namespace distance {
diff --git a/cpp/include/raft/sparse/linalg/detail/add.cuh b/cpp/include/raft/sparse/linalg/detail/add.cuh
index 769c7e523f..b288d0a603 100644
--- a/cpp/include/raft/sparse/linalg/detail/add.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/add.cuh
@@ -20,7 +20,7 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
diff --git a/cpp/include/raft/sparse/linalg/detail/norm.cuh b/cpp/include/raft/sparse/linalg/detail/norm.cuh
index f4b4f65f7e..b7420a55e7 100644
--- a/cpp/include/raft/sparse/linalg/detail/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/norm.cuh
@@ -19,7 +19,7 @@
 #include <cusparse_v2.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
index 9d1741fab7..5b43798e2e 100644
--- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -17,7 +17,7 @@
 #include <raft/cudart_utils.h>
 
 #include <raft/cuda_utils.cuh>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/spectral/partition.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
index 045f0e14bc..4384f2ba55 100644
--- a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
@@ -20,7 +20,7 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
diff --git a/cpp/include/raft/sparse/linalg/detail/transpose.h b/cpp/include/raft/sparse/linalg/detail/transpose.h
index be74a72817..398877eaab 100644
--- a/cpp/include/raft/sparse/linalg/detail/transpose.h
+++ b/cpp/include/raft/sparse/linalg/detail/transpose.h
@@ -20,7 +20,7 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
@@ -70,39 +70,39 @@ void csr_transpose(cusparseHandle_t handle,
 {
   size_t convert_csc_workspace_size = 0;
 
-  RAFT_CUSPARSE_TRY(raft::sparse::cusparsecsr2csc_bufferSize(handle,
-                                                             csr_nrows,
-                                                             csr_ncols,
-                                                             nnz,
-                                                             csr_data,
-                                                             csr_indptr,
-                                                             csr_indices,
-                                                             csc_data,
-                                                             csc_indptr,
-                                                             csc_indices,
-                                                             CUSPARSE_ACTION_NUMERIC,
-                                                             CUSPARSE_INDEX_BASE_ZERO,
-                                                             CUSPARSE_CSR2CSC_ALG1,
-                                                             &convert_csc_workspace_size,
-                                                             stream));
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecsr2csc_bufferSize(handle,
+                                                                     csr_nrows,
+                                                                     csr_ncols,
+                                                                     nnz,
+                                                                     csr_data,
+                                                                     csr_indptr,
+                                                                     csr_indices,
+                                                                     csc_data,
+                                                                     csc_indptr,
+                                                                     csc_indices,
+                                                                     CUSPARSE_ACTION_NUMERIC,
+                                                                     CUSPARSE_INDEX_BASE_ZERO,
+                                                                     CUSPARSE_CSR2CSC_ALG1,
+                                                                     &convert_csc_workspace_size,
+                                                                     stream));
 
   rmm::device_uvector<char> convert_csc_workspace(convert_csc_workspace_size, stream);
 
-  RAFT_CUSPARSE_TRY(raft::sparse::cusparsecsr2csc(handle,
-                                                  csr_nrows,
-                                                  csr_ncols,
-                                                  nnz,
-                                                  csr_data,
-                                                  csr_indptr,
-                                                  csr_indices,
-                                                  csc_data,
-                                                  csc_indptr,
-                                                  csc_indices,
-                                                  CUSPARSE_ACTION_NUMERIC,
-                                                  CUSPARSE_INDEX_BASE_ZERO,
-                                                  CUSPARSE_CSR2CSC_ALG1,
-                                                  convert_csc_workspace.data(),
-                                                  stream));
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecsr2csc(handle,
+                                                          csr_nrows,
+                                                          csr_ncols,
+                                                          nnz,
+                                                          csr_data,
+                                                          csr_indptr,
+                                                          csr_indices,
+                                                          csc_data,
+                                                          csc_indptr,
+                                                          csc_indices,
+                                                          CUSPARSE_ACTION_NUMERIC,
+                                                          CUSPARSE_INDEX_BASE_ZERO,
+                                                          CUSPARSE_CSR2CSC_ALG1,
+                                                          convert_csc_workspace.data(),
+                                                          stream));
 }
 
 };  // end NAMESPACE detail
diff --git a/cpp/include/raft/sparse/op/detail/filter.cuh b/cpp/include/raft/sparse/op/detail/filter.cuh
index 6e5d518619..80a6584251 100644
--- a/cpp/include/raft/sparse/op/detail/filter.cuh
+++ b/cpp/include/raft/sparse/op/detail/filter.cuh
@@ -20,7 +20,7 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
diff --git a/cpp/include/raft/sparse/op/detail/reduce.cuh b/cpp/include/raft/sparse/op/detail/reduce.cuh
index ba728f54c8..e4a64fbb51 100644
--- a/cpp/include/raft/sparse/op/detail/reduce.cuh
+++ b/cpp/include/raft/sparse/op/detail/reduce.cuh
@@ -21,7 +21,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/mr/device/buffer.hpp>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 
 #include <raft/device_atomics.cuh>
 #include <raft/sparse/op/sort.hpp>
diff --git a/cpp/include/raft/sparse/op/detail/row_op.cuh b/cpp/include/raft/sparse/op/detail/row_op.cuh
index b8803d4926..4754f753d4 100644
--- a/cpp/include/raft/sparse/op/detail/row_op.cuh
+++ b/cpp/include/raft/sparse/op/detail/row_op.cuh
@@ -20,7 +20,7 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
diff --git a/cpp/include/raft/sparse/op/detail/slice.h b/cpp/include/raft/sparse/op/detail/slice.h
index 0f4f50ceb6..e3c0f09e14 100644
--- a/cpp/include/raft/sparse/op/detail/slice.h
+++ b/cpp/include/raft/sparse/op/detail/slice.h
@@ -21,7 +21,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/unary_op.hpp>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
@@ -106,4 +106,4 @@ void csr_row_slice_populate(value_idx start_offset,
 };  // namespace detail
 };  // namespace op
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/op/detail/sort.h b/cpp/include/raft/sparse/op/detail/sort.h
index 94feda1e76..9fc7cac5e3 100644
--- a/cpp/include/raft/sparse/op/detail/sort.h
+++ b/cpp/include/raft/sparse/op/detail/sort.h
@@ -19,7 +19,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/sparse/coo.hpp>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/sparse/detail/utils.h>
 #include <rmm/exec_policy.hpp>
 
diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp
index 75f0121795..d86dc21135 100644
--- a/cpp/include/raft/spectral/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/matrix_wrappers.hpp
@@ -18,7 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/fill.h>
@@ -208,24 +208,24 @@ struct sparse_matrix_t {
     // void*; the casts should be harmless)
     //
     cusparseSpMatDescr_t matA;
-    RAFT_CUSPARSE_TRY(cusparsecreatecsr(&matA,
-                                        nrows_,
-                                        ncols_,
-                                        nnz_,
-                                        const_cast<index_type*>(row_offsets_),
-                                        const_cast<index_type*>(col_indices_),
-                                        const_cast<value_type*>(values_)));
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatecsr(&matA,
+                                                              nrows_,
+                                                              ncols_,
+                                                              nnz_,
+                                                              const_cast<index_type*>(row_offsets_),
+                                                              const_cast<index_type*>(col_indices_),
+                                                              const_cast<value_type*>(values_)));
 
     cusparseDnVecDescr_t vecX;
-    RAFT_CUSPARSE_TRY(cusparsecreatednvec(&vecX, size_x, x));
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec(&vecX, size_x, x));
 
     cusparseDnVecDescr_t vecY;
-    RAFT_CUSPARSE_TRY(cusparsecreatednvec(&vecY, size_y, y));
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec(&vecY, size_y, y));
 
     // get (scratch) external device buffer size:
     //
     size_t bufferSize;
-    RAFT_CUSPARSE_TRY(cusparsespmv_buffersize(
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv_buffersize(
       cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, &bufferSize, stream));
 
     // allocate external buffer:
@@ -234,7 +234,7 @@ struct sparse_matrix_t {
 
     // finally perform SpMV:
     //
-    RAFT_CUSPARSE_TRY(cusparsespmv(
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(
       cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, external_buffer.raw(), stream));
 
     // free descriptors:
@@ -244,7 +244,8 @@ struct sparse_matrix_t {
     RAFT_CUSPARSE_TRY(cusparseDestroyDnVec(vecX));
     RAFT_CUSPARSE_TRY(cusparseDestroySpMat(matA));
 #else
-    RAFT_CUSPARSE_TRY(cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream));
+    RAFT_CUSPARSE_TRY(
+      raft::sparse::detail::cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream));
     cusparseMatDescr_t descr = 0;
     RAFT_CUSPARSE_TRY(cusparseCreateMatDescr(&descr));
     if (symmetric) {
@@ -253,20 +254,20 @@ struct sparse_matrix_t {
       RAFT_CUSPARSE_TRY(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
     }
     RAFT_CUSPARSE_TRY(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
-    RAFT_CUSPARSE_TRY(cusparsecsrmv(cusparse_h,
-                                    trans,
-                                    nrows_,
-                                    ncols_,
-                                    nnz_,
-                                    &alpha,
-                                    descr,
-                                    values_,
-                                    row_offsets_,
-                                    col_indices_,
-                                    x,
-                                    &beta,
-                                    y,
-                                    stream));
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecsrmv(cusparse_h,
+                                                          trans,
+                                                          nrows_,
+                                                          ncols_,
+                                                          nnz_,
+                                                          &alpha,
+                                                          descr,
+                                                          values_,
+                                                          row_offsets_,
+                                                          col_indices_,
+                                                          x,
+                                                          &beta,
+                                                          y,
+                                                          stream));
     RAFT_CUSPARSE_TRY(cusparseDestroyMatDescr(descr));
 #endif
   }
diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu
index f0a245b432..e92717c454 100644
--- a/cpp/test/sparse/csr_row_slice.cu
+++ b/cpp/test/sparse/csr_row_slice.cu
@@ -19,7 +19,7 @@
 #include <raft/handle.hpp>
 
 #include <gtest/gtest.h>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/sparse/op/slice.hpp>
 
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu
index 8dec9492bb..60447e3a81 100644
--- a/cpp/test/sparse/csr_to_dense.cu
+++ b/cpp/test/sparse/csr_to_dense.cu
@@ -20,7 +20,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/sparse/convert/dense.hpp>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 
 #include <rmm/device_uvector.hpp>
 
@@ -36,6 +36,7 @@ template <typename value_idx, typename value_t>
 struct CSRToDenseInputs {
   value_idx nrows;
   value_idx ncols;
+  value_idx nnz;
 
   std::vector<value_idx> indptr_h;
   std::vector<value_idx> indices_h;
@@ -95,6 +96,7 @@ class CSRToDenseTest : public ::testing::TestWithParam<CSRToDenseInputs<value_id
     convert::csr_to_dense(handle,
                           params.nrows,
                           params.ncols,
+                          params.nnz,
                           indptr.data(),
                           indices.data(),
                           data.data(),
@@ -135,6 +137,7 @@ class CSRToDenseTest : public ::testing::TestWithParam<CSRToDenseInputs<value_id
 const std::vector<CSRToDenseInputs<int, float>> inputs_i32_f = {
   {4,
    4,
+   8,
    {0, 2, 4, 6, 8},
    {0, 1, 2, 3, 0, 1, 2, 3},  // indices
    {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f},
diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu
index ab95c3610f..e4fb7a102b 100644
--- a/cpp/test/sparse/csr_transpose.cu
+++ b/cpp/test/sparse/csr_transpose.cu
@@ -20,7 +20,7 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/sparse/linalg/transpose.hpp>
 
 #include "../test_utils.h"
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index d3f4adb01b..e2288daed9 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -16,12 +16,10 @@
 
 #include <gtest/gtest.h>
 
-#include <cusparse_v2.h>
-
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance_type.hpp>
 #include <raft/linalg/unary_op.hpp>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 #include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/convert/coo.hpp>
diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu
index c8798f832f..7c61f2ed1c 100644
--- a/cpp/test/sparse/distance.cu
+++ b/cpp/test/sparse/distance.cu
@@ -20,7 +20,7 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance_type.hpp>
-#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/sparse/detail/cusparse_wrappers.h>
 
 #include <raft/sparse/distance/distance.hpp>
 
diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu
index f0336a31fa..5a066c2c28 100644
--- a/cpp/test/sparse/knn.cu
+++ b/cpp/test/sparse/knn.cu
@@ -19,11 +19,9 @@
 
 #include "../test_utils.h"
 #include <raft/distance/distance_type.hpp>
-#include <raft/sparse/cusparse_wrappers.h>
 #include <raft/sparse/selection/knn.hpp>
 
 #include <raft/cudart_utils.h>
-#include <raft/sparse/cusparse_wrappers.h>
 
 namespace raft {
 namespace sparse {

From 2ebf89c3969d23f8f394a409909997c8d9593ae1 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Tue, 8 Feb 2022 22:19:20 -0500
Subject: [PATCH 106/171] Hiding implementation details for lap, clustering,
 spectral, and label (#477)

Also managed to remove the raft host/device buffers in the process

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/477
---
 .../kmeans.hpp => cluster/detail/kmeans.cuh}  |  17 +-
 cpp/include/raft/cluster/kmeans.hpp           |  65 ++++++
 cpp/include/raft/comms/helper.hpp             |   1 -
 cpp/include/raft/comms/std_comms.hpp          |   2 -
 cpp/include/raft/label/classlabels.hpp        | 117 ++++++++++
 .../raft/label/{ => detail}/classlabels.cuh   |   3 +
 .../raft/label/{ => detail}/merge_labels.cuh  |   2 +
 cpp/include/raft/label/merge_labels.hpp       |  66 ++++++
 cpp/include/raft/lap/{ => detail}/d_structs.h |   0
 .../raft/lap/{ => detail}/lap_functions.cuh   |   2 +-
 .../raft/lap/{ => detail}/lap_kernels.cuh     |   1 -
 cpp/include/raft/lap/{lap.cuh => lap.hpp}     |   4 +-
 cpp/include/raft/linalg/detail/lanczos.hpp    |   6 +-
 cpp/include/raft/mr/buffer_base.hpp           | 211 ------------------
 cpp/include/raft/mr/device/buffer.hpp         |  70 ------
 cpp/include/raft/mr/host/buffer.hpp           |  85 -------
 .../raft/sparse/distance/detail/coo_spmv.cuh  |   1 -
 .../raft/sparse/distance/detail/utils.cuh     |   1 -
 .../sparse/hierarchy/detail/agglomerative.cuh |   1 -
 .../hierarchy/detail/connectivities.cuh       |   1 -
 .../raft/sparse/hierarchy/detail/mst.cuh      |   1 -
 .../raft/sparse/linalg/detail/spectral.cuh    |   8 +-
 cpp/include/raft/sparse/op/detail/reduce.cuh  |   1 -
 .../selection/detail/connect_components.cuh   |   3 +-
 .../raft/sparse/selection/detail/knn.cuh      |   1 -
 cpp/include/raft/spatial/knn/ann.hpp          |   2 -
 cpp/include/raft/spatial/knn/knn.hpp          |   4 -
 cpp/include/raft/spectral/cluster_solvers.hpp |  28 ++-
 .../raft/spectral/{ => detail}/lapack.hpp     |   0
 .../matrix_wrappers.cuh}                      |   0
 .../detail/modularity_maximization.hpp        | 188 ++++++++++++++++
 .../raft/spectral/detail/partition.hpp        | 182 +++++++++++++++
 .../spectral_util.cuh}                        |   0
 .../raft/spectral/{ => detail}/warn_dbg.hpp   |   0
 cpp/include/raft/spectral/eigen_solvers.hpp   |   3 +
 .../raft/spectral/modularity_maximization.hpp | 110 +--------
 cpp/include/raft/spectral/partition.hpp       |  94 +-------
 cpp/test/CMakeLists.txt                       |   2 -
 cpp/test/cluster_solvers.cu                   |  10 +-
 cpp/test/eigen_solvers.cu                     |  17 +-
 cpp/test/label/label.cu                       |   2 +-
 cpp/test/label/merge_labels.cu                |   2 +-
 cpp/test/lap/lap.cu                           |   2 +-
 cpp/test/mr/device/buffer.cpp                 |  92 --------
 cpp/test/mr/host/buffer.cpp                   |  71 ------
 cpp/test/spectral_matrix.cu                   |   2 +-
 46 files changed, 697 insertions(+), 784 deletions(-)
 rename cpp/include/raft/{spectral/kmeans.hpp => cluster/detail/kmeans.cuh} (99%)
 create mode 100644 cpp/include/raft/cluster/kmeans.hpp
 create mode 100644 cpp/include/raft/label/classlabels.hpp
 rename cpp/include/raft/label/{ => detail}/classlabels.cuh (99%)
 rename cpp/include/raft/label/{ => detail}/merge_labels.cuh (99%)
 create mode 100644 cpp/include/raft/label/merge_labels.hpp
 rename cpp/include/raft/lap/{ => detail}/d_structs.h (100%)
 rename cpp/include/raft/lap/{ => detail}/lap_functions.cuh (99%)
 rename cpp/include/raft/lap/{ => detail}/lap_kernels.cuh (99%)
 rename cpp/include/raft/lap/{lap.cuh => lap.hpp} (99%)
 delete mode 100644 cpp/include/raft/mr/buffer_base.hpp
 delete mode 100644 cpp/include/raft/mr/device/buffer.hpp
 delete mode 100644 cpp/include/raft/mr/host/buffer.hpp
 rename cpp/include/raft/spectral/{ => detail}/lapack.hpp (100%)
 rename cpp/include/raft/spectral/{matrix_wrappers.hpp => detail/matrix_wrappers.cuh} (100%)
 create mode 100644 cpp/include/raft/spectral/detail/modularity_maximization.hpp
 create mode 100644 cpp/include/raft/spectral/detail/partition.hpp
 rename cpp/include/raft/spectral/{spectral_util.hpp => detail/spectral_util.cuh} (100%)
 rename cpp/include/raft/spectral/{ => detail}/warn_dbg.hpp (100%)
 delete mode 100644 cpp/test/mr/device/buffer.cpp
 delete mode 100644 cpp/test/mr/host/buffer.cpp

diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/cluster/detail/kmeans.cuh
similarity index 99%
rename from cpp/include/raft/spectral/kmeans.hpp
rename to cpp/include/raft/cluster/detail/kmeans.cuh
index 56f4022a8c..5f1a0e137d 100644
--- a/cpp/include/raft/spectral/kmeans.hpp
+++ b/cpp/include/raft/cluster/detail/kmeans.cuh
@@ -32,13 +32,12 @@
 #include <raft/device_atomics.cuh>
 #include <raft/handle.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/spectral/matrix_wrappers.hpp>
-#include <raft/spectral/warn_dbg.hpp>
+#include <raft/spectral/detail/matrix_wrappers.cuh>
+#include <raft/spectral/detail/warn_dbg.hpp>
 
-namespace {
-
-using namespace raft;
-using namespace raft::linalg;
+namespace raft {
+namespace cluster {
+namespace detail {
 // =========================================================
 // Useful grid settings
 // =========================================================
@@ -728,10 +727,6 @@ static int updateCentroids(handle_t const& handle,
   return 0;
 }
 
-}  // namespace
-
-namespace raft {
-
 // =========================================================
 // k-means algorithm
 // =========================================================
@@ -986,4 +981,6 @@ int kmeans(handle_t const& handle,
                                             seed);
 }
 
+}  // namespace detail
+}  // namespace cluster
 }  // namespace raft
diff --git a/cpp/include/raft/cluster/kmeans.hpp b/cpp/include/raft/cluster/kmeans.hpp
new file mode 100644
index 0000000000..ab0fbb04c7
--- /dev/null
+++ b/cpp/include/raft/cluster/kmeans.hpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/cluster/detail/kmeans.cuh>
+
+namespace raft {
+namespace cluster {
+
+/**
+ *  @brief Find clusters with k-means algorithm.
+ *    Initial centroids are chosen with k-means++ algorithm. Empty
+ *    clusters are reinitialized by choosing new centroids with
+ *    k-means++ algorithm.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param n Number of observation vectors.
+ *  @param d Dimension of observation vectors.
+ *  @param k Number of clusters.
+ *  @param tol Tolerance for convergence. k-means stops when the
+ *    change in residual divided by n is less than tol.
+ *  @param maxiter Maximum number of k-means iterations.
+ *  @param obs (Input, device memory, d*n entries) Observation
+ *    matrix. Matrix is stored column-major and each column is an
+ *    observation vector. Matrix dimensions are d x n.
+ *  @param codes (Output, device memory, n entries) Cluster
+ *    assignments.
+ *  @param residual On exit, residual sum of squares (sum of squares
+ *    of distances between observation vectors and centroids).
+ *  @param iters on exit, number of k-means iterations.
+ *  @param seed random seed to be used.
+ *  @return error flag
+ */
+template <typename index_type_t, typename value_type_t>
+int kmeans(handle_t const& handle,
+           index_type_t n,
+           index_type_t d,
+           index_type_t k,
+           value_type_t tol,
+           index_type_t maxiter,
+           const value_type_t* __restrict__ obs,
+           index_type_t* __restrict__ codes,
+           value_type_t& residual,
+           index_type_t& iters,
+           unsigned long long seed = 123456)
+{
+  return detail::kmeans<index_type_t, value_type_t>(
+    handle, n, d, k, tol, maxiter, obs, codes, residual, iters, seed);
+}
+}  // namespace cluster
+}  // namespace raft
diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp
index 09a767bea7..d83e8f4d4f 100644
--- a/cpp/include/raft/comms/helper.hpp
+++ b/cpp/include/raft/comms/helper.hpp
@@ -18,7 +18,6 @@
 
 #include <raft/comms/std_comms.hpp>
 #include <raft/handle.hpp>
-#include <raft/mr/device/buffer.hpp>
 
 #include <iostream>
 #include <nccl.h>
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index f54535a88c..b4aa72d53e 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -21,8 +21,6 @@
 #include <raft/comms/comms.hpp>
 #include <raft/comms/detail/std_comms.hpp>
 
-#include <raft/mr/device/buffer.hpp>
-
 #include <iostream>
 #include <nccl.h>
 #include <ucp/api/ucp.h>
diff --git a/cpp/include/raft/label/classlabels.hpp b/cpp/include/raft/label/classlabels.hpp
new file mode 100644
index 0000000000..de9f60518d
--- /dev/null
+++ b/cpp/include/raft/label/classlabels.hpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/label/detail/classlabels.cuh>
+
+namespace raft {
+namespace label {
+
+/**
+ * Get unique class labels.
+ *
+ * The y array is assumed to store class labels. The unique values are selected
+ * from this array.
+ *
+ * @tparam value_t numeric type of the arrays with class labels
+ * @param [inout] unique output unique labels
+ * @param [in] y device array of labels, size [n]
+ * @param [in] n number of labels
+ * @param [in] stream cuda stream
+ * @returns unique device array of unique labels, unallocated on entry,
+ *   on exit it has size
+ */
+template <typename value_t>
+int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n, cudaStream_t stream)
+{
+  return detail::getUniquelabels<value_t>(unique, y, n, stream);
+}
+
+/**
+ * Assign one versus rest labels.
+ *
+ * The output labels will have values +/-1:
+ * y_out = (y == y_unique[idx]) ? +1 : -1;
+ *
+ * The output type currently is set to value_t, but for SVM in principle we are
+ * free to choose other type for y_out (it should represent +/-1, and it is used
+ * in floating point arithmetics).
+ *
+ * @param [in] y device array if input labels, size [n]
+ * @param [in] n number of labels
+ * @param [in] y_unique device array of unique labels, size [n_classes]
+ * @param [in] n_classes number of unique labels
+ * @param [out] y_out device array of output labels
+ * @param [in] idx index of unique label that should be labeled as 1
+ * @param [in] stream cuda stream
+ */
+template <typename value_t>
+void getOvrlabels(
+  value_t* y, int n, value_t* y_unique, int n_classes, value_t* y_out, int idx, cudaStream_t stream)
+{
+  detail::getOvrlabels<value_t>(y, n, y_unique, n_classes, y_out, idx, stream);
+}
+/**
+ * Maps an input array containing a series of numbers into a new array
+ * where numbers have been mapped to a monotonically increasing set
+ * of labels. This can be useful in machine learning algorithms, for instance,
+ * where a given set of labels is not taken from a monotonically increasing
+ * set. This can happen if they are filtered or if only a subset of the
+ * total labels are used in a dataset. This is also useful in graph algorithms
+ * where a set of vertices need to be labeled in a monotonically increasing
+ * order.
+ * @tparam Type the numeric type of the input and output arrays
+ * @tparam Lambda the type of an optional filter function, which determines
+ * which items in the array to map.
+ * @param[out] out the output monotonic array
+ * @param[in] in input label array
+ * @param[in] N number of elements in the input array
+ * @param[in] stream cuda stream to use
+ * @param[in] filter_op an optional function for specifying which values
+ * should have monotonically increasing labels applied to them.
+ * @param[in] zero_based force monotonic set to start at 0?
+ */
+template <typename Type, typename Lambda>
+void make_monotonic(
+  Type* out, Type* in, size_t N, cudaStream_t stream, Lambda filter_op, bool zero_based = false)
+{
+  detail::make_monotonic<Type, Lambda>(out, in, N, stream, filter_op, zero_based);
+}
+
+/**
+ * Maps an input array containing a series of numbers into a new array
+ * where numbers have been mapped to a monotonically increasing set
+ * of labels. This can be useful in machine learning algorithms, for instance,
+ * where a given set of labels is not taken from a monotonically increasing
+ * set. This can happen if they are filtered or if only a subset of the
+ * total labels are used in a dataset. This is also useful in graph algorithms
+ * where a set of vertices need to be labeled in a monotonically increasing
+ * order.
+ * @tparam Type the numeric type of the input and output arrays
+ * @param[out] out output label array with labels assigned monotonically
+ * @param[in] in input label array
+ * @param[in] N number of elements in the input array
+ * @param[in] stream cuda stream to use
+ * @param[in] zero_based force monotonic label set to start at 0?
+ */
+template <typename Type>
+void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zero_based = false)
+{
+  detail::make_monotonic<Type>(out, in, N, stream, zero_based);
+}
+};  // namespace label
+};  // end namespace raft
diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/detail/classlabels.cuh
similarity index 99%
rename from cpp/include/raft/label/classlabels.cuh
rename to cpp/include/raft/label/detail/classlabels.cuh
index fda4c02a1c..c805860759 100644
--- a/cpp/include/raft/label/classlabels.cuh
+++ b/cpp/include/raft/label/detail/classlabels.cuh
@@ -26,6 +26,7 @@
 
 namespace raft {
 namespace label {
+namespace detail {
 
 /**
  * Get unique class labels.
@@ -194,5 +195,7 @@ void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zer
   make_monotonic<Type>(
     out, in, N, stream, [] __device__(Type val) { return false; }, zero_based);
 }
+
+};  // namespace detail
 };  // namespace label
 };  // end namespace raft
diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/detail/merge_labels.cuh
similarity index 99%
rename from cpp/include/raft/label/merge_labels.cuh
rename to cpp/include/raft/label/detail/merge_labels.cuh
index 9cd5a29951..bf03d1c738 100644
--- a/cpp/include/raft/label/merge_labels.cuh
+++ b/cpp/include/raft/label/detail/merge_labels.cuh
@@ -25,6 +25,7 @@
 
 namespace raft {
 namespace label {
+namespace detail {
 
 /** Note: this is one possible implementation where we represent the label
  *  equivalence graph implicitly using labels_a, labels_b and mask.
@@ -153,5 +154,6 @@ void merge_labels(value_idx* labels_a,
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
+}  // namespace detail
 };  // namespace label
 };  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/label/merge_labels.hpp b/cpp/include/raft/label/merge_labels.hpp
new file mode 100644
index 0000000000..5ba8fe8a27
--- /dev/null
+++ b/cpp/include/raft/label/merge_labels.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/label/detail/merge_labels.cuh>
+
+namespace raft {
+namespace label {
+
+/**
+ * @brief Merge two labellings in-place, according to a core mask
+ *
+ * A labelling is a representation of disjoint sets (groups) where points that
+ * belong to the same group have the same label. It is assumed that group
+ * labels take values between 1 and N. labels relate to points, i.e a label i+1
+ * means that you belong to the same group as the point i.
+ * The special value MAX_LABEL is used to mark points that are not labelled.
+ *
+ * The two label arrays A and B induce two sets of groups over points 0..N-1.
+ * If a point is labelled i in A and j in B and the mask is true for this
+ * point, then i and j are equivalent labels and their groups are merged by
+ * relabeling the elements of both groups to have the same label. The new label
+ * is the smaller one from the original labels.
+ * It is required that if the mask is true for a point, this point is labelled
+ * (i.e its label is different than the special value MAX_LABEL).
+ *
+ * One use case is finding connected components: the two input label arrays can
+ * represent the connected components of graphs G_A and G_B, and the output
+ * would be the connected components labels of G_A \union G_B.
+ *
+ * @param[inout] labels_a    First input, and output label array (in-place)
+ * @param[in]    labels_b    Second input label array
+ * @param[in]    mask        Core point mask
+ * @param[out]   R           label equivalence map
+ * @param[in]    m           Working flag
+ * @param[in]    N           Number of points in the dataset
+ * @param[in]    stream      CUDA stream
+ */
+template <typename value_idx = int, int TPB_X = 256>
+void merge_labels(value_idx* labels_a,
+                  const value_idx* labels_b,
+                  const bool* mask,
+                  value_idx* R,
+                  bool* m,
+                  value_idx N,
+                  cudaStream_t stream)
+{
+  detail::merge_labels<value_idx, TPB_X>(labels_a, labels_b, mask, R, m, N, stream);
+}
+
+};  // namespace label
+};  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/lap/d_structs.h b/cpp/include/raft/lap/detail/d_structs.h
similarity index 100%
rename from cpp/include/raft/lap/d_structs.h
rename to cpp/include/raft/lap/detail/d_structs.h
diff --git a/cpp/include/raft/lap/lap_functions.cuh b/cpp/include/raft/lap/detail/lap_functions.cuh
similarity index 99%
rename from cpp/include/raft/lap/lap_functions.cuh
rename to cpp/include/raft/lap/detail/lap_functions.cuh
index ab4aa2df59..6c6b09e5d8 100644
--- a/cpp/include/raft/lap/lap_functions.cuh
+++ b/cpp/include/raft/lap/detail/lap_functions.cuh
@@ -28,7 +28,7 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/lap/lap_kernels.cuh>
+#include <raft/lap/detail/lap_kernels.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/lap/lap_kernels.cuh b/cpp/include/raft/lap/detail/lap_kernels.cuh
similarity index 99%
rename from cpp/include/raft/lap/lap_kernels.cuh
rename to cpp/include/raft/lap/detail/lap_kernels.cuh
index 328cbf3e74..b61d0bd269 100644
--- a/cpp/include/raft/lap/lap_kernels.cuh
+++ b/cpp/include/raft/lap/detail/lap_kernels.cuh
@@ -28,7 +28,6 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/mr/device/buffer.hpp>
 
 #include <thrust/for_each.h>
 
diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.hpp
similarity index 99%
rename from cpp/include/raft/lap/lap.cuh
rename to cpp/include/raft/lap/lap.hpp
index 42b898ebff..2350ebcddf 100644
--- a/cpp/include/raft/lap/lap.cuh
+++ b/cpp/include/raft/lap/lap.hpp
@@ -27,8 +27,8 @@
 #include <raft/handle.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include "d_structs.h"
-#include "lap_functions.cuh"
+#include "detail/d_structs.h"
+#include "detail/lap_functions.cuh"
 
 namespace raft {
 namespace lap {
diff --git a/cpp/include/raft/linalg/detail/lanczos.hpp b/cpp/include/raft/linalg/detail/lanczos.hpp
index 3d8fde7e68..a2b7751a05 100644
--- a/cpp/include/raft/linalg/detail/lanczos.hpp
+++ b/cpp/include/raft/linalg/detail/lanczos.hpp
@@ -28,9 +28,9 @@
 #include "cublas_wrappers.hpp"
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/spectral/lapack.hpp>
-#include <raft/spectral/matrix_wrappers.hpp>
-#include <raft/spectral/warn_dbg.hpp>
+#include <raft/spectral/detail/lapack.hpp>
+#include <raft/spectral/detail/matrix_wrappers.cuh>
+#include <raft/spectral/detail/warn_dbg.hpp>
 
 namespace raft {
 
diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp
deleted file mode 100644
index 11724bed00..0000000000
--- a/cpp/include/raft/mr/buffer_base.hpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/cudart_utils.h>
-
-#include <cuda_runtime.h>
-
-#include <cstddef>
-#include <memory>
-#include <utility>
-
-namespace raft {
-namespace mr {
-
-/**
- * @brief Base for all RAII-based owning of temporary memory allocations. This
- *        class should ideally not be used by users directly, but instead via
- *        the child classes `device_buffer` and `host_buffer`.
- *
- * @tparam T          data type
- * @tparam AllocatorT The underly allocator object
- */
-template <typename T, typename AllocatorT>
-class buffer_base {
- public:
-  using size_type       = std::size_t;
-  using value_type      = T;
-  using iterator        = value_type*;
-  using const_iterator  = const value_type*;
-  using reference       = T&;
-  using const_reference = const T&;
-
-  buffer_base() = delete;
-
-  buffer_base(const buffer_base& other) = delete;
-
-  buffer_base& operator=(const buffer_base& other) = delete;
-
-  /**
-   * @brief Main ctor
-   *
-   * @param[in] allocator asynchronous allocator used for managing buffer life
-   * @param[in] stream    cuda stream where this allocation operations are async
-   * @param[in] n         size of the buffer (in number of elements)
-   */
-  buffer_base(std::shared_ptr<AllocatorT> allocator, cudaStream_t stream, size_type n = 0)
-    : data_(nullptr), size_(n), capacity_(n), stream_(stream), allocator_(std::move(allocator))
-  {
-    if (capacity_ > 0) {
-      data_ =
-        static_cast<value_type*>(allocator_->allocate(capacity_ * sizeof(value_type), stream_));
-      RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
-    }
-  }
-
-  ~buffer_base() { release(); }
-
-  value_type* data() { return data_; }
-
-  const value_type* data() const { return data_; }
-
-  size_type size() const { return size_; }
-
-  void clear() { size_ = 0; }
-
-  iterator begin() { return data_; }
-
-  const_iterator begin() const { return data_; }
-
-  iterator end() { return data_ + size_; }
-
-  const_iterator end() const { return data_ + size_; }
-
-  /**
-   * @brief Reserve new memory size for this buffer.
-   *
-   * It re-allocates a fresh buffer if the new requested capacity is more than
-   * the current one, copies the old buffer contents to this new buffer and
-   * removes the old one.
-   *
-   * @param[in] new_capacity new capacity (in number of elements)
-   * @{
-   */
-  void reserve(size_type new_capacity)
-  {
-    if (new_capacity > capacity_) {
-      auto* new_data =
-        static_cast<value_type*>(allocator_->allocate(new_capacity * sizeof(value_type), stream_));
-      if (size_ > 0) { raft::copy(new_data, data_, size_, stream_); }
-      // Only deallocate if we have allocated a pointer
-      if (nullptr != data_) {
-        allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_);
-      }
-      data_     = new_data;
-      capacity_ = new_capacity;
-    }
-  }
-
-  void reserve(size_type new_capacity, cudaStream_t stream)
-  {
-    set_stream(stream);
-    reserve(new_capacity);
-  }
-  /** @} */
-
-  /**
-   * @brief Resize the underlying buffer (uses `reserve` method internally)
-   *
-   * @param[in] new_size new buffer size
-   * @{
-   */
-  void resize(const size_type new_size)
-  {
-    reserve(new_size);
-    size_ = new_size;
-  }
-
-  void resize(const size_type new_size, cudaStream_t stream)
-  {
-    set_stream(stream);
-    resize(new_size);
-  }
-  /** @} */
-
-  /**
-   * @brief Deletes the underlying buffer
-   *
-   * If this method is not explicitly called, it will be during the destructor
-   * @{
-   */
-  void release()
-  {
-    if (nullptr != data_) {
-      allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_);
-    }
-    data_     = nullptr;
-    capacity_ = 0;
-    size_     = 0;
-  }
-
-  void release(cudaStream_t stream)
-  {
-    set_stream(stream);
-    release();
-  }
-  /** @} */
-
-  /**
-   * @brief returns the underlying allocator used
-   *
-   * @return the allocator pointer
-   */
-  std::shared_ptr<AllocatorT> get_allocator() const { return allocator_; }
-
-  /**
-   * @brief returns the underlying stream used
-   *
-   * @return the cuda stream
-   */
-  cudaStream_t get_stream() const { return stream_; }
-
- protected:
-  value_type* data_;
-
- private:
-  size_type size_;
-  size_type capacity_;
-  cudaStream_t stream_;
-  std::shared_ptr<AllocatorT> allocator_;
-
-  /**
-   * @brief Sets a new cuda stream where the future operations will be queued
-   *
-   * This method makes sure that the inter-stream dependencies are met and taken
-   * care of, before setting the input stream as a new stream for this buffer.
-   * Ideally, the same cuda stream passed during constructor is expected to be
-   * used throughout this buffer's lifetime, for performance.
-   *
-   * @param[in] stream new cuda stream to be set. If it is the same as the
-   *                   current one, then this method will be a no-op.
-   */
-  void set_stream(cudaStream_t stream)
-  {
-    if (stream_ != stream) {
-      cudaEvent_t event;
-      RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
-      RAFT_CUDA_TRY(cudaEventRecord(event, stream_));
-      RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, event, 0));
-      stream_ = stream;
-      RAFT_CUDA_TRY(cudaEventDestroy(event));
-    }
-  }
-};  // class buffer_base
-
-};  // namespace mr
-};  // namespace raft
diff --git a/cpp/include/raft/mr/device/buffer.hpp b/cpp/include/raft/mr/device/buffer.hpp
deleted file mode 100644
index 9b5ff11c50..0000000000
--- a/cpp/include/raft/mr/device/buffer.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "allocator.hpp"
-#include <memory>
-#include <raft/mr/buffer_base.hpp>
-
-namespace raft {
-namespace mr {
-namespace device {
-
-/**
- * @brief RAII object owning a contiguous typed device buffer. The passed in
- *        allocator supports asynchronous allocation and deallocation so this
- *        can also be used for temporary memory
- *
- * @code{.cpp}
- * template<typename T>
- * void foo(..., cudaStream_t stream) {
- *   ...
- *   raft::mr::device::buffer<T> temp(stream, 0);
- *   ...
- *   temp.resize(n);
- *   kernelA<<<grid,block,0,stream>>>(...,temp.data(),...);
- *   kernelB<<<grid,block,0,stream>>>(...,temp.data(),...);
- *   temp.release();
- *   ...
- * }
- * @endcode
- */
-template <typename T>
-class buffer : public buffer_base<T, allocator> {
- public:
-  using size_type       = typename buffer_base<T, allocator>::size_type;
-  using value_type      = typename buffer_base<T, allocator>::value_type;
-  using iterator        = typename buffer_base<T, allocator>::iterator;
-  using const_iterator  = typename buffer_base<T, allocator>::const_iterator;
-  using reference       = typename buffer_base<T, allocator>::reference;
-  using const_reference = typename buffer_base<T, allocator>::const_reference;
-
-  buffer() = delete;
-
-  buffer(const buffer& other) = delete;
-
-  buffer& operator=(const buffer& other) = delete;
-
-  buffer(std::shared_ptr<allocator> alloc, cudaStream_t stream, size_type n = 0)
-    : buffer_base<T, device::allocator>(alloc, stream, n)
-  {
-  }
-};  // class buffer
-
-};  // namespace device
-};  // namespace mr
-};  // namespace raft
diff --git a/cpp/include/raft/mr/host/buffer.hpp b/cpp/include/raft/mr/host/buffer.hpp
deleted file mode 100644
index 204b384719..0000000000
--- a/cpp/include/raft/mr/host/buffer.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "allocator.hpp"
-#include <memory>
-#include <raft/mr/buffer_base.hpp>
-#include <raft/mr/device/buffer.hpp>
-
-namespace raft {
-namespace mr {
-namespace host {
-
-/**
- * @brief RAII object owning a contigous typed host buffer (aka pinned memory).
- *        The passed in allocator supports asynchronus allocation and
- *        deallocation so this can also be used for temporary memory
- *
- * @code{.cpp}
- * template<typename T>
- * void foo(const T* in_d , T* out_d, ..., cudaStream_t stream) {
- *   ...
- *   raft::mr::host::buffer<T> temp(stream, 0);
- *   ...
- *   temp.resize(n);
- *   raft::copy(temp.data(), in_d, temp.size());
- *   ...
- *   raft::copy(out_d, temp.data(), temp.size());
- *   temp.release(stream);
- *   ...
- * }
- * @endcode
- */
-template <typename T>
-class buffer : public buffer_base<T, allocator> {
- public:
-  using size_type       = typename buffer_base<T, allocator>::size_type;
-  using value_type      = typename buffer_base<T, allocator>::value_type;
-  using iterator        = typename buffer_base<T, allocator>::iterator;
-  using const_iterator  = typename buffer_base<T, allocator>::const_iterator;
-  using reference       = typename buffer_base<T, allocator>::reference;
-  using const_reference = typename buffer_base<T, allocator>::const_reference;
-
-  buffer() = delete;
-
-  buffer(const buffer& other) = delete;
-
-  buffer& operator=(const buffer& other) = delete;
-
-  buffer(std::shared_ptr<allocator> alloc, const device::buffer<T>& other)
-    : buffer_base<T, allocator>(alloc, other.get_stream(), other.size())
-  {
-    if (other.size() > 0) { raft::copy(data_, other.data(), other.size(), other.get_stream()); }
-  }
-
-  buffer(std::shared_ptr<allocator> alloc, cudaStream_t stream, size_type n = 0)
-    : buffer_base<T, allocator>(alloc, stream, n)
-  {
-  }
-
-  reference operator[](size_type pos) { return data_[pos]; }
-
-  const_reference operator[](size_type pos) const { return data_[pos]; }
-
- private:
-  using buffer_base<T, allocator>::data_;
-};
-
-};  // namespace host
-};  // namespace mr
-};  // namespace raft
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
index 046b65a0f0..020de9e014 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
@@ -21,7 +21,6 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 
 #include "../../csr.hpp"
diff --git a/cpp/include/raft/sparse/distance/detail/utils.cuh b/cpp/include/raft/sparse/distance/detail/utils.cuh
index 8c01b33c1e..06c034ad9f 100644
--- a/cpp/include/raft/sparse/distance/detail/utils.cuh
+++ b/cpp/include/raft/sparse/distance/detail/utils.cuh
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <raft/mr/device/allocator.hpp>
-#include <raft/mr/device/buffer.hpp>
 
 #include <cub/cub.cuh>
 
diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
index 31ebe38d85..105f1cc9f6 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
@@ -19,7 +19,6 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/mr/device/buffer.hpp>
 
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
index bd96ca8649..fe58246545 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
@@ -24,7 +24,6 @@
 #include <rmm/device_uvector.hpp>
 
 #include <raft/distance/distance_type.hpp>
-#include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/convert/csr.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/hierarchy/common.h>
diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
index 7173c76c08..10e9d04c0d 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
@@ -19,7 +19,6 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 
-#include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/mst/mst.cuh>
 #include <raft/sparse/op/sort.hpp>
 #include <raft/sparse/selection/connect_components.hpp>
diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
index 5b43798e2e..95d9c0d1c5 100644
--- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -17,7 +17,8 @@
 #include <raft/cudart_utils.h>
 
 #include <raft/cuda_utils.cuh>
-#include <raft/sparse/detail/cusparse_wrappers.h>
+#include <raft/spectral/cluster_solvers.hpp>
+#include <raft/spectral/eigen_solvers.hpp>
 #include <raft/spectral/partition.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -70,11 +71,12 @@ void fit_embedding(const raft::handle_t& handle,
   value_type tol          = 0.01;
   index_type restart_iter = 15 + neigvs;  // what cugraph is using
 
-  raft::eigen_solver_config_t<index_type, value_type> cfg{neigvs, maxiter, restart_iter, tol};
+  raft::spectral::eigen_solver_config_t<index_type, value_type> cfg{
+    neigvs, maxiter, restart_iter, tol};
 
   cfg.seed = seed;
 
-  raft::lanczos_solver_t<index_type, value_type> eig_solver{cfg};
+  raft::spectral::lanczos_solver_t<index_type, value_type> eig_solver{cfg};
 
   // cluster computation here is irrelevant,
   // hence define a no-op such solver to
diff --git a/cpp/include/raft/sparse/op/detail/reduce.cuh b/cpp/include/raft/sparse/op/detail/reduce.cuh
index e4a64fbb51..988f478f2b 100644
--- a/cpp/include/raft/sparse/op/detail/reduce.cuh
+++ b/cpp/include/raft/sparse/op/detail/reduce.cuh
@@ -20,7 +20,6 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 
 #include <raft/device_atomics.cuh>
diff --git a/cpp/include/raft/sparse/selection/detail/connect_components.cuh b/cpp/include/raft/sparse/selection/detail/connect_components.cuh
index afbb7f17b3..2b9ca2d8b5 100644
--- a/cpp/include/raft/sparse/selection/detail/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/detail/connect_components.cuh
@@ -17,9 +17,8 @@
 #include <cub/cub.cuh>
 
 #include <raft/distance/fused_l2_nn.hpp>
-#include <raft/label/classlabels.cuh>
+#include <raft/label/classlabels.hpp>
 #include <raft/linalg/norm.hpp>
-#include <raft/mr/device/buffer.hpp>
 #include <raft/sparse/convert/csr.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/linalg/symmetrize.hpp>
diff --git a/cpp/include/raft/sparse/selection/detail/knn.cuh b/cpp/include/raft/sparse/selection/detail/knn.cuh
index 3de10a2782..82a689fe00 100644
--- a/cpp/include/raft/sparse/selection/detail/knn.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn.cuh
@@ -23,7 +23,6 @@
 #include <raft/distance/distance_type.hpp>
 #include <raft/linalg/unary_op.hpp>
 #include <raft/matrix/matrix.hpp>
-#include <raft/mr/device/buffer.hpp>
 
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/csr.hpp>
diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp
index 6ce9463e43..5f64a8d1b5 100644
--- a/cpp/include/raft/spatial/knn/ann.hpp
+++ b/cpp/include/raft/spatial/knn/ann.hpp
@@ -22,8 +22,6 @@
 #include <faiss/gpu/GpuIndex.h>
 #include <raft/spatial/knn/faiss_mr.hpp>
 
-#include <raft/mr/device/buffer.hpp>
-
 namespace raft {
 namespace spatial {
 namespace knn {
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index b29c4cc51c..59df75ba36 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -19,14 +19,10 @@
 #include "detail/knn_brute_force_faiss.cuh"
 #include "detail/selection_faiss.cuh"
 
-#include <raft/mr/device/buffer.hpp>
-
 namespace raft {
 namespace spatial {
 namespace knn {
 
-using deviceAllocator = raft::mr::device::allocator;
-
 /**
  * Performs a k-select across row partitioned index/distance
  * matrices formatted like the following:
diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp
index 221a9679d4..cc25e66cae 100644
--- a/cpp/include/raft/spectral/cluster_solvers.hpp
+++ b/cpp/include/raft/spectral/cluster_solvers.hpp
@@ -15,10 +15,11 @@
  */
 #pragma once
 
-#include <raft/spectral/kmeans.hpp>
+#include <raft/cluster/kmeans.hpp>
 #include <utility>  // for std::pair
 
 namespace raft {
+namespace spectral {
 
 using namespace matrix;
 
@@ -52,17 +53,18 @@ struct kmeans_solver_t {
     RAFT_EXPECTS(codes != nullptr, "Null codes buffer.");
     value_type_t residual{};
     index_type_t iters{};
-    kmeans(handle,
-           n_obs_vecs,
-           dim,
-           config_.n_clusters,
-           config_.tol,
-           config_.maxIter,
-           obs,
-           codes,
-           residual,
-           iters,
-           config_.seed);
+
+    raft::cluster::kmeans(handle,
+                          n_obs_vecs,
+                          dim,
+                          config_.n_clusters,
+                          config_.tol,
+                          config_.maxIter,
+                          obs,
+                          codes,
+                          residual,
+                          iters,
+                          config_.seed);
     return std::make_pair(residual, iters);
   }
 
@@ -71,4 +73,6 @@ struct kmeans_solver_t {
  private:
   cluster_solver_config_t<index_type_t, value_type_t, size_type_t> config_;
 };
+
+}  // namespace spectral
 }  // namespace raft
diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/detail/lapack.hpp
similarity index 100%
rename from cpp/include/raft/spectral/lapack.hpp
rename to cpp/include/raft/spectral/detail/lapack.hpp
diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.cuh
similarity index 100%
rename from cpp/include/raft/spectral/matrix_wrappers.hpp
rename to cpp/include/raft/spectral/detail/matrix_wrappers.cuh
diff --git a/cpp/include/raft/spectral/detail/modularity_maximization.hpp b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
new file mode 100644
index 0000000000..a55dfbe67f
--- /dev/null
+++ b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <math.h>
+#include <stdio.h>
+
+#include <cuda.h>
+#include <thrust/fill.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
+
+#include <tuple>
+
+#include <raft/spectral/cluster_solvers.hpp>
+#include <raft/spectral/detail/spectral_util.cuh>
+#include <raft/spectral/eigen_solvers.hpp>
+
+#ifdef COLLECT_TIME_STATISTICS
+#include <cuda_profiler_api.h>
+#include <stddef.h>
+#include <sys/resource.h>
+#include <sys/sysinfo.h>
+#include <sys/time.h>
+#endif
+
+#ifdef COLLECT_TIME_STATISTICS
+static double timer(void)
+{
+  struct timeval tv;
+  cudaDeviceSynchronize();
+  gettimeofday(&tv, NULL);
+  return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+}
+#endif
+
+namespace raft {
+namespace spectral {
+namespace detail {
+
+using namespace matrix;
+using namespace linalg;
+
+// =========================================================
+// Spectral modularity_maximization
+// =========================================================
+
+/** Compute partition for a weighted undirected graph. This
+ *  partition attempts to minimize the cost function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter_lanczos Maximum number of Lanczos iterations.
+ *  @param restartIter_lanczos Maximum size of Lanczos system before
+ *    implicit restart.
+ *  @param tol_lanczos Convergence tolerance for Lanczos method.
+ *  @param maxIter_kmeans Maximum number of k-means iterations.
+ *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+ *  @param clusters (Output, device memory, n entries) Cluster
+ *    assignments.
+ *  @param iters_lanczos On exit, number of Lanczos iterations
+ *    performed.
+ *  @param iters_kmeans On exit, number of k-means iterations
+ *    performed.
+ *  @return error flag.
+ */
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
+  handle_t const& handle,
+  sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
+{
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+  RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+  RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+
+  auto stream   = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
+
+  std::tuple<vertex_t, weight_t, vertex_t>
+    stats;  // # iters eigen solver, cluster solver residual, # iters cluster solver
+
+  vertex_t n = csr_m.nrows_;
+
+  // Compute eigenvectors of Modularity Matrix
+
+  // Initialize Modularity Matrix
+  modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
+
+  auto eigen_config = eigen_solver.get_config();
+  auto nEigVecs     = eigen_config.n_eigVecs;
+
+  // Compute eigenvectors corresponding to largest eigenvalues
+  std::get<0>(stats) = eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs);
+
+  // Whiten eigenvector matrix
+  transform_eigen_matrix(handle, n, nEigVecs, eigVecs);
+
+  // notice that at this point the matrix has already been transposed, so we are scaling
+  // columns
+  scale_obs(nEigVecs, n, eigVecs);
+  RAFT_CHECK_CUDA(stream);
+
+  // Find partition clustering
+  auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
+
+  std::get<1>(stats) = pair_cluster.first;
+  std::get<2>(stats) = pair_cluster.second;
+
+  return stats;
+}
+//===================================================
+// Analysis of graph partition
+// =========================================================
+
+/// Compute modularity
+/** This function determines the modularity based on a graph and cluster assignments
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of clusters.
+ *  @param clusters (Input, device memory, n entries) Cluster assignments.
+ *  @param modularity On exit, modularity
+ */
+template <typename vertex_t, typename weight_t>
+void analyzeModularity(handle_t const& handle,
+                       sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                       vertex_t nClusters,
+                       vertex_t const* __restrict__ clusters,
+                       weight_t& modularity)
+{
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+
+  vertex_t i;
+  vertex_t n = csr_m.nrows_;
+  weight_t partModularity, clustersize;
+
+  auto cublas_h = handle.get_cublas_handle();
+  auto stream   = handle.get_stream();
+
+  // Device memory
+  vector_t<weight_t> part_i(handle, n);
+  vector_t<weight_t> Bx(handle, n);
+
+  // Initialize cuBLAS
+  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+  // Initialize Modularity
+  modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
+
+  // Initialize output
+  modularity = 0;
+
+  // Iterate through partitions
+  for (i = 0; i < nClusters; ++i) {
+    if (!construct_indicator(handle, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) {
+      WARNING("empty partition");
+      continue;
+    }
+
+    // Record results
+    modularity += partModularity;
+  }
+
+  modularity = modularity / B.diagonal_.nrm1();
+}
+
+}  // namespace detail
+}  // namespace spectral
+}  // namespace raft
diff --git a/cpp/include/raft/spectral/detail/partition.hpp b/cpp/include/raft/spectral/detail/partition.hpp
new file mode 100644
index 0000000000..b7c811d5a5
--- /dev/null
+++ b/cpp/include/raft/spectral/detail/partition.hpp
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <math.h>
+#include <stdio.h>
+
+#include <cuda.h>
+#include <thrust/fill.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
+
+#include <tuple>
+
+#include <raft/spectral/cluster_solvers.hpp>
+#include <raft/spectral/detail/spectral_util.cuh>
+#include <raft/spectral/eigen_solvers.hpp>
+
+namespace raft {
+namespace spectral {
+namespace detail {
+
+using namespace matrix;
+using namespace linalg;
+
+// =========================================================
+// Spectral partitioner
+// =========================================================
+
+/// Compute spectral graph partition
+/** Compute partition for a weighted undirected graph. This
+ *  partition attempts to minimize the cost function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter_lanczos Maximum number of Lanczos iterations.
+ *  @param restartIter_lanczos Maximum size of Lanczos system before
+ *    implicit restart.
+ *  @param tol_lanczos Convergence tolerance for Lanczos method.
+ *  @param maxIter_kmeans Maximum number of k-means iterations.
+ *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+ *  @param clusters (Output, device memory, n entries) Partition
+ *    assignments.
+ *  @param iters_lanczos On exit, number of Lanczos iterations
+ *    performed.
+ *  @param iters_kmeans On exit, number of k-means iterations
+ *    performed.
+ *  @return statistics: number of eigensolver iterations, .
+ */
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> partition(handle_t const& handle,
+                                                   sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                                                   EigenSolver const& eigen_solver,
+                                                   ClusterSolver const& cluster_solver,
+                                                   vertex_t* __restrict__ clusters,
+                                                   weight_t* eigVals,
+                                                   weight_t* eigVecs)
+{
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+  RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+  RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+
+  auto stream   = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
+
+  std::tuple<vertex_t, weight_t, vertex_t>
+    stats;  //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver,
+            // cluster solver residual, # iters cluster solver
+
+  vertex_t n = csr_m.nrows_;
+
+  // -------------------------------------------------------
+  // Spectral partitioner
+  // -------------------------------------------------------
+
+  // Compute eigenvectors of Laplacian
+
+  // Initialize Laplacian
+  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
+  laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
+
+  auto eigen_config = eigen_solver.get_config();
+  auto nEigVecs     = eigen_config.n_eigVecs;
+
+  // Compute smallest eigenvalues and eigenvectors
+  std::get<0>(stats) = eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs);
+
+  // Whiten eigenvector matrix
+  transform_eigen_matrix(handle, n, nEigVecs, eigVecs);
+
+  // Find partition clustering
+  auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
+
+  std::get<1>(stats) = pair_cluster.first;
+  std::get<2>(stats) = pair_cluster.second;
+
+  return stats;
+}
+
+// =========================================================
+// Analysis of graph partition
+// =========================================================
+
+/// Compute cost function for partition
+/** This function determines the edges cut by a partition and a cost
+ *  function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *  Graph is assumed to be weighted and undirected.
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param clusters (Input, device memory, n entries) Partition
+ *    assignments.
+ *  @param edgeCut On exit, weight of edges cut by partition.
+ *  @param cost On exit, partition cost function.
+ *  @return error flag.
+ */
+template <typename vertex_t, typename weight_t>
+void analyzePartition(handle_t const& handle,
+                      sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                      vertex_t nClusters,
+                      const vertex_t* __restrict__ clusters,
+                      weight_t& edgeCut,
+                      weight_t& cost)
+{
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+
+  vertex_t i;
+  vertex_t n = csr_m.nrows_;
+
+  auto stream   = handle.get_stream();
+  auto cublas_h = handle.get_cublas_handle();
+
+  weight_t partEdgesCut, clustersize;
+
+  // Device memory
+  vector_t<weight_t> part_i(handle, n);
+  vector_t<weight_t> Lx(handle, n);
+
+  // Initialize cuBLAS
+  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+  // Initialize Laplacian
+  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
+  laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
+
+  // Initialize output
+  cost    = 0;
+  edgeCut = 0;
+
+  // Iterate through partitions
+  for (i = 0; i < nClusters; ++i) {
+    // Construct indicator vector for ith partition
+    if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) {
+      WARNING("empty partition");
+      continue;
+    }
+
+    // Record results
+    cost += partEdgesCut / clustersize;
+    edgeCut += partEdgesCut / 2;
+  }
+}
+
+}  // namespace detail
+}  // namespace spectral
+}  // namespace raft
diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/detail/spectral_util.cuh
similarity index 100%
rename from cpp/include/raft/spectral/spectral_util.hpp
rename to cpp/include/raft/spectral/detail/spectral_util.cuh
diff --git a/cpp/include/raft/spectral/warn_dbg.hpp b/cpp/include/raft/spectral/detail/warn_dbg.hpp
similarity index 100%
rename from cpp/include/raft/spectral/warn_dbg.hpp
rename to cpp/include/raft/spectral/detail/warn_dbg.hpp
diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp
index 156b996586..192dc15a6b 100644
--- a/cpp/include/raft/spectral/eigen_solvers.hpp
+++ b/cpp/include/raft/spectral/eigen_solvers.hpp
@@ -18,6 +18,7 @@
 #include <raft/linalg/lanczos.hpp>
 
 namespace raft {
+namespace spectral {
 
 using namespace matrix;
 
@@ -95,4 +96,6 @@ struct lanczos_solver_t {
  private:
   eigen_solver_config_t<index_type_t, value_type_t, size_type_t> config_;
 };
+
+}  // namespace spectral
 }  // namespace raft
diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp
index 8188a772b8..466851c74f 100644
--- a/cpp/include/raft/spectral/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/modularity_maximization.hpp
@@ -16,44 +16,13 @@
 
 #pragma once
 
-#include <math.h>
-#include <stdio.h>
-
-#include <cuda.h>
-#include <thrust/fill.h>
-#include <thrust/reduce.h>
-#include <thrust/transform.h>
-
 #include <tuple>
 
-#include <raft/spectral/cluster_solvers.hpp>
-#include <raft/spectral/eigen_solvers.hpp>
-#include <raft/spectral/spectral_util.hpp>
-
-#ifdef COLLECT_TIME_STATISTICS
-#include <cuda_profiler_api.h>
-#include <stddef.h>
-#include <sys/resource.h>
-#include <sys/sysinfo.h>
-#include <sys/time.h>
-#endif
-
-#ifdef COLLECT_TIME_STATISTICS
-static double timer(void)
-{
-  struct timeval tv;
-  cudaDeviceSynchronize();
-  gettimeofday(&tv, NULL);
-  return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
-}
-#endif
+#include <raft/spectral/detail/modularity_maximization.hpp>
 
 namespace raft {
 namespace spectral {
 
-using namespace matrix;
-using namespace linalg;
-
 // =========================================================
 // Spectral modularity_maximization
 // =========================================================
@@ -89,44 +58,8 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
   weight_t* eigVals,
   weight_t* eigVecs)
 {
-  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
-  RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
-  RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
-
-  auto stream   = handle.get_stream();
-  auto cublas_h = handle.get_cublas_handle();
-
-  std::tuple<vertex_t, weight_t, vertex_t>
-    stats;  // # iters eigen solver, cluster solver residual, # iters cluster solver
-
-  vertex_t n = csr_m.nrows_;
-
-  // Compute eigenvectors of Modularity Matrix
-
-  // Initialize Modularity Matrix
-  modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
-
-  auto eigen_config = eigen_solver.get_config();
-  auto nEigVecs     = eigen_config.n_eigVecs;
-
-  // Compute eigenvectors corresponding to largest eigenvalues
-  std::get<0>(stats) = eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs);
-
-  // Whiten eigenvector matrix
-  transform_eigen_matrix(handle, n, nEigVecs, eigVecs);
-
-  // notice that at this point the matrix has already been transposed, so we are scaling
-  // columns
-  scale_obs(nEigVecs, n, eigVecs);
-  RAFT_CHECK_CUDA(stream);
-
-  // Find partition clustering
-  auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
-
-  std::get<1>(stats) = pair_cluster.first;
-  std::get<2>(stats) = pair_cluster.second;
-
-  return stats;
+  return detail::modularity_maximization<vertex_t, weight_t, EigenSolver, ClusterSolver>(
+    handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
 }
 //===================================================
 // Analysis of graph partition
@@ -146,42 +79,7 @@ void analyzeModularity(handle_t const& handle,
                        vertex_t const* __restrict__ clusters,
                        weight_t& modularity)
 {
-  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
-
-  vertex_t i;
-  vertex_t n = csr_m.nrows_;
-  weight_t partModularity, clustersize;
-
-  auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
-
-  // Device memory
-  vector_t<weight_t> part_i(handle, n);
-  vector_t<weight_t> Bx(handle, n);
-
-  // Initialize cuBLAS
-  // #TODO: Use public API when ready
-  RAFT_CUBLAS_TRY(
-    raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
-
-  // Initialize Modularity
-  modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
-
-  // Initialize output
-  modularity = 0;
-
-  // Iterate through partitions
-  for (i = 0; i < nClusters; ++i) {
-    if (!construct_indicator(handle, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) {
-      WARNING("empty partition");
-      continue;
-    }
-
-    // Record results
-    modularity += partModularity;
-  }
-
-  modularity = modularity / B.diagonal_.nrm1();
+  detail::analyzeModularity<vertex_t, weight_t>(handle, csr_m, nClusters, clusters, modularity);
 }
 
 }  // namespace spectral
diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp
index 5b1478baa9..597ef530a2 100644
--- a/cpp/include/raft/spectral/partition.hpp
+++ b/cpp/include/raft/spectral/partition.hpp
@@ -15,26 +15,13 @@
  */
 #pragma once
 
-#include <math.h>
-#include <stdio.h>
-
-#include <cuda.h>
-#include <thrust/fill.h>
-#include <thrust/reduce.h>
-#include <thrust/transform.h>
-
 #include <tuple>
 
-#include <raft/spectral/cluster_solvers.hpp>
-#include <raft/spectral/eigen_solvers.hpp>
-#include <raft/spectral/spectral_util.hpp>
+#include <raft/spectral/detail/partition.hpp>
 
 namespace raft {
 namespace spectral {
 
-using namespace matrix;
-using namespace linalg;
-
 // =========================================================
 // Spectral partitioner
 // =========================================================
@@ -70,45 +57,8 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(handle_t const& handle,
                                                    weight_t* eigVals,
                                                    weight_t* eigVecs)
 {
-  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
-  RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
-  RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
-
-  auto stream   = handle.get_stream();
-  auto cublas_h = handle.get_cublas_handle();
-
-  std::tuple<vertex_t, weight_t, vertex_t>
-    stats;  //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver,
-            // cluster solver residual, # iters cluster solver
-
-  vertex_t n = csr_m.nrows_;
-
-  // -------------------------------------------------------
-  // Spectral partitioner
-  // -------------------------------------------------------
-
-  // Compute eigenvectors of Laplacian
-
-  // Initialize Laplacian
-  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
-  laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
-
-  auto eigen_config = eigen_solver.get_config();
-  auto nEigVecs     = eigen_config.n_eigVecs;
-
-  // Compute smallest eigenvalues and eigenvectors
-  std::get<0>(stats) = eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs);
-
-  // Whiten eigenvector matrix
-  transform_eigen_matrix(handle, n, nEigVecs, eigVecs);
-
-  // Find partition clustering
-  auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
-
-  std::get<1>(stats) = pair_cluster.first;
-  std::get<2>(stats) = pair_cluster.second;
-
-  return stats;
+  return detail::partition<vertex_t, weight_t, EigenSolver, ClusterSolver>(
+    handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
 }
 
 // =========================================================
@@ -137,43 +87,7 @@ void analyzePartition(handle_t const& handle,
                       weight_t& edgeCut,
                       weight_t& cost)
 {
-  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
-
-  vertex_t i;
-  vertex_t n = csr_m.nrows_;
-
-  auto stream   = handle.get_stream();
-  auto cublas_h = handle.get_cublas_handle();
-
-  weight_t partEdgesCut, clustersize;
-
-  // Device memory
-  vector_t<weight_t> part_i(handle, n);
-  vector_t<weight_t> Lx(handle, n);
-
-  // Initialize cuBLAS
-  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
-
-  // Initialize Laplacian
-  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
-  laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
-
-  // Initialize output
-  cost    = 0;
-  edgeCut = 0;
-
-  // Iterate through partitions
-  for (i = 0; i < nClusters; ++i) {
-    // Construct indicator vector for ith partition
-    if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) {
-      WARNING("empty partition");
-      continue;
-    }
-
-    // Record results
-    cost += partEdgesCut / clustersize;
-    edgeCut += partEdgesCut / 2;
-  }
+  detail::analyzePartition<vertex_t, weight_t>(handle, csr_m, nClusters, clusters, edgeCut, cost);
 }
 
 }  // namespace spectral
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index a3df5c7a4b..fda60e1cb0 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -65,8 +65,6 @@ add_executable(test_raft
     test/matrix/math.cu
     test/matrix/matrix.cu
     test/matrix/linewise_op.cu
-    test/mr/device/buffer.cpp
-    test/mr/host/buffer.cpp
     test/mst.cu
     test/random/rng.cu
     test/random/rng_int.cu
diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu
index 2c7996514a..0030596e21 100644
--- a/cpp/test/cluster_solvers.cu
+++ b/cpp/test/cluster_solvers.cu
@@ -19,9 +19,11 @@
 #include <memory>
 #include <raft/handle.hpp>
 
+#include <raft/spectral/cluster_solvers.hpp>
 #include <raft/spectral/modularity_maximization.hpp>
 
 namespace raft {
+namespace spectral {
 
 TEST(Raft, ClusterSolvers)
 {
@@ -60,7 +62,12 @@ TEST(Raft, ModularitySolvers)
   using value_type = double;
 
   handle_t h;
-  ASSERT_EQ(0, h.get_device());
+  ASSERT_EQ(0,
+            h.
+
+            get_device()
+
+  );
 
   index_type neigvs{10};
   index_type maxiter{100};
@@ -95,4 +102,5 @@ TEST(Raft, ModularitySolvers)
   EXPECT_ANY_THROW(spectral::analyzeModularity(h, sm, k, clusters, modularity));
 }
 
+}  // namespace spectral
 }  // namespace raft
diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu
index f898d11d2e..541d4dccc8 100644
--- a/cpp/test/eigen_solvers.cu
+++ b/cpp/test/eigen_solvers.cu
@@ -16,6 +16,7 @@
 
 #include <raft/common/nvtx.hpp>
 #include <raft/handle.hpp>
+#include <raft/spectral/eigen_solvers.hpp>
 #include <raft/spectral/partition.hpp>
 
 #include <gtest/gtest.h>
@@ -25,6 +26,7 @@
 #include <memory>
 
 namespace raft {
+namespace spectral {
 
 TEST(Raft, EigenSolvers)
 {
@@ -34,7 +36,12 @@ TEST(Raft, EigenSolvers)
   using value_type = double;
 
   handle_t h;
-  ASSERT_EQ(0, h.get_device());
+  ASSERT_EQ(0,
+            h.
+
+            get_device()
+
+  );
 
   index_type* ro{nullptr};
   index_type* ci{nullptr};
@@ -75,7 +82,12 @@ TEST(Raft, SpectralSolvers)
   using value_type = double;
 
   handle_t h;
-  ASSERT_EQ(0, h.get_device());
+  ASSERT_EQ(0,
+            h.
+
+            get_device()
+
+  );
 
   index_type neigvs{10};
   index_type maxiter{100};
@@ -109,4 +121,5 @@ TEST(Raft, SpectralSolvers)
   EXPECT_ANY_THROW(spectral::analyzePartition(h, sm, k, clusters, edgeCut, cost));
 }
 
+}  // namespace spectral
 }  // namespace raft
diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu
index d441bf95a8..b19accc3b4 100644
--- a/cpp/test/label/label.cu
+++ b/cpp/test/label/label.cu
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 
-#include <raft/label/classlabels.cuh>
+#include <raft/label/classlabels.hpp>
 
 #include "../test_utils.h"
 #include <raft/cuda_utils.cuh>
diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu
index 5d30af795f..db6b34bbd6 100644
--- a/cpp/test/label/merge_labels.cu
+++ b/cpp/test/label/merge_labels.cu
@@ -15,7 +15,7 @@
  */
 
 #include <gtest/gtest.h>
-#include <raft/label/merge_labels.cuh>
+#include <raft/label/merge_labels.hpp>
 
 #include "../test_utils.h"
 #include <raft/cudart_utils.h>
diff --git a/cpp/test/lap/lap.cu b/cpp/test/lap/lap.cu
index afdebae1f8..24e1c6be4f 100644
--- a/cpp/test/lap/lap.cu
+++ b/cpp/test/lap/lap.cu
@@ -28,7 +28,7 @@
 
 #include <iostream>
 #include <omp.h>
-#include <raft/lap/lap.cuh>
+#include <raft/lap/lap.hpp>
 #include <random>
 
 #define PROBLEMSIZE  1000  // Number of rows/columns
diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp
deleted file mode 100644
index 4861a4ca1f..0000000000
--- a/cpp/test/mr/device/buffer.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-#include <iostream>
-#include <memory>
-#include <raft/cudart_utils.h>
-#include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/limiting_resource_adaptor.hpp>
-
-namespace raft {
-namespace mr {
-namespace device {
-
-TEST(Raft, DeviceBufferAlloc)
-{
-  cudaStream_t stream;
-  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-  // no allocation at construction
-  rmm::device_uvector<char> buff(0, stream);
-  ASSERT_EQ(0, buff.size());
-  // explicit allocation after construction
-  buff.resize(20, stream);
-  ASSERT_EQ(20, buff.size());
-  // resizing to a smaller buffer size
-  buff.resize(10, stream);
-  ASSERT_EQ(10, buff.size());
-  // explicit deallocation
-  buff.release();
-  ASSERT_EQ(0, buff.size());
-  // use these methods without the explicit stream parameter
-  buff.resize(20, stream);
-  ASSERT_EQ(20, buff.size());
-  buff.resize(10, stream);
-  ASSERT_EQ(10, buff.size());
-  buff.release();
-  ASSERT_EQ(0, buff.size());
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
-}
-
-TEST(Raft, DeviceBufferZeroResize)
-{
-  // Create a limiting_resource_adaptor to track allocations
-  auto curr_mr =
-    dynamic_cast<rmm::mr::cuda_memory_resource*>(rmm::mr::get_current_device_resource());
-  auto limit_mr =
-    std::make_shared<rmm::mr::limiting_resource_adaptor<rmm::mr::cuda_memory_resource>>(curr_mr,
-                                                                                        1000);
-
-  rmm::mr::set_current_device_resource(limit_mr.get());
-
-  cudaStream_t stream;
-  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-  // no allocation at construction
-  rmm::device_uvector<char> buff(10, stream);
-  ASSERT_EQ(10, buff.size());
-  // explicit allocation after construction
-  buff.resize(0, stream);
-  ASSERT_EQ(0, buff.size());
-  // resizing to a smaller buffer size
-  buff.resize(20, stream);
-  ASSERT_EQ(20, buff.size());
-  // explicit deallocation
-  buff.release();
-  ASSERT_EQ(0, buff.size());
-
-  // Now check that there is no memory left. (Used to not be true)
-  ASSERT_EQ(0, limit_mr->get_allocated_bytes());
-
-  rmm::mr::set_current_device_resource(curr_mr);
-
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
-}
-
-}  // namespace device
-}  // namespace mr
-}  // namespace raft
diff --git a/cpp/test/mr/host/buffer.cpp b/cpp/test/mr/host/buffer.cpp
deleted file mode 100644
index d645ffa0e0..0000000000
--- a/cpp/test/mr/host/buffer.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-#include <iostream>
-#include <memory>
-#include <raft/mr/device/buffer.hpp>
-#include <raft/mr/host/buffer.hpp>
-
-namespace raft {
-namespace mr {
-namespace host {
-
-TEST(Raft, HostBuffer)
-{
-  auto alloc = std::make_shared<default_allocator>();
-  cudaStream_t stream;
-  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-  // no allocation at construction
-  buffer<char> buff(alloc, stream);
-  ASSERT_EQ(0, buff.size());
-  // explicit allocation after construction
-  buff.resize(20, stream);
-  ASSERT_EQ(20, buff.size());
-  // resizing to a smaller buffer size
-  buff.resize(10, stream);
-  ASSERT_EQ(10, buff.size());
-  // explicit deallocation
-  buff.release(stream);
-  ASSERT_EQ(0, buff.size());
-  // use these methods without the explicit stream parameter
-  buff.resize(20);
-  ASSERT_EQ(20, buff.size());
-  buff.resize(10);
-  ASSERT_EQ(10, buff.size());
-  buff.release();
-  ASSERT_EQ(0, buff.size());
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
-}
-
-TEST(Raft, DeviceToHostBuffer)
-{
-  auto d_alloc = std::make_shared<device::default_allocator>();
-  auto h_alloc = std::make_shared<default_allocator>();
-  cudaStream_t stream;
-  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-  device::buffer<char> d_buff(d_alloc, stream, 32);
-  RAFT_CUDA_TRY(cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream));
-  buffer<char> h_buff(h_alloc, d_buff);
-  ASSERT_EQ(d_buff.size(), h_buff.size());
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
-}
-
-}  // namespace host
-}  // namespace mr
-}  // namespace raft
diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu
index fa54b04cda..652aa61451 100644
--- a/cpp/test/spectral_matrix.cu
+++ b/cpp/test/spectral_matrix.cu
@@ -19,7 +19,7 @@
 #include <memory>
 #include <raft/handle.hpp>
 
-#include <raft/spectral/matrix_wrappers.hpp>
+#include <raft/spectral/detail/matrix_wrappers.cuh>
 
 namespace raft {
 namespace {

From 9b0208b414de3102f5e45dea6a085fea96ec49c0 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Wed, 9 Feb 2022 17:40:29 -0500
Subject: [PATCH 107/171] Fixing spectral APIs (#496)

Further cleanup of the spectral APIs.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/raft/pull/496
---
 cpp/include/raft/cluster/detail/kmeans.cuh    |  12 +-
 cpp/include/raft/linalg/detail/lanczos.hpp    | 144 ++++++------
 cpp/include/raft/linalg/lanczos.hpp           |  49 ++--
 cpp/include/raft/mr/buffer_base.hpp           | 211 ++++++++++++++++++
 cpp/include/raft/mr/device/buffer.hpp         |  70 ++++++
 cpp/include/raft/mr/host/buffer.hpp           |  85 +++++++
 .../raft/spectral/detail/matrix_wrappers.cuh  |   4 +
 .../detail/modularity_maximization.hpp        |  19 +-
 .../raft/spectral/detail/partition.hpp        |  24 +-
 .../raft/spectral/detail/spectral_util.cuh    |   7 +-
 cpp/include/raft/spectral/eigen_solvers.hpp   |  65 +++---
 cpp/include/raft/spectral/matrix_wrappers.hpp |  54 +++++
 .../raft/spectral/modularity_maximization.hpp |  12 +-
 cpp/include/raft/spectral/partition.hpp       |  22 +-
 cpp/test/CMakeLists.txt                       |   2 +
 cpp/test/mr/device/buffer.cpp                 |  92 ++++++++
 cpp/test/mr/host/buffer.cpp                   |  70 ++++++
 cpp/test/spectral_matrix.cu                   |   7 +-
 18 files changed, 769 insertions(+), 180 deletions(-)
 create mode 100644 cpp/include/raft/mr/buffer_base.hpp
 create mode 100644 cpp/include/raft/mr/device/buffer.hpp
 create mode 100644 cpp/include/raft/mr/host/buffer.hpp
 create mode 100644 cpp/include/raft/spectral/matrix_wrappers.hpp
 create mode 100644 cpp/test/mr/device/buffer.cpp
 create mode 100644 cpp/test/mr/host/buffer.cpp

diff --git a/cpp/include/raft/cluster/detail/kmeans.cuh b/cpp/include/raft/cluster/detail/kmeans.cuh
index 5f1a0e137d..039ac8854a 100644
--- a/cpp/include/raft/cluster/detail/kmeans.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans.cuh
@@ -32,8 +32,8 @@
 #include <raft/device_atomics.cuh>
 #include <raft/handle.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/spectral/detail/matrix_wrappers.cuh>
 #include <raft/spectral/detail/warn_dbg.hpp>
+#include <raft/spectral/matrix_wrappers.hpp>
 
 namespace raft {
 namespace cluster {
@@ -948,8 +948,6 @@ int kmeans(handle_t const& handle,
            index_type_t& iters,
            unsigned long long seed = 123456)
 {
-  using namespace matrix;
-
   // Check that parameters are valid
   RAFT_EXPECTS(n > 0, "invalid parameter (n<1)");
   RAFT_EXPECTS(d > 0, "invalid parameter (d<1)");
@@ -958,10 +956,10 @@ int kmeans(handle_t const& handle,
   RAFT_EXPECTS(maxiter >= 0, "invalid parameter (maxiter<0)");
 
   // Allocate memory
-  vector_t<index_type_t> clusterSizes(handle, k);
-  vector_t<value_type_t> centroids(handle, d * k);
-  vector_t<value_type_t> work(handle, n * max(k, d));
-  vector_t<index_type_t> work_int(handle, 2 * d * n);
+  raft::spectral::matrix::vector_t<index_type_t> clusterSizes(handle, k);
+  raft::spectral::matrix::vector_t<value_type_t> centroids(handle, d * k);
+  raft::spectral::matrix::vector_t<value_type_t> work(handle, n * max(k, d));
+  raft::spectral::matrix::vector_t<index_type_t> work_int(handle, 2 * d * n);
 
   // Perform k-means
   return kmeans<index_type_t, value_type_t>(handle,
diff --git a/cpp/include/raft/linalg/detail/lanczos.hpp b/cpp/include/raft/linalg/detail/lanczos.hpp
index a2b7751a05..9fa0d79875 100644
--- a/cpp/include/raft/linalg/detail/lanczos.hpp
+++ b/cpp/include/raft/linalg/detail/lanczos.hpp
@@ -29,15 +29,11 @@
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 #include <raft/spectral/detail/lapack.hpp>
-#include <raft/spectral/detail/matrix_wrappers.cuh>
 #include <raft/spectral/detail/warn_dbg.hpp>
+#include <raft/spectral/matrix_wrappers.hpp>
 
 namespace raft {
-
-using namespace matrix;
-using namespace linalg::detail;
-
-namespace spectral {
+namespace linalg {
 namespace detail {
 
 // curandGeneratorNormalX
@@ -87,7 +83,7 @@ inline curandStatus_t curandGenerateNormalX(
  */
 template <typename index_type_t, typename value_type_t>
 int performLanczosIteration(handle_t const& handle,
-                            sparse_matrix_t<index_type_t, value_type_t> const* A,
+                            spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const* A,
                             index_type_t* iter,
                             index_type_t maxIter,
                             value_type_t shift,
@@ -696,11 +692,6 @@ static int lanczosRestart(handle_t const& handle,
   return 0;
 }
 
-}  // namespace detail
-}  // namespace spectral
-
-namespace detail {
-
 /**
  * @brief  Compute smallest eigenvectors of symmetric matrix
  *    Computes eigenvalues and eigenvectors that are least
@@ -751,26 +742,25 @@ namespace detail {
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeSmallestEigenvectors(handle_t const& handle,
-                                sparse_matrix_t<index_type_t, value_type_t> const* A,
-                                index_type_t nEigVecs,
-                                index_type_t maxIter,
-                                index_type_t restartIter,
-                                value_type_t tol,
-                                bool reorthogonalize,
-                                index_type_t* effIter,
-                                index_type_t* totalIter,
-                                value_type_t* shift,
-                                value_type_t* __restrict__ alpha_host,
-                                value_type_t* __restrict__ beta_host,
-                                value_type_t* __restrict__ lanczosVecs_dev,
-                                value_type_t* __restrict__ work_dev,
-                                value_type_t* __restrict__ eigVals_dev,
-                                value_type_t* __restrict__ eigVecs_dev,
-                                unsigned long long seed)
+int computeSmallestEigenvectors(
+  handle_t const& handle,
+  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const* A,
+  index_type_t nEigVecs,
+  index_type_t maxIter,
+  index_type_t restartIter,
+  value_type_t tol,
+  bool reorthogonalize,
+  index_type_t* effIter,
+  index_type_t* totalIter,
+  value_type_t* shift,
+  value_type_t* __restrict__ alpha_host,
+  value_type_t* __restrict__ beta_host,
+  value_type_t* __restrict__ lanczosVecs_dev,
+  value_type_t* __restrict__ work_dev,
+  value_type_t* __restrict__ eigVals_dev,
+  value_type_t* __restrict__ eigVecs_dev,
+  unsigned long long seed)
 {
-  using namespace raft::spectral::detail;
-
   // Useful constants
   constexpr value_type_t one  = 1;
   constexpr value_type_t zero = 0;
@@ -993,20 +983,19 @@ int computeSmallestEigenvectors(handle_t const& handle,
 }
 
 template <typename index_type_t, typename value_type_t>
-int computeSmallestEigenvectors(handle_t const& handle,
-                                sparse_matrix_t<index_type_t, value_type_t> const& A,
-                                index_type_t nEigVecs,
-                                index_type_t maxIter,
-                                index_type_t restartIter,
-                                value_type_t tol,
-                                bool reorthogonalize,
-                                index_type_t& iter,
-                                value_type_t* __restrict__ eigVals_dev,
-                                value_type_t* __restrict__ eigVecs_dev,
-                                unsigned long long seed = 1234567)
+int computeSmallestEigenvectors(
+  handle_t const& handle,
+  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+  index_type_t nEigVecs,
+  index_type_t maxIter,
+  index_type_t restartIter,
+  value_type_t tol,
+  bool reorthogonalize,
+  index_type_t& iter,
+  value_type_t* __restrict__ eigVals_dev,
+  value_type_t* __restrict__ eigVecs_dev,
+  unsigned long long seed = 1234567)
 {
-  using namespace raft::spectral::detail;
-
   // Matrix dimension
   index_type_t n = A.nrows_;
 
@@ -1024,8 +1013,8 @@ int computeSmallestEigenvectors(handle_t const& handle,
   value_type_t* alpha_host = alpha_host_v.data();
   value_type_t* beta_host  = beta_host_v.data();
 
-  vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
-  vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
+  spectral::matrix::vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
+  spectral::matrix::vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
 
   // Perform Lanczos method
   index_type_t effIter;
@@ -1097,25 +1086,24 @@ int computeSmallestEigenvectors(handle_t const& handle,
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeLargestEigenvectors(handle_t const& handle,
-                               sparse_matrix_t<index_type_t, value_type_t> const* A,
-                               index_type_t nEigVecs,
-                               index_type_t maxIter,
-                               index_type_t restartIter,
-                               value_type_t tol,
-                               bool reorthogonalize,
-                               index_type_t* effIter,
-                               index_type_t* totalIter,
-                               value_type_t* __restrict__ alpha_host,
-                               value_type_t* __restrict__ beta_host,
-                               value_type_t* __restrict__ lanczosVecs_dev,
-                               value_type_t* __restrict__ work_dev,
-                               value_type_t* __restrict__ eigVals_dev,
-                               value_type_t* __restrict__ eigVecs_dev,
-                               unsigned long long seed)
+int computeLargestEigenvectors(
+  handle_t const& handle,
+  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const* A,
+  index_type_t nEigVecs,
+  index_type_t maxIter,
+  index_type_t restartIter,
+  value_type_t tol,
+  bool reorthogonalize,
+  index_type_t* effIter,
+  index_type_t* totalIter,
+  value_type_t* __restrict__ alpha_host,
+  value_type_t* __restrict__ beta_host,
+  value_type_t* __restrict__ lanczosVecs_dev,
+  value_type_t* __restrict__ work_dev,
+  value_type_t* __restrict__ eigVals_dev,
+  value_type_t* __restrict__ eigVecs_dev,
+  unsigned long long seed)
 {
-  using namespace raft::spectral::detail;
-
   // Useful constants
   constexpr value_type_t one  = 1;
   constexpr value_type_t zero = 0;
@@ -1342,17 +1330,18 @@ int computeLargestEigenvectors(handle_t const& handle,
 }
 
 template <typename index_type_t, typename value_type_t>
-int computeLargestEigenvectors(handle_t const& handle,
-                               sparse_matrix_t<index_type_t, value_type_t> const& A,
-                               index_type_t nEigVecs,
-                               index_type_t maxIter,
-                               index_type_t restartIter,
-                               value_type_t tol,
-                               bool reorthogonalize,
-                               index_type_t& iter,
-                               value_type_t* __restrict__ eigVals_dev,
-                               value_type_t* __restrict__ eigVecs_dev,
-                               unsigned long long seed = 123456)
+int computeLargestEigenvectors(
+  handle_t const& handle,
+  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+  index_type_t nEigVecs,
+  index_type_t maxIter,
+  index_type_t restartIter,
+  value_type_t tol,
+  bool reorthogonalize,
+  index_type_t& iter,
+  value_type_t* __restrict__ eigVals_dev,
+  value_type_t* __restrict__ eigVecs_dev,
+  unsigned long long seed = 123456)
 {
   // Matrix dimension
   index_type_t n = A.nrows_;
@@ -1371,8 +1360,8 @@ int computeLargestEigenvectors(handle_t const& handle,
   value_type_t* alpha_host = alpha_host_v.data();
   value_type_t* beta_host  = beta_host_v.data();
 
-  vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
-  vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
+  spectral::matrix::vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
+  spectral::matrix::vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
 
   // Perform Lanczos method
   index_type_t effIter;
@@ -1398,4 +1387,5 @@ int computeLargestEigenvectors(handle_t const& handle,
 }
 
 }  // namespace detail
+}  // namespace linalg
 }  // namespace raft
diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp
index e7d965f810..21b65158fc 100644
--- a/cpp/include/raft/linalg/lanczos.hpp
+++ b/cpp/include/raft/linalg/lanczos.hpp
@@ -17,8 +17,10 @@
 #pragma once
 
 #include "detail/lanczos.hpp"
+#include <raft/spectral/matrix_wrappers.hpp>
 
 namespace raft {
+namespace linalg {
 
 // =========================================================
 // Eigensolver
@@ -62,17 +64,18 @@ namespace raft {
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeSmallestEigenvectors(handle_t const& handle,
-                                sparse_matrix_t<index_type_t, value_type_t> const& A,
-                                index_type_t nEigVecs,
-                                index_type_t maxIter,
-                                index_type_t restartIter,
-                                value_type_t tol,
-                                bool reorthogonalize,
-                                index_type_t& iter,
-                                value_type_t* __restrict__ eigVals_dev,
-                                value_type_t* __restrict__ eigVecs_dev,
-                                unsigned long long seed = 1234567)
+int computeSmallestEigenvectors(
+  handle_t const& handle,
+  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+  index_type_t nEigVecs,
+  index_type_t maxIter,
+  index_type_t restartIter,
+  value_type_t tol,
+  bool reorthogonalize,
+  index_type_t& iter,
+  value_type_t* __restrict__ eigVals_dev,
+  value_type_t* __restrict__ eigVecs_dev,
+  unsigned long long seed = 1234567)
 {
   return detail::computeSmallestEigenvectors(handle,
                                              A,
@@ -125,17 +128,18 @@ int computeSmallestEigenvectors(handle_t const& handle,
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeLargestEigenvectors(handle_t const& handle,
-                               sparse_matrix_t<index_type_t, value_type_t> const& A,
-                               index_type_t nEigVecs,
-                               index_type_t maxIter,
-                               index_type_t restartIter,
-                               value_type_t tol,
-                               bool reorthogonalize,
-                               index_type_t& iter,
-                               value_type_t* __restrict__ eigVals_dev,
-                               value_type_t* __restrict__ eigVecs_dev,
-                               unsigned long long seed = 123456)
+int computeLargestEigenvectors(
+  handle_t const& handle,
+  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+  index_type_t nEigVecs,
+  index_type_t maxIter,
+  index_type_t restartIter,
+  value_type_t tol,
+  bool reorthogonalize,
+  index_type_t& iter,
+  value_type_t* __restrict__ eigVals_dev,
+  value_type_t* __restrict__ eigVecs_dev,
+  unsigned long long seed = 123456)
 {
   return detail::computeLargestEigenvectors(handle,
                                             A,
@@ -150,4 +154,5 @@ int computeLargestEigenvectors(handle_t const& handle,
                                             seed);
 }
 
+}  // namespace linalg
 }  // namespace raft
diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp
new file mode 100644
index 0000000000..151c49af7c
--- /dev/null
+++ b/cpp/include/raft/mr/buffer_base.hpp
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cudart_utils.h>
+
+#include <cuda_runtime.h>
+
+#include <cstddef>
+#include <memory>
+#include <utility>
+
+namespace raft {
+namespace mr {
+
+/**
+ * @brief Base for all RAII-based owning of temporary memory allocations. This
+ *        class should ideally not be used by users directly, but instead via
+ *        the child classes `device_buffer` and `host_buffer`.
+ *
+ * @tparam T          data type
+ * @tparam AllocatorT The underly allocator object
+ */
+template <typename T, typename AllocatorT>
+class buffer_base {
+ public:
+  using size_type       = std::size_t;
+  using value_type      = T;
+  using iterator        = value_type*;
+  using const_iterator  = const value_type*;
+  using reference       = T&;
+  using const_reference = const T&;
+
+  buffer_base() = delete;
+
+  buffer_base(const buffer_base& other) = delete;
+
+  buffer_base& operator=(const buffer_base& other) = delete;
+
+  /**
+   * @brief Main ctor
+   *
+   * @param[in] allocator asynchronous allocator used for managing buffer life
+   * @param[in] stream    cuda stream where this allocation operations are async
+   * @param[in] n         size of the buffer (in number of elements)
+   */
+  buffer_base(std::shared_ptr<AllocatorT> allocator, cudaStream_t stream, size_type n = 0)
+    : data_(nullptr), size_(n), capacity_(n), stream_(stream), allocator_(std::move(allocator))
+  {
+    if (capacity_ > 0) {
+      data_ =
+        static_cast<value_type*>(allocator_->allocate(capacity_ * sizeof(value_type), stream_));
+      RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
+    }
+  }
+
+  ~buffer_base() { release(); }
+
+  value_type* data() { return data_; }
+
+  const value_type* data() const { return data_; }
+
+  size_type size() const { return size_; }
+
+  void clear() { size_ = 0; }
+
+  iterator begin() { return data_; }
+
+  const_iterator begin() const { return data_; }
+
+  iterator end() { return data_ + size_; }
+
+  const_iterator end() const { return data_ + size_; }
+
+  /**
+   * @brief Reserve new memory size for this buffer.
+   *
+   * It re-allocates a fresh buffer if the new requested capacity is more than
+   * the current one, copies the old buffer contents to this new buffer and
+   * removes the old one.
+   *
+   * @param[in] new_capacity new capacity (in number of elements)
+   * @{
+   */
+  void reserve(size_type new_capacity)
+  {
+    if (new_capacity > capacity_) {
+      auto* new_data =
+        static_cast<value_type*>(allocator_->allocate(new_capacity * sizeof(value_type), stream_));
+      if (size_ > 0) { raft::copy(new_data, data_, size_, stream_); }
+      // Only deallocate if we have allocated a pointer
+      if (nullptr != data_) {
+        allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_);
+      }
+      data_     = new_data;
+      capacity_ = new_capacity;
+    }
+  }
+
+  void reserve(size_type new_capacity, cudaStream_t stream)
+  {
+    set_stream(stream);
+    reserve(new_capacity);
+  }
+  /** @} */
+
+  /**
+   * @brief Resize the underlying buffer (uses `reserve` method internally)
+   *
+   * @param[in] new_size new buffer size
+   * @{
+   */
+  void resize(const size_type new_size)
+  {
+    reserve(new_size);
+    size_ = new_size;
+  }
+
+  void resize(const size_type new_size, cudaStream_t stream)
+  {
+    set_stream(stream);
+    resize(new_size);
+  }
+  /** @} */
+
+  /**
+   * @brief Deletes the underlying buffer
+   *
+   * If this method is not explicitly called, it will be during the destructor
+   * @{
+   */
+  void release()
+  {
+    if (nullptr != data_) {
+      allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_);
+    }
+    data_     = nullptr;
+    capacity_ = 0;
+    size_     = 0;
+  }
+
+  void release(cudaStream_t stream)
+  {
+    set_stream(stream);
+    release();
+  }
+  /** @} */
+
+  /**
+   * @brief returns the underlying allocator used
+   *
+   * @return the allocator pointer
+   */
+  std::shared_ptr<AllocatorT> get_allocator() const { return allocator_; }
+
+  /**
+   * @brief returns the underlying stream used
+   *
+   * @return the cuda stream
+   */
+  cudaStream_t get_stream() const { return stream_; }
+
+ protected:
+  value_type* data_;
+
+ private:
+  size_type size_;
+  size_type capacity_;
+  cudaStream_t stream_;
+  std::shared_ptr<AllocatorT> allocator_;
+
+  /**
+   * @brief Sets a new cuda stream where the future operations will be queued
+   *
+   * This method makes sure that the inter-stream dependencies are met and taken
+   * care of, before setting the input stream as a new stream for this buffer.
+   * Ideally, the same cuda stream passed during constructor is expected to be
+   * used throughout this buffer's lifetime, for performance.
+   *
+   * @param[in] stream new cuda stream to be set. If it is the same as the
+   *                   current one, then this method will be a no-op.
+   */
+  void set_stream(cudaStream_t stream)
+  {
+    if (stream_ != stream) {
+      cudaEvent_t event;
+      RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+      RAFT_CUDA_TRY(cudaEventRecord(event, stream_));
+      RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, event, 0));
+      stream_ = stream;
+      RAFT_CUDA_TRY(cudaEventDestroy(event));
+    }
+  }
+};  // class buffer_base
+
+};  // namespace mr
+};  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/mr/device/buffer.hpp b/cpp/include/raft/mr/device/buffer.hpp
new file mode 100644
index 0000000000..aee3cba046
--- /dev/null
+++ b/cpp/include/raft/mr/device/buffer.hpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "allocator.hpp"
+#include <memory>
+#include <raft/mr/buffer_base.hpp>
+
+namespace raft {
+namespace mr {
+namespace device {
+
+/**
+ * @brief RAII object owning a contiguous typed device buffer. The passed in
+ *        allocator supports asynchronous allocation and deallocation so this
+ *        can also be used for temporary memory
+ *
+ * @code{.cpp}
+ * template<typename T>
+ * void foo(..., cudaStream_t stream) {
+ *   ...
+ *   raft::mr::device::buffer<T> temp(stream, 0);
+ *   ...
+ *   temp.resize(n);
+ *   kernelA<<<grid,block,0,stream>>>(...,temp.data(),...);
+ *   kernelB<<<grid,block,0,stream>>>(...,temp.data(),...);
+ *   temp.release();
+ *   ...
+ * }
+ * @endcode
+ */
+template <typename T>
+class buffer : public buffer_base<T, allocator> {
+ public:
+  using size_type       = typename buffer_base<T, allocator>::size_type;
+  using value_type      = typename buffer_base<T, allocator>::value_type;
+  using iterator        = typename buffer_base<T, allocator>::iterator;
+  using const_iterator  = typename buffer_base<T, allocator>::const_iterator;
+  using reference       = typename buffer_base<T, allocator>::reference;
+  using const_reference = typename buffer_base<T, allocator>::const_reference;
+
+  buffer() = delete;
+
+  buffer(const buffer& other) = delete;
+
+  buffer& operator=(const buffer& other) = delete;
+
+  buffer(std::shared_ptr<allocator> alloc, cudaStream_t stream, size_type n = 0)
+    : buffer_base<T, device::allocator>(alloc, stream, n)
+  {
+  }
+};  // class buffer
+
+};  // namespace device
+};  // namespace mr
+};  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/mr/host/buffer.hpp b/cpp/include/raft/mr/host/buffer.hpp
new file mode 100644
index 0000000000..de9468add8
--- /dev/null
+++ b/cpp/include/raft/mr/host/buffer.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "allocator.hpp"
+#include <memory>
+#include <raft/mr/buffer_base.hpp>
+#include <raft/mr/device/buffer.hpp>
+
+namespace raft {
+namespace mr {
+namespace host {
+
+/**
+ * @brief RAII object owning a contigous typed host buffer (aka pinned memory).
+ *        The passed in allocator supports asynchronus allocation and
+ *        deallocation so this can also be used for temporary memory
+ *
+ * @code{.cpp}
+ * template<typename T>
+ * void foo(const T* in_d , T* out_d, ..., cudaStream_t stream) {
+ *   ...
+ *   raft::mr::host::buffer<T> temp(stream, 0);
+ *   ...
+ *   temp.resize(n);
+ *   raft::copy(temp.data(), in_d, temp.size());
+ *   ...
+ *   raft::copy(out_d, temp.data(), temp.size());
+ *   temp.release(stream);
+ *   ...
+ * }
+ * @endcode
+ */
+template <typename T>
+class buffer : public buffer_base<T, allocator> {
+ public:
+  using size_type       = typename buffer_base<T, allocator>::size_type;
+  using value_type      = typename buffer_base<T, allocator>::value_type;
+  using iterator        = typename buffer_base<T, allocator>::iterator;
+  using const_iterator  = typename buffer_base<T, allocator>::const_iterator;
+  using reference       = typename buffer_base<T, allocator>::reference;
+  using const_reference = typename buffer_base<T, allocator>::const_reference;
+
+  buffer() = delete;
+
+  buffer(const buffer& other) = delete;
+
+  buffer& operator=(const buffer& other) = delete;
+
+  buffer(std::shared_ptr<allocator> alloc, const device::buffer<T>& other)
+    : buffer_base<T, allocator>(alloc, other.get_stream(), other.size())
+  {
+    if (other.size() > 0) { raft::copy(data_, other.data(), other.size(), other.get_stream()); }
+  }
+
+  buffer(std::shared_ptr<allocator> alloc, cudaStream_t stream, size_type n = 0)
+    : buffer_base<T, allocator>(alloc, stream, n)
+  {
+  }
+
+  reference operator[](size_type pos) { return data_[pos]; }
+
+  const_reference operator[](size_type pos) const { return data_[pos]; }
+
+ private:
+  using buffer_base<T, allocator>::data_;
+};
+
+};  // namespace host
+};  // namespace mr
+};  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.cuh b/cpp/include/raft/spectral/detail/matrix_wrappers.cuh
index d86dc21135..b4a2ed175f 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.cuh
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.cuh
@@ -34,7 +34,9 @@
 #define IDX(i, j, lda) ((i) + (j) * (lda))
 
 namespace raft {
+namespace spectral {
 namespace matrix {
+namespace detail {
 
 using size_type = int;  // for now; TODO: move it in appropriate header
 
@@ -443,5 +445,7 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
   value_type edge_sum_;
 };
 
+}  // namespace detail
 }  // namespace matrix
+}  // namespace spectral
 }  // namespace raft
diff --git a/cpp/include/raft/spectral/detail/modularity_maximization.hpp b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
index a55dfbe67f..6bb3dca920 100644
--- a/cpp/include/raft/spectral/detail/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
@@ -26,9 +26,11 @@
 
 #include <tuple>
 
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/spectral/cluster_solvers.hpp>
 #include <raft/spectral/detail/spectral_util.cuh>
 #include <raft/spectral/eigen_solvers.hpp>
+#include <raft/spectral/matrix_wrappers.hpp>
 
 #ifdef COLLECT_TIME_STATISTICS
 #include <cuda_profiler_api.h>
@@ -52,9 +54,6 @@ namespace raft {
 namespace spectral {
 namespace detail {
 
-using namespace matrix;
-using namespace linalg;
-
 // =========================================================
 // Spectral modularity_maximization
 // =========================================================
@@ -83,7 +82,7 @@ using namespace linalg;
 template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
   handle_t const& handle,
-  sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
   EigenSolver const& eigen_solver,
   ClusterSolver const& cluster_solver,
   vertex_t* __restrict__ clusters,
@@ -105,7 +104,7 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
   // Compute eigenvectors of Modularity Matrix
 
   // Initialize Modularity Matrix
-  modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
+  raft::spectral::matrix::modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
 
   auto eigen_config = eigen_solver.get_config();
   auto nEigVecs     = eigen_config.n_eigVecs;
@@ -142,7 +141,7 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
  */
 template <typename vertex_t, typename weight_t>
 void analyzeModularity(handle_t const& handle,
-                       sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                       raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
                        vertex_t nClusters,
                        vertex_t const* __restrict__ clusters,
                        weight_t& modularity)
@@ -157,14 +156,14 @@ void analyzeModularity(handle_t const& handle,
   auto stream   = handle.get_stream();
 
   // Device memory
-  vector_t<weight_t> part_i(handle, n);
-  vector_t<weight_t> Bx(handle, n);
+  raft::spectral::matrix::vector_t<weight_t> part_i(handle, n);
+  raft::spectral::matrix::vector_t<weight_t> Bx(handle, n);
 
   // Initialize cuBLAS
-  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  RAFT_CUBLAS_TRY(linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // Initialize Modularity
-  modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
+  raft::spectral::matrix::modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
 
   // Initialize output
   modularity = 0;
diff --git a/cpp/include/raft/spectral/detail/partition.hpp b/cpp/include/raft/spectral/detail/partition.hpp
index b7c811d5a5..775b37d118 100644
--- a/cpp/include/raft/spectral/detail/partition.hpp
+++ b/cpp/include/raft/spectral/detail/partition.hpp
@@ -25,6 +25,7 @@
 
 #include <tuple>
 
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/spectral/cluster_solvers.hpp>
 #include <raft/spectral/detail/spectral_util.cuh>
 #include <raft/spectral/eigen_solvers.hpp>
@@ -33,9 +34,6 @@ namespace raft {
 namespace spectral {
 namespace detail {
 
-using namespace matrix;
-using namespace linalg;
-
 // =========================================================
 // Spectral partitioner
 // =========================================================
@@ -63,13 +61,14 @@ using namespace linalg;
  *  @return statistics: number of eigensolver iterations, .
  */
 template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
-std::tuple<vertex_t, weight_t, vertex_t> partition(handle_t const& handle,
-                                                   sparse_matrix_t<vertex_t, weight_t> const& csr_m,
-                                                   EigenSolver const& eigen_solver,
-                                                   ClusterSolver const& cluster_solver,
-                                                   vertex_t* __restrict__ clusters,
-                                                   weight_t* eigVals,
-                                                   weight_t* eigVecs)
+std::tuple<vertex_t, weight_t, vertex_t> partition(
+  handle_t const& handle,
+  spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
 {
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
   RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
@@ -132,7 +131,7 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(handle_t const& handle,
  */
 template <typename vertex_t, typename weight_t>
 void analyzePartition(handle_t const& handle,
-                      sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                      spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
                       vertex_t nClusters,
                       const vertex_t* __restrict__ clusters,
                       weight_t& edgeCut,
@@ -153,7 +152,8 @@ void analyzePartition(handle_t const& handle,
   vector_t<weight_t> Lx(handle, n);
 
   // Initialize cuBLAS
-  RAFT_CUBLAS_TRY(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  RAFT_CUBLAS_TRY(
+    raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // Initialize Laplacian
   /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
diff --git a/cpp/include/raft/spectral/detail/spectral_util.cuh b/cpp/include/raft/spectral/detail/spectral_util.cuh
index 6b57566a73..c7a0f0c5ef 100644
--- a/cpp/include/raft/spectral/detail/spectral_util.cuh
+++ b/cpp/include/raft/spectral/detail/spectral_util.cuh
@@ -19,6 +19,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/spectral/matrix_wrappers.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/reduce.h>
@@ -204,9 +205,9 @@ bool construct_indicator(handle_t const& handle,
                          weight_t& clustersize,
                          weight_t& partStats,
                          vertex_t const* __restrict__ clusters,
-                         vector_t<weight_t>& part_i,
-                         vector_t<weight_t>& Bx,
-                         laplacian_matrix_t<vertex_t, weight_t> const& B)
+                         raft::spectral::matrix::vector_t<weight_t>& part_i,
+                         raft::spectral::matrix::vector_t<weight_t>& Bx,
+                         raft::spectral::matrix::laplacian_matrix_t<vertex_t, weight_t> const& B)
 {
   auto stream             = handle.get_stream();
   auto cublas_h           = handle.get_cublas_handle();
diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp
index 192dc15a6b..0033dbeea9 100644
--- a/cpp/include/raft/spectral/eigen_solvers.hpp
+++ b/cpp/include/raft/spectral/eigen_solvers.hpp
@@ -16,12 +16,11 @@
 #pragma once
 
 #include <raft/linalg/lanczos.hpp>
+#include <raft/spectral/matrix_wrappers.hpp>
 
 namespace raft {
 namespace spectral {
 
-using namespace matrix;
-
 // aggregate of control params for Eigen Solver:
 //
 template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
@@ -47,47 +46,49 @@ struct lanczos_solver_t {
   {
   }
 
-  index_type_t solve_smallest_eigenvectors(handle_t const& handle,
-                                           sparse_matrix_t<index_type_t, value_type_t> const& A,
-                                           value_type_t* __restrict__ eigVals,
-                                           value_type_t* __restrict__ eigVecs) const
+  index_type_t solve_smallest_eigenvectors(
+    handle_t const& handle,
+    matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+    value_type_t* __restrict__ eigVals,
+    value_type_t* __restrict__ eigVecs) const
   {
     RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
     RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
     index_type_t iters{};
-    computeSmallestEigenvectors(handle,
-                                A,
-                                config_.n_eigVecs,
-                                config_.maxIter,
-                                config_.restartIter,
-                                config_.tol,
-                                config_.reorthogonalize,
-                                iters,
-                                eigVals,
-                                eigVecs,
-                                config_.seed);
+    linalg::computeSmallestEigenvectors(handle,
+                                        A,
+                                        config_.n_eigVecs,
+                                        config_.maxIter,
+                                        config_.restartIter,
+                                        config_.tol,
+                                        config_.reorthogonalize,
+                                        iters,
+                                        eigVals,
+                                        eigVecs,
+                                        config_.seed);
     return iters;
   }
 
-  index_type_t solve_largest_eigenvectors(handle_t const& handle,
-                                          sparse_matrix_t<index_type_t, value_type_t> const& A,
-                                          value_type_t* __restrict__ eigVals,
-                                          value_type_t* __restrict__ eigVecs) const
+  index_type_t solve_largest_eigenvectors(
+    handle_t const& handle,
+    matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+    value_type_t* __restrict__ eigVals,
+    value_type_t* __restrict__ eigVecs) const
   {
     RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
     RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
     index_type_t iters{};
-    computeLargestEigenvectors(handle,
-                               A,
-                               config_.n_eigVecs,
-                               config_.maxIter,
-                               config_.restartIter,
-                               config_.tol,
-                               config_.reorthogonalize,
-                               iters,
-                               eigVals,
-                               eigVecs,
-                               config_.seed);
+    linalg::computeLargestEigenvectors(handle,
+                                       A,
+                                       config_.n_eigVecs,
+                                       config_.maxIter,
+                                       config_.restartIter,
+                                       config_.tol,
+                                       config_.reorthogonalize,
+                                       iters,
+                                       eigVals,
+                                       eigVecs,
+                                       config_.seed);
     return iters;
   }
 
diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp
new file mode 100644
index 0000000000..237f1275fd
--- /dev/null
+++ b/cpp/include/raft/spectral/matrix_wrappers.hpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/spectral/detail/matrix_wrappers.cuh>
+
+// =========================================================
+// Useful macros
+// =========================================================
+
+namespace raft {
+namespace spectral {
+namespace matrix {
+
+using size_type = int;  // for now; TODO: move it in appropriate header
+
+// specifies type of algorithm used
+// for SpMv:
+//
+using sparse_mv_alg_t = detail::sparse_mv_alg_t;
+
+// Vector "view"-like aggregate for linear algebra purposes
+//
+template <typename value_type>
+using vector_view_t = detail::vector_view_t<value_type>;
+
+template <typename value_type>
+using vector_t = detail::vector_t<value_type>;
+
+template <typename index_type, typename value_type>
+using sparse_matrix_t = detail::sparse_matrix_t<index_type, value_type>;
+
+template <typename index_type, typename value_type>
+using laplacian_matrix_t = detail::laplacian_matrix_t<index_type, value_type>;
+
+template <typename index_type, typename value_type>
+using modularity_matrix_t = detail::modularity_matrix_t<index_type, value_type>;
+
+}  // namespace matrix
+}  // namespace spectral
+}  // namespace raft
diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp
index 466851c74f..e67be767a2 100644
--- a/cpp/include/raft/spectral/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/modularity_maximization.hpp
@@ -51,15 +51,16 @@ namespace spectral {
 template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
   handle_t const& handle,
-  sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
   EigenSolver const& eigen_solver,
   ClusterSolver const& cluster_solver,
   vertex_t* __restrict__ clusters,
   weight_t* eigVals,
   weight_t* eigVecs)
 {
-  return detail::modularity_maximization<vertex_t, weight_t, EigenSolver, ClusterSolver>(
-    handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
+  return raft::spectral::detail::
+    modularity_maximization<vertex_t, weight_t, EigenSolver, ClusterSolver>(
+      handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
 }
 //===================================================
 // Analysis of graph partition
@@ -74,12 +75,13 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
  */
 template <typename vertex_t, typename weight_t>
 void analyzeModularity(handle_t const& handle,
-                       sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                       matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
                        vertex_t nClusters,
                        vertex_t const* __restrict__ clusters,
                        weight_t& modularity)
 {
-  detail::analyzeModularity<vertex_t, weight_t>(handle, csr_m, nClusters, clusters, modularity);
+  raft::spectral::detail::analyzeModularity<vertex_t, weight_t>(
+    handle, csr_m, nClusters, clusters, modularity);
 }
 
 }  // namespace spectral
diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp
index 597ef530a2..f62773a958 100644
--- a/cpp/include/raft/spectral/partition.hpp
+++ b/cpp/include/raft/spectral/partition.hpp
@@ -49,15 +49,16 @@ namespace spectral {
  *  @return statistics: number of eigensolver iterations, .
  */
 template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
-std::tuple<vertex_t, weight_t, vertex_t> partition(handle_t const& handle,
-                                                   sparse_matrix_t<vertex_t, weight_t> const& csr_m,
-                                                   EigenSolver const& eigen_solver,
-                                                   ClusterSolver const& cluster_solver,
-                                                   vertex_t* __restrict__ clusters,
-                                                   weight_t* eigVals,
-                                                   weight_t* eigVecs)
+std::tuple<vertex_t, weight_t, vertex_t> partition(
+  handle_t const& handle,
+  matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
 {
-  return detail::partition<vertex_t, weight_t, EigenSolver, ClusterSolver>(
+  return raft::spectral::detail::partition<vertex_t, weight_t, EigenSolver, ClusterSolver>(
     handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
 }
 
@@ -81,13 +82,14 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(handle_t const& handle,
  */
 template <typename vertex_t, typename weight_t>
 void analyzePartition(handle_t const& handle,
-                      sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                      matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
                       vertex_t nClusters,
                       const vertex_t* __restrict__ clusters,
                       weight_t& edgeCut,
                       weight_t& cost)
 {
-  detail::analyzePartition<vertex_t, weight_t>(handle, csr_m, nClusters, clusters, edgeCut, cost);
+  raft::spectral::detail::analyzePartition<vertex_t, weight_t>(
+    handle, csr_m, nClusters, clusters, edgeCut, cost);
 }
 
 }  // namespace spectral
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index fda60e1cb0..9f5ca95e93 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -65,6 +65,8 @@ add_executable(test_raft
     test/matrix/math.cu
     test/matrix/matrix.cu
     test/matrix/linewise_op.cu
+    test/mr/host/buffer.cpp
+    test/mr/device/buffer.cpp
     test/mst.cu
     test/random/rng.cu
     test/random/rng_int.cu
diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp
new file mode 100644
index 0000000000..324e9b9e4b
--- /dev/null
+++ b/cpp/test/mr/device/buffer.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <raft/cudart_utils.h>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/limiting_resource_adaptor.hpp>
+
+namespace raft {
+namespace mr {
+namespace device {
+
+TEST(Raft, DeviceBufferAlloc)
+{
+  cudaStream_t stream;
+  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+  // no allocation at construction
+  rmm::device_uvector<char> buff(0, stream);
+  ASSERT_EQ(0, buff.size());
+  // explicit allocation after construction
+  buff.resize(20, stream);
+  ASSERT_EQ(20, buff.size());
+  // resizing to a smaller buffer size
+  buff.resize(10, stream);
+  ASSERT_EQ(10, buff.size());
+  // explicit deallocation
+  buff.release();
+  ASSERT_EQ(0, buff.size());
+  // use these methods without the explicit stream parameter
+  buff.resize(20, stream);
+  ASSERT_EQ(20, buff.size());
+  buff.resize(10, stream);
+  ASSERT_EQ(10, buff.size());
+  buff.release();
+  ASSERT_EQ(0, buff.size());
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+}
+
+TEST(Raft, DeviceBufferZeroResize)
+{
+  // Create a limiting_resource_adaptor to track allocations
+  auto curr_mr =
+    dynamic_cast<rmm::mr::cuda_memory_resource*>(rmm::mr::get_current_device_resource());
+  auto limit_mr =
+    std::make_shared<rmm::mr::limiting_resource_adaptor<rmm::mr::cuda_memory_resource>>(curr_mr,
+                                                                                        1000);
+
+  rmm::mr::set_current_device_resource(limit_mr.get());
+
+  cudaStream_t stream;
+  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+  // no allocation at construction
+  rmm::device_uvector<char> buff(10, stream);
+  ASSERT_EQ(10, buff.size());
+  // explicit allocation after construction
+  buff.resize(0, stream);
+  ASSERT_EQ(0, buff.size());
+  // resizing to a smaller buffer size
+  buff.resize(20, stream);
+  ASSERT_EQ(20, buff.size());
+  // explicit deallocation
+  buff.release();
+  ASSERT_EQ(0, buff.size());
+
+  // Now check that there is no memory left. (Used to not be true)
+  ASSERT_EQ(0, limit_mr->get_allocated_bytes());
+
+  rmm::mr::set_current_device_resource(curr_mr);
+
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+}
+
+}  // namespace device
+}  // namespace mr
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/test/mr/host/buffer.cpp b/cpp/test/mr/host/buffer.cpp
new file mode 100644
index 0000000000..c174b269da
--- /dev/null
+++ b/cpp/test/mr/host/buffer.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <raft/mr/host/buffer.hpp>
+
+namespace raft {
+namespace mr {
+namespace host {
+
+TEST(Raft, HostBuffer)
+{
+  auto alloc = std::make_shared<default_allocator>();
+  cudaStream_t stream;
+  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+  // no allocation at construction
+  buffer<char> buff(alloc, stream);
+  ASSERT_EQ(0, buff.size());
+  // explicit allocation after construction
+  buff.resize(20, stream);
+  ASSERT_EQ(20, buff.size());
+  // resizing to a smaller buffer size
+  buff.resize(10, stream);
+  ASSERT_EQ(10, buff.size());
+  // explicit deallocation
+  buff.release(stream);
+  ASSERT_EQ(0, buff.size());
+  // use these methods without the explicit stream parameter
+  buff.resize(20);
+  ASSERT_EQ(20, buff.size());
+  buff.resize(10);
+  ASSERT_EQ(10, buff.size());
+  buff.release();
+  ASSERT_EQ(0, buff.size());
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+}
+
+TEST(Raft, DeviceToHostBuffer)
+{
+  auto d_alloc = std::make_shared<device::default_allocator>();
+  auto h_alloc = std::make_shared<default_allocator>();
+  cudaStream_t stream;
+  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+  device::buffer<char> d_buff(d_alloc, stream, 32);
+  RAFT_CUDA_TRY(cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream));
+  buffer<char> h_buff(h_alloc, d_buff);
+  ASSERT_EQ(d_buff.size(), h_buff.size());
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+}
+
+}  // namespace host
+}  // namespace mr
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu
index 652aa61451..5d0768a729 100644
--- a/cpp/test/spectral_matrix.cu
+++ b/cpp/test/spectral_matrix.cu
@@ -19,9 +19,11 @@
 #include <memory>
 #include <raft/handle.hpp>
 
-#include <raft/spectral/detail/matrix_wrappers.cuh>
+#include <raft/spectral/matrix_wrappers.hpp>
 
 namespace raft {
+namespace spectral {
+namespace matrix {
 namespace {
 template <typename index_type, typename value_type>
 struct csr_view_t {
@@ -34,7 +36,6 @@ struct csr_view_t {
 }  // namespace
 TEST(Raft, SpectralMatrices)
 {
-  using namespace matrix;
   using index_type = int;
   using value_type = double;
 
@@ -75,4 +76,6 @@ TEST(Raft, SpectralMatrices)
   EXPECT_ANY_THROW(cnstr_mm2());  // because of nullptr ptr args
 }
 
+}  // namespace matrix
+}  // namespace spectral
 }  // namespace raft

From 6963de900dfc9798c096667f785ab17b633d6953 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Wed, 9 Feb 2022 18:51:31 -0500
Subject: [PATCH 108/171] Adding destructor for std comms and using nccl
 allreduce for barrier in mpi comms (#473)

Closes #281

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Seunghwa Kang (https://github.com/seunghwak)

URL: https://github.com/rapidsai/raft/pull/473
---
 cpp/include/raft/comms/detail/mpi_comms.hpp | 30 ++++++++++++++++++---
 cpp/include/raft/comms/detail/std_comms.hpp | 28 ++++++++++---------
 cpp/include/raft/comms/mpi_comms.hpp        |  4 +--
 cpp/include/raft/comms/std_comms.hpp        |  2 +-
 4 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/cpp/include/raft/comms/detail/mpi_comms.hpp b/cpp/include/raft/comms/detail/mpi_comms.hpp
index b0da532f0a..423beace7f 100644
--- a/cpp/include/raft/comms/detail/mpi_comms.hpp
+++ b/cpp/include/raft/comms/detail/mpi_comms.hpp
@@ -31,6 +31,8 @@
 #include <raft/cudart_utils.h>
 #include <raft/error.hpp>
 #include <raft/handle.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
 
 #define RAFT_MPI_TRY(call)                                                                    \
   do {                                                                                        \
@@ -104,8 +106,14 @@ constexpr MPI_Op get_mpi_op(const op_t op)
 
 class mpi_comms : public comms_iface {
  public:
-  mpi_comms(MPI_Comm comm, const bool owns_mpi_comm)
-    : owns_mpi_comm_(owns_mpi_comm), mpi_comm_(comm), size_(0), rank_(1), next_request_id_(0)
+  mpi_comms(MPI_Comm comm, const bool owns_mpi_comm, rmm::cuda_stream_view stream)
+    : owns_mpi_comm_(owns_mpi_comm),
+      mpi_comm_(comm),
+      size_(0),
+      rank_(1),
+      status_(stream),
+      next_request_id_(0),
+      stream_(stream)
   {
     int mpi_is_initialized = 0;
     RAFT_MPI_TRY(MPI_Initialized(&mpi_is_initialized));
@@ -121,6 +129,12 @@ class mpi_comms : public comms_iface {
     RAFT_NCCL_TRY(ncclCommInitRank(&nccl_comm_, size_, id, rank_));
   }
 
+  void initialize()
+  {
+    status_.set_value_to_zero_async(stream_);
+    buf_ = status_.data();
+  }
+
   virtual ~mpi_comms()
   {
     // finalizing NCCL
@@ -139,7 +153,13 @@ class mpi_comms : public comms_iface {
     return std::unique_ptr<comms_iface>(new mpi_comms(new_comm, true));
   }
 
-  void barrier() const { RAFT_MPI_TRY(MPI_Barrier(mpi_comm_)); }
+  void barrier() const
+  {
+    allreduce(buf_, buf_, 1, datatype_t::INT32, op_t::SUM, stream_);
+
+    ASSERT(sync_stream(stream_) == status_t::SUCCESS,
+           "ERROR: syncStream failed. This can be caused by a failed rank_.");
+  }
 
   void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const
   {
@@ -397,6 +417,10 @@ class mpi_comms : public comms_iface {
   bool owns_mpi_comm_;
   MPI_Comm mpi_comm_;
 
+  cudaStream_t stream_;
+  rmm::device_scalar<int32_t> status_;
+  int32_t* buf_;
+
   ncclComm_t nccl_comm_;
   int size_;
   int rank_;
diff --git a/cpp/include/raft/comms/detail/std_comms.hpp b/cpp/include/raft/comms/detail/std_comms.hpp
index d8b0f2090c..1a4cc2fcf9 100644
--- a/cpp/include/raft/comms/detail/std_comms.hpp
+++ b/cpp/include/raft/comms/detail/std_comms.hpp
@@ -21,6 +21,7 @@
 #include <raft/comms/detail/util.hpp>
 
 #include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <raft/error.hpp>
@@ -70,11 +71,11 @@ class std_comms : public comms_iface {
             std::shared_ptr<ucp_ep_h*> eps,
             int num_ranks,
             int rank,
-            cudaStream_t stream,
+            rmm::cuda_stream_view stream,
             bool subcomms_ucp = true)
     : nccl_comm_(nccl_comm),
       stream_(stream),
-      status_(2, stream),
+      status_(stream),
       num_ranks_(num_ranks),
       rank_(rank),
       subcomms_ucp_(subcomms_ucp),
@@ -92,10 +93,10 @@ class std_comms : public comms_iface {
    * @param rank rank of the current worker
    * @param stream stream for ordering collective operations
    */
-  std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank, cudaStream_t stream)
+  std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank, rmm::cuda_stream_view stream)
     : nccl_comm_(nccl_comm),
       stream_(stream),
-      status_(2, stream),
+      status_(stream),
       num_ranks_(num_ranks),
       rank_(rank),
       subcomms_ucp_(false)
@@ -105,8 +106,14 @@ class std_comms : public comms_iface {
 
   void initialize()
   {
-    sendbuff_ = status_.data();
-    recvbuff_ = status_.data() + 1;
+    status_.set_value_to_zero_async(stream_);
+    buf_ = status_.data();
+  }
+
+  ~std_comms()
+  {
+    requests_in_flight_.clear();
+    free_requests_.clear();
   }
 
   int get_size() const { return num_ranks_; }
@@ -179,10 +186,7 @@ class std_comms : public comms_iface {
 
   void barrier() const
   {
-    RAFT_CUDA_TRY(cudaMemsetAsync(sendbuff_, 1, sizeof(int), stream_));
-    RAFT_CUDA_TRY(cudaMemsetAsync(recvbuff_, 1, sizeof(int), stream_));
-
-    allreduce(sendbuff_, recvbuff_, 1, datatype_t::INT32, op_t::SUM, stream_);
+    allreduce(buf_, buf_, 1, datatype_t::INT32, op_t::SUM, stream_);
 
     ASSERT(sync_stream(stream_) == status_t::SUCCESS,
            "ERROR: syncStream failed. This can be caused by a failed rank_.");
@@ -505,9 +509,9 @@ class std_comms : public comms_iface {
   ncclComm_t nccl_comm_;
   cudaStream_t stream_;
 
-  int *sendbuff_, *recvbuff_;
-  rmm::device_uvector<int> status_;
+  rmm::device_scalar<int32_t> status_;
 
+  int32_t* buf_;
   int num_ranks_;
   int rank_;
 
diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp
index bb1e30afc8..3fab04c441 100644
--- a/cpp/include/raft/comms/mpi_comms.hpp
+++ b/cpp/include/raft/comms/mpi_comms.hpp
@@ -26,8 +26,8 @@ using mpi_comms = detail::mpi_comms;
 
 inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm)
 {
-  auto communicator =
-    std::make_shared<comms_t>(std::unique_ptr<comms_iface>(new mpi_comms(comm, false)));
+  auto communicator = std::make_shared<comms_t>(
+    std::unique_ptr<comms_iface>(new mpi_comms(comm, false, handle->get_stream())));
   handle->set_comms(communicator);
 };
 
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index b4aa72d53e..6fa0f7e37b 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -103,4 +103,4 @@ inline void get_unique_id(char* uid, int size)
   memcpy(uid, id.internal, size);
 }
 };  // namespace comms
-};  // end namespace raft
\ No newline at end of file
+};  // end namespace raft

From 4171917ac66d6812204ff5d56b9544488936f59e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Wed, 9 Feb 2022 20:55:13 -0500
Subject: [PATCH 109/171] Passing stream through commsplit (#503)

I think it's time to add at least a test that the mpicomm can be compiled. I'll create an issue for it.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Seunghwa Kang (https://github.com/seunghwak)

URL: https://github.com/rapidsai/raft/pull/503
---
 cpp/include/raft/comms/detail/mpi_comms.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/comms/detail/mpi_comms.hpp b/cpp/include/raft/comms/detail/mpi_comms.hpp
index 423beace7f..44b64dc4ec 100644
--- a/cpp/include/raft/comms/detail/mpi_comms.hpp
+++ b/cpp/include/raft/comms/detail/mpi_comms.hpp
@@ -150,7 +150,7 @@ class mpi_comms : public comms_iface {
   {
     MPI_Comm new_comm;
     RAFT_MPI_TRY(MPI_Comm_split(mpi_comm_, color, key, &new_comm));
-    return std::unique_ptr<comms_iface>(new mpi_comms(new_comm, true));
+    return std::unique_ptr<comms_iface>(new mpi_comms(new_comm, true, stream_));
   }
 
   void barrier() const

From f46bb14cef0cef27136e169292480d4cf5f6a5d2 Mon Sep 17 00:00:00 2001
From: Vinay Deshpande <vinayd@nvidia.com>
Date: Thu, 10 Feb 2022 20:01:29 +0530
Subject: [PATCH 110/171] Correcting the namespace (#505)

Minor edit PR.

Authors:
  - Vinay Deshpande (https://github.com/vinaydes)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/505
---
 cpp/include/raft/sparse/linalg/detail/spectral.cuh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
index 95d9c0d1c5..7e5bd5b9e4 100644
--- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,7 +64,8 @@ void fit_embedding(const raft::handle_t& handle,
   index_type* ci = dst_cols.data();
   value_type* vs = dst_vals.data();
 
-  raft::matrix::sparse_matrix_t<index_type, value_type> const r_csr_m{handle, ro, ci, vs, n, nnz};
+  raft::spectral::matrix::sparse_matrix_t<index_type, value_type> const r_csr_m{
+    handle, ro, ci, vs, n, nnz};
 
   index_type neigvs       = n_components + 1;
   index_type maxiter      = 4000;  // default reset value (when set to 0);

From fdff8bb700b8c49257c834c7cf811d2a9990eb61 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Thu, 10 Feb 2022 10:58:09 -0800
Subject: [PATCH 111/171] Call initialize() in mpi_comms_t constructor. (#506)

initialize() call was missing in mpi_comms_t constructor. `buf_` didn't get properly initialized to `status_.data()`.

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/506
---
 cpp/include/raft/comms/detail/mpi_comms.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/include/raft/comms/detail/mpi_comms.hpp b/cpp/include/raft/comms/detail/mpi_comms.hpp
index 44b64dc4ec..ec1101032e 100644
--- a/cpp/include/raft/comms/detail/mpi_comms.hpp
+++ b/cpp/include/raft/comms/detail/mpi_comms.hpp
@@ -127,6 +127,8 @@ class mpi_comms : public comms_iface {
 
     // initializing NCCL
     RAFT_NCCL_TRY(ncclCommInitRank(&nccl_comm_, size_, id, rank_));
+
+    initialize();
   }
 
   void initialize()

From f9462d325c36839774899d179d5a545d83e19710 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Thu, 10 Feb 2022 17:44:30 -0500
Subject: [PATCH 112/171] Moving some of the remaining linalg prims from cuml
 (#502)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/502
---
 cpp/include/raft/linalg/detail/axpy.hpp       |   1 -
 cpp/include/raft/linalg/detail/gemm.hpp       |   1 -
 cpp/include/raft/linalg/detail/gemv.hpp       |   1 -
 cpp/include/raft/linalg/detail/lstsq.hpp      | 457 ++++++++++++++++++
 .../raft/linalg/detail/reduce_cols_by_key.cuh |  83 ++++
 .../raft/linalg/detail/reduce_rows_by_key.cuh | 446 +++++++++++++++++
 cpp/include/raft/linalg/detail/rsvd.cuh       | 412 ++++++++++++++++
 cpp/include/raft/linalg/detail/ternary_op.cuh | 105 ++++
 cpp/include/raft/linalg/lstsq.hpp             | 117 +++++
 cpp/include/raft/linalg/power.cuh             |  65 +++
 .../raft/linalg/reduce_cols_by_key.cuh        |  54 +++
 .../raft/linalg/reduce_rows_by_key.cuh        | 110 +++++
 cpp/include/raft/linalg/rsvd.cuh              | 139 ++++++
 cpp/include/raft/linalg/sqrt.cuh              |  44 ++
 cpp/include/raft/linalg/ternary_op.cuh        |  50 ++
 cpp/test/CMakeLists.txt                       |   6 +
 cpp/test/linalg/power.cu                      | 144 ++++++
 cpp/test/linalg/reduce_cols_by_key.cu         | 124 +++++
 cpp/test/linalg/reduce_rows_by_key.cu         | 262 ++++++++++
 cpp/test/linalg/rsvd.cu                       | 315 ++++++++++++
 cpp/test/linalg/sqrt.cu                       | 117 +++++
 cpp/test/linalg/ternary_op.cu                 | 107 ++++
 22 files changed, 3157 insertions(+), 3 deletions(-)
 create mode 100644 cpp/include/raft/linalg/detail/lstsq.hpp
 create mode 100644 cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh
 create mode 100644 cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
 create mode 100644 cpp/include/raft/linalg/detail/rsvd.cuh
 create mode 100644 cpp/include/raft/linalg/detail/ternary_op.cuh
 create mode 100644 cpp/include/raft/linalg/lstsq.hpp
 create mode 100644 cpp/include/raft/linalg/power.cuh
 create mode 100644 cpp/include/raft/linalg/reduce_cols_by_key.cuh
 create mode 100644 cpp/include/raft/linalg/reduce_rows_by_key.cuh
 create mode 100644 cpp/include/raft/linalg/rsvd.cuh
 create mode 100644 cpp/include/raft/linalg/sqrt.cuh
 create mode 100644 cpp/include/raft/linalg/ternary_op.cuh
 create mode 100644 cpp/test/linalg/power.cu
 create mode 100644 cpp/test/linalg/reduce_cols_by_key.cu
 create mode 100644 cpp/test/linalg/reduce_rows_by_key.cu
 create mode 100644 cpp/test/linalg/rsvd.cu
 create mode 100644 cpp/test/linalg/sqrt.cu
 create mode 100644 cpp/test/linalg/ternary_op.cu

diff --git a/cpp/include/raft/linalg/detail/axpy.hpp b/cpp/include/raft/linalg/detail/axpy.hpp
index f5527bf10f..c0ce398de9 100644
--- a/cpp/include/raft/linalg/detail/axpy.hpp
+++ b/cpp/include/raft/linalg/detail/axpy.hpp
@@ -20,7 +20,6 @@
 
 #include "cublas_wrappers.hpp"
 
-#include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
 
 namespace raft::linalg::detail {
diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp
index 0ea1723a9e..29308304d8 100644
--- a/cpp/include/raft/linalg/detail/gemm.hpp
+++ b/cpp/include/raft/linalg/detail/gemm.hpp
@@ -20,7 +20,6 @@
 
 #include "cublas_wrappers.hpp"
 
-#include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/linalg/detail/gemv.hpp b/cpp/include/raft/linalg/detail/gemv.hpp
index 3692743152..ad2e5275cb 100644
--- a/cpp/include/raft/linalg/detail/gemv.hpp
+++ b/cpp/include/raft/linalg/detail/gemv.hpp
@@ -20,7 +20,6 @@
 
 #include "cublas_wrappers.hpp"
 
-#include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/linalg/detail/lstsq.hpp b/cpp/include/raft/linalg/detail/lstsq.hpp
new file mode 100644
index 0000000000..6553394cc4
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/lstsq.hpp
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <common/nvtx.hpp>
+#include <raft/common/nvtx.hpp>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/detail/cusolver_wrappers.hpp>
+#include <raft/linalg/eig.hpp>
+#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/gemm.hpp>
+#include <raft/linalg/gemv.hpp>
+#include <raft/linalg/qr.hpp>
+#include <raft/linalg/svd.hpp>
+#include <raft/linalg/transpose.hpp>
+#include <raft/matrix/math.hpp>
+#include <raft/matrix/matrix.hpp>
+#include <raft/random/rng.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+namespace {
+
+/** Operate a CUDA event if we're in the concurrent mode; no-op otherwise. */
+struct DeviceEvent {
+ private:
+  cudaEvent_t e;
+
+ public:
+  DeviceEvent(bool concurrent)
+  {
+    if (concurrent)
+      RAFT_CUDA_TRY(cudaEventCreateWithFlags(&e, cudaEventDisableTiming));
+    else
+      e = nullptr;
+  }
+
+  ~DeviceEvent()
+  {
+    if (e != nullptr) RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(e));
+  }
+
+  void record(cudaStream_t stream)
+  {
+    if (e != nullptr) RAFT_CUDA_TRY(cudaEventRecord(e, stream));
+  }
+
+  void wait_by(cudaStream_t stream)
+  {
+    if (e != nullptr) RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, e, 0u));
+  }
+
+  DeviceEvent& operator=(const DeviceEvent& other) = delete;
+};
+
+/**
+ *  @brief Tells if the viewed CUDA stream is implicitly synchronized with the given stream.
+ *
+ *  This can happen e.g.
+ *   if the two views point to the same stream
+ *   or sometimes when one of them is the legacy default stream.
+ */
+bool are_implicitly_synchronized(rmm::cuda_stream_view a, rmm::cuda_stream_view b)
+{
+  // any stream is "synchronized" with itself
+  if (a.value() == b.value()) return true;
+  // legacy + blocking streams
+  unsigned int flags = 0;
+  if (a.is_default()) {
+    RAFT_CUDA_TRY(cudaStreamGetFlags(b.value(), &flags));
+    if ((flags & cudaStreamNonBlocking) == 0) return true;
+  }
+  if (b.is_default()) {
+    RAFT_CUDA_TRY(cudaStreamGetFlags(a.value(), &flags));
+    if ((flags & cudaStreamNonBlocking) == 0) return true;
+  }
+  return false;
+}
+
+template <typename math_t>
+struct DivideByNonZero {
+  constexpr static const math_t eps = math_t(1e-10);
+
+  __device__ math_t
+
+  operator()(const math_t a, const math_t b) const
+  {
+    return raft::myAbs<math_t>(b) >= eps ? a / b : a;
+  }
+};
+
+}  // namespace
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  Via SVD decomposition of `A = U S Vt` using default cuSOLVER routine.
+ *
+ *  @param A - input feature matrix; it's marked [in/out] in the used cuSOLVER routines,
+ *             so it's not guaranteed to stay unmodified.
+ */
+template <typename math_t>
+void lstsqSvdQR(const raft::handle_t& handle,
+                math_t* A,
+                const int n_rows,
+                const int n_cols,
+                const math_t* b,
+                math_t* w,
+                cudaStream_t stream)
+{
+  const int minmn              = min(n_rows, n_cols);
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+  int cusolverWorkSetSize      = 0;
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvd_bufferSize<math_t>(
+    cusolverH, n_rows, n_cols, &cusolverWorkSetSize));
+
+  rmm::device_uvector<math_t> workset(cusolverWorkSetSize  // cuSolver
+                                        + n_rows * minmn   // U
+                                        + n_cols * n_cols  // V
+                                        + minmn            // S
+                                        + minmn            // U^T * b
+                                        + 1                // devInfo
+                                      ,
+                                      stream);
+  math_t* cusolverWorkSet = workset.data();
+  math_t* U               = cusolverWorkSet + cusolverWorkSetSize;
+  math_t* Vt              = U + n_rows * minmn;
+  math_t* S               = Vt + n_cols * n_cols;
+  math_t* Ub              = S + minmn;
+  int* devInfo            = reinterpret_cast<int*>(Ub + minmn);
+
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvd<math_t>(cusolverH,
+                                                                  'S',
+                                                                  'S',
+                                                                  n_rows,
+                                                                  n_cols,
+                                                                  A,
+                                                                  n_rows,
+                                                                  S,
+                                                                  U,
+                                                                  n_rows,
+                                                                  Vt,
+                                                                  n_cols,
+                                                                  cusolverWorkSet,
+                                                                  cusolverWorkSetSize,
+                                                                  nullptr,
+                                                                  devInfo,
+                                                                  stream));
+  raft::linalg::gemv(handle, U, n_rows, minmn, b, Ub, true, stream);
+  raft::linalg::binaryOp(Ub, Ub, S, minmn, DivideByNonZero<math_t>(), stream);
+  raft::linalg::gemv(handle, Vt, minmn, n_cols, n_cols, Ub, w, true, stream);
+}
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  Via SVD decomposition of `A = U S V^T` using Jacobi iterations (cuSOLVER).
+ *
+ *  @param A - input feature matrix; it's marked [in/out] in the used cuSOLVER routines,
+ *             so it's not guaranteed to stay unmodified.
+ */
+template <typename math_t>
+void lstsqSvdJacobi(const raft::handle_t& handle,
+                    math_t* A,
+                    const int n_rows,
+                    const int n_cols,
+                    const math_t* b,
+                    math_t* w,
+                    cudaStream_t stream)
+{
+  const int minmn = min(n_rows, n_cols);
+  gesvdjInfo_t gesvdj_params;
+  RAFT_CUSOLVER_TRY(cusolverDnCreateGesvdjInfo(&gesvdj_params));
+  int cusolverWorkSetSize      = 0;
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(
+    raft::linalg::detail::cusolverDngesvdj_bufferSize<math_t>(cusolverH,
+                                                              CUSOLVER_EIG_MODE_VECTOR,
+                                                              1,
+                                                              n_rows,
+                                                              n_cols,
+                                                              A,
+                                                              n_rows,
+                                                              nullptr,
+                                                              nullptr,
+                                                              n_rows,
+                                                              nullptr,
+                                                              n_cols,
+                                                              &cusolverWorkSetSize,
+                                                              gesvdj_params));
+  rmm::device_uvector<math_t> workset(cusolverWorkSetSize  // cuSolver
+                                        + n_rows * minmn   // U
+                                        + n_cols * minmn   // V
+                                        + minmn            // S
+                                        + minmn            // U^T * b
+                                        + 1                // devInfo
+                                      ,
+                                      stream);
+  math_t* cusolverWorkSet = workset.data();
+  math_t* U               = cusolverWorkSet + cusolverWorkSetSize;
+  math_t* V               = U + n_rows * minmn;
+  math_t* S               = V + n_cols * minmn;
+  math_t* Ub              = S + minmn;
+  int* devInfo            = reinterpret_cast<int*>(Ub + minmn);
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvdj<math_t>(cusolverH,
+                                                                   CUSOLVER_EIG_MODE_VECTOR,
+                                                                   1,
+                                                                   n_rows,
+                                                                   n_cols,
+                                                                   A,
+                                                                   n_rows,
+                                                                   S,
+                                                                   U,
+                                                                   n_rows,
+                                                                   V,
+                                                                   n_cols,
+                                                                   cusolverWorkSet,
+                                                                   cusolverWorkSetSize,
+                                                                   devInfo,
+                                                                   gesvdj_params,
+                                                                   stream));
+  raft::linalg::gemv(handle, U, n_rows, minmn, b, Ub, true, stream);
+  raft::linalg::binaryOp(Ub, Ub, S, minmn, DivideByNonZero<math_t>(), stream);
+  raft::linalg::gemv(handle, V, n_cols, minmn, Ub, w, false, stream);
+}
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  via eigenvalue decomposition of `A^T * A` (covariance matrix for dataset A).
+ *  (`w = (A^T A)^-1  A^T b`)
+ */
+template <typename math_t>
+void lstsqEig(const raft::handle_t& handle,
+              const math_t* A,
+              const int n_rows,
+              const int n_cols,
+              const math_t* b,
+              math_t* w,
+              cudaStream_t stream)
+{
+  rmm::cuda_stream_view mainStream   = rmm::cuda_stream_view(stream);
+  rmm::cuda_stream_view multAbStream = handle.get_next_usable_stream();
+  bool concurrent;
+  // Check if the two streams can run concurrently. This is needed because a legacy default stream
+  // would synchronize with other blocking streams. To avoid synchronization in such case, we try to
+  // use an additional stream from the pool.
+  if (!are_implicitly_synchronized(mainStream, multAbStream)) {
+    concurrent = true;
+  } else if (handle.get_stream_pool_size() > 1) {
+    mainStream = handle.get_next_usable_stream();
+    concurrent = true;
+  } else {
+    multAbStream = mainStream;
+    concurrent   = false;
+  }
+
+  rmm::device_uvector<math_t> workset(n_cols * n_cols * 3 + n_cols * 2, mainStream);
+  // the event is created only if the given raft handle is capable of running
+  // at least two CUDA streams without implicit synchronization.
+  DeviceEvent worksetDone(concurrent);
+  worksetDone.record(mainStream);
+  math_t* Q    = workset.data();
+  math_t* QS   = Q + n_cols * n_cols;
+  math_t* covA = QS + n_cols * n_cols;
+  math_t* S    = covA + n_cols * n_cols;
+  math_t* Ab   = S + n_cols;
+
+  // covA <- A* A
+  math_t alpha = math_t(1);
+  math_t beta  = math_t(0);
+  raft::linalg::gemm(handle,
+                     A,
+                     n_rows,
+                     n_cols,
+                     A,
+                     covA,
+                     n_cols,
+                     n_cols,
+                     CUBLAS_OP_T,
+                     CUBLAS_OP_N,
+                     alpha,
+                     beta,
+                     mainStream);
+
+  // Ab <- A* b
+  worksetDone.wait_by(multAbStream);
+  raft::linalg::gemv(handle, A, n_rows, n_cols, b, Ab, true, multAbStream);
+  DeviceEvent multAbDone(concurrent);
+  multAbDone.record(multAbStream);
+
+  // Q S Q* <- covA
+  raft::common::nvtx::push_range("raft::linalg::eigDC");
+  raft::linalg::eigDC(handle, covA, n_cols, n_cols, Q, S, mainStream);
+  raft::common::nvtx::pop_range();
+
+  // QS  <- Q invS
+  raft::linalg::matrixVectorOp(
+    QS, Q, S, n_cols, n_cols, false, true, DivideByNonZero<math_t>(), mainStream);
+  // covA <- QS Q* == Q invS Q* == inv(A* A)
+  raft::linalg::gemm(handle,
+                     QS,
+                     n_cols,
+                     n_cols,
+                     Q,
+                     covA,
+                     n_cols,
+                     n_cols,
+                     CUBLAS_OP_N,
+                     CUBLAS_OP_T,
+                     alpha,
+                     beta,
+                     mainStream);
+
+  multAbDone.wait_by(mainStream);
+  // w <- covA Ab == Q invS Q* A b == inv(A* A) A b
+  raft::linalg::gemv(handle, covA, n_cols, n_cols, Ab, w, false, mainStream);
+
+  // This event is created only if we use two worker streams, and `stream` is not the legacy stream,
+  // and `mainStream` is not a non-blocking stream. In fact, with the current logic these conditions
+  // are impossible together, but it still makes sense to put this construct here to emphasize that
+  // `stream` must wait till the work here is done (for future refactorings).
+  DeviceEvent mainDone(!are_implicitly_synchronized(mainStream, stream));
+  mainDone.record(mainStream);
+  mainDone.wait_by(stream);
+}
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  via QR decomposition of `A = QR`.
+ *  (triangular system of equations `Rw = Q^T b`)
+ *
+ * @param A[in/out] - input feature matrix.
+ *            Warning: the content of this matrix is modified by the cuSOLVER routines.
+ * @param b[in/out] - input target vector.
+ *            Warning: the content of this vector is modified by the cuSOLVER routines.
+ */
+template <typename math_t>
+void lstsqQR(const raft::handle_t& handle,
+             math_t* A,
+             const int n_rows,
+             const int n_cols,
+             math_t* b,
+             math_t* w,
+             cudaStream_t stream)
+{
+  cublasHandle_t cublasH       = handle.get_cublas_handle();
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+
+  int m = n_rows;
+  int n = n_cols;
+
+  int info = 0;
+  rmm::device_uvector<math_t> d_tau(n, stream);
+  rmm::device_scalar<int> d_info(stream);
+
+  const cublasSideMode_t side   = CUBLAS_SIDE_LEFT;
+  const cublasOperation_t trans = CUBLAS_OP_T;
+
+  int lwork_geqrf = 0;
+  int lwork_ormqr = 0;
+  int lwork       = 0;
+
+  const int lda = m;
+  const int ldb = m;
+
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(
+    raft::linalg::detail::cusolverDngeqrf_bufferSize(cusolverH, m, n, A, lda, &lwork_geqrf));
+
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnormqr_bufferSize(cusolverH,
+                                                                     side,
+                                                                     trans,
+                                                                     m,
+                                                                     1,
+                                                                     n,
+                                                                     A,
+                                                                     lda,
+                                                                     d_tau.data(),
+                                                                     b,    // C,
+                                                                     lda,  // ldc,
+                                                                     &lwork_ormqr));
+
+  lwork = (lwork_geqrf > lwork_ormqr) ? lwork_geqrf : lwork_ormqr;
+
+  rmm::device_uvector<math_t> d_work(lwork, stream);
+
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngeqrf(
+    cusolverH, m, n, A, lda, d_tau.data(), d_work.data(), lwork, d_info.data(), stream));
+
+  RAFT_CUDA_TRY(cudaMemcpyAsync(&info, d_info.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  ASSERT(0 == info, "lstsq.h: QR wasn't successful");
+
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnormqr(cusolverH,
+                                                          side,
+                                                          trans,
+                                                          m,
+                                                          1,
+                                                          n,
+                                                          A,
+                                                          lda,
+                                                          d_tau.data(),
+                                                          b,
+                                                          ldb,
+                                                          d_work.data(),
+                                                          lwork,
+                                                          d_info.data(),
+                                                          stream));
+
+  RAFT_CUDA_TRY(cudaMemcpyAsync(&info, d_info.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  ASSERT(0 == info, "lstsq.h: QR wasn't successful");
+
+  const math_t one = 1;
+
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublastrsm(cublasH,
+                                                   side,
+                                                   CUBLAS_FILL_MODE_UPPER,
+                                                   CUBLAS_OP_N,
+                                                   CUBLAS_DIAG_NON_UNIT,
+                                                   n,
+                                                   1,
+                                                   &one,
+                                                   A,
+                                                   lda,
+                                                   b,
+                                                   ldb,
+                                                   stream));
+
+  RAFT_CUDA_TRY(cudaMemcpyAsync(w, b, sizeof(math_t) * n, cudaMemcpyDeviceToDevice, stream));
+}
+};  // namespace detail
+};  // namespace linalg
+};  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh
new file mode 100644
index 0000000000..54cf9aa204
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/reduce_cols_by_key.cuh
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/cub.cuh>
+#include <limits>
+#include <raft/cuda_utils.cuh>
+#include <stdlib.h>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+///@todo: support col-major
+///@todo: specialize this to support shared-mem based atomics
+
+template <typename T, typename KeyIteratorT, typename IdxType>
+__global__ void reduce_cols_by_key_kernel(
+  const T* data, const KeyIteratorT keys, T* out, IdxType nrows, IdxType ncols, IdxType nkeys)
+{
+  typedef typename std::iterator_traits<KeyIteratorT>::value_type KeyType;
+
+  IdxType idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= (nrows * ncols)) return;
+  ///@todo: yikes! use fast-int-div
+  IdxType colId = idx % ncols;
+  IdxType rowId = idx / ncols;
+  KeyType key   = keys[colId];
+  raft::myAtomicAdd(out + rowId * nkeys + key, data[idx]);
+}
+
+/**
+ * @brief Computes the sum-reduction of matrix columns for each given key
+ * @tparam T the input data type (as well as the output reduced matrix)
+ * @tparam KeyType data type of the keys
+ * @tparam IdxType indexing arithmetic type
+ * @param data the input data (dim = nrows x ncols). This is assumed to be in
+ * row-major layout
+ * @param keys keys array (len = ncols). It is assumed that each key in this
+ * array is between [0, nkeys). In case this is not true, the caller is expected
+ * to have called make_monotonic primitive to prepare such a contiguous and
+ * monotonically increasing keys array.
+ * @param out the output reduced matrix along columns (dim = nrows x nkeys).
+ * This will be assumed to be in row-major layout
+ * @param nrows number of rows in the input data
+ * @param ncols number of colums in the input data
+ * @param nkeys number of unique keys in the keys array
+ * @param stream cuda stream to launch the kernel onto
+ */
+template <typename T, typename KeyIteratorT, typename IdxType = int>
+void reduce_cols_by_key(const T* data,
+                        const KeyIteratorT keys,
+                        T* out,
+                        IdxType nrows,
+                        IdxType ncols,
+                        IdxType nkeys,
+                        cudaStream_t stream)
+{
+  typedef typename std::iterator_traits<KeyIteratorT>::value_type KeyType;
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(out, 0, sizeof(T) * nrows * nkeys, stream));
+  constexpr int TPB = 256;
+  int nblks         = (int)raft::ceildiv<IdxType>(nrows * ncols, TPB);
+  reduce_cols_by_key_kernel<<<nblks, TPB, 0, stream>>>(data, keys, out, nrows, ncols, nkeys);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
new file mode 100644
index 0000000000..aa0b1545d3
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
@@ -0,0 +1,446 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+
+#include <cub/cub.cuh>
+
+#include <limits>
+
+#define MAX_BLOCKS 65535u
+namespace raft {
+namespace linalg {
+namespace detail {
+
+//
+// Small helper function to convert from int->char and char->int
+// Transform ncols*nrows read of int in 2*nrows reads of int + ncols*rows reads of chars
+//
+
+template <typename IteratorT1, typename IteratorT2>
+void __global__ convert_array_kernel(IteratorT1 dst, IteratorT2 src, int n)
+{
+  for (int idx = blockDim.x * blockIdx.x + threadIdx.x; idx < n; idx += gridDim.x * blockDim.x) {
+    dst[idx] = src[idx];
+  }
+}
+
+//
+// Small helper function to convert from int->char and char->int
+// Transform ncols*nrows read of int in 2*nrows reads of int + ncols*rows reads of chars
+//
+
+template <typename IteratorT1, typename IteratorT2>
+void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st)
+{
+  dim3 grid, block;
+  block.x = 256;
+
+  grid.x = raft::ceildiv(n, (int)block.x);
+  grid.x = std::min(grid.x, MAX_BLOCKS);
+
+  convert_array_kernel<<<grid, block, 0, st>>>(dst, src, n);
+}
+
+template <typename T>
+struct quad {
+  T x, y, z, w;
+};
+
+//
+// Functor for reduce by key, small k
+//
+template <typename T>
+struct quadSum {
+  __host__ __device__ __forceinline__ quad<T> operator()(const quad<T>& a, const quad<T>& b) const
+  {
+    // wasting a double4..
+    quad<T> c;
+    c.x = a.x + b.x;
+    c.y = a.y + b.y;
+    c.z = a.z + b.z;
+    c.w = a.w + b.w;
+
+    return c;
+  }
+};
+
+//
+// Reduce by keys
+// We need to sum each dimension by labels
+// The labels are not adjacent
+//
+
+//
+// Reduce by keys - for keys <= 4
+//
+
+#define SUM_ROWS_SMALL_K_DIMX         256
+#define SUM_ROWS_BY_KEY_SMALL_K_MAX_K 4
+template <typename DataIteratorT, typename WeightT>
+__launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4)
+
+  __global__ void sum_rows_by_key_small_nkeys_kernel(const DataIteratorT d_A,
+                                                     int lda,
+                                                     const char* d_keys,
+                                                     const WeightT* d_weights,
+                                                     int nrows,
+                                                     int ncols,
+                                                     int nkeys,
+                                                     DataIteratorT d_sums)
+{
+  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+  typedef cub::BlockReduce<quad<DataType>, SUM_ROWS_SMALL_K_DIMX> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  for (int idim = static_cast<int>(blockIdx.y); idim < ncols; idim += gridDim.y) {
+    if (idim != static_cast<int>(blockIdx.y)) __syncthreads();  // we're reusing temp_storage
+
+    // threadIdx.x stores partial sum for current dim and key=threadIdx.x in this reg
+    quad<DataType> thread_sums;
+    thread_sums.x = 0.0;
+    thread_sums.y = 0.0;
+    thread_sums.z = 0.0;
+    thread_sums.w = 0.0;
+
+    // May use vectorized load - not necessary for doubles
+    for (int block_offset_irow = blockIdx.x * blockDim.x;
+         block_offset_irow < nrows;  // we will syncthreads() inside the loop, no CTA divergence
+         block_offset_irow += blockDim.x * gridDim.x) {
+      int irow     = block_offset_irow + threadIdx.x;
+      DataType val = (irow < nrows) ? d_A[irow * lda + idim] : 0.0;
+      if (d_weights && irow < nrows) { val = val * d_weights[irow]; }
+      // we are not reusing the keys - after profiling
+      // d_keys is mainly loaded from L2, and this kernel is DRAM BW bounded
+      // (experimentation gave a 10% speed up - not worth the many code lines added)
+      int row_key = (irow < nrows) ? d_keys[irow] : -1;
+
+      thread_sums.x += (row_key == 0) ? val : 0.0;
+      thread_sums.y += (row_key == 1) ? val : 0.0;
+      thread_sums.z += (row_key == 2) ? val : 0.0;
+      thread_sums.w += (row_key == 3) ? val : 0.0;
+    }
+
+    // End of column
+    // Saving local sums back to global mem
+
+    // Strided access
+
+    // Reducing by key
+    thread_sums = BlockReduce(temp_storage).Reduce(thread_sums, quadSum<DataType>());
+
+    if (threadIdx.x < 32) {
+      // We only need 4
+      thread_sums = cub::ShuffleIndex<32>(thread_sums, 0, 0xffffffff);
+      if (static_cast<int>(threadIdx.x) < nkeys) {
+        if (threadIdx.x == 0) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.x);
+        if (threadIdx.x == 1) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.y);
+        if (threadIdx.x == 2) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.z);
+        if (threadIdx.x == 3) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.w);
+      }
+    }
+  }
+}
+
+template <typename DataIteratorT, typename WeightT>
+void sum_rows_by_key_small_nkeys(const DataIteratorT d_A,
+                                 int lda,
+                                 const char* d_keys,
+                                 const WeightT* d_weights,
+                                 int nrows,
+                                 int ncols,
+                                 int nkeys,
+                                 DataIteratorT d_sums,
+                                 cudaStream_t st)
+{
+  dim3 grid, block;
+  block.x = SUM_ROWS_SMALL_K_DIMX;
+  block.y = 1;  // Necessary
+
+  grid.x = raft::ceildiv(nrows, (int)block.x);
+  grid.x = std::min(grid.x, 32u);
+  grid.y = ncols;
+  grid.y = std::min(grid.y, MAX_BLOCKS);
+  sum_rows_by_key_small_nkeys_kernel<<<grid, block, 0, st>>>(
+    d_A, lda, d_keys, d_weights, nrows, ncols, nkeys, d_sums);
+}
+
+//
+// Reduce by keys - large number of keys
+// Computing a "weigthed histogram" with local histograms in smem
+// Keeping it simple - not optimized
+//
+
+#define SUM_ROWS_BY_KEY_LARGE_K_MAX_K 1024
+
+template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
+__global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT d_A,
+                                                            int lda,
+                                                            const KeysIteratorT d_keys,
+                                                            const WeightT* d_weights,
+                                                            int nrows,
+                                                            int ncols,
+                                                            int key_offset,
+                                                            int nkeys,
+                                                            DataIteratorT d_sums)
+{
+  typedef typename std::iterator_traits<KeysIteratorT>::value_type KeyType;
+  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+  __shared__ DataType local_sums[SUM_ROWS_BY_KEY_LARGE_K_MAX_K];
+
+  for (int local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x)
+    local_sums[local_key] = 0.0;
+
+  for (int idim = blockIdx.y; idim < ncols; idim += gridDim.y) {
+    __syncthreads();  // local_sums
+
+    // At this point local_sums if full of zeros
+
+    for (int irow = blockIdx.x * blockDim.x + threadIdx.x; irow < nrows;
+         irow += blockDim.x * gridDim.x) {
+      // Branch div in this loop - not an issue with current code
+      DataType val = d_A[idim * lda + irow];
+      if (d_weights) val = val * d_weights[irow];
+
+      int local_key = d_keys[irow] - key_offset;
+
+      // We could load next val here
+      raft::myAtomicAdd(&local_sums[local_key], val);
+    }
+
+    __syncthreads();  // local_sums
+
+    for (int local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x) {
+      DataType local_sum = local_sums[local_key];
+
+      if (local_sum != 0.0) {
+        KeyType global_key = key_offset + local_key;
+        raft::myAtomicAdd(&d_sums[global_key * ncols + idim], local_sum);
+        local_sums[local_key] = 0.0;
+      }
+    }
+  }
+}
+
+template <typename DataIteratorT, typename KeysIteratorT>
+void sum_rows_by_key_large_nkeys_colmajor(const DataIteratorT d_A,
+                                          int lda,
+                                          KeysIteratorT d_keys,
+                                          int nrows,
+                                          int ncols,
+                                          int key_offset,
+                                          int nkeys,
+                                          DataIteratorT d_sums,
+                                          cudaStream_t st)
+{
+  dim3 grid, block;
+  block.x = SUM_ROWS_SMALL_K_DIMX;
+  block.y = 1;  // Necessary
+
+  grid.x = raft::ceildiv(nrows, (int)block.x);
+  grid.x = std::min(grid.x, 32u);
+  grid.y = ncols;
+  grid.y = std::min(grid.y, MAX_BLOCKS);
+  sum_rows_by_key_large_nkeys_kernel_colmajor<<<grid, block, 0, st>>>(
+    d_A, lda, d_keys, nrows, ncols, key_offset, nkeys, d_sums);
+}
+
+#define RRBK_SHMEM_SZ 32
+
+//#define RRBK_SHMEM
+template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
+__global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT d_A,
+                                                            int lda,
+                                                            const WeightT* d_weights,
+                                                            KeysIteratorT d_keys,
+                                                            int nrows,
+                                                            int ncols,
+                                                            int key_offset,
+                                                            int nkeys,
+                                                            DataIteratorT d_sums)
+{
+  typedef typename std::iterator_traits<KeysIteratorT>::value_type KeyType;
+  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+
+#ifdef RRBK_SHMEM
+  __shared__ KeyType sh_keys[RRBK_SHMEM_SZ];
+#endif
+  int rows_per_partition = nrows / gridDim.z + 1;
+  int start_row          = blockIdx.z * rows_per_partition;
+  int end_row            = start_row + rows_per_partition;
+  end_row                = end_row > nrows ? nrows : end_row;
+
+  KeyType local_key = blockIdx.y;
+  if (local_key >= nkeys) return;
+  int this_col = threadIdx.x + blockIdx.x * blockDim.x;
+  if (this_col >= ncols) return;
+
+  DataType sum       = 0.0;
+  KeyType global_key = key_offset + local_key;
+#ifdef RRBK_SHMEM
+  int sh_key_inx = 0;
+#endif
+  for (int r = start_row; r < end_row; r++) {
+#ifdef RRBK_SHMEM
+    if (0 == sh_key_inx % RRBK_SHMEM_SZ) {
+      for (int x = threadIdx.x; x < RRBK_SHMEM_SZ; x += blockDim.x)
+        sh_keys[x] = d_keys[r + x];
+      __syncthreads();
+    }
+    if (sh_keys[sh_key_inx] != global_key) continue;  // No divergence since global_key is the
+    // same for the whole block
+    sh_key_inx++;
+#else
+    if (d_keys[r] != global_key)
+      continue;  // No divergence since global_key is the
+                 // same for the whole block
+#endif
+    // if ((end_row-start_row) / (r-start_row) != global_key) continue;
+    DataType val = __ldcg(&d_A[r * lda + this_col]);
+    if (d_weights) { val = val * d_weights[r]; }
+    sum += val;
+  }
+
+  if (sum != 0.0) raft::myAtomicAdd(&d_sums[global_key * ncols + this_col], sum);
+}
+
+template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
+void sum_rows_by_key_large_nkeys_rowmajor(const DataIteratorT d_A,
+                                          int lda,
+                                          const KeysIteratorT d_keys,
+                                          const WeightT* d_weights,
+                                          int nrows,
+                                          int ncols,
+                                          int key_offset,
+                                          int nkeys,
+                                          DataIteratorT d_sums,
+                                          cudaStream_t st)
+{
+  // x-dim refers to the column in the input data
+  // y-dim refers to the key
+  // z-dim refers to a partitioning of the rows among the threadblocks
+  dim3 grid, block;
+  block.x = 256;  // Adjust me!
+  block.y = 1;    // Don't adjust me!
+  grid.x  = raft::ceildiv(ncols, (int)block.x);
+  grid.y  = nkeys;
+  grid.z  = std::max(40960000 / nkeys / ncols, (int)1);  // Adjust me!
+  grid.z  = std::min(grid.z, (unsigned int)nrows);
+  grid.z  = std::min(grid.z, MAX_BLOCKS);
+
+  sum_rows_by_key_large_nkeys_kernel_rowmajor<<<grid, block, 0, st>>>(
+    d_A, lda, d_weights, d_keys, nrows, ncols, key_offset, nkeys, d_sums);
+}
+
+/**
+ * @brief Computes the weighted reduction of matrix rows for each given key
+ *
+ * @tparam DataIteratorT Random-access iterator type, for reading input matrix
+ *                       (may be a simple pointer type)
+ * @tparam KeysIteratorT Random-access iterator type, for reading input keys
+ *                       (may be a simple pointer type)
+ *
+ * @param[in]  d_A         Input data array (lda x nrows)
+ * @param[in]  lda         Real row size for input data, d_A
+ * @param[in]  d_keys      Keys for each row (1 x nrows)
+ * @param[in]  d_weights   Weights for each observation in d_A (1 x nrows)
+ * @param[out] d_keys_char Scratch memory for conversion of keys to char
+ * @param[in]  nrows       Number of rows in d_A and d_keys
+ * @param[in]  ncols       Number of data columns in d_A
+ * @param[in]  nkeys       Number of unique keys in d_keys
+ * @param[out] d_sums      Row sums by key (ncols x d_keys)
+ * @param[in]  stream      CUDA stream
+ */
+template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
+void reduce_rows_by_key(const DataIteratorT d_A,
+                        int lda,
+                        const KeysIteratorT d_keys,
+                        const WeightT* d_weights,
+                        char* d_keys_char,
+                        int nrows,
+                        int ncols,
+                        int nkeys,
+                        DataIteratorT d_sums,
+                        cudaStream_t stream)
+{
+  typedef typename std::iterator_traits<KeysIteratorT>::value_type KeyType;
+  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+
+  // Following kernel needs memset
+  cudaMemsetAsync(d_sums, 0, ncols * nkeys * sizeof(DataType), stream);
+
+  if (nkeys <= SUM_ROWS_BY_KEY_SMALL_K_MAX_K) {
+    // sum_rows_by_key_small_k is BW bounded. d_keys is loaded ncols time - avoiding wasting BW
+    // with doubles we have ~20% speed up - with floats we can hope something around 2x
+    // Converting d_keys to char
+    convert_array(d_keys_char, d_keys, nrows, stream);
+    sum_rows_by_key_small_nkeys(
+      d_A, lda, d_keys_char, d_weights, nrows, ncols, nkeys, d_sums, stream);
+  } else {
+    for (KeyType key_offset = 0; key_offset < static_cast<KeyType>(nkeys);
+         key_offset += SUM_ROWS_BY_KEY_LARGE_K_MAX_K) {
+      KeyType this_call_nkeys = std::min(SUM_ROWS_BY_KEY_LARGE_K_MAX_K, nkeys);
+      sum_rows_by_key_large_nkeys_rowmajor(
+        d_A, lda, d_keys, d_weights, nrows, ncols, key_offset, this_call_nkeys, d_sums, stream);
+    }
+  }
+}
+
+/**
+ * @brief Computes the reduction of matrix rows for each given key
+ * @tparam DataIteratorT Random-access iterator type, for reading input matrix (may be a simple
+ * pointer type)
+ * @tparam KeysIteratorT Random-access iterator type, for reading input keys (may be a simple
+ * pointer type)
+ * @param[in]  d_A         Input data array (lda x nrows)
+ * @param[in]  lda         Real row size for input data, d_A
+ * @param[in]  d_keys      Keys for each row (1 x nrows)
+ * @param      d_keys_char Scratch memory for conversion of keys to char
+ * @param[in]  nrows       Number of rows in d_A and d_keys
+ * @param[in]  ncols       Number of data columns in d_A
+ * @param[in]  nkeys       Number of unique keys in d_keys
+ * @param[out] d_sums      Row sums by key (ncols x d_keys)
+ * @param[in]  stream      CUDA stream
+ */
+template <typename DataIteratorT, typename KeysIteratorT>
+void reduce_rows_by_key(const DataIteratorT d_A,
+                        int lda,
+                        const KeysIteratorT d_keys,
+                        char* d_keys_char,
+                        int nrows,
+                        int ncols,
+                        int nkeys,
+                        DataIteratorT d_sums,
+                        cudaStream_t stream)
+{
+  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+  reduce_rows_by_key(d_A,
+                     lda,
+                     d_keys,
+                     static_cast<DataType*>(nullptr),
+                     d_keys_char,
+                     nrows,
+                     ncols,
+                     nkeys,
+                     d_sums,
+                     stream);
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/rsvd.cuh b/cpp/include/raft/linalg/detail/rsvd.cuh
new file mode 100644
index 0000000000..88436eda64
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/rsvd.cuh
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/eig.hpp>
+#include <raft/linalg/gemm.hpp>
+#include <raft/linalg/qr.hpp>
+#include <raft/linalg/svd.hpp>
+#include <raft/linalg/transpose.hpp>
+#include <raft/matrix/math.hpp>
+#include <raft/matrix/matrix.hpp>
+#include <raft/random/rng.hpp>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+/**
+ * @brief randomized singular value decomposition (RSVD) on the column major
+ * float type input matrix (Jacobi-based), by specifying no. of PCs and
+ * upsamples directly
+ * @param handle: raft handle
+ * @param M: input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param S_vec: singular values of input matrix
+ * @param U: left singular values of input matrix
+ * @param V: right singular values of input matrix
+ * @param k: no. of singular values to be computed
+ * @param p: no. of upsamples
+ * @param use_bbt: whether use eigen decomposition in computation or not
+ * @param gen_left_vec: left vector needs to be generated or not?
+ * @param gen_right_vec: right vector needs to be generated or not?
+ * @param use_jacobi: whether to jacobi solver for decomposition
+ * @param tol: tolerance for Jacobi-based solvers
+ * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void rsvdFixedRank(const raft::handle_t& handle,
+                   math_t* M,
+                   int n_rows,
+                   int n_cols,
+                   math_t* S_vec,
+                   math_t* U,
+                   math_t* V,
+                   int k,
+                   int p,
+                   bool use_bbt,
+                   bool gen_left_vec,
+                   bool gen_right_vec,
+                   bool use_jacobi,
+                   math_t tol,
+                   int max_sweeps,
+                   cudaStream_t stream)
+{
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+  cublasHandle_t cublasH       = handle.get_cublas_handle();
+
+  // All the notations are following Algorithm 4 & 5 in S. Voronin's paper:
+  // https://arxiv.org/abs/1502.05366
+
+  int m = n_rows, n = n_cols;
+  int l = k + p;  // Total number of singular values to be computed before truncation
+  int q = 2;      // Number of power sampling counts
+  int s = 1;      // Frequency controller for QR decomposition during power sampling
+  // scheme. s = 1: 2 QR per iteration; s = 2: 1 QR per iteration; s
+  // > 2: less frequent QR
+
+  const math_t alpha = 1.0, beta = 0.0;
+
+  // Build temporary U, S, V matrices
+  rmm::device_uvector<math_t> S_vec_tmp(l, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(S_vec_tmp.data(), 0, sizeof(math_t) * l, stream));
+
+  // build random matrix
+  rmm::device_uvector<math_t> RN(n * l, stream);
+  raft::random::Rng rng(484);
+  rng.normal(RN.data(), n * l, math_t(0.0), alpha, stream);
+
+  // multiply to get matrix of random samples Y
+  rmm::device_uvector<math_t> Y(m * l, stream);
+  raft::linalg::gemm(
+    handle, M, m, n, RN.data(), Y.data(), m, l, CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream);
+
+  // now build up (M M^T)^q R
+  rmm::device_uvector<math_t> Z(n * l, stream);
+  rmm::device_uvector<math_t> Yorth(m * l, stream);
+  rmm::device_uvector<math_t> Zorth(n * l, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(Z.data(), 0, sizeof(math_t) * n * l, stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(Yorth.data(), 0, sizeof(math_t) * m * l, stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(Zorth.data(), 0, sizeof(math_t) * n * l, stream));
+
+  // power sampling scheme
+  for (int j = 1; j < q; j++) {
+    if ((2 * j - 2) % s == 0) {
+      raft::linalg::qrGetQ(handle, Y.data(), Yorth.data(), m, l, stream);
+      raft::linalg::gemm(handle,
+                         M,
+                         m,
+                         n,
+                         Yorth.data(),
+                         Z.data(),
+                         n,
+                         l,
+                         CUBLAS_OP_T,
+                         CUBLAS_OP_N,
+                         alpha,
+                         beta,
+                         stream);
+    } else {
+      raft::linalg::gemm(
+        handle, M, m, n, Y.data(), Z.data(), n, l, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream);
+    }
+
+    if ((2 * j - 1) % s == 0) {
+      raft::linalg::qrGetQ(handle, Z.data(), Zorth.data(), n, l, stream);
+      raft::linalg::gemm(handle,
+                         M,
+                         m,
+                         n,
+                         Zorth.data(),
+                         Y.data(),
+                         m,
+                         l,
+                         CUBLAS_OP_N,
+                         CUBLAS_OP_N,
+                         alpha,
+                         beta,
+                         stream);
+    } else {
+      raft::linalg::gemm(
+        handle, M, m, n, Z.data(), Y.data(), m, l, CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream);
+    }
+  }
+
+  // orthogonalize on exit from loop to get Q
+  rmm::device_uvector<math_t> Q(m * l, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(Q.data(), 0, sizeof(math_t) * m * l, stream));
+  raft::linalg::qrGetQ(handle, Y.data(), Q.data(), m, l, stream);
+
+  // either QR of B^T method, or eigendecompose BB^T method
+  if (!use_bbt) {
+    // form Bt = Mt*Q : nxm * mxl = nxl
+    rmm::device_uvector<math_t> Bt(n * l, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(Bt.data(), 0, sizeof(math_t) * n * l, stream));
+    raft::linalg::gemm(
+      handle, M, m, n, Q.data(), Bt.data(), n, l, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream);
+
+    // compute QR factorization of Bt
+    // M is mxn ; Q is mxn ; R is min(m,n) x min(m,n) */
+    rmm::device_uvector<math_t> Qhat(n * l, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(Qhat.data(), 0, sizeof(math_t) * n * l, stream));
+    rmm::device_uvector<math_t> Rhat(l * l, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(Rhat.data(), 0, sizeof(math_t) * l * l, stream));
+    raft::linalg::qrGetQR(handle, Bt.data(), Qhat.data(), Rhat.data(), n, l, stream);
+
+    // compute SVD of Rhat (lxl)
+    rmm::device_uvector<math_t> Uhat(l * l, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream));
+    rmm::device_uvector<math_t> Vhat(l * l, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(Vhat.data(), 0, sizeof(math_t) * l * l, stream));
+    if (use_jacobi)
+      raft::linalg::svdJacobi(handle,
+                              Rhat.data(),
+                              l,
+                              l,
+                              S_vec_tmp.data(),
+                              Uhat.data(),
+                              Vhat.data(),
+                              true,
+                              true,
+                              tol,
+                              max_sweeps,
+                              stream);
+    else
+      raft::linalg::svdQR(handle,
+                          Rhat.data(),
+                          l,
+                          l,
+                          S_vec_tmp.data(),
+                          Uhat.data(),
+                          Vhat.data(),
+                          true,
+                          true,
+                          true,
+                          stream);
+    raft::matrix::sliceMatrix(S_vec_tmp.data(),
+                              1,
+                              l,
+                              S_vec,
+                              0,
+                              0,
+                              1,
+                              k,
+                              stream);  // First k elements of S_vec
+
+    // Merge step 14 & 15 by calculating U = Q*Vhat[:,1:k] mxl * lxk = mxk
+    if (gen_left_vec) {
+      raft::linalg::gemm(handle,
+                         Q.data(),
+                         m,
+                         l,
+                         Vhat.data(),
+                         U,
+                         m,
+                         k /*used to be l and needs slicing*/,
+                         CUBLAS_OP_N,
+                         CUBLAS_OP_N,
+                         alpha,
+                         beta,
+                         stream);
+    }
+
+    // Merge step 14 & 15 by calculating V = Qhat*Uhat[:,1:k] nxl * lxk = nxk
+    if (gen_right_vec) {
+      raft::linalg::gemm(handle,
+                         Qhat.data(),
+                         n,
+                         l,
+                         Uhat.data(),
+                         V,
+                         n,
+                         k /*used to be l and needs slicing*/,
+                         CUBLAS_OP_N,
+                         CUBLAS_OP_N,
+                         alpha,
+                         beta,
+                         stream);
+    }
+  } else {
+    // build the matrix B B^T = Q^T M M^T Q column by column
+    // Bt = M^T Q ; nxm * mxk = nxk
+    rmm::device_uvector<math_t> B(n * l, stream);
+    raft::linalg::gemm(
+      handle, Q.data(), m, l, M, B.data(), l, n, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream);
+
+    rmm::device_uvector<math_t> BBt(l * l, stream);
+    raft::linalg::gemm(handle,
+                       B.data(),
+                       l,
+                       n,
+                       B.data(),
+                       BBt.data(),
+                       l,
+                       l,
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_T,
+                       alpha,
+                       beta,
+                       stream);
+
+    // compute eigendecomposition of BBt
+    rmm::device_uvector<math_t> Uhat(l * l, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(Uhat.data(), 0, sizeof(math_t) * l * l, stream));
+    rmm::device_uvector<math_t> Uhat_dup(l * l, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(Uhat_dup.data(), 0, sizeof(math_t) * l * l, stream));
+    raft::matrix::copyUpperTriangular(BBt.data(), Uhat_dup.data(), l, l, stream);
+    if (use_jacobi)
+      raft::linalg::eigJacobi(
+        handle, Uhat_dup.data(), l, l, Uhat.data(), S_vec_tmp.data(), stream, tol, max_sweeps);
+    else
+      raft::linalg::eigDC(handle, Uhat_dup.data(), l, l, Uhat.data(), S_vec_tmp.data(), stream);
+    raft::matrix::seqRoot(S_vec_tmp.data(), l, stream);
+    raft::matrix::sliceMatrix(S_vec_tmp.data(),
+                              1,
+                              l,
+                              S_vec,
+                              0,
+                              p,
+                              1,
+                              l,
+                              stream);  // Last k elements of S_vec
+    raft::matrix::colReverse(S_vec, 1, k, stream);
+
+    // Merge step 14 & 15 by calculating U = Q*Uhat[:,(p+1):l] mxl * lxk = mxk
+    if (gen_left_vec) {
+      raft::linalg::gemm(handle,
+                         Q.data(),
+                         m,
+                         l,
+                         Uhat.data() + p * l,
+                         U,
+                         m,
+                         k,
+                         CUBLAS_OP_N,
+                         CUBLAS_OP_N,
+                         alpha,
+                         beta,
+                         stream);
+      raft::matrix::colReverse(U, m, k, stream);
+    }
+
+    // Merge step 14 & 15 by calculating V = B^T Uhat[:,(p+1):l] *
+    // Sigma^{-1}[(p+1):l, (p+1):l] nxl * lxk * kxk = nxk
+    if (gen_right_vec) {
+      rmm::device_uvector<math_t> Sinv(k * k, stream);
+      RAFT_CUDA_TRY(cudaMemsetAsync(Sinv.data(), 0, sizeof(math_t) * k * k, stream));
+      rmm::device_uvector<math_t> UhatSinv(l * k, stream);
+      RAFT_CUDA_TRY(cudaMemsetAsync(UhatSinv.data(), 0, sizeof(math_t) * l * k, stream));
+      raft::matrix::reciprocal(S_vec_tmp.data(), l, stream);
+      raft::matrix::initializeDiagonalMatrix(S_vec_tmp.data() + p, Sinv.data(), k, k, stream);
+
+      raft::linalg::gemm(handle,
+                         Uhat.data() + p * l,
+                         l,
+                         k,
+                         Sinv.data(),
+                         UhatSinv.data(),
+                         l,
+                         k,
+                         CUBLAS_OP_N,
+                         CUBLAS_OP_N,
+                         alpha,
+                         beta,
+                         stream);
+      raft::linalg::gemm(handle,
+                         B.data(),
+                         l,
+                         n,
+                         UhatSinv.data(),
+                         V,
+                         n,
+                         k,
+                         CUBLAS_OP_T,
+                         CUBLAS_OP_N,
+                         alpha,
+                         beta,
+                         stream);
+      raft::matrix::colReverse(V, n, k, stream);
+    }
+  }
+}
+
+/**
+ * @brief randomized singular value decomposition (RSVD) on the column major
+ * float type input matrix (Jacobi-based), by specifying the PC and upsampling
+ * ratio
+ * @param handle: raft handle
+ * @param M: input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param S_vec: singular values of input matrix
+ * @param U: left singular values of input matrix
+ * @param V: right singular values of input matrix
+ * @param PC_perc: percentage of singular values to be computed
+ * @param UpS_perc: upsampling percentage
+ * @param use_bbt: whether use eigen decomposition in computation or not
+ * @param gen_left_vec: left vector needs to be generated or not?
+ * @param gen_right_vec: right vector needs to be generated or not?
+ * @param use_jacobi: whether to jacobi solver for decomposition
+ * @param tol: tolerance for Jacobi-based solvers
+ * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void rsvdPerc(const raft::handle_t& handle,
+              math_t* M,
+              int n_rows,
+              int n_cols,
+              math_t* S_vec,
+              math_t* U,
+              math_t* V,
+              math_t PC_perc,
+              math_t UpS_perc,
+              bool use_bbt,
+              bool gen_left_vec,
+              bool gen_right_vec,
+              bool use_jacobi,
+              math_t tol,
+              int max_sweeps,
+              cudaStream_t stream)
+{
+  int k = max((int)(min(n_rows, n_cols) * PC_perc),
+              1);  // Number of singular values to be computed
+  int p = max((int)(min(n_rows, n_cols) * UpS_perc), 1);  // Upsamples
+  rsvdFixedRank(handle,
+                M,
+                n_rows,
+                n_cols,
+                S_vec,
+                U,
+                V,
+                k,
+                p,
+                use_bbt,
+                gen_left_vec,
+                gen_right_vec,
+                use_jacobi,
+                tol,
+                max_sweeps,
+                stream);
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/detail/ternary_op.cuh b/cpp/include/raft/linalg/detail/ternary_op.cuh
new file mode 100644
index 0000000000..bcfcc9df01
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/ternary_op.cuh
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/vectorized.cuh>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+template <typename math_t, int veclen_, typename Lambda, typename IdxType>
+__global__ void ternaryOpKernel(
+  math_t* out, const math_t* in1, const math_t* in2, const math_t* in3, IdxType len, Lambda op)
+{
+  typedef raft::TxN_t<math_t, veclen_> VecType;
+  VecType a, b, c;
+  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
+  idx *= VecType::Ratio;
+  if (idx >= len) return;
+  a.load(in1, idx);
+  b.load(in2, idx);
+  c.load(in3, idx);
+#pragma unroll
+  for (int i = 0; i < VecType::Ratio; ++i) {
+    a.val.data[i] = op(a.val.data[i], b.val.data[i], c.val.data[i]);
+  }
+  a.store(out, idx);
+}
+
+template <typename math_t, int veclen_, typename Lambda, typename IdxType, int TPB>
+void ternaryOpImpl(math_t* out,
+                   const math_t* in1,
+                   const math_t* in2,
+                   const math_t* in3,
+                   IdxType len,
+                   Lambda op,
+                   cudaStream_t stream)
+{
+  const IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : len, (IdxType)TPB);
+  ternaryOpKernel<math_t, veclen_, Lambda, IdxType>
+    <<<nblks, TPB, 0, stream>>>(out, in1, in2, in3, len, op);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/**
+ * @brief perform element-wise ternary operation on the input arrays
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @param out the output array
+ * @param in1 the first input array
+ * @param in2 the second input array
+ * @param in3 the third input array
+ * @param len number of elements in the input array
+ * @param op the device-lambda
+ * @param stream cuda stream where to launch work
+ */
+template <typename math_t, typename Lambda, typename IdxType = int, int TPB = 256>
+void ternaryOp(math_t* out,
+               const math_t* in1,
+               const math_t* in2,
+               const math_t* in3,
+               IdxType len,
+               Lambda op,
+               cudaStream_t stream)
+{
+  size_t bytes = len * sizeof(math_t);
+  if (16 / sizeof(math_t) && bytes % 16 == 0) {
+    ternaryOpImpl<math_t, 16 / sizeof(math_t), Lambda, IdxType, TPB>(
+      out, in1, in2, in3, len, op, stream);
+  } else if (8 / sizeof(math_t) && bytes % 8 == 0) {
+    ternaryOpImpl<math_t, 8 / sizeof(math_t), Lambda, IdxType, TPB>(
+      out, in1, in2, in3, len, op, stream);
+  } else if (4 / sizeof(math_t) && bytes % 4 == 0) {
+    ternaryOpImpl<math_t, 4 / sizeof(math_t), Lambda, IdxType, TPB>(
+      out, in1, in2, in3, len, op, stream);
+  } else if (2 / sizeof(math_t) && bytes % 2 == 0) {
+    ternaryOpImpl<math_t, 2 / sizeof(math_t), Lambda, IdxType, TPB>(
+      out, in1, in2, in3, len, op, stream);
+  } else if (1 / sizeof(math_t)) {
+    ternaryOpImpl<math_t, 1 / sizeof(math_t), Lambda, IdxType, TPB>(
+      out, in1, in2, in3, len, op, stream);
+  } else {
+    ternaryOpImpl<math_t, 1, Lambda, IdxType, TPB>(out, in1, in2, in3, len, op, stream);
+  }
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/lstsq.hpp b/cpp/include/raft/linalg/lstsq.hpp
new file mode 100644
index 0000000000..57dd0a7b15
--- /dev/null
+++ b/cpp/include/raft/linalg/lstsq.hpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/handle.hpp>
+#include <raft/linalg/detail/lstsq.hpp>
+namespace raft {
+namespace linalg {
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  Via SVD decomposition of `A = U S Vt` using default cuSOLVER routine.
+ *
+ * @param[in] handle raft handle
+ * @param[inout] A input feature matrix.
+ *            Warning: the content of this matrix is modified by the cuSOLVER routines.
+ * @param[in] n_rows number of rows in A
+ * @param[in] n_cols number of columns in A
+ * @param[inout] b input target vector.
+ *            Warning: the content of this vector is modified by the cuSOLVER routines.
+ * @param[out] w output coefficient vector
+ * @param[in] stream cuda stream for ordering operations
+ */
+template <typename math_t>
+void lstsqSvdQR(const raft::handle_t& handle,
+                math_t* A,
+                const int n_rows,
+                const int n_cols,
+                const math_t* b,
+                math_t* w,
+                cudaStream_t stream)
+{
+  detail::lstsqSvdQR(handle, A, n_rows, n_cols, b, w, stream);
+}
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  Via SVD decomposition of `A = U S V^T` using Jacobi iterations (cuSOLVER).
+ *
+ * @param[in] handle raft handle
+ * @param[inout] A input feature matrix.
+ *            Warning: the content of this matrix is modified by the cuSOLVER routines.
+ * @param[in] n_rows number of rows in A
+ * @param[in] n_cols number of columns in A
+ * @param[inout] b input target vector.
+ *            Warning: the content of this vector is modified by the cuSOLVER routines.
+ * @param[out] w output coefficient vector
+ * @param[in] stream cuda stream for ordering operations
+ */
+template <typename math_t>
+void lstsqSvdJacobi(const raft::handle_t& handle,
+                    math_t* A,
+                    const int n_rows,
+                    const int n_cols,
+                    const math_t* b,
+                    math_t* w,
+                    cudaStream_t stream)
+{
+  detail::lstsqSvdJacobi(handle, A, n_rows, n_cols, b, w, stream);
+}
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  via eigenvalue decomposition of `A^T * A` (covariance matrix for dataset A).
+ *  (`w = (A^T A)^-1  A^T b`)
+ */
+template <typename math_t>
+void lstsqEig(const raft::handle_t& handle,
+              const math_t* A,
+              const int n_rows,
+              const int n_cols,
+              const math_t* b,
+              math_t* w,
+              cudaStream_t stream)
+{
+  detail::lstsqEig(handle, A, n_rows, n_cols, b, w, stream);
+}
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  via QR decomposition of `A = QR`.
+ *  (triangular system of equations `Rw = Q^T b`)
+ *
+ * @param[in] handle raft handle
+ * @param[inout] A input feature matrix.
+ *            Warning: the content of this matrix is modified by the cuSOLVER routines.
+ * @param[in] n_rows number of rows in A
+ * @param[in] n_cols number of columns in A
+ * @param[inout] b input target vector.
+ *            Warning: the content of this vector is modified by the cuSOLVER routines.
+ * @param[out] w output coefficient vector
+ * @param[in] stream cuda stream for ordering operations
+ */
+template <typename math_t>
+void lstsqQR(const raft::handle_t& handle,
+             math_t* A,
+             const int n_rows,
+             const int n_cols,
+             math_t* b,
+             math_t* w,
+             cudaStream_t stream)
+{
+  detail::lstsqQR(handle, A, n_rows, n_cols, b, w, stream);
+}
+
+};  // namespace linalg
+};  // namespace raft
diff --git a/cpp/include/raft/linalg/power.cuh b/cpp/include/raft/linalg/power.cuh
new file mode 100644
index 0000000000..d17fa9a043
--- /dev/null
+++ b/cpp/include/raft/linalg/power.cuh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/binary_op.hpp>
+#include <raft/linalg/unary_op.hpp>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @defgroup ScalarOps Scalar operations on the input buffer
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in the input buffer
+ * @param scalar the scalar used in the operations
+ * @param len number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void powerScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(
+    out, in, len, [scalar] __device__(math_t in) { return raft::myPow(in, scalar); }, stream);
+}
+/** @} */
+
+/**
+ * @defgroup BinaryOps Element-wise binary operations on the input buffers
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in1 the first input buffer
+ * @param in2 the second input buffer
+ * @param len number of elements in the input buffers
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void power(math_t* out, const math_t* in1, const math_t* in2, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::binaryOp(
+    out, in1, in2, len, [] __device__(math_t a, math_t b) { return raft::myPow(a, b); }, stream);
+}
+/** @} */
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
new file mode 100644
index 0000000000..82d272671c
--- /dev/null
+++ b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/detail/reduce_cols_by_key.cuh>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Computes the sum-reduction of matrix columns for each given key
+ * @tparam T the input data type (as well as the output reduced matrix)
+ * @tparam KeyType data type of the keys
+ * @tparam IdxType indexing arithmetic type
+ * @param data the input data (dim = nrows x ncols). This is assumed to be in
+ * row-major layout
+ * @param keys keys array (len = ncols). It is assumed that each key in this
+ * array is between [0, nkeys). In case this is not true, the caller is expected
+ * to have called make_monotonic primitive to prepare such a contiguous and
+ * monotonically increasing keys array.
+ * @param out the output reduced matrix along columns (dim = nrows x nkeys).
+ * This will be assumed to be in row-major layout
+ * @param nrows number of rows in the input data
+ * @param ncols number of colums in the input data
+ * @param nkeys number of unique keys in the keys array
+ * @param stream cuda stream to launch the kernel onto
+ */
+template <typename T, typename KeyIteratorT, typename IdxType = int>
+void reduce_cols_by_key(const T* data,
+                        const KeyIteratorT keys,
+                        T* out,
+                        IdxType nrows,
+                        IdxType ncols,
+                        IdxType nkeys,
+                        cudaStream_t stream)
+{
+  detail::reduce_cols_by_key(data, keys, out, nrows, ncols, nkeys, stream);
+}
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
new file mode 100644
index 0000000000..76d4ed4971
--- /dev/null
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/detail/reduce_rows_by_key.cuh>
+
+namespace raft {
+namespace linalg {
+
+/**
+ Small helper function to convert from int->char and char->int
+ Transform ncols*nrows read of int in 2*nrows reads of int + ncols*rows reads of chars
+**/
+template <typename IteratorT1, typename IteratorT2>
+void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st)
+{
+  detail::convert_array(dst, src, n, st);
+}
+
+/**
+ * @brief Computes the weighted reduction of matrix rows for each given key
+ *
+ * @tparam DataIteratorT Random-access iterator type, for reading input matrix
+ *                       (may be a simple pointer type)
+ * @tparam KeysIteratorT Random-access iterator type, for reading input keys
+ *                       (may be a simple pointer type)
+ *
+ * @param[in]  d_A         Input data array (lda x nrows)
+ * @param[in]  lda         Real row size for input data, d_A
+ * @param[in]  d_keys      Keys for each row (1 x nrows)
+ * @param[in]  d_weights   Weights for each observation in d_A (1 x nrows)
+ * @param[out] d_keys_char Scratch memory for conversion of keys to char
+ * @param[in]  nrows       Number of rows in d_A and d_keys
+ * @param[in]  ncols       Number of data columns in d_A
+ * @param[in]  nkeys       Number of unique keys in d_keys
+ * @param[out] d_sums      Row sums by key (ncols x d_keys)
+ * @param[in]  stream      CUDA stream
+ */
+template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
+void reduce_rows_by_key(const DataIteratorT d_A,
+                        int lda,
+                        const KeysIteratorT d_keys,
+                        const WeightT* d_weights,
+                        char* d_keys_char,
+                        int nrows,
+                        int ncols,
+                        int nkeys,
+                        DataIteratorT d_sums,
+                        cudaStream_t stream)
+{
+  detail::reduce_rows_by_key(
+    d_A, lda, d_keys, d_weights, d_keys_char, nrows, ncols, nkeys, d_sums, stream);
+}
+
+/**
+ * @brief Computes the reduction of matrix rows for each given key
+ * @tparam DataIteratorT Random-access iterator type, for reading input matrix (may be a simple
+ * pointer type)
+ * @tparam KeysIteratorT Random-access iterator type, for reading input keys (may be a simple
+ * pointer type)
+ * @param[in]  d_A         Input data array (lda x nrows)
+ * @param[in]  lda         Real row size for input data, d_A
+ * @param[in]  d_keys      Keys for each row (1 x nrows)
+ * @param      d_keys_char Scratch memory for conversion of keys to char
+ * @param[in]  nrows       Number of rows in d_A and d_keys
+ * @param[in]  ncols       Number of data columns in d_A
+ * @param[in]  nkeys       Number of unique keys in d_keys
+ * @param[out] d_sums      Row sums by key (ncols x d_keys)
+ * @param[in]  stream      CUDA stream
+ */
+template <typename DataIteratorT, typename KeysIteratorT>
+void reduce_rows_by_key(const DataIteratorT d_A,
+                        int lda,
+                        const KeysIteratorT d_keys,
+                        char* d_keys_char,
+                        int nrows,
+                        int ncols,
+                        int nkeys,
+                        DataIteratorT d_sums,
+                        cudaStream_t stream)
+{
+  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+  reduce_rows_by_key(d_A,
+                     lda,
+                     d_keys,
+                     static_cast<DataType*>(nullptr),
+                     d_keys_char,
+                     nrows,
+                     ncols,
+                     nkeys,
+                     d_sums,
+                     stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/rsvd.cuh b/cpp/include/raft/linalg/rsvd.cuh
new file mode 100644
index 0000000000..d1d739489f
--- /dev/null
+++ b/cpp/include/raft/linalg/rsvd.cuh
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/detail/rsvd.cuh>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief randomized singular value decomposition (RSVD) on the column major
+ * float type input matrix (Jacobi-based), by specifying no. of PCs and
+ * upsamples directly
+ * @param handle: raft handle
+ * @param M: input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param S_vec: singular values of input matrix
+ * @param U: left singular values of input matrix
+ * @param V: right singular values of input matrix
+ * @param k: no. of singular values to be computed
+ * @param p: no. of upsamples
+ * @param use_bbt: whether use eigen decomposition in computation or not
+ * @param gen_left_vec: left vector needs to be generated or not?
+ * @param gen_right_vec: right vector needs to be generated or not?
+ * @param use_jacobi: whether to jacobi solver for decomposition
+ * @param tol: tolerance for Jacobi-based solvers
+ * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void rsvdFixedRank(const raft::handle_t& handle,
+                   math_t* M,
+                   int n_rows,
+                   int n_cols,
+                   math_t* S_vec,
+                   math_t* U,
+                   math_t* V,
+                   int k,
+                   int p,
+                   bool use_bbt,
+                   bool gen_left_vec,
+                   bool gen_right_vec,
+                   bool use_jacobi,
+                   math_t tol,
+                   int max_sweeps,
+                   cudaStream_t stream)
+{
+  detail::rsvdFixedRank(handle,
+                        M,
+                        n_rows,
+                        n_cols,
+                        S_vec,
+                        U,
+                        V,
+                        k,
+                        p,
+                        use_bbt,
+                        gen_left_vec,
+                        gen_right_vec,
+                        use_jacobi,
+                        tol,
+                        max_sweeps,
+                        stream);
+}
+
+/**
+ * @brief randomized singular value decomposition (RSVD) on the column major
+ * float type input matrix (Jacobi-based), by specifying the PC and upsampling
+ * ratio
+ * @param handle: raft handle
+ * @param M: input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param S_vec: singular values of input matrix
+ * @param U: left singular values of input matrix
+ * @param V: right singular values of input matrix
+ * @param PC_perc: percentage of singular values to be computed
+ * @param UpS_perc: upsampling percentage
+ * @param use_bbt: whether use eigen decomposition in computation or not
+ * @param gen_left_vec: left vector needs to be generated or not?
+ * @param gen_right_vec: right vector needs to be generated or not?
+ * @param use_jacobi: whether to jacobi solver for decomposition
+ * @param tol: tolerance for Jacobi-based solvers
+ * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void rsvdPerc(const raft::handle_t& handle,
+              math_t* M,
+              int n_rows,
+              int n_cols,
+              math_t* S_vec,
+              math_t* U,
+              math_t* V,
+              math_t PC_perc,
+              math_t UpS_perc,
+              bool use_bbt,
+              bool gen_left_vec,
+              bool gen_right_vec,
+              bool use_jacobi,
+              math_t tol,
+              int max_sweeps,
+              cudaStream_t stream)
+{
+  detail::rsvdPerc(handle,
+                   M,
+                   n_rows,
+                   n_cols,
+                   S_vec,
+                   U,
+                   V,
+                   PC_perc,
+                   UpS_perc,
+                   use_bbt,
+                   gen_left_vec,
+                   gen_right_vec,
+                   use_jacobi,
+                   tol,
+                   max_sweeps,
+                   stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/sqrt.cuh b/cpp/include/raft/linalg/sqrt.cuh
new file mode 100644
index 0000000000..c431cfdcc0
--- /dev/null
+++ b/cpp/include/raft/linalg/sqrt.cuh
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/unary_op.hpp>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @defgroup ScalarOps Scalar operations on the input buffer
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in the input buffer
+ * @param len number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void sqrt(math_t* out, const math_t* in, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(
+    out, in, len, [] __device__(math_t in) { return raft::mySqrt(in); }, stream);
+}
+/** @} */
+
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/ternary_op.cuh b/cpp/include/raft/linalg/ternary_op.cuh
new file mode 100644
index 0000000000..be411e6492
--- /dev/null
+++ b/cpp/include/raft/linalg/ternary_op.cuh
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/detail/ternary_op.cuh>
+
+namespace raft {
+namespace linalg {
+/**
+ * @brief perform element-wise ternary operation on the input arrays
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @param out the output array
+ * @param in1 the first input array
+ * @param in2 the second input array
+ * @param in3 the third input array
+ * @param len number of elements in the input array
+ * @param op the device-lambda
+ * @param stream cuda stream where to launch work
+ */
+template <typename math_t, typename Lambda, typename IdxType = int, int TPB = 256>
+void ternaryOp(math_t* out,
+               const math_t* in1,
+               const math_t* in2,
+               const math_t* in3,
+               IdxType len,
+               Lambda op,
+               cudaStream_t stream)
+{
+  detail::ternaryOp(out, in1, in2, in3, len, op, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
\ No newline at end of file
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 9f5ca95e93..75b415814a 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -56,10 +56,16 @@ add_executable(test_raft
     test/linalg/matrix_vector_op.cu
     test/linalg/multiply.cu
     test/linalg/norm.cu
+    test/linalg/power.cu
     test/linalg/reduce.cu
+    test/linalg/reduce_cols_by_key.cu
+    test/linalg/reduce_rows_by_key.cu
+    test/linalg/rsvd.cu
+    test/linalg/sqrt.cu
     test/linalg/strided_reduction.cu
     test/linalg/subtract.cu
     test/linalg/svd.cu
+    test/linalg/ternary_op.cu
     test/linalg/transpose.cu
     test/linalg/unary_op.cu
     test/matrix/math.cu
diff --git a/cpp/test/linalg/power.cu b/cpp/test/linalg/power.cu
new file mode 100644
index 0000000000..0ec8613ce7
--- /dev/null
+++ b/cpp/test/linalg/power.cu
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/power.cuh>
+#include <raft/random/rng.hpp>
+
+namespace raft {
+namespace linalg {
+
+template <typename Type>
+__global__ void naivePowerElemKernel(Type* out, const Type* in1, const Type* in2, int len)
+{
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < len) { out[idx] = raft::myPow(in1[idx], in2[idx]); }
+}
+
+template <typename Type>
+void naivePowerElem(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream)
+{
+  static const int TPB = 64;
+  int nblks            = raft::ceildiv(len, TPB);
+  naivePowerElemKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename Type>
+__global__ void naivePowerScalarKernel(Type* out, const Type* in1, const Type in2, int len)
+{
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < len) { out[idx] = raft::myPow(in1[idx], in2); }
+}
+
+template <typename Type>
+void naivePowerScalar(Type* out, const Type* in1, const Type in2, int len, cudaStream_t stream)
+{
+  static const int TPB = 64;
+  int nblks            = raft::ceildiv(len, TPB);
+  naivePowerScalarKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename T>
+struct PowerInputs {
+  T tolerance;
+  int len;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const PowerInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+class PowerTest : public ::testing::TestWithParam<PowerInputs<T>> {
+ protected:
+  PowerTest()
+    : in1(0, handle.get_stream()),
+      in2(0, handle.get_stream()),
+      out_ref(0, handle.get_stream()),
+      out(0, handle.get_stream())
+  {
+  }
+
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<PowerInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int len = params.len;
+
+    cudaStream_t stream = handle.get_stream();
+
+    in1.resize(len, stream);
+    in2.resize(len, stream);
+    out_ref.resize(len, stream);
+    out.resize(len, stream);
+    r.uniform(in1.data(), len, T(1.0), T(2.0), stream);
+    r.uniform(in2.data(), len, T(1.0), T(2.0), stream);
+
+    naivePowerElem(out_ref.data(), in1.data(), in2.data(), len, stream);
+    naivePowerScalar(out_ref.data(), out_ref.data(), T(2), len, stream);
+
+    power(out.data(), in1.data(), in2.data(), len, stream);
+    powerScalar(out.data(), out.data(), T(2), len, stream);
+    power(in1.data(), in1.data(), in2.data(), len, stream);
+    powerScalar(in1.data(), in1.data(), T(2), len, stream);
+
+    handle.sync_stream();
+  }
+
+ protected:
+  raft::handle_t handle;
+  PowerInputs<T> params;
+  rmm::device_uvector<T> in1, in2, out_ref, out;
+  int device_count = 0;
+};
+
+const std::vector<PowerInputs<float>> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}};
+
+const std::vector<PowerInputs<double>> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}};
+
+typedef PowerTest<float> PowerTestF;
+TEST_P(PowerTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), out.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
+
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), in1.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
+}
+
+typedef PowerTest<double> PowerTestD;
+TEST_P(PowerTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), out.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
+
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), in1.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_CASE_P(PowerTests, PowerTestF, ::testing::ValuesIn(inputsf2));
+
+INSTANTIATE_TEST_CASE_P(PowerTests, PowerTestD, ::testing::ValuesIn(inputsd2));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/reduce_cols_by_key.cu b/cpp/test/linalg/reduce_cols_by_key.cu
new file mode 100644
index 0000000000..94459769f8
--- /dev/null
+++ b/cpp/test/linalg/reduce_cols_by_key.cu
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/interruptible.hpp>
+#include <raft/linalg/reduce_cols_by_key.cuh>
+#include <raft/random/rng.hpp>
+
+namespace raft {
+namespace linalg {
+
+template <typename T>
+void naiveReduceColsByKey(const T* in,
+                          const uint32_t* keys,
+                          T* out_ref,
+                          uint32_t nrows,
+                          uint32_t ncols,
+                          uint32_t nkeys,
+                          cudaStream_t stream)
+{
+  std::vector<uint32_t> h_keys(ncols, 0u);
+  raft::copy(&(h_keys[0]), keys, ncols, stream);
+  std::vector<T> h_in(nrows * ncols);
+  raft::copy(&(h_in[0]), in, nrows * ncols, stream);
+  raft::interruptible::synchronize(stream);
+  std::vector<T> out(nrows * nkeys, T(0));
+  for (uint32_t i = 0; i < nrows; ++i) {
+    for (uint32_t j = 0; j < ncols; ++j) {
+      out[i * nkeys + h_keys[j]] += h_in[i * ncols + j];
+    }
+  }
+  raft::copy(out_ref, &(out[0]), nrows * nkeys, stream);
+  raft::interruptible::synchronize(stream);
+}
+
+template <typename T>
+struct ReduceColsInputs {
+  T tolerance;
+  uint32_t rows;
+  uint32_t cols;
+  uint32_t nkeys;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const ReduceColsInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+class ReduceColsTest : public ::testing::TestWithParam<ReduceColsInputs<T>> {
+ protected:
+  ReduceColsTest() : in(0, stream), out_ref(0, stream), out(0, stream), keys(0, stream) {}
+
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<ReduceColsInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    auto nrows = params.rows;
+    auto ncols = params.cols;
+    auto nkeys = params.nkeys;
+    in.resize(nrows * ncols, stream);
+    keys.resize(ncols, stream);
+    out_ref.resize(nrows * nkeys, stream);
+    out.resize(nrows * nkeys, stream);
+    r.uniform(in.data(), nrows * ncols, T(-1.0), T(1.0), stream);
+    r.uniformInt(keys.data(), ncols, 0u, params.nkeys, stream);
+    naiveReduceColsByKey(in.data(), keys.data(), out_ref.data(), nrows, ncols, nkeys, stream);
+    reduce_cols_by_key(in.data(), keys.data(), out.data(), nrows, ncols, nkeys, stream);
+    raft::interruptible::synchronize(stream);
+  }
+
+  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
+
+ protected:
+  cudaStream_t stream = 0;
+  ReduceColsInputs<T> params;
+  rmm::device_uvector<T> in, out_ref, out;
+  rmm::device_uvector<uint32_t> keys;
+};
+
+const std::vector<ReduceColsInputs<float>> inputsf = {{0.0001f, 128, 32, 6, 1234ULL},
+                                                      {0.0005f, 121, 63, 10, 1234ULL}};
+typedef ReduceColsTest<float> ReduceColsTestF;
+TEST_P(ReduceColsTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                                out.data(),
+                                params.rows * params.nkeys,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ReduceColsTests, ReduceColsTestF, ::testing::ValuesIn(inputsf));
+
+const std::vector<ReduceColsInputs<double>> inputsd2 = {{0.0000001, 128, 32, 6, 1234ULL},
+                                                        {0.0000001, 121, 63, 10, 1234ULL}};
+typedef ReduceColsTest<double> ReduceColsTestD;
+TEST_P(ReduceColsTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                                out.data(),
+                                params.rows * params.nkeys,
+                                raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ReduceColsTests, ReduceColsTestD, ::testing::ValuesIn(inputsd2));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/reduce_rows_by_key.cu b/cpp/test/linalg/reduce_rows_by_key.cu
new file mode 100644
index 0000000000..9219c4f561
--- /dev/null
+++ b/cpp/test/linalg/reduce_rows_by_key.cu
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/reduce_rows_by_key.cuh>
+#include <raft/random/rng.hpp>
+
+namespace raft {
+namespace linalg {
+
+template <typename Type>
+__global__ void naiveReduceRowsByKeyKernel(const Type* d_A,
+                                           int lda,
+                                           uint32_t* d_keys,
+                                           const Type* d_weight,
+                                           char* d_char_keys,
+                                           int nrows,
+                                           int ncols,
+                                           int nkeys,
+                                           Type* d_sums)
+{
+  int c = threadIdx.x + blockIdx.x * blockDim.x;
+  if (c >= ncols) return;
+  int this_key = threadIdx.y + blockIdx.y * blockDim.y;
+
+  Type sum = 0.0;
+  for (int r = 0; r < nrows; r++) {
+    if (this_key != d_keys[r]) continue;
+    Type wt = 1;
+    if (d_weight) wt = d_weight[r];
+    sum += d_A[lda * r + c] * wt;
+  }
+  d_sums[this_key * ncols + c] = sum;
+}
+template <typename Type>
+void naiveReduceRowsByKey(const Type* d_A,
+                          int lda,
+                          uint32_t* d_keys,
+                          const Type* d_weight,
+                          char* d_char_keys,
+                          int nrows,
+                          int ncols,
+                          int nkeys,
+                          Type* d_sums,
+                          cudaStream_t stream)
+{
+  cudaMemset(d_sums, 0, sizeof(Type) * nkeys * ncols);
+
+  naiveReduceRowsByKeyKernel<<<dim3((ncols + 31) / 32, nkeys), dim3(32, 1), 0, stream>>>(
+    d_A, lda, d_keys, d_weight, d_char_keys, nrows, ncols, nkeys, d_sums);
+}
+
+template <typename T>
+struct ReduceRowsInputs {
+  T tolerance;
+  int nobs;
+  uint32_t cols;
+  uint32_t nkeys;
+  unsigned long long int seed;
+  bool weighted;
+  T max_weight;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const ReduceRowsInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+class ReduceRowTest : public ::testing::TestWithParam<ReduceRowsInputs<T>> {
+ public:
+  ReduceRowTest()
+    : params(::testing::TestWithParam<ReduceRowsInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      in(params.nobs * params.cols, stream),
+      out(params.nkeys * params.cols, stream),
+      out_ref(params.nkeys * params.cols, stream),
+      keys(params.nobs, stream),
+      scratch_buf(params.nobs, stream)
+  {
+  }
+
+ protected:
+  void SetUp() override
+  {
+    raft::random::Rng r(params.seed);
+    raft::random::Rng r_int(params.seed);
+
+    int nobs       = params.nobs;
+    uint32_t cols  = params.cols;
+    uint32_t nkeys = params.nkeys;
+    r.uniform(in.data(), nobs * cols, T(0.0), T(2.0 / nobs), stream);
+    r_int.uniformInt(keys.data(), nobs, (uint32_t)0, nkeys, stream);
+
+    rmm::device_uvector<T> weight(0, stream);
+    if (params.weighted) {
+      weight.resize(nobs, stream);
+      raft::random::Rng r(params.seed, raft::random::GeneratorType::GenPhilox);
+      r.uniform(weight.data(), nobs, T(1), params.max_weight, stream);
+    }
+
+    naiveReduceRowsByKey(in.data(),
+                         cols,
+                         keys.data(),
+                         params.weighted ? weight.data() : nullptr,
+                         scratch_buf.data(),
+                         nobs,
+                         cols,
+                         nkeys,
+                         out_ref.data(),
+                         stream);
+    if (params.weighted) {
+      reduce_rows_by_key(in.data(),
+                         cols,
+                         keys.data(),
+                         params.weighted ? weight.data() : nullptr,
+                         scratch_buf.data(),
+                         nobs,
+                         cols,
+                         nkeys,
+                         out.data(),
+                         stream);
+    } else {
+      reduce_rows_by_key(
+        in.data(), cols, keys.data(), scratch_buf.data(), nobs, cols, nkeys, out.data(), stream);
+    }
+    handle.sync_stream(stream);
+  }
+
+ protected:
+  ReduceRowsInputs<T> params;
+  raft::handle_t handle;
+  cudaStream_t stream = 0;
+
+  int device_count = 0;
+  rmm::device_uvector<T> in, out, out_ref;
+  rmm::device_uvector<uint32_t> keys;
+  rmm::device_uvector<char> scratch_buf;
+};
+
+// ReduceRowTestF
+// 128 Obs, 32 cols, 6 clusters
+const std::vector<ReduceRowsInputs<float>> inputsf2 = {{0.000001f, 128, 32, 6, 1234ULL, false},
+                                                       {0.000001f, 128, 32, 6, 1234ULL, true, 1.0},
+                                                       {0.000001f, 128, 32, 6, 1234ULL, true, 2.0}};
+typedef ReduceRowTest<float> ReduceRowTestF;
+TEST_P(ReduceRowTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                                out.data(),
+                                params.cols * params.nkeys,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ReduceRowTests, ReduceRowTestF, ::testing::ValuesIn(inputsf2));
+
+// ReduceRowTestD
+// 128 Obs, 32 cols, 6 clusters, double precision
+const std::vector<ReduceRowsInputs<double>> inputsd2 = {
+  {0.00000001, 128, 32, 6, 1234ULL, false},
+  {0.00000001, 128, 32, 6, 1234ULL, true, 2.0},
+  {0.00000001, 128, 32, 6, 1234ULL, true, 8.0}};
+typedef ReduceRowTest<double> ReduceRowTestD;
+TEST_P(ReduceRowTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                                out.data(),
+                                params.cols * params.nkeys,
+                                raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ReduceRowTests, ReduceRowTestD, ::testing::ValuesIn(inputsd2));
+
+// ReduceRowTestSmallnKey
+// 128 Obs, 32 cols, 3 clusters
+const std::vector<ReduceRowsInputs<float>> inputsf_small_nkey = {
+  {0.000001f, 128, 32, 3, 1234ULL, false},
+  {0.000001f, 128, 32, 3, 1234ULL, true, 5.0},
+  {0.000001f, 128, 32, 3, 1234ULL, true, 8.0}};
+typedef ReduceRowTest<float> ReduceRowTestSmallnKey;
+TEST_P(ReduceRowTestSmallnKey, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                                out.data(),
+                                params.cols * params.nkeys,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ReduceRowTests,
+                        ReduceRowTestSmallnKey,
+                        ::testing::ValuesIn(inputsf_small_nkey));
+
+// ReduceRowTestBigSpace
+// 512 Obs, 1024 cols, 32 clusters, double precision
+const std::vector<ReduceRowsInputs<double>> inputsd_big_space = {
+  {0.00000001, 512, 1024, 40, 1234ULL, false},
+  {0.00000001, 512, 1024, 40, 1234ULL, true, 4.0},
+  {0.00000001, 512, 1024, 40, 1234ULL, true, 16.0}};
+typedef ReduceRowTest<double> ReduceRowTestBigSpace;
+TEST_P(ReduceRowTestBigSpace, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                                out.data(),
+                                params.cols * params.nkeys,
+                                raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ReduceRowTests,
+                        ReduceRowTestBigSpace,
+                        ::testing::ValuesIn(inputsd_big_space));
+
+// ReduceRowTestManyObs
+// 100000 Obs, 37 cols, 32 clusters
+const std::vector<ReduceRowsInputs<float>> inputsf_many_obs = {
+  {0.00001f, 100000, 37, 32, 1234ULL, false},
+  {0.00001f, 100000, 37, 32, 1234ULL, true, 4.0},
+  {0.00001f, 100000, 37, 32, 1234ULL, true, 16.0}};
+typedef ReduceRowTest<float> ReduceRowTestManyObs;
+TEST_P(ReduceRowTestManyObs, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                                out.data(),
+                                params.cols * params.nkeys,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ReduceRowTests,
+                        ReduceRowTestManyObs,
+                        ::testing::ValuesIn(inputsf_many_obs));
+
+// ReduceRowTestManyClusters
+// 100000 Obs, 37 cols, 2048 clusters
+const std::vector<ReduceRowsInputs<float>> inputsf_many_cluster = {
+  {0.00001f, 100000, 37, 2048, 1234ULL, false},
+  {0.00001f, 100000, 37, 2048, 1234ULL, true, 32.0},
+  {0.00001f, 100000, 37, 2048, 1234ULL, true, 16.0}};
+typedef ReduceRowTest<float> ReduceRowTestManyClusters;
+TEST_P(ReduceRowTestManyClusters, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(out_ref.data(),
+                                out.data(),
+                                params.cols * params.nkeys,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ReduceRowTests,
+                        ReduceRowTestManyClusters,
+                        ::testing::ValuesIn(inputsf_many_cluster));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/rsvd.cu b/cpp/test/linalg/rsvd.cu
new file mode 100644
index 0000000000..b8e44580b5
--- /dev/null
+++ b/cpp/test/linalg/rsvd.cu
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <raft/linalg/rsvd.cuh>
+#include <raft/random/rng.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace linalg {
+
+template <typename T>
+struct RsvdInputs {
+  T tolerance;
+  int n_row;
+  int n_col;
+  T PC_perc;
+  T UpS_perc;
+  int k;
+  int p;
+  bool use_bbt;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const RsvdInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+class RsvdTest : public ::testing::TestWithParam<RsvdInputs<T>> {
+ protected:
+  RsvdTest()
+    : A(0, stream),
+      U(0, stream),
+      S(0, stream),
+      V(0, stream),
+      left_eig_vectors_ref(0, stream),
+      right_eig_vectors_ref(0, stream),
+      sing_vals_ref(0, stream)
+  {
+  }
+
+  void SetUp() override
+  {
+    raft::handle_t handle;
+    stream = handle.get_stream();
+
+    params = ::testing::TestWithParam<RsvdInputs<T>>::GetParam();
+    // rSVD seems to be very sensitive to the random number sequence as well!
+    raft::random::Rng r(params.seed, raft::random::GenTaps);
+    int m = params.n_row, n = params.n_col;
+    T eig_svd_tol  = 1.e-7;
+    int max_sweeps = 100;
+
+    T mu = 0.0, sigma = 1.0;
+    A.resize(m * n, stream);
+    if (params.tolerance > 1) {  // Sanity check
+      ASSERT(m == 3, "This test only supports mxn=3x2!");
+      ASSERT(m * n == 6, "This test only supports mxn=3x2!");
+      T data_h[] = {1.0, 4.0, 2.0, 2.0, 5.0, 1.0};
+      raft::update_device(A.data(), data_h, m * n, stream);
+
+      T left_eig_vectors_ref_h[]  = {-0.308219, -0.906133, -0.289695};
+      T right_eig_vectors_ref_h[] = {-0.638636, -0.769509};
+      T sing_vals_ref_h[]         = {7.065283};
+
+      left_eig_vectors_ref.resize(m, stream);
+      right_eig_vectors_ref.resize(n, stream);
+      sing_vals_ref.resize(1, stream);
+
+      raft::update_device(left_eig_vectors_ref.data(), left_eig_vectors_ref_h, m * 1, stream);
+      raft::update_device(right_eig_vectors_ref.data(), right_eig_vectors_ref_h, n * 1, stream);
+      raft::update_device(sing_vals_ref.data(), sing_vals_ref_h, 1, stream);
+
+    } else {  // Other normal tests
+      r.normal(A.data(), m * n, mu, sigma, stream);
+    }
+    std::vector<T> A_backup_cpu(m *
+                                n);  // Backup A matrix as svdJacobi will destroy the content of A
+    raft::update_host(A_backup_cpu.data(), A.data(), m * n, stream);
+
+    if (params.k == 0) {
+      params.k = max((int)(min(m, n) * params.PC_perc), 1);
+      params.p = max((int)(min(m, n) * params.UpS_perc), 1);
+    }
+
+    U.resize(m * params.k, stream);
+    S.resize(params.k, stream);
+    V.resize(n * params.k, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(U.data(), 0, U.size() * sizeof(T), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(S.data(), 0, S.size() * sizeof(T), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(V.data(), 0, V.size() * sizeof(T), stream));
+
+    // RSVD tests
+    if (params.k == 0) {  // Test with PC and upsampling ratio
+      rsvdPerc(handle,
+               A.data(),
+               m,
+               n,
+               S.data(),
+               U.data(),
+               V.data(),
+               params.PC_perc,
+               params.UpS_perc,
+               params.use_bbt,
+               true,
+               true,
+               false,
+               eig_svd_tol,
+               max_sweeps,
+               stream);
+    } else {  // Test with directly given fixed rank
+      rsvdFixedRank(handle,
+                    A.data(),
+                    m,
+                    n,
+                    S.data(),
+                    U.data(),
+                    V.data(),
+                    params.k,
+                    params.p,
+                    params.use_bbt,
+                    true,
+                    true,
+                    true,
+                    eig_svd_tol,
+                    max_sweeps,
+                    stream);
+    }
+    raft::update_device(A.data(), A_backup_cpu.data(), m * n, stream);
+  }
+
+ protected:
+  cudaStream_t stream = 0;
+  RsvdInputs<T> params;
+  rmm::device_uvector<T> A, U, S, V, left_eig_vectors_ref, right_eig_vectors_ref, sing_vals_ref;
+};
+
+const std::vector<RsvdInputs<float>> inputs_fx = {
+  // Test with ratios
+  {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL},     // Square + BBT
+  {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL},    // Tall + BBT
+  {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL},    // Square + non-BBT
+  {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL},   // Tall + non-BBT
+  {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL},   // Square + BBT
+  {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL},  // Tall + BBT
+  {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL},  // Square + non-BBT
+  {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL}  // Tall + non-BBT
+
+  ,                                                         // Test with fixed ranks
+  {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL},     // Square + BBT
+  {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL},    // Tall + BBT
+  {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL},    // Square + non-BBT
+  {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL},   // Tall + non-BBT
+  {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL},   // Square + BBT
+  {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL},  // Tall + BBT
+  {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL},  // Square + non-BBT
+  {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL}  // Tall + non-BBT
+};
+
+const std::vector<RsvdInputs<double>> inputs_dx = {
+  // Test with ratios
+  {0.20, 256, 256, 0.2, 0.05, 0, 0, true, 4321ULL},     // Square + BBT
+  {0.20, 2048, 256, 0.2, 0.05, 0, 0, true, 4321ULL},    // Tall + BBT
+  {0.20, 256, 256, 0.2, 0.05, 0, 0, false, 4321ULL},    // Square + non-BBT
+  {0.20, 2048, 256, 0.2, 0.05, 0, 0, false, 4321ULL},   // Tall + non-BBT
+  {0.20, 2048, 2048, 0.2, 0.05, 0, 0, true, 4321ULL},   // Square + BBT
+  {0.60, 16384, 2048, 0.2, 0.05, 0, 0, true, 4321ULL},  // Tall + BBT
+  {0.20, 2048, 2048, 0.2, 0.05, 0, 0, false, 4321ULL},  // Square + non-BBT
+  {0.60, 16384, 2048, 0.2, 0.05, 0, 0, false, 4321ULL}  // Tall + non-BBT
+
+  ,                                                      // Test with fixed ranks
+  {0.10, 256, 256, 0.0, 0.0, 100, 5, true, 4321ULL},     // Square + BBT
+  {0.12, 2048, 256, 0.0, 0.0, 100, 5, true, 4321ULL},    // Tall + BBT
+  {0.10, 256, 256, 0.0, 0.0, 100, 5, false, 4321ULL},    // Square + non-BBT
+  {0.12, 2048, 256, 0.0, 0.0, 100, 5, false, 4321ULL},   // Tall + non-BBT
+  {0.60, 2048, 2048, 0.0, 0.0, 100, 5, true, 4321ULL},   // Square + BBT
+  {1.00, 16384, 2048, 0.0, 0.0, 100, 5, true, 4321ULL},  // Tall + BBT
+  {0.60, 2048, 2048, 0.0, 0.0, 100, 5, false, 4321ULL},  // Square + non-BBT
+  {1.00, 16384, 2048, 0.0, 0.0, 100, 5, false, 4321ULL}  // Tall + non-BBT
+};
+
+const std::vector<RsvdInputs<float>> sanity_inputs_fx = {
+  {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, true, 4321ULL},
+  {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, true, 4321ULL},
+  {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, false, 4321ULL},
+  {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, false, 4321ULL}};
+
+const std::vector<RsvdInputs<double>> sanity_inputs_dx = {
+  {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, true, 4321ULL},
+  {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, true, 4321ULL},
+  {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, false, 4321ULL},
+  {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, false, 4321ULL}};
+
+typedef RsvdTest<float> RsvdSanityCheckValF;
+TEST_P(RsvdSanityCheckValF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    sing_vals_ref.data(), S.data(), params.k, raft::CompareApproxAbs<float>(params.tolerance)));
+}
+
+typedef RsvdTest<double> RsvdSanityCheckValD;
+TEST_P(RsvdSanityCheckValD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    sing_vals_ref.data(), S.data(), params.k, raft::CompareApproxAbs<double>(params.tolerance)));
+}
+
+typedef RsvdTest<float> RsvdSanityCheckLeftVecF;
+TEST_P(RsvdSanityCheckLeftVecF, Result)
+{
+  ASSERT_TRUE(devArrMatch(left_eig_vectors_ref.data(),
+                          U.data(),
+                          params.n_row * params.k,
+                          raft::CompareApproxAbs<float>(params.tolerance)));
+}
+
+typedef RsvdTest<double> RsvdSanityCheckLeftVecD;
+TEST_P(RsvdSanityCheckLeftVecD, Result)
+{
+  ASSERT_TRUE(devArrMatch(left_eig_vectors_ref.data(),
+                          U.data(),
+                          params.n_row * params.k,
+                          raft::CompareApproxAbs<double>(params.tolerance)));
+}
+
+typedef RsvdTest<float> RsvdSanityCheckRightVecF;
+TEST_P(RsvdSanityCheckRightVecF, Result)
+{
+  ASSERT_TRUE(devArrMatch(right_eig_vectors_ref.data(),
+                          V.data(),
+                          params.n_col * params.k,
+                          raft::CompareApproxAbs<float>(params.tolerance)));
+}
+
+typedef RsvdTest<double> RsvdSanityCheckRightVecD;
+TEST_P(RsvdSanityCheckRightVecD, Result)
+{
+  ASSERT_TRUE(devArrMatch(right_eig_vectors_ref.data(),
+                          V.data(),
+                          params.n_col * params.k,
+                          raft::CompareApproxAbs<double>(params.tolerance)));
+}
+
+typedef RsvdTest<float> RsvdTestSquareMatrixNormF;
+TEST_P(RsvdTestSquareMatrixNormF, Result)
+{
+  raft::handle_t handle;
+
+  ASSERT_TRUE(raft::linalg::evaluateSVDByL2Norm(handle,
+                                                A.data(),
+                                                U.data(),
+                                                S.data(),
+                                                V.data(),
+                                                params.n_row,
+                                                params.n_col,
+                                                params.k,
+                                                4 * params.tolerance,
+                                                handle.get_stream()));
+}
+
+typedef RsvdTest<double> RsvdTestSquareMatrixNormD;
+TEST_P(RsvdTestSquareMatrixNormD, Result)
+{
+  raft::handle_t handle;
+
+  ASSERT_TRUE(raft::linalg::evaluateSVDByL2Norm(handle,
+                                                A.data(),
+                                                U.data(),
+                                                S.data(),
+                                                V.data(),
+                                                params.n_row,
+                                                params.n_col,
+                                                params.k,
+                                                4 * params.tolerance,
+                                                handle.get_stream()));
+}
+
+INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdSanityCheckValF, ::testing::ValuesIn(sanity_inputs_fx));
+
+INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdSanityCheckValD, ::testing::ValuesIn(sanity_inputs_dx));
+
+INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdSanityCheckLeftVecF, ::testing::ValuesIn(sanity_inputs_fx));
+
+INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdSanityCheckLeftVecD, ::testing::ValuesIn(sanity_inputs_dx));
+
+INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdSanityCheckRightVecF, ::testing::ValuesIn(sanity_inputs_fx));
+
+INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdSanityCheckRightVecD, ::testing::ValuesIn(sanity_inputs_dx));
+
+INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdTestSquareMatrixNormF, ::testing::ValuesIn(inputs_fx));
+
+INSTANTIATE_TEST_CASE_P(RsvdTests, RsvdTestSquareMatrixNormD, ::testing::ValuesIn(inputs_dx));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/sqrt.cu b/cpp/test/linalg/sqrt.cu
new file mode 100644
index 0000000000..92c9626395
--- /dev/null
+++ b/cpp/test/linalg/sqrt.cu
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/sqrt.cuh>
+#include <raft/random/rng.hpp>
+
+namespace raft {
+namespace linalg {
+
+template <typename Type>
+__global__ void naiveSqrtElemKernel(Type* out, const Type* in1, int len)
+{
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < len) { out[idx] = raft::mySqrt(in1[idx]); }
+}
+
+template <typename Type>
+void naiveSqrtElem(Type* out, const Type* in1, int len)
+{
+  static const int TPB = 64;
+  int nblks            = raft::ceildiv(len, TPB);
+  naiveSqrtElemKernel<Type><<<nblks, TPB>>>(out, in1, len);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename T>
+struct SqrtInputs {
+  T tolerance;
+  int len;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const SqrtInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+class SqrtTest : public ::testing::TestWithParam<SqrtInputs<T>> {
+ protected:
+  SqrtTest()
+    : in1(0, handle.get_stream()), out_ref(0, handle.get_stream()), out(0, handle.get_stream())
+  {
+  }
+
+  void SetUp() override
+  {
+    auto stream = handle.get_stream();
+    params      = ::testing::TestWithParam<SqrtInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int len = params.len;
+    in1.resize(len, stream);
+    out_ref.resize(len, stream);
+    out.resize(len, stream);
+    r.uniform(in1.data(), len, T(1.0), T(2.0), stream);
+
+    naiveSqrtElem(out_ref.data(), in1.data(), len);
+
+    sqrt(out.data(), in1.data(), len, stream);
+    sqrt(in1.data(), in1.data(), len, stream);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  }
+
+ protected:
+  raft::handle_t handle;
+  SqrtInputs<T> params;
+  rmm::device_uvector<T> in1, out_ref, out;
+  int device_count = 0;
+};
+
+const std::vector<SqrtInputs<float>> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}};
+
+const std::vector<SqrtInputs<double>> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}};
+
+typedef SqrtTest<float> SqrtTestF;
+TEST_P(SqrtTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), out.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
+
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), in1.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
+}
+
+typedef SqrtTest<double> SqrtTestD;
+TEST_P(SqrtTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), out.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
+
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), in1.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_CASE_P(SqrtTests, SqrtTestF, ::testing::ValuesIn(inputsf2));
+
+INSTANTIATE_TEST_CASE_P(SqrtTests, SqrtTestD, ::testing::ValuesIn(inputsd2));
+
+}  // namespace linalg
+}  // namespace raft
diff --git a/cpp/test/linalg/ternary_op.cu b/cpp/test/linalg/ternary_op.cu
new file mode 100644
index 0000000000..4140a9c4b3
--- /dev/null
+++ b/cpp/test/linalg/ternary_op.cu
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/ternary_op.cuh>
+#include <raft/random/rng.hpp>
+
+namespace raft {
+namespace linalg {
+
+template <typename InType, typename IdxType = int, typename OutType = InType>
+struct BinaryOpInputs {
+  InType tolerance;
+  IdxType len;
+  unsigned long long int seed;
+};
+
+template <typename InType, typename IdxType = int, typename OutType = InType>
+::std::ostream& operator<<(::std::ostream& os, const BinaryOpInputs<InType, IdxType, OutType>& d)
+{
+  return os;
+}
+
+template <typename T>
+class ternaryOpTest : public ::testing::TestWithParam<BinaryOpInputs<T>> {
+ public:
+  ternaryOpTest()
+    : params(::testing::TestWithParam<BinaryOpInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      out_add_ref(params.len, stream),
+      out_add(params.len, stream),
+      out_mul_ref(params.len, stream),
+      out_mul(params.len, stream)
+  {
+  }
+
+  void SetUp() override
+  {
+    raft::random::Rng rng(params.seed);
+    int len = params.len;
+    rmm::device_uvector<T> in1(len, stream);
+    rmm::device_uvector<T> in2(len, stream);
+    rmm::device_uvector<T> in3(len, stream);
+
+    rng.fill(out_add_ref.data(), len, T(6.0), stream);
+    rng.fill(out_mul_ref.data(), len, T(6.0), stream);
+    rng.fill(in1.data(), len, T(1.0), stream);
+    rng.fill(in2.data(), len, T(2.0), stream);
+    rng.fill(in3.data(), len, T(3.0), stream);
+
+    auto add = [] __device__(T a, T b, T c) { return a + b + c; };
+    auto mul = [] __device__(T a, T b, T c) { return a * b * c; };
+    ternaryOp(out_add.data(), in1.data(), in2.data(), in3.data(), len, add, stream);
+    ternaryOp(out_mul.data(), in1.data(), in2.data(), in3.data(), len, mul, stream);
+  }
+
+ protected:
+  BinaryOpInputs<T> params;
+  raft::handle_t handle;
+  cudaStream_t stream = 0;
+
+  rmm::device_uvector<T> out_add_ref, out_add, out_mul_ref, out_mul;
+};
+
+const std::vector<BinaryOpInputs<float>> inputsf = {{0.000001f, 1024 * 1024, 1234ULL},
+                                                    {0.000001f, 1024 * 1024 + 2, 1234ULL},
+                                                    {0.000001f, 1024 * 1024 + 1, 1234ULL}};
+typedef ternaryOpTest<float> ternaryOpTestF;
+TEST_P(ternaryOpTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out_add_ref.data(), out_add.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_mul_ref.data(), out_mul.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ternaryOpTests, ternaryOpTestF, ::testing::ValuesIn(inputsf));
+
+const std::vector<BinaryOpInputs<double>> inputsd = {{0.00000001, 1024 * 1024, 1234ULL},
+                                                     {0.00000001, 1024 * 1024 + 2, 1234ULL},
+                                                     {0.00000001, 1024 * 1024 + 1, 1234ULL}};
+typedef ternaryOpTest<double> ternaryOpTestD;
+TEST_P(ternaryOpTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out_add_ref.data(), out_add.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_mul_ref.data(), out_mul.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ternaryOpTests, ternaryOpTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace linalg
+}  // end namespace raft

From 37f627c78d8b8ec10cef9a7b7a1207b7cafc621f Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Thu, 10 Feb 2022 19:20:45 -0500
Subject: [PATCH 113/171] Move `random` package of cuML to RAFT (#449)

This includes:

- [ ] make_arima
- [x] make_blobs
- [x] make_regression
- [x] multi_variable_gaussian
- [x] permute

Authors:
  - Divye Gala (https://github.com/divyegala)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/449
---
 .../raft/random/detail/curand_wrappers.hpp    |  57 ++++
 cpp/include/raft/random/detail/make_blobs.cuh | 179 ++++++++++
 .../raft/random/detail/make_regression.cuh    | 285 ++++++++++++++++
 .../random/detail/multi_variable_gaussian.cuh | 290 ++++++++++++++++
 cpp/include/raft/random/detail/permute.cuh    | 162 +++++++++
 cpp/include/raft/random/make_blobs.hpp        |  91 +++++
 cpp/include/raft/random/make_regression.hpp   | 100 ++++++
 .../raft/random/multi_variable_gaussian.hpp   |  59 ++++
 cpp/include/raft/random/permute.hpp           |  58 ++++
 cpp/test/CMakeLists.txt                       |   4 +
 cpp/test/random/make_blobs.cu                 | 263 +++++++++++++++
 cpp/test/random/make_regression.cu            | 163 +++++++++
 cpp/test/random/multi_variable_gaussian.cu    | 315 ++++++++++++++++++
 cpp/test/random/permute.cu                    | 237 +++++++++++++
 cpp/test/sparse/filter.cu                     |   2 +-
 cpp/test/sparse/norm.cu                       |   2 +-
 cpp/test/sparse/reduce.cu                     |   2 +-
 17 files changed, 2266 insertions(+), 3 deletions(-)
 create mode 100644 cpp/include/raft/random/detail/curand_wrappers.hpp
 create mode 100644 cpp/include/raft/random/detail/make_blobs.cuh
 create mode 100644 cpp/include/raft/random/detail/make_regression.cuh
 create mode 100644 cpp/include/raft/random/detail/multi_variable_gaussian.cuh
 create mode 100644 cpp/include/raft/random/detail/permute.cuh
 create mode 100644 cpp/include/raft/random/make_blobs.hpp
 create mode 100644 cpp/include/raft/random/make_regression.hpp
 create mode 100644 cpp/include/raft/random/multi_variable_gaussian.hpp
 create mode 100644 cpp/include/raft/random/permute.hpp
 create mode 100644 cpp/test/random/make_blobs.cu
 create mode 100644 cpp/test/random/make_regression.cu
 create mode 100644 cpp/test/random/multi_variable_gaussian.cu
 create mode 100644 cpp/test/random/permute.cu

diff --git a/cpp/include/raft/random/detail/curand_wrappers.hpp b/cpp/include/raft/random/detail/curand_wrappers.hpp
new file mode 100644
index 0000000000..969d739cc1
--- /dev/null
+++ b/cpp/include/raft/random/detail/curand_wrappers.hpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <curand.h>
+
+namespace raft::random {
+namespace detail {
+
+// @todo: We probably want to scrape through and replace any consumers of
+// these wrappers with our RNG
+/** check for curand runtime API errors and assert accordingly */
+#define CURAND_CHECK(call)                                                                         \
+  do {                                                                                             \
+    curandStatus_t status = call;                                                                  \
+    ASSERT(status == CURAND_STATUS_SUCCESS, "FAIL: curand-call='%s'. Reason:%d\n", #call, status); \
+  } while (0)
+
+/**
+ * @defgroup normal curand normal random number generation operations
+ * @{
+ */
+template <typename T>
+curandStatus_t curandGenerateNormal(
+  curandGenerator_t generator, T* outputPtr, size_t n, T mean, T stddev);
+
+template <>
+inline curandStatus_t curandGenerateNormal(
+  curandGenerator_t generator, float* outputPtr, size_t n, float mean, float stddev)
+{
+  return curandGenerateNormal(generator, outputPtr, n, mean, stddev);
+}
+
+template <>
+inline curandStatus_t curandGenerateNormal(
+  curandGenerator_t generator, double* outputPtr, size_t n, double mean, double stddev)
+{
+  return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev);
+}
+/** @} */
+
+};  // end namespace detail
+};  // end namespace raft::random
\ No newline at end of file
diff --git a/cpp/include/raft/random/detail/make_blobs.cuh b/cpp/include/raft/random/detail/make_blobs.cuh
new file mode 100644
index 0000000000..528d20a284
--- /dev/null
+++ b/cpp/include/raft/random/detail/make_blobs.cuh
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/unary_op.hpp>
+#include <raft/random/permute.hpp>
+#include <raft/random/rng.hpp>
+#include <rmm/device_uvector.hpp>
+#include <vector>
+
+namespace raft::random {
+namespace detail {
+
+namespace {
+
+// generate the labels first and shuffle them instead of shuffling the dataset
+template <typename IdxT>
+void generate_labels(IdxT* labels,
+                     IdxT n_rows,
+                     IdxT n_clusters,
+                     bool shuffle,
+                     raft::random::Rng& r,
+                     cudaStream_t stream)
+{
+  IdxT a, b;
+  r.affine_transform_params(n_clusters, a, b);
+  auto op = [=] __device__(IdxT * ptr, IdxT idx) {
+    if (shuffle) { idx = IdxT((a * int64_t(idx)) + b); }
+    idx %= n_clusters;
+    // in the unlikely case of n_clusters > n_rows, make sure that the writes
+    // do not go out-of-bounds
+    if (idx < n_rows) { *ptr = idx; }
+  };
+  raft::linalg::writeOnlyUnaryOp<IdxT, decltype(op), IdxT>(labels, n_rows, op, stream);
+}
+
+template <typename DataT, typename IdxT>
+DI void get_mu_sigma(DataT& mu,
+                     DataT& sigma,
+                     IdxT idx,
+                     const IdxT* labels,
+                     bool row_major,
+                     const DataT* centers,
+                     const DataT* cluster_std,
+                     DataT cluster_std_scalar,
+                     IdxT n_rows,
+                     IdxT n_cols,
+                     IdxT n_clusters)
+{
+  IdxT cid, fid;
+  if (row_major) {
+    cid = idx / n_cols;
+    fid = idx % n_cols;
+  } else {
+    cid = idx % n_rows;
+    fid = idx / n_rows;
+  }
+  IdxT center_id;
+  if (cid < n_rows) {
+    center_id = labels[cid];
+  } else {
+    center_id = 0;
+  }
+
+  if (fid >= n_cols) { fid = 0; }
+
+  if (row_major) {
+    center_id = center_id * n_cols + fid;
+  } else {
+    center_id += fid * n_clusters;
+  }
+  sigma = cluster_std == nullptr ? cluster_std_scalar : cluster_std[cid];
+  mu    = centers[center_id];
+}
+
+template <typename DataT, typename IdxT>
+void generate_data(DataT* out,
+                   const IdxT* labels,
+                   IdxT n_rows,
+                   IdxT n_cols,
+                   IdxT n_clusters,
+                   cudaStream_t stream,
+                   bool row_major,
+                   const DataT* centers,
+                   const DataT* cluster_std,
+                   const DataT cluster_std_scalar,
+                   raft::random::Rng& rng)
+{
+  auto op = [=] __device__(DataT & val1, DataT & val2, IdxT idx1, IdxT idx2) {
+    DataT mu1, sigma1, mu2, sigma2;
+    get_mu_sigma(mu1,
+                 sigma1,
+                 idx1,
+                 labels,
+                 row_major,
+                 centers,
+                 cluster_std,
+                 cluster_std_scalar,
+                 n_rows,
+                 n_cols,
+                 n_clusters);
+    get_mu_sigma(mu2,
+                 sigma2,
+                 idx2,
+                 labels,
+                 row_major,
+                 centers,
+                 cluster_std,
+                 cluster_std_scalar,
+                 n_rows,
+                 n_cols,
+                 n_clusters);
+    raft::random::box_muller_transform<DataT>(val1, val2, sigma1, mu1, sigma2, mu2);
+  };
+  rng.custom_distribution2<DataT, DataT, IdxT>(out, n_rows * n_cols, op, stream);
+}
+
+}  // namespace
+
+template <typename DataT, typename IdxT>
+void make_blobs_caller(DataT* out,
+                       IdxT* labels,
+                       IdxT n_rows,
+                       IdxT n_cols,
+                       IdxT n_clusters,
+                       cudaStream_t stream,
+                       bool row_major                   = true,
+                       const DataT* centers             = nullptr,
+                       const DataT* cluster_std         = nullptr,
+                       const DataT cluster_std_scalar   = (DataT)1.0,
+                       bool shuffle                     = true,
+                       DataT center_box_min             = (DataT)-10.0,
+                       DataT center_box_max             = (DataT)10.0,
+                       uint64_t seed                    = 0ULL,
+                       raft::random::GeneratorType type = raft::random::GenPhilox)
+{
+  raft::random::Rng r(seed, type);
+  // use the right centers buffer for data generation
+  rmm::device_uvector<DataT> rand_centers(0, stream);
+  const DataT* _centers;
+  if (centers == nullptr) {
+    rand_centers.resize(n_clusters * n_cols, stream);
+    r.uniform(rand_centers.data(), n_clusters * n_cols, center_box_min, center_box_max, stream);
+    _centers = rand_centers.data();
+  } else {
+    _centers = centers;
+  }
+  generate_labels(labels, n_rows, n_clusters, shuffle, r, stream);
+  generate_data(out,
+                labels,
+                n_rows,
+                n_cols,
+                n_clusters,
+                stream,
+                row_major,
+                _centers,
+                cluster_std,
+                cluster_std_scalar,
+                r);
+}
+
+}  // end namespace detail
+}  // end namespace raft::random
\ No newline at end of file
diff --git a/cpp/include/raft/random/detail/make_regression.cuh b/cpp/include/raft/random/detail/make_regression.cuh
new file mode 100644
index 0000000000..eb8eaf565e
--- /dev/null
+++ b/cpp/include/raft/random/detail/make_regression.cuh
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Adapted from scikit-learn
+ * https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/datasets/_samples_generator.py
+ */
+
+#pragma once
+
+#include <algorithm>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/init.hpp>
+#include <raft/linalg/qr.hpp>
+#include <raft/linalg/transpose.hpp>
+#include <raft/matrix/matrix.hpp>
+#include <raft/mr/device/buffer.hpp>
+#include <raft/random/permute.hpp>
+#include <raft/random/rng.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft::random {
+namespace detail {
+
+/* Internal auxiliary function to help build the singular profile */
+template <typename DataT, typename IdxT>
+static __global__ void _singular_profile_kernel(DataT* out, IdxT n, DataT tail_strength, IdxT rank)
+{
+  IdxT tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < n) {
+    DataT sval     = static_cast<DataT>(tid) / rank;
+    DataT low_rank = ((DataT)1.0 - tail_strength) * raft::myExp(-sval * sval);
+    DataT tail     = tail_strength * raft::myExp((DataT)-0.1 * sval);
+    out[tid]       = low_rank + tail;
+  }
+}
+
+/* Internal auxiliary function to generate a low-rank matrix */
+template <typename DataT, typename IdxT>
+static void _make_low_rank_matrix(const raft::handle_t& handle,
+                                  DataT* out,
+                                  IdxT n_rows,
+                                  IdxT n_cols,
+                                  IdxT effective_rank,
+                                  DataT tail_strength,
+                                  raft::random::Rng& r,
+                                  cudaStream_t stream)
+{
+  cusolverDnHandle_t cusolver_handle = handle.get_cusolver_dn_handle();
+  cublasHandle_t cublas_handle       = handle.get_cublas_handle();
+
+  IdxT n = std::min(n_rows, n_cols);
+
+  // Generate random (ortho normal) vectors with QR decomposition
+  rmm::device_uvector<DataT> rd_mat_0(n_rows * n, stream);
+  rmm::device_uvector<DataT> rd_mat_1(n_cols * n, stream);
+  r.normal(rd_mat_0.data(), n_rows * n, (DataT)0.0, (DataT)1.0, stream);
+  r.normal(rd_mat_1.data(), n_cols * n, (DataT)0.0, (DataT)1.0, stream);
+  rmm::device_uvector<DataT> q0(n_rows * n, stream);
+  rmm::device_uvector<DataT> q1(n_cols * n, stream);
+  raft::linalg::qrGetQ(handle, rd_mat_0.data(), q0.data(), n_rows, n, stream);
+  raft::linalg::qrGetQ(handle, rd_mat_1.data(), q1.data(), n_cols, n, stream);
+
+  // Build the singular profile by assembling signal and noise components
+  rmm::device_uvector<DataT> singular_vec(n, stream);
+  _singular_profile_kernel<<<raft::ceildiv<IdxT>(n, 256), 256, 0, stream>>>(
+    singular_vec.data(), n, tail_strength, effective_rank);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  rmm::device_uvector<DataT> singular_mat(n * n, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(singular_mat.data(), 0, n * n * sizeof(DataT), stream));
+  raft::matrix::initializeDiagonalMatrix(singular_vec.data(), singular_mat.data(), n, n, stream);
+
+  // Generate the column-major matrix
+  rmm::device_uvector<DataT> temp_q0s(n_rows * n, stream);
+  rmm::device_uvector<DataT> temp_out(n_rows * n_cols, stream);
+  DataT alpha = 1.0, beta = 0.0;
+  raft::linalg::detail::cublasgemm(cublas_handle,
+                                   CUBLAS_OP_N,
+                                   CUBLAS_OP_N,
+                                   n_rows,
+                                   n,
+                                   n,
+                                   &alpha,
+                                   q0.data(),
+                                   n_rows,
+                                   singular_mat.data(),
+                                   n,
+                                   &beta,
+                                   temp_q0s.data(),
+                                   n_rows,
+                                   stream);
+  raft::linalg::detail::cublasgemm(cublas_handle,
+                                   CUBLAS_OP_N,
+                                   CUBLAS_OP_T,
+                                   n_rows,
+                                   n_cols,
+                                   n,
+                                   &alpha,
+                                   temp_q0s.data(),
+                                   n_rows,
+                                   q1.data(),
+                                   n_cols,
+                                   &beta,
+                                   temp_out.data(),
+                                   n_rows,
+                                   stream);
+
+  // Transpose from column-major to row-major
+  raft::linalg::transpose(handle, temp_out.data(), out, n_rows, n_cols, stream);
+}
+
+/* Internal auxiliary function to permute rows in the given matrix according
+ * to a given permutation vector */
+template <typename DataT, typename IdxT>
+static __global__ void _gather2d_kernel(
+  DataT* out, const DataT* in, const IdxT* perms, IdxT n_rows, IdxT n_cols)
+{
+  IdxT tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (tid < n_rows) {
+    const DataT* row_in = in + n_cols * perms[tid];
+    DataT* row_out      = out + n_cols * tid;
+
+    for (IdxT i = 0; i < n_cols; i++) {
+      row_out[i] = row_in[i];
+    }
+  }
+}
+
+template <typename DataT, typename IdxT>
+void make_regression_caller(const raft::handle_t& handle,
+                            DataT* out,
+                            DataT* values,
+                            IdxT n_rows,
+                            IdxT n_cols,
+                            IdxT n_informative,
+                            cudaStream_t stream,
+                            DataT* coef                      = nullptr,
+                            IdxT n_targets                   = (IdxT)1,
+                            DataT bias                       = (DataT)0.0,
+                            IdxT effective_rank              = (IdxT)-1,
+                            DataT tail_strength              = (DataT)0.5,
+                            DataT noise                      = (DataT)0.0,
+                            bool shuffle                     = true,
+                            uint64_t seed                    = 0ULL,
+                            raft::random::GeneratorType type = raft::random::GenPhilox)
+{
+  n_informative = std::min(n_informative, n_cols);
+
+  cusolverDnHandle_t cusolver_handle = handle.get_cusolver_dn_handle();
+  cublasHandle_t cublas_handle       = handle.get_cublas_handle();
+
+  cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);
+  raft::random::Rng r(seed, type);
+
+  if (effective_rank < 0) {
+    // Randomly generate a well conditioned input set
+    r.normal(out, n_rows * n_cols, (DataT)0.0, (DataT)1.0, stream);
+  } else {
+    // Randomly generate a low rank, fat tail input set
+    _make_low_rank_matrix(handle, out, n_rows, n_cols, effective_rank, tail_strength, r, stream);
+  }
+
+  // Use the right output buffer for the values
+  rmm::device_uvector<DataT> tmp_values(0, stream);
+  DataT* _values;
+  if (shuffle) {
+    tmp_values.resize(n_rows * n_targets, stream);
+    _values = tmp_values.data();
+  } else {
+    _values = values;
+  }
+  // Create a column-major matrix of output values only if it has more
+  // than 1 column
+  rmm::device_uvector<DataT> values_col(0, stream);
+  DataT* _values_col;
+  if (n_targets > 1) {
+    values_col.resize(n_rows * n_targets, stream);
+    _values_col = values_col.data();
+  } else {
+    _values_col = _values;
+  }
+
+  // Use the right buffer for the coefficients
+  rmm::device_uvector<DataT> tmp_coef(0, stream);
+  DataT* _coef;
+  if (coef != nullptr && !shuffle) {
+    _coef = coef;
+  } else {
+    tmp_coef.resize(n_cols * n_targets, stream);
+    _coef = tmp_coef.data();
+  }
+
+  // Generate a ground truth model with only n_informative features
+  r.uniform(_coef, n_informative * n_targets, (DataT)1.0, (DataT)100.0, stream);
+  if (coef && n_informative != n_cols) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(_coef + n_informative * n_targets,
+                                  0,
+                                  (n_cols - n_informative) * n_targets * sizeof(DataT),
+                                  stream));
+  }
+
+  // Compute the output values
+  DataT alpha = (DataT)1.0, beta = (DataT)0.0;
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
+                                                   CUBLAS_OP_T,
+                                                   CUBLAS_OP_T,
+                                                   n_rows,
+                                                   n_targets,
+                                                   n_informative,
+                                                   &alpha,
+                                                   out,
+                                                   n_cols,
+                                                   _coef,
+                                                   n_targets,
+                                                   &beta,
+                                                   _values_col,
+                                                   n_rows,
+                                                   stream));
+
+  // Transpose the values from column-major to row-major if needed
+  if (n_targets > 1) {
+    raft::linalg::transpose(handle, _values_col, _values, n_rows, n_targets, stream);
+  }
+
+  if (bias != 0.0) {
+    // Add bias
+    raft::linalg::addScalar(_values, _values, bias, n_rows * n_targets, stream);
+  }
+
+  rmm::device_uvector<DataT> white_noise(0, stream);
+  if (noise != 0.0) {
+    // Add white noise
+    white_noise.resize(n_rows * n_targets, stream);
+    r.normal(white_noise.data(), n_rows * n_targets, (DataT)0.0, noise, stream);
+    raft::linalg::add(_values, _values, white_noise.data(), n_rows * n_targets, stream);
+  }
+
+  if (shuffle) {
+    rmm::device_uvector<DataT> tmp_out(n_rows * n_cols, stream);
+    rmm::device_uvector<IdxT> perms_samples(n_rows, stream);
+    rmm::device_uvector<IdxT> perms_features(n_cols, stream);
+
+    constexpr IdxT Nthreads = 256;
+
+    // Shuffle the samples from out to tmp_out
+    raft::random::permute<DataT, IdxT, IdxT>(
+      perms_samples.data(), tmp_out.data(), out, n_cols, n_rows, true, stream);
+    IdxT nblks_rows = raft::ceildiv<IdxT>(n_rows, Nthreads);
+    _gather2d_kernel<<<nblks_rows, Nthreads, 0, stream>>>(
+      values, _values, perms_samples.data(), n_rows, n_targets);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+    // Shuffle the features from tmp_out to out
+    raft::random::permute<DataT, IdxT, IdxT>(
+      perms_features.data(), out, tmp_out.data(), n_rows, n_cols, false, stream);
+
+    // Shuffle the coefficients accordingly
+    if (coef != nullptr) {
+      IdxT nblks_cols = raft::ceildiv<IdxT>(n_cols, Nthreads);
+      _gather2d_kernel<<<nblks_cols, Nthreads, 0, stream>>>(
+        coef, _coef, perms_features.data(), n_cols, n_targets);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+    }
+  }
+}
+
+}  // namespace detail
+}  // namespace raft::random
\ No newline at end of file
diff --git a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
new file mode 100644
index 0000000000..bf79b3cb71
--- /dev/null
+++ b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "curand_wrappers.hpp"
+#include <cmath>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/detail/cusolver_wrappers.hpp>
+#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/unary_op.hpp>
+#include <stdio.h>
+
+// mvg.cuh takes in matrices that are colomn major (as in fortan)
+#define IDX2C(i, j, ld) (j * ld + i)
+
+namespace raft::random {
+namespace detail {
+
+enum Filler : unsigned char {
+  LOWER,  // = 0
+  UPPER   // = 1
+};        // used in memseting upper/lower matrix
+
+/**
+ * @brief Reset values within the epsilon absolute range to zero
+ * @tparam T the data type
+ * @param eig the array
+ * @param epsilon the range
+ * @param size length of the array
+ * @param stream cuda stream
+ */
+template <typename T>
+void epsilonToZero(T* eig, T epsilon, int size, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(
+    eig,
+    eig,
+    size,
+    [epsilon] __device__(T in) { return (in < epsilon && in > -epsilon) ? T(0.0) : in; },
+    stream);
+}
+
+/**
+ * @brief Broadcast addition of vector onto a matrix
+ * @tparam the data type
+ * @param out the output matrix
+ * @param in_m the input matrix
+ * @param in_v the input vector
+ * @param scalar scalar multiplier
+ * @param rows number of rows in the input matrix
+ * @param cols number of cols in the input matrix
+ * @param stream cuda stream
+ */
+template <typename T>
+void matVecAdd(
+  T* out, const T* in_m, const T* in_v, T scalar, int rows, int cols, cudaStream_t stream)
+{
+  raft::linalg::matrixVectorOp(
+    out,
+    in_m,
+    in_v,
+    cols,
+    rows,
+    true,
+    true,
+    [=] __device__(T mat, T vec) { return mat + scalar * vec; },
+    stream);
+}
+
+// helper kernels
+template <typename T>
+__global__ void combined_dot_product(int rows, int cols, const T* W, T* matrix, int* check)
+{
+  int m_i = threadIdx.x + blockDim.x * blockIdx.x;
+  int Wi  = m_i / cols;
+  if (m_i < cols * rows) {
+    if (W[Wi] >= 0.0)
+      matrix[m_i] = pow(W[Wi], 0.5) * (matrix[m_i]);
+    else
+      check[0] = Wi;  // reports Wi'th eigen values is negative.
+  }
+}
+
+template <typename T>  // if uplo = 0, lower part of dim x dim matrix set to
+// value
+__global__ void fill_uplo(int dim, Filler uplo, T value, T* A)
+{
+  int j = threadIdx.x + blockDim.x * blockIdx.x;
+  int i = threadIdx.y + blockDim.y * blockIdx.y;
+  if (i < dim && j < dim) {
+    // making off-diagonals == value
+    if (i < j) {
+      if (uplo == 1) A[IDX2C(i, j, dim)] = value;
+    } else if (i > j) {
+      if (uplo == 0) A[IDX2C(i, j, dim)] = value;
+    }
+  }
+}
+
+template <typename T>
+class multi_variable_gaussian_impl {
+ public:
+  enum Decomposer : unsigned char { chol_decomp, jacobi, qr };
+
+ private:
+  // adjustable stuff
+  const int dim;
+  const int nPoints     = 1;
+  const double tol      = 1.e-7;
+  const T epsilon       = 1.e-12;
+  const int max_sweeps  = 100;
+  cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
+  const Decomposer method;
+
+  // not so much
+  T *P = 0, *X = 0, *x = 0, *workspace_decomp = 0, *eig = 0;
+  int *info, Lwork, info_h;
+  syevjInfo_t syevj_params = NULL;
+  curandGenerator_t gen;
+  const raft::handle_t& handle;
+  cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR;
+  bool deinitilized      = false;
+
+ public:  // functions
+  multi_variable_gaussian_impl() = delete;
+  multi_variable_gaussian_impl(const raft::handle_t& handle, const int dim, Decomposer method)
+    : handle(handle), dim(dim), method(method)
+  {
+    auto cusolverHandle = handle.get_cusolver_dn_handle();
+
+    CURAND_CHECK(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
+    CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(gen, 28));  // SEED
+    if (method == chol_decomp) {
+      RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnpotrf_bufferSize(
+        cusolverHandle, uplo, dim, P, dim, &Lwork));
+    } else if (method == jacobi) {  // jacobi init
+      RAFT_CUSOLVER_TRY(cusolverDnCreateSyevjInfo(&syevj_params));
+      RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetTolerance(syevj_params, tol));
+      RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetMaxSweeps(syevj_params, max_sweeps));
+      RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnsyevj_bufferSize(
+        cusolverHandle, jobz, uplo, dim, P, dim, eig, &Lwork, syevj_params));
+    } else {  // method == qr
+      RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnsyevd_bufferSize(
+        cusolverHandle, jobz, uplo, dim, P, dim, eig, &Lwork));
+    }
+  }
+
+  std::size_t get_workspace_size()
+  {
+    // malloc workspace_decomp
+    std::size_t granuality = 256, offset = 0;
+    workspace_decomp = (T*)offset;
+    offset += raft::alignTo(sizeof(T) * Lwork, granuality);
+    eig = (T*)offset;
+    offset += raft::alignTo(sizeof(T) * dim, granuality);
+    info = (int*)offset;
+    offset += raft::alignTo(sizeof(int), granuality);
+    return offset;
+  }
+
+  void set_workspace(T* workarea)
+  {
+    workspace_decomp = (T*)((std::size_t)workspace_decomp + (std::size_t)workarea);
+    eig              = (T*)((std::size_t)eig + (std::size_t)workarea);
+    info             = (int*)((std::size_t)info + (std::size_t)workarea);
+  }
+
+  void give_gaussian(const int nPoints, T* P, T* X, const T* x = 0)
+  {
+    auto cusolverHandle = handle.get_cusolver_dn_handle();
+    auto cublasHandle   = handle.get_cublas_handle();
+    auto cudaStream     = handle.get_stream();
+    if (method == chol_decomp) {
+      // lower part will contains chol_decomp
+      RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnpotrf(
+        cusolverHandle, uplo, dim, P, dim, workspace_decomp, Lwork, info, cudaStream));
+    } else if (method == jacobi) {
+      RAFT_CUSOLVER_TRY(
+        raft::linalg::detail::cusolverDnsyevj(cusolverHandle,
+                                              jobz,
+                                              uplo,
+                                              dim,
+                                              P,
+                                              dim,
+                                              eig,
+                                              workspace_decomp,
+                                              Lwork,
+                                              info,
+                                              syevj_params,
+                                              cudaStream));  // vectors stored as cols. & col major
+    } else {                                                 // qr
+      RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnsyevd(
+        cusolverHandle, jobz, uplo, dim, P, dim, eig, workspace_decomp, Lwork, info, cudaStream));
+    }
+    raft::update_host(&info_h, info, 1, cudaStream);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(cudaStream));
+    ASSERT(info_h == 0, "mvg: error in syevj/syevd/potrf, info=%d | expected=0", info_h);
+    T mean = 0.0, stddv = 1.0;
+    // generate nxN gaussian nums in X
+    CURAND_CHECK(
+      detail::curandGenerateNormal(gen, X, (nPoints * dim) + (nPoints * dim) % 2, mean, stddv));
+    T alfa = 1.0, beta = 0.0;
+    if (method == chol_decomp) {
+      // upper part (0) being filled with 0.0
+      dim3 block(32, 32);
+      dim3 grid(raft::ceildiv(dim, (int)block.x), raft::ceildiv(dim, (int)block.y));
+      fill_uplo<T><<<grid, block, 0, cudaStream>>>(dim, UPPER, (T)0.0, P);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+      // P is lower triangular chol decomp mtrx
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublasHandle,
+                                                       CUBLAS_OP_N,
+                                                       CUBLAS_OP_N,
+                                                       dim,
+                                                       nPoints,
+                                                       dim,
+                                                       &alfa,
+                                                       P,
+                                                       dim,
+                                                       X,
+                                                       dim,
+                                                       &beta,
+                                                       X,
+                                                       dim,
+                                                       cudaStream));
+    } else {
+      epsilonToZero(eig, epsilon, dim, cudaStream);
+      dim3 block(64);
+      dim3 grid(raft::ceildiv(dim, (int)block.x));
+      RAFT_CUDA_TRY(cudaMemsetAsync(info, 0, sizeof(int), cudaStream));
+      grid.x = raft::ceildiv(dim * dim, (int)block.x);
+      combined_dot_product<T><<<grid, block, 0, cudaStream>>>(dim, dim, eig, P, info);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+      // checking if any eigen vals were negative
+      raft::update_host(&info_h, info, 1, cudaStream);
+      RAFT_CUDA_TRY(cudaStreamSynchronize(cudaStream));
+      ASSERT(info_h == 0, "mvg: Cov matrix has %dth Eigenval negative", info_h);
+
+      // Got Q = eigvect*eigvals.sqrt in P, Q*X in X below
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublasHandle,
+                                                       CUBLAS_OP_N,
+                                                       CUBLAS_OP_N,
+                                                       dim,
+                                                       nPoints,
+                                                       dim,
+                                                       &alfa,
+                                                       P,
+                                                       dim,
+                                                       X,
+                                                       dim,
+                                                       &beta,
+                                                       X,
+                                                       dim,
+                                                       cudaStream));
+    }
+    // working to make mean not 0
+    // since we are working with column-major, nPoints and dim are swapped
+    if (x != NULL) matVecAdd(X, X, x, T(1.0), nPoints, dim, cudaStream);
+  }
+
+  void deinit()
+  {
+    if (deinitilized) return;
+    CURAND_CHECK(curandDestroyGenerator(gen));
+    RAFT_CUSOLVER_TRY(cusolverDnDestroySyevjInfo(syevj_params));
+    deinitilized = true;
+  }
+
+  ~multi_variable_gaussian_impl() { deinit(); }
+};  // end of multi_variable_gaussian_impl
+
+};  // end of namespace detail
+};  // end of namespace raft::random
\ No newline at end of file
diff --git a/cpp/include/raft/random/detail/permute.cuh b/cpp/include/raft/random/detail/permute.cuh
new file mode 100644
index 0000000000..28eaf9136c
--- /dev/null
+++ b/cpp/include/raft/random/detail/permute.cuh
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cooperative_groups.h>
+#include <memory>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/vectorized.cuh>
+
+namespace raft::random {
+namespace detail {
+
+template <typename Type, typename IntType, typename IdxType, int TPB, bool rowMajor>
+__global__ void permuteKernel(
+  IntType* perms, Type* out, const Type* in, IdxType a, IdxType b, IdxType N, IdxType D)
+{
+  namespace cg        = cooperative_groups;
+  const int WARP_SIZE = 32;
+
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  // having shuffled input indices and coalesced output indices appears
+  // to be preferrable to the reverse, especially for column major
+  IntType inIdx  = ((a * int64_t(tid)) + b) % N;
+  IntType outIdx = tid;
+
+  if (perms != nullptr && tid < N) { perms[outIdx] = inIdx; }
+
+  if (out == nullptr || in == nullptr) { return; }
+
+  if (rowMajor) {
+    cg::thread_block_tile<WARP_SIZE> warp = cg::tiled_partition<WARP_SIZE>(cg::this_thread_block());
+
+    __shared__ IntType inIdxShm[TPB];
+    __shared__ IntType outIdxShm[TPB];
+    inIdxShm[threadIdx.x]  = inIdx;
+    outIdxShm[threadIdx.x] = outIdx;
+    warp.sync();
+
+    int warpID = threadIdx.x / WARP_SIZE;
+    int laneID = threadIdx.x % WARP_SIZE;
+    for (int i = warpID * WARP_SIZE; i < warpID * WARP_SIZE + WARP_SIZE; ++i) {
+      if (outIdxShm[i] < N) {
+#pragma unroll
+        for (int j = laneID; j < D; j += WARP_SIZE) {
+          out[outIdxShm[i] * D + j] = in[inIdxShm[i] * D + j];
+        }
+      }
+    }
+  } else {
+#pragma unroll
+    for (int j = 0; j < D; ++j) {
+      if (tid < N) { out[outIdx + j * N] = in[inIdx + j * N]; }
+    }
+  }
+}
+
+// This is wrapped in a type to allow for partial template specialization
+template <typename Type, typename IntType, typename IdxType, int TPB, bool rowMajor, int VLen>
+struct permute_impl_t {
+  static void permuteImpl(IntType* perms,
+                          Type* out,
+                          const Type* in,
+                          IdxType N,
+                          IdxType D,
+                          int nblks,
+                          IdxType a,
+                          IdxType b,
+                          cudaStream_t stream)
+  {
+    // determine vector type and set new pointers
+    typedef typename raft::IOType<Type, VLen>::Type VType;
+    VType* vout      = reinterpret_cast<VType*>(out);
+    const VType* vin = reinterpret_cast<const VType*>(in);
+
+    // check if we can execute at this vector length
+    if (D % VLen == 0 && raft::is_aligned(vout, sizeof(VType)) &&
+        raft::is_aligned(vin, sizeof(VType))) {
+      permuteKernel<VType, IntType, IdxType, TPB, rowMajor>
+        <<<nblks, TPB, 0, stream>>>(perms, vout, vin, a, b, N, D / VLen);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+    } else {  // otherwise try the next lower vector length
+      permute_impl_t<Type, IntType, IdxType, TPB, rowMajor, VLen / 2>::permuteImpl(
+        perms, out, in, N, D, nblks, a, b, stream);
+    }
+  }
+};
+
+// at vector length 1 we just execute a scalar version to break the recursion
+template <typename Type, typename IntType, typename IdxType, int TPB, bool rowMajor>
+struct permute_impl_t<Type, IntType, IdxType, TPB, rowMajor, 1> {
+  static void permuteImpl(IntType* perms,
+                          Type* out,
+                          const Type* in,
+                          IdxType N,
+                          IdxType D,
+                          int nblks,
+                          IdxType a,
+                          IdxType b,
+                          cudaStream_t stream)
+  {
+    permuteKernel<Type, IntType, IdxType, TPB, rowMajor>
+      <<<nblks, TPB, 0, stream>>>(perms, out, in, a, b, N, D);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+};
+
+template <typename Type, typename IntType = int, typename IdxType = int, int TPB = 256>
+void permute(IntType* perms,
+             Type* out,
+             const Type* in,
+             IntType D,
+             IntType N,
+             bool rowMajor,
+             cudaStream_t stream)
+{
+  auto nblks = raft::ceildiv(N, (IntType)TPB);
+
+  // always keep 'a' to be coprime to N
+  IdxType a = rand() % N;
+  while (raft::gcd(a, N) != 1)
+    a = (a + 1) % N;
+  IdxType b = rand() % N;
+
+  if (rowMajor) {
+    permute_impl_t<Type,
+                   IntType,
+                   IdxType,
+                   TPB,
+                   true,
+                   (16 / sizeof(Type) > 0) ? 16 / sizeof(Type) : 1>::permuteImpl(perms,
+                                                                                 out,
+                                                                                 in,
+                                                                                 N,
+                                                                                 D,
+                                                                                 nblks,
+                                                                                 a,
+                                                                                 b,
+                                                                                 stream);
+  } else {
+    permute_impl_t<Type, IntType, IdxType, TPB, false, 1>::permuteImpl(
+      perms, out, in, N, D, nblks, a, b, stream);
+  }
+}
+
+};  // end namespace detail
+};  // end namespace raft::random
\ No newline at end of file
diff --git a/cpp/include/raft/random/make_blobs.hpp b/cpp/include/raft/random/make_blobs.hpp
new file mode 100644
index 0000000000..afdabfe55b
--- /dev/null
+++ b/cpp/include/raft/random/make_blobs.hpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/make_blobs.cuh"
+
+namespace raft::random {
+
+/**
+ * @brief GPU-equivalent of sklearn.datasets.make_blobs
+ *
+ * @tparam DataT output data type
+ * @tparam IdxT  indexing arithmetic type
+ *
+ * @param[out] out                generated data [on device]
+ *                                [dim = n_rows x n_cols]
+ * @param[out] labels             labels for the generated data [on device]
+ *                                [len = n_rows]
+ * @param[in]  n_rows             number of rows in the generated data
+ * @param[in]  n_cols             number of columns in the generated data
+ * @param[in]  n_clusters         number of clusters (or classes) to generate
+ * @param[in]  stream             cuda stream to schedule the work on
+ * @param[in]  row_major          whether input `centers` and output `out`
+ *                                buffers are to be stored in row or column
+ *                                major layout
+ * @param[in]  centers            centers of each of the cluster, pass a nullptr
+ *                                if you need this also to be generated randomly
+ *                                [on device] [dim = n_clusters x n_cols]
+ * @param[in]  cluster_std        standard deviation of each cluster center,
+ *                                pass a nullptr if this is to be read from the
+ *                                `cluster_std_scalar`. [on device]
+ *                                [len = n_clusters]
+ * @param[in]  cluster_std_scalar if 'cluster_std' is nullptr, then use this as
+ *                                the std-dev across all dimensions.
+ * @param[in]  shuffle            shuffle the generated dataset and labels
+ * @param[in]  center_box_min     min value of box from which to pick cluster
+ *                                centers. Useful only if 'centers' is nullptr
+ * @param[in]  center_box_max     max value of box from which to pick cluster
+ *                                centers. Useful only if 'centers' is nullptr
+ * @param[in]  seed               seed for the RNG
+ * @param[in]  type               RNG type
+ */
+template <typename DataT, typename IdxT>
+void make_blobs(DataT* out,
+                IdxT* labels,
+                IdxT n_rows,
+                IdxT n_cols,
+                IdxT n_clusters,
+                cudaStream_t stream,
+                bool row_major                 = true,
+                const DataT* centers           = nullptr,
+                const DataT* cluster_std       = nullptr,
+                const DataT cluster_std_scalar = (DataT)1.0,
+                bool shuffle                   = true,
+                DataT center_box_min           = (DataT)-10.0,
+                DataT center_box_max           = (DataT)10.0,
+                uint64_t seed                  = 0ULL,
+                GeneratorType type             = GenPhilox)
+{
+  detail::make_blobs_caller(out,
+                            labels,
+                            n_rows,
+                            n_cols,
+                            n_clusters,
+                            stream,
+                            row_major,
+                            centers,
+                            cluster_std,
+                            cluster_std_scalar,
+                            shuffle,
+                            center_box_min,
+                            center_box_max,
+                            seed,
+                            type);
+}
+
+}  // end namespace raft::random
\ No newline at end of file
diff --git a/cpp/include/raft/random/make_regression.hpp b/cpp/include/raft/random/make_regression.hpp
new file mode 100644
index 0000000000..d6fceff466
--- /dev/null
+++ b/cpp/include/raft/random/make_regression.hpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Adapted from scikit-learn
+ * https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/datasets/_samples_generator.py
+ */
+
+#pragma once
+
+#include <algorithm>
+
+#include "detail/make_regression.cuh"
+
+namespace raft::random {
+
+/**
+ * @brief GPU-equivalent of sklearn.datasets.make_regression as documented at:
+ * https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html
+ *
+ * @tparam  DataT  Scalar type
+ * @tparam  IdxT   Index type
+ *
+ * @param[in]   handle          RAFT handle
+ * @param[out]  out             Row-major (samples, features) matrix to store
+ *                              the problem data
+ * @param[out]  values          Row-major (samples, targets) matrix to store
+ *                              the values for the regression problem
+ * @param[in]   n_rows          Number of samples
+ * @param[in]   n_cols          Number of features
+ * @param[in]   n_informative   Number of informative features (non-zero
+ *                              coefficients)
+ * @param[in]   stream          CUDA stream
+ * @param[out]  coef            Row-major (features, targets) matrix to store
+ *                              the coefficients used to generate the values
+ *                              for the regression problem. If nullptr is
+ *                              given, nothing will be written
+ * @param[in]   n_targets       Number of targets (generated values per sample)
+ * @param[in]   bias            A scalar that will be added to the values
+ * @param[in]   effective_rank  The approximate rank of the data matrix (used
+ *                              to create correlations in the data). -1 is the
+ *                              code to use well-conditioned data
+ * @param[in]   tail_strength   The relative importance of the fat noisy tail
+ *                              of the singular values profile if
+ *                              effective_rank is not -1
+ * @param[in]   noise           Standard deviation of the gaussian noise
+ *                              applied to the output
+ * @param[in]   shuffle         Shuffle the samples and the features
+ * @param[in]   seed            Seed for the random number generator
+ * @param[in]   type            Random generator type
+ */
+template <typename DataT, typename IdxT>
+void make_regression(const raft::handle_t& handle,
+                     DataT* out,
+                     DataT* values,
+                     IdxT n_rows,
+                     IdxT n_cols,
+                     IdxT n_informative,
+                     cudaStream_t stream,
+                     DataT* coef         = nullptr,
+                     IdxT n_targets      = (IdxT)1,
+                     DataT bias          = (DataT)0.0,
+                     IdxT effective_rank = (IdxT)-1,
+                     DataT tail_strength = (DataT)0.5,
+                     DataT noise         = (DataT)0.0,
+                     bool shuffle        = true,
+                     uint64_t seed       = 0ULL,
+                     GeneratorType type  = GenPhilox)
+{
+  detail::make_regression_caller(handle,
+                                 out,
+                                 values,
+                                 n_rows,
+                                 n_cols,
+                                 n_informative,
+                                 stream,
+                                 coef,
+                                 n_targets,
+                                 bias,
+                                 effective_rank,
+                                 tail_strength,
+                                 noise,
+                                 shuffle,
+                                 seed,
+                                 type);
+}
+
+}  // namespace raft::random
\ No newline at end of file
diff --git a/cpp/include/raft/random/multi_variable_gaussian.hpp b/cpp/include/raft/random/multi_variable_gaussian.hpp
new file mode 100644
index 0000000000..c2af52322a
--- /dev/null
+++ b/cpp/include/raft/random/multi_variable_gaussian.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/multi_variable_gaussian.cuh"
+
+namespace raft::random {
+
+template <typename T>
+class multi_variable_gaussian : public detail::multi_variable_gaussian_impl<T> {
+ public:
+  // using Decomposer = typename detail::multi_variable_gaussian_impl<T>::Decomposer;
+  // using detail::multi_variable_gaussian_impl<T>::Decomposer::chol_decomp;
+  // using detail::multi_variable_gaussian_impl<T>::Decomposer::jacobi;
+  // using detail::multi_variable_gaussian_impl<T>::Decomposer::qr;
+
+  multi_variable_gaussian() = delete;
+  multi_variable_gaussian(const raft::handle_t& handle,
+                          const int dim,
+                          typename detail::multi_variable_gaussian_impl<T>::Decomposer method)
+    : detail::multi_variable_gaussian_impl<T>{handle, dim, method}
+  {
+  }
+
+  std::size_t get_workspace_size()
+  {
+    return detail::multi_variable_gaussian_impl<T>::get_workspace_size();
+  }
+
+  void set_workspace(T* workarea)
+  {
+    detail::multi_variable_gaussian_impl<T>::set_workspace(workarea);
+  }
+
+  void give_gaussian(const int nPoints, T* P, T* X, const T* x = 0)
+  {
+    detail::multi_variable_gaussian_impl<T>::give_gaussian(nPoints, P, X, x);
+  }
+
+  void deinit() { detail::multi_variable_gaussian_impl<T>::deinit(); }
+
+  ~multi_variable_gaussian() { deinit(); }
+};  // end of multi_variable_gaussian
+
+};  // end of namespace raft::random
\ No newline at end of file
diff --git a/cpp/include/raft/random/permute.hpp b/cpp/include/raft/random/permute.hpp
new file mode 100644
index 0000000000..32ed3779e4
--- /dev/null
+++ b/cpp/include/raft/random/permute.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/permute.cuh"
+
+namespace raft::random {
+
+/**
+ * @brief Generate permutations of the input array. Pretty useful primitive for
+ * shuffling the input datasets in ML algos. See note at the end for some of its
+ * limitations!
+ * @tparam Type Data type of the array to be shuffled
+ * @tparam IntType Integer type used for ther perms array
+ * @tparam IdxType Integer type used for addressing indices
+ * @tparam TPB threads per block
+ * @param perms the output permutation indices. Typically useful only when
+ * one wants to refer back. If you don't need this, pass a nullptr
+ * @param out the output shuffled array. Pass nullptr if you don't want this to
+ * be written. For eg: when you only want the perms array to be filled.
+ * @param in input array (in-place is not supported due to race conditions!)
+ * @param D number of columns of the input array
+ * @param N length of the input array (or number of rows)
+ * @param rowMajor whether the input/output matrices are row or col major
+ * @param stream cuda stream where to launch the work
+ *
+ * @note This is NOT a uniform permutation generator! In fact, it only generates
+ * very small percentage of permutations. If your application really requires a
+ * high quality permutation generator, it is recommended that you pick
+ * Knuth Shuffle.
+ */
+template <typename Type, typename IntType = int, typename IdxType = int, int TPB = 256>
+void permute(IntType* perms,
+             Type* out,
+             const Type* in,
+             IntType D,
+             IntType N,
+             bool rowMajor,
+             cudaStream_t stream)
+{
+  detail::permute<Type, IntType, IdxType, TPB>(perms, out, in, D, N, rowMajor, stream);
+}
+
+};  // end namespace raft::random
\ No newline at end of file
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 75b415814a..654ab73f84 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -74,6 +74,10 @@ add_executable(test_raft
     test/mr/host/buffer.cpp
     test/mr/device/buffer.cpp
     test/mst.cu
+    test/random/make_blobs.cu
+    test/random/make_regression.cu
+    test/random/multi_variable_gaussian.cu
+    test/random/permute.cu
     test/random/rng.cu
     test/random/rng_int.cu
     test/random/sample_without_replacement.cu
diff --git a/cpp/test/random/make_blobs.cu b/cpp/test/random/make_blobs.cu
new file mode 100644
index 0000000000..8c7e440d0e
--- /dev/null
+++ b/cpp/test/random/make_blobs.cu
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <cub/cub.cuh>
+#include <gtest/gtest.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/random/make_blobs.hpp>
+
+namespace raft::random {
+
+template <typename T>
+__global__ void meanKernel(T* out,
+                           int* lens,
+                           const T* data,
+                           const int* labels,
+                           int nrows,
+                           int ncols,
+                           int nclusters,
+                           bool row_major)
+{
+  int tid   = threadIdx.x + blockIdx.x * blockDim.x;
+  int rowid = row_major ? tid / ncols : tid % nrows;
+  int colid = row_major ? tid % ncols : tid / nrows;
+  if (rowid < nrows && colid < ncols) {
+    T val     = data[tid];
+    int label = labels[rowid];
+    int idx   = row_major ? label * ncols + colid : colid * nclusters + label;
+    raft::myAtomicAdd(out + idx * 2, val);
+    raft::myAtomicAdd(out + idx * 2 + 1, val * val);
+    if (colid == 0) { raft::myAtomicAdd(lens + label, 1); }
+  }
+}
+
+template <typename T>
+__global__ void compute_mean_var(
+  T* out, const T* stats, int* lens, int nrows, int ncols, bool row_major)
+{
+  int tid    = threadIdx.x + blockIdx.x * blockDim.x;
+  int rowid  = row_major ? tid / ncols : tid % nrows;
+  int colid  = row_major ? tid % ncols : tid / nrows;
+  int stride = nrows * ncols;
+  if (rowid < nrows && colid < ncols) {
+    int len           = lens[rowid];
+    auto mean         = stats[tid * 2] / len;
+    out[tid]          = mean;
+    out[tid + stride] = (stats[tid * 2 + 1] / len) - (mean * mean);
+  }
+}
+
+template <typename T>
+struct MakeBlobsInputs {
+  T tolerance;
+  int rows, cols, n_clusters;
+  T std;
+  bool row_major, shuffle;
+  raft::random::GeneratorType gtype;
+  uint64_t seed;
+};
+
+template <typename T>
+class MakeBlobsTest : public ::testing::TestWithParam<MakeBlobsInputs<T>> {
+ public:
+  MakeBlobsTest()
+    : params(::testing::TestWithParam<MakeBlobsInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      mu_vec(params.cols * params.n_clusters, stream),
+      mean_var(2 * params.n_clusters * params.cols, stream)
+  {
+  }
+
+ protected:
+  void SetUp() override
+  {
+    // Tests are configured with their expected test-values sigma. For example,
+    // 4 x sigma indicates the test shouldn't fail 99.9% of the time.
+    num_sigma = 50;
+    auto len  = params.rows * params.cols;
+    raft::random::Rng r(params.seed, params.gtype);
+
+    rmm::device_uvector<T> data(len, stream);
+    rmm::device_uvector<int> labels(params.rows, stream);
+    rmm::device_uvector<T> stats(2 * params.n_clusters * params.cols, stream);
+    rmm::device_uvector<int> lens(params.n_clusters, stream);
+
+    RAFT_CUDA_TRY(cudaMemsetAsync(stats.data(), 0, stats.size() * sizeof(T), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(mean_var.data(), 0, mean_var.size() * sizeof(T), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(lens.data(), 0, lens.size() * sizeof(int), stream));
+
+    r.uniform(mu_vec.data(), params.cols * params.n_clusters, T(-10.0), T(10.0), stream);
+    T* sigma_vec = nullptr;
+    make_blobs(data.data(),
+               labels.data(),
+               params.rows,
+               params.cols,
+               params.n_clusters,
+               stream,
+               params.row_major,
+               mu_vec.data(),
+               sigma_vec,
+               params.std,
+               params.shuffle,
+               T(-10.0),
+               T(10.0),
+               params.seed,
+               params.gtype);
+    static const int threads = 128;
+    meanKernel<T><<<raft::ceildiv(len, threads), threads, 0, stream>>>(stats.data(),
+                                                                       lens.data(),
+                                                                       data.data(),
+                                                                       labels.data(),
+                                                                       params.rows,
+                                                                       params.cols,
+                                                                       params.n_clusters,
+                                                                       params.row_major);
+    int len1 = params.n_clusters * params.cols;
+    compute_mean_var<T><<<raft::ceildiv(len1, threads), threads, 0, stream>>>(
+      mean_var.data(), stats.data(), lens.data(), params.n_clusters, params.cols, params.row_major);
+  }
+
+  void check()
+  {
+    int len      = params.n_clusters * params.cols;
+    auto compare = raft::CompareApprox<T>(num_sigma * params.tolerance);
+    ASSERT_TRUE(raft::devArrMatch(mu_vec.data(), mean_var.data(), len, compare, stream));
+    ASSERT_TRUE(raft::devArrMatch(params.std, mean_var.data() + len, len, compare, stream));
+  }
+
+ protected:
+  MakeBlobsInputs<T> params;
+  raft::handle_t handle;
+  cudaStream_t stream = 0;
+
+  rmm::device_uvector<T> mu_vec, mean_var;
+  int num_sigma;
+};
+
+typedef MakeBlobsTest<float> MakeBlobsTestF;
+const std::vector<MakeBlobsInputs<float>> inputsf_t = {
+  {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenTaps, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenTaps, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenTaps, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenTaps, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenTaps, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenTaps, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenTaps, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenTaps, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenKiss99, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenKiss99, 1234ULL},
+
+  {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenTaps, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenTaps, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenTaps, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenTaps, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenTaps, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenTaps, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenTaps, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenTaps, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenKiss99, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenKiss99, 1234ULL},
+};
+
+TEST_P(MakeBlobsTestF, Result) { check(); }
+INSTANTIATE_TEST_CASE_P(MakeBlobsTests, MakeBlobsTestF, ::testing::ValuesIn(inputsf_t));
+
+typedef MakeBlobsTest<double> MakeBlobsTestD;
+const std::vector<MakeBlobsInputs<double>> inputsd_t = {
+  {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenTaps, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenTaps, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenTaps, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenTaps, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenTaps, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenTaps, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenTaps, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenTaps, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenKiss99, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenKiss99, 1234ULL},
+
+  {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenTaps, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenTaps, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenTaps, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenTaps, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenTaps, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenTaps, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenTaps, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenTaps, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenKiss99, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenKiss99, 1234ULL},
+};
+TEST_P(MakeBlobsTestD, Result) { check(); }
+INSTANTIATE_TEST_CASE_P(MakeBlobsTests, MakeBlobsTestD, ::testing::ValuesIn(inputsd_t));
+
+}  // end namespace raft::random
\ No newline at end of file
diff --git a/cpp/test/random/make_regression.cu b/cpp/test/random/make_regression.cu
new file mode 100644
index 0000000000..01c3008cd3
--- /dev/null
+++ b/cpp/test/random/make_regression.cu
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+
+#include "../test_utils.h"
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/subtract.hpp>
+#include <raft/linalg/transpose.hpp>
+#include <raft/random/make_regression.hpp>
+
+namespace raft::random {
+
+template <typename T>
+struct MakeRegressionInputs {
+  T tolerance;
+  int n_samples, n_features, n_informative, n_targets, effective_rank;
+  T bias;
+  bool shuffle;
+  raft::random::GeneratorType gtype;
+  uint64_t seed;
+};
+
+template <typename T>
+class MakeRegressionTest : public ::testing::TestWithParam<MakeRegressionInputs<T>> {
+ public:
+  MakeRegressionTest()
+    : params(::testing::TestWithParam<MakeRegressionInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      values_ret(params.n_samples * params.n_targets, stream),
+      values_prod(params.n_samples * params.n_targets, stream)
+  {
+  }
+
+ protected:
+  void SetUp() override
+  {
+    // Noise must be zero to compare the actual and expected values
+    T noise = (T)0.0, tail_strength = (T)0.5;
+
+    rmm::device_uvector<T> data(params.n_samples * params.n_features, stream);
+    rmm::device_uvector<T> values_cm(params.n_samples * params.n_targets, stream);
+    rmm::device_uvector<T> coef(params.n_features * params.n_targets, stream);
+
+    // Create the regression problem
+    make_regression(handle,
+                    data.data(),
+                    values_ret.data(),
+                    params.n_samples,
+                    params.n_features,
+                    params.n_informative,
+                    stream,
+                    coef.data(),
+                    params.n_targets,
+                    params.bias,
+                    params.effective_rank,
+                    tail_strength,
+                    noise,
+                    params.shuffle,
+                    params.seed,
+                    params.gtype);
+
+    // Calculate the values from the data and coefficients (column-major)
+    T alpha = (T)1.0, beta = (T)0.0;
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(handle.get_cublas_handle(),
+                                                     CUBLAS_OP_T,
+                                                     CUBLAS_OP_T,
+                                                     params.n_samples,
+                                                     params.n_targets,
+                                                     params.n_features,
+                                                     &alpha,
+                                                     data.data(),
+                                                     params.n_features,
+                                                     coef.data(),
+                                                     params.n_targets,
+                                                     &beta,
+                                                     values_cm.data(),
+                                                     params.n_samples,
+                                                     stream));
+
+    // Transpose the values to row-major
+    raft::linalg::transpose(
+      handle, values_cm.data(), values_prod.data(), params.n_samples, params.n_targets, stream);
+
+    // Add the bias
+    raft::linalg::addScalar(values_prod.data(),
+                            values_prod.data(),
+                            params.bias,
+                            params.n_samples * params.n_targets,
+                            stream);
+
+    // Count the number of zeroes in the coefficients
+    thrust::device_ptr<T> __coef = thrust::device_pointer_cast(coef.data());
+    zero_count = thrust::count(__coef, __coef + params.n_features * params.n_targets, (T)0.0);
+  }
+
+ protected:
+  raft::handle_t handle;
+  cudaStream_t stream = 0;
+
+  MakeRegressionInputs<T> params;
+  rmm::device_uvector<T> values_ret, values_prod;
+  int zero_count;
+};
+
+typedef MakeRegressionTest<float> MakeRegressionTestF;
+const std::vector<MakeRegressionInputs<float>> inputsf_t = {
+  {0.01f, 256, 32, 16, 1, -1, 0.f, true, raft::random::GenPhilox, 1234ULL},
+  {0.01f, 1000, 100, 47, 4, 65, 4.2f, true, raft::random::GenPhilox, 1234ULL},
+  {0.01f, 20000, 500, 450, 13, -1, -3.f, false, raft::random::GenPhilox, 1234ULL}};
+
+TEST_P(MakeRegressionTestF, Result)
+{
+  ASSERT_TRUE(match(params.n_targets * (params.n_features - params.n_informative),
+                    zero_count,
+                    raft::Compare<int>()));
+  ASSERT_TRUE(devArrMatch(values_ret.data(),
+                          values_prod.data(),
+                          params.n_samples,
+                          params.n_targets,
+                          raft::CompareApprox<float>(params.tolerance),
+                          stream));
+}
+INSTANTIATE_TEST_CASE_P(MakeRegressionTests, MakeRegressionTestF, ::testing::ValuesIn(inputsf_t));
+
+typedef MakeRegressionTest<double> MakeRegressionTestD;
+const std::vector<MakeRegressionInputs<double>> inputsd_t = {
+  {0.01, 256, 32, 16, 1, -1, 0.0, true, raft::random::GenPhilox, 1234ULL},
+  {0.01, 1000, 100, 47, 4, 65, 4.2, true, raft::random::GenPhilox, 1234ULL},
+  {0.01, 20000, 500, 450, 13, -1, -3.0, false, raft::random::GenPhilox, 1234ULL}};
+
+TEST_P(MakeRegressionTestD, Result)
+{
+  ASSERT_TRUE(match(params.n_targets * (params.n_features - params.n_informative),
+                    zero_count,
+                    raft::Compare<int>()));
+  ASSERT_TRUE(devArrMatch(values_ret.data(),
+                          values_prod.data(),
+                          params.n_samples,
+                          params.n_targets,
+                          raft::CompareApprox<double>(params.tolerance),
+                          stream));
+}
+INSTANTIATE_TEST_CASE_P(MakeRegressionTests, MakeRegressionTestD, ::testing::ValuesIn(inputsd_t));
+
+}  // end namespace raft::random
\ No newline at end of file
diff --git a/cpp/test/random/multi_variable_gaussian.cu b/cpp/test/random/multi_variable_gaussian.cu
new file mode 100644
index 0000000000..daafdbc754
--- /dev/null
+++ b/cpp/test/random/multi_variable_gaussian.cu
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <cmath>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <raft/cudart_utils.h>
+#include <raft/random/multi_variable_gaussian.hpp>
+#include <random>
+#include <rmm/device_uvector.hpp>
+
+// mvg.h takes in matrices that are colomn major (as in fortan)
+#define IDX2C(i, j, ld) (j * ld + i)
+
+namespace raft::random {
+
+// helper kernels
+/// @todo Duplicate called vctwiseAccumulate in utils.h (Kalman Filters,
+// i think that is much better to use., more general)
+template <typename T>
+__global__ void En_KF_accumulate(const int nPoints, const int dim, const T* X, T* x)
+{
+  int idx = threadIdx.x + blockDim.x * blockIdx.x;
+  int col = idx % dim;
+  int row = idx / dim;
+  if (col < dim && row < nPoints) raft::myAtomicAdd(x + col, X[idx]);
+}
+
+template <typename T>
+__global__ void En_KF_normalize(const int divider, const int dim, T* x)
+{
+  int xi = threadIdx.x + blockDim.x * blockIdx.x;
+  if (xi < dim) x[xi] = x[xi] / divider;
+}
+
+template <typename T>
+__global__ void En_KF_dif(const int nPoints, const int dim, const T* X, const T* x, T* X_diff)
+{
+  int idx = threadIdx.x + blockDim.x * blockIdx.x;
+  int col = idx % dim;
+  int row = idx / dim;
+  if (col < dim && row < nPoints) X_diff[idx] = X[idx] - x[col];
+}
+
+// for specialising tests
+enum Correlation : unsigned char {
+  CORRELATED,  // = 0
+  UNCORRELATED
+};
+
+template <typename T>
+struct MVGInputs {
+  T tolerance;
+  typename multi_variable_gaussian<T>::Decomposer method;
+  Correlation corr;
+  int dim, nPoints;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const MVGInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+class MVGTest : public ::testing::TestWithParam<MVGInputs<T>> {
+ protected:
+  MVGTest()
+    : workspace_d(0, handle.get_stream()),
+      P_d(0, handle.get_stream()),
+      x_d(0, handle.get_stream()),
+      X_d(0, handle.get_stream()),
+      Rand_cov(0, handle.get_stream()),
+      Rand_mean(0, handle.get_stream())
+  {
+  }
+
+  void SetUp() override
+  {
+    // getting params
+    params    = ::testing::TestWithParam<MVGInputs<T>>::GetParam();
+    dim       = params.dim;
+    nPoints   = params.nPoints;
+    method    = params.method;
+    corr      = params.corr;
+    tolerance = params.tolerance;
+
+    auto cublasH   = handle.get_cublas_handle();
+    auto cusolverH = handle.get_cusolver_dn_handle();
+    auto stream    = handle.get_stream();
+
+    // preparing to store stuff
+    P.resize(dim * dim);
+    x.resize(dim);
+    X.resize(dim * nPoints);
+    P_d.resize(dim * dim, stream);
+    X_d.resize(nPoints * dim, stream);
+    x_d.resize(dim, stream);
+    Rand_cov.resize(dim * dim, stream);
+    Rand_mean.resize(dim, stream);
+
+    // generating random mean and cov.
+    srand(params.seed);
+    for (int j = 0; j < dim; j++)
+      x.data()[j] = rand() % 100 + 5.0f;
+
+    // for random Cov. martix
+    std::default_random_engine generator(params.seed);
+    std::uniform_real_distribution<T> distribution(0.0, 1.0);
+
+    // P (developing a +ve definite symm matrix)
+    for (int j = 0; j < dim; j++) {
+      for (int i = 0; i < j + 1; i++) {
+        T k = distribution(generator);
+        if (corr == UNCORRELATED) k = 0.0;
+        P.data()[IDX2C(i, j, dim)] = k;
+        P.data()[IDX2C(j, i, dim)] = k;
+        if (i == j) P.data()[IDX2C(i, j, dim)] += dim;
+      }
+    }
+
+    // porting inputs to gpu
+    raft::update_device(P_d.data(), P.data(), dim * dim, stream);
+    raft::update_device(x_d.data(), x.data(), dim, stream);
+
+    // initilizing the mvg
+    mvg           = new multi_variable_gaussian<T>(handle, dim, method);
+    std::size_t o = mvg->get_workspace_size();
+
+    // give the workspace area to mvg
+    workspace_d.resize(o, stream);
+    mvg->set_workspace(workspace_d.data());
+
+    // get gaussians in X_d | P_d is destroyed.
+    mvg->give_gaussian(nPoints, P_d.data(), X_d.data(), x_d.data());
+
+    // saving the mean of the randoms in Rand_mean
+    //@todo can be swapped with a API that calculates mean
+    RAFT_CUDA_TRY(cudaMemset(Rand_mean.data(), 0, dim * sizeof(T)));
+    dim3 block = (64);
+    dim3 grid  = (raft::ceildiv(nPoints * dim, (int)block.x));
+    En_KF_accumulate<<<grid, block, 0, stream>>>(nPoints, dim, X_d.data(), Rand_mean.data());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+    grid = (raft::ceildiv(dim, (int)block.x));
+    En_KF_normalize<<<grid, block, 0, stream>>>(nPoints, dim, Rand_mean.data());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+    // storing the error wrt random point mean in X_d
+    grid = (raft::ceildiv(dim * nPoints, (int)block.x));
+    En_KF_dif<<<grid, block, 0, stream>>>(nPoints, dim, X_d.data(), Rand_mean.data(), X_d.data());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+    // finding the cov matrix, placing in Rand_cov
+    T alfa = 1.0 / (nPoints - 1), beta = 0.0;
+
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublasH,
+                                                     CUBLAS_OP_N,
+                                                     CUBLAS_OP_T,
+                                                     dim,
+                                                     dim,
+                                                     nPoints,
+                                                     &alfa,
+                                                     X_d.data(),
+                                                     dim,
+                                                     X_d.data(),
+                                                     dim,
+                                                     &beta,
+                                                     Rand_cov.data(),
+                                                     dim,
+                                                     stream));
+
+    // restoring cov provided into P_d
+    raft::update_device(P_d.data(), P.data(), dim * dim, stream);
+  }
+
+  void TearDown() override
+  {
+    // deleting mvg
+    delete mvg;
+  }
+
+ protected:
+  MVGInputs<T> params;
+  std::vector<T> P, x, X;
+  rmm::device_uvector<T> workspace_d, P_d, x_d, X_d, Rand_cov, Rand_mean;
+  int dim, nPoints;
+  typename multi_variable_gaussian<T>::Decomposer method;
+  Correlation corr;
+  multi_variable_gaussian<T>* mvg = NULL;
+  T tolerance;
+  raft::handle_t handle;
+};  // end of MVGTest class
+
+///@todo find out the reason that Un-correlated covs are giving problems (in qr)
+// Declare your inputs
+const std::vector<MVGInputs<float>> inputsf = {
+  {0.3f,
+   multi_variable_gaussian<float>::Decomposer::chol_decomp,
+   Correlation::CORRELATED,
+   5,
+   30000,
+   6ULL},
+  {0.1f,
+   multi_variable_gaussian<float>::Decomposer::chol_decomp,
+   Correlation::UNCORRELATED,
+   5,
+   30000,
+   6ULL},
+  {0.25f,
+   multi_variable_gaussian<float>::Decomposer::jacobi,
+   Correlation::CORRELATED,
+   5,
+   30000,
+   6ULL},
+  {0.1f,
+   multi_variable_gaussian<float>::Decomposer::jacobi,
+   Correlation::UNCORRELATED,
+   5,
+   30000,
+   6ULL},
+  {0.2f, multi_variable_gaussian<float>::Decomposer::qr, Correlation::CORRELATED, 5, 30000, 6ULL},
+  // { 0.2f,          multi_variable_gaussian<float>::Decomposer::qr,
+  // Correlation::UNCORRELATED, 5, 30000, 6ULL}
+};
+const std::vector<MVGInputs<double>> inputsd = {
+  {0.25,
+   multi_variable_gaussian<double>::Decomposer::chol_decomp,
+   Correlation::CORRELATED,
+   10,
+   3000000,
+   6ULL},
+  {0.1,
+   multi_variable_gaussian<double>::Decomposer::chol_decomp,
+   Correlation::UNCORRELATED,
+   10,
+   3000000,
+   6ULL},
+  {0.25,
+   multi_variable_gaussian<double>::Decomposer::jacobi,
+   Correlation::CORRELATED,
+   10,
+   3000000,
+   6ULL},
+  {0.1,
+   multi_variable_gaussian<double>::Decomposer::jacobi,
+   Correlation::UNCORRELATED,
+   10,
+   3000000,
+   6ULL},
+  {0.2,
+   multi_variable_gaussian<double>::Decomposer::qr,
+   Correlation::CORRELATED,
+   10,
+   3000000,
+   6ULL},
+  // { 0.2,          multi_variable_gaussian<double>::Decomposer::qr,
+  // Correlation::UNCORRELATED, 10, 3000000, 6ULL}
+};
+
+// make the tests
+typedef MVGTest<float> MVGTestF;
+typedef MVGTest<double> MVGTestD;
+TEST_P(MVGTestF, MeanIsCorrectF)
+{
+  EXPECT_TRUE(raft::devArrMatch(
+    x_d.data(), Rand_mean.data(), dim, raft::CompareApprox<float>(tolerance), handle.get_stream()))
+    << " in MeanIsCorrect";
+}
+TEST_P(MVGTestF, CovIsCorrectF)
+{
+  EXPECT_TRUE(raft::devArrMatch(P_d.data(),
+                                Rand_cov.data(),
+                                dim,
+                                dim,
+                                raft::CompareApprox<float>(tolerance),
+                                handle.get_stream()))
+    << " in CovIsCorrect";
+}
+TEST_P(MVGTestD, MeanIsCorrectD)
+{
+  EXPECT_TRUE(raft::devArrMatch(
+    x_d.data(), Rand_mean.data(), dim, raft::CompareApprox<double>(tolerance), handle.get_stream()))
+    << " in MeanIsCorrect";
+}
+TEST_P(MVGTestD, CovIsCorrectD)
+{
+  EXPECT_TRUE(raft::devArrMatch(P_d.data(),
+                                Rand_cov.data(),
+                                dim,
+                                dim,
+                                raft::CompareApprox<double>(tolerance),
+                                handle.get_stream()))
+    << " in CovIsCorrect";
+}
+
+// call the tests
+INSTANTIATE_TEST_CASE_P(MVGTests, MVGTestF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(MVGTests, MVGTestD, ::testing::ValuesIn(inputsd));
+
+};  // end of namespace raft::random
\ No newline at end of file
diff --git a/cpp/test/random/permute.cu b/cpp/test/random/permute.cu
new file mode 100644
index 0000000000..294444d409
--- /dev/null
+++ b/cpp/test/random/permute.cu
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <algorithm>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/random/permute.hpp>
+#include <raft/random/rng.hpp>
+#include <vector>
+
+namespace raft {
+namespace random {
+
+template <typename T>
+struct PermInputs {
+  int N, D;
+  bool needPerms, needShuffle, rowMajor;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const PermInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+class PermTest : public ::testing::TestWithParam<PermInputs<T>> {
+ protected:
+  PermTest() : in(0, stream), out(0, stream), outPerms(0, stream) {}
+
+  void SetUp() override
+  {
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    params = ::testing::TestWithParam<PermInputs<T>>::GetParam();
+    // forcefully set needPerms, since we need it for unit-testing!
+    if (params.needShuffle) { params.needPerms = true; }
+    raft::random::Rng r(params.seed);
+    int N               = params.N;
+    int D               = params.D;
+    int len             = N * D;
+    cudaStream_t stream = 0;
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    if (params.needPerms) {
+      outPerms.resize(N, stream);
+      outPerms_ptr = outPerms.data();
+    }
+    if (params.needShuffle) {
+      in.resize(len, stream);
+      out.resize(len, stream);
+      in_ptr  = in.data();
+      out_ptr = out.data();
+      r.uniform(in_ptr, len, T(-1.0), T(1.0), stream);
+    }
+    permute(outPerms_ptr, out_ptr, in_ptr, D, N, params.rowMajor, stream);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  }
+
+  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
+
+ protected:
+  PermInputs<T> params;
+  rmm::device_uvector<T> in, out;
+  T* in_ptr  = nullptr;
+  T* out_ptr = nullptr;
+  rmm::device_uvector<int> outPerms;
+  int* outPerms_ptr   = nullptr;
+  cudaStream_t stream = 0;
+};
+
+template <typename T, typename L>
+::testing::AssertionResult devArrMatchRange(
+  const T* actual, size_t size, T start, L eq_compare, bool doSort = true, cudaStream_t stream = 0)
+{
+  std::vector<T> act_h(size);
+  raft::update_host<T>(&(act_h[0]), actual, size, stream);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  if (doSort) std::sort(act_h.begin(), act_h.end());
+  for (size_t i(0); i < size; ++i) {
+    auto act      = act_h[i];
+    auto expected = start + i;
+    if (!eq_compare(expected, act)) {
+      return ::testing::AssertionFailure()
+             << "actual=" << act << " != expected=" << expected << " @" << i;
+    }
+  }
+  return ::testing::AssertionSuccess();
+}
+
+template <typename T, typename L>
+::testing::AssertionResult devArrMatchShuffle(const int* perms,
+                                              const T* out,
+                                              const T* in,
+                                              int D,
+                                              int N,
+                                              bool rowMajor,
+                                              L eq_compare,
+                                              cudaStream_t stream = 0)
+{
+  std::vector<int> h_perms(N);
+  raft::update_host<int>(&(h_perms[0]), perms, N, stream);
+  std::vector<T> h_out(N * D), h_in(N * D);
+  raft::update_host<T>(&(h_out[0]), out, N * D, stream);
+  raft::update_host<T>(&(h_in[0]), in, N * D, stream);
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  for (int i = 0; i < N; ++i) {
+    for (int j = 0; j < D; ++j) {
+      int outPos    = rowMajor ? i * D + j : j * N + i;
+      int inPos     = rowMajor ? h_perms[i] * D + j : j * N + h_perms[i];
+      auto act      = h_out[outPos];
+      auto expected = h_in[inPos];
+      if (!eq_compare(expected, act)) {
+        return ::testing::AssertionFailure()
+               << "actual=" << act << " != expected=" << expected << " @" << i << ", " << j;
+      }
+    }
+  }
+  return ::testing::AssertionSuccess();
+}
+
+const std::vector<PermInputs<float>> inputsf = {
+  // only generate permutations
+  {32, 8, true, false, true, 1234ULL},
+  {32, 8, true, false, true, 1234567890ULL},
+  {1024, 32, true, false, true, 1234ULL},
+  {1024, 32, true, false, true, 1234567890ULL},
+  {2 * 1024, 32, true, false, true, 1234ULL},
+  {2 * 1024, 32, true, false, true, 1234567890ULL},
+  {2 * 1024 + 500, 32, true, false, true, 1234ULL},
+  {2 * 1024 + 500, 32, true, false, true, 1234567890ULL},
+  {100000, 32, true, false, true, 1234ULL},
+  {100000, 32, true, false, true, 1234567890ULL},
+  {100001, 33, true, false, true, 1234567890ULL},
+  // permute and shuffle the data row major
+  {32, 8, true, true, true, 1234ULL},
+  {32, 8, true, true, true, 1234567890ULL},
+  {1024, 32, true, true, true, 1234ULL},
+  {1024, 32, true, true, true, 1234567890ULL},
+  {2 * 1024, 32, true, true, true, 1234ULL},
+  {2 * 1024, 32, true, true, true, 1234567890ULL},
+  {2 * 1024 + 500, 32, true, true, true, 1234ULL},
+  {2 * 1024 + 500, 32, true, true, true, 1234567890ULL},
+  {100000, 32, true, true, true, 1234ULL},
+  {100000, 32, true, true, true, 1234567890ULL},
+  {100001, 31, true, true, true, 1234567890ULL},
+  // permute and shuffle the data column major
+  {32, 8, true, true, false, 1234ULL},
+  {32, 8, true, true, false, 1234567890ULL},
+  {1024, 32, true, true, false, 1234ULL},
+  {1024, 32, true, true, false, 1234567890ULL},
+  {2 * 1024, 32, true, true, false, 1234ULL},
+  {2 * 1024, 32, true, true, false, 1234567890ULL},
+  {2 * 1024 + 500, 32, true, true, false, 1234ULL},
+  {2 * 1024 + 500, 32, true, true, false, 1234567890ULL},
+  {100000, 32, true, true, false, 1234ULL},
+  {100000, 32, true, true, false, 1234567890ULL},
+  {100001, 33, true, true, false, 1234567890ULL}};
+
+typedef PermTest<float> PermTestF;
+TEST_P(PermTestF, Result)
+{
+  if (params.needPerms) {
+    ASSERT_TRUE(devArrMatchRange(outPerms_ptr, params.N, 0, raft::Compare<int>()));
+  }
+  if (params.needShuffle) {
+    ASSERT_TRUE(devArrMatchShuffle(
+      outPerms_ptr, out_ptr, in_ptr, params.D, params.N, params.rowMajor, raft::Compare<float>()));
+  }
+}
+INSTANTIATE_TEST_CASE_P(PermTests, PermTestF, ::testing::ValuesIn(inputsf));
+
+const std::vector<PermInputs<double>> inputsd = {
+  // only generate permutations
+  {32, 8, true, false, true, 1234ULL},
+  {32, 8, true, false, true, 1234567890ULL},
+  {1024, 32, true, false, true, 1234ULL},
+  {1024, 32, true, false, true, 1234567890ULL},
+  {2 * 1024, 32, true, false, true, 1234ULL},
+  {2 * 1024, 32, true, false, true, 1234567890ULL},
+  {2 * 1024 + 500, 32, true, false, true, 1234ULL},
+  {2 * 1024 + 500, 32, true, false, true, 1234567890ULL},
+  {100000, 32, true, false, true, 1234ULL},
+  {100000, 32, true, false, true, 1234567890ULL},
+  {100001, 33, true, false, true, 1234567890ULL},
+  // permute and shuffle the data row major
+  {32, 8, true, true, true, 1234ULL},
+  {32, 8, true, true, true, 1234567890ULL},
+  {1024, 32, true, true, true, 1234ULL},
+  {1024, 32, true, true, true, 1234567890ULL},
+  {2 * 1024, 32, true, true, true, 1234ULL},
+  {2 * 1024, 32, true, true, true, 1234567890ULL},
+  {2 * 1024 + 500, 32, true, true, true, 1234ULL},
+  {2 * 1024 + 500, 32, true, true, true, 1234567890ULL},
+  {100000, 32, true, true, true, 1234ULL},
+  {100000, 32, true, true, true, 1234567890ULL},
+  {100001, 31, true, true, true, 1234567890ULL},
+  // permute and shuffle the data column major
+  {32, 8, true, true, false, 1234ULL},
+  {32, 8, true, true, false, 1234567890ULL},
+  {1024, 32, true, true, false, 1234ULL},
+  {1024, 32, true, true, false, 1234567890ULL},
+  {2 * 1024, 32, true, true, false, 1234ULL},
+  {2 * 1024, 32, true, true, false, 1234567890ULL},
+  {2 * 1024 + 500, 32, true, true, false, 1234ULL},
+  {2 * 1024 + 500, 32, true, true, false, 1234567890ULL},
+  {100000, 32, true, true, false, 1234ULL},
+  {100000, 32, true, true, false, 1234567890ULL},
+  {100001, 33, true, true, false, 1234567890ULL}};
+typedef PermTest<double> PermTestD;
+TEST_P(PermTestD, Result)
+{
+  if (params.needPerms) {
+    ASSERT_TRUE(devArrMatchRange(outPerms_ptr, params.N, 0, raft::Compare<int>()));
+  }
+  if (params.needShuffle) {
+    ASSERT_TRUE(devArrMatchShuffle(
+      outPerms_ptr, out_ptr, in_ptr, params.D, params.N, params.rowMajor, raft::Compare<double>()));
+  }
+}
+INSTANTIATE_TEST_CASE_P(PermTests, PermTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace random
+}  // end namespace raft
\ No newline at end of file
diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu
index 210eefa850..5a389b8c87 100644
--- a/cpp/test/sparse/filter.cu
+++ b/cpp/test/sparse/filter.cu
@@ -96,7 +96,7 @@ TEST_P(COORemoveZeros, Result)
   raft::update_device(out_ref.vals(), out_vals_ref_h, 2, stream);
 
   op::coo_remove_zeros<float>(&in, &out, stream);
-  CUDA_CHECK(cudaStreamSynchronize(stream));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
   ASSERT_TRUE(raft::devArrMatch<int>(out_ref.rows(), out.rows(), 2, raft::Compare<int>()));
   ASSERT_TRUE(raft::devArrMatch<int>(out_ref.cols(), out.cols(), 2, raft::Compare<int>()));
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
index 8d61ca06a8..ac5443d43b 100644
--- a/cpp/test/sparse/norm.cu
+++ b/cpp/test/sparse/norm.cu
@@ -73,7 +73,7 @@ class CSRRowNormalizeTest : public ::testing::TestWithParam<CSRRowNormalizeInput
           ex_scan.data(), in_vals.data(), nnz, n_rows, result.data(), stream);
         break;
     }
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 
     ASSERT_TRUE(
       raft::devArrMatch<Type_f>(verify.data(), result.data(), nnz, raft::Compare<Type_f>()));
diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu
index b550857797..edf7432c49 100644
--- a/cpp/test/sparse/reduce.cu
+++ b/cpp/test/sparse/reduce.cu
@@ -78,7 +78,7 @@ class SparseReduceTest : public ::testing::TestWithParam<SparseReduceInputs<valu
                                      params.in_rows.size(),
                                      params.m,
                                      params.n);
-    CUDA_CHECK(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
     ASSERT_TRUE(raft::devArrMatch<value_idx>(
       out_rows.data(), out.rows(), out.nnz, raft::Compare<value_idx>()));
     ASSERT_TRUE(raft::devArrMatch<value_idx>(

From 6efe89b8222619ece7d098c2e15fe0ac5f918d80 Mon Sep 17 00:00:00 2001
From: Vinay Deshpande <vinayd@nvidia.com>
Date: Fri, 11 Feb 2022 08:34:12 +0530
Subject: [PATCH 114/171] Improvents to RNG (#434)

With this change proposing number of improvements to the Random Number Generator (RNG) class of RAFT

- Remove inferior TAPS and KISS99 generators
- Fix and optimize bounded integer generation
- Fix and optimize uniform floating point numbers (float/double)
- Add faster PCG based generator
- Provide device API for generator with all supported probability distributions

Authors:
  - Vinay Deshpande (https://github.com/vinaydes)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/434
---
 cpp/include/raft/random/detail/rng_impl.cuh   | 1204 +++++++++--------
 cpp/include/raft/random/rng.hpp               |  145 +-
 cpp/test/linalg/eig_sel.cu                    |   39 +-
 cpp/test/random/rng.cu                        |  125 +-
 cpp/test/random/rng_int.cu                    |   32 +-
 cpp/test/random/sample_without_replacement.cu |  204 ++-
 thirdparty/README.md                          |    4 +
 thirdparty/pcg/LICENSE.txt                    |  201 +++
 thirdparty/pcg/pcg_basic.c                    |  116 ++
 9 files changed, 1189 insertions(+), 881 deletions(-)
 create mode 100644 thirdparty/README.md
 create mode 100644 thirdparty/pcg/LICENSE.txt
 create mode 100644 thirdparty/pcg/pcg_basic.c

diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh
index 9ca3859e18..2406456404 100644
--- a/cpp/include/raft/random/detail/rng_impl.cuh
+++ b/cpp/include/raft/random/detail/rng_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,18 +16,13 @@
 
 #pragma once
 
-#include <cstdio>
-#include <cstdlib>
 #include <curand_kernel.h>
 #include <raft/common/cub_wrappers.cuh>
 #include <raft/common/scatter.cuh>
 #include <raft/cuda_utils.cuh>
-#include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 #include <random>
 #include <rmm/device_uvector.hpp>
-#include <stdint.h>
-#include <type_traits>
 
 namespace raft {
 namespace random {
@@ -37,10 +32,99 @@ namespace detail {
 enum GeneratorType {
   /** curand-based philox generator */
   GenPhilox = 0,
-  /** LFSR taps generator */
-  GenTaps,
-  /** kiss99 generator (currently the fastest) */
-  GenKiss99
+  /** Permuted Congruential Generator */
+  GenPC
+};
+
+template <typename OutType>
+struct InvariantDistParams {
+  OutType const_val;
+};
+
+template <typename OutType>
+struct UniformDistParams {
+  OutType start;
+  OutType end;
+};
+
+template <typename OutType, typename DiffType>
+struct UniformIntDistParams {
+  OutType start;
+  OutType end;
+  DiffType diff;
+};
+
+template <typename OutType>
+struct NormalDistParams {
+  OutType mu;
+  OutType sigma;
+};
+
+template <typename IntType>
+struct NormalIntDistParams {
+  IntType mu;
+  IntType sigma;
+};
+
+template <typename OutType, typename LenType>
+struct NormalTableDistParams {
+  LenType n_rows;
+  LenType n_cols;
+  const OutType* mu_vec;
+  OutType sigma;
+  const OutType* sigma_vec;
+};
+
+template <typename OutType>
+struct BernoulliDistParams {
+  OutType prob;
+};
+
+template <typename OutType>
+struct ScaledBernoulliDistParams {
+  OutType prob;
+  OutType scale;
+};
+
+template <typename OutType>
+struct GumbelDistParams {
+  OutType mu;
+  OutType beta;
+};
+
+template <typename OutType>
+struct LogNormalDistParams {
+  OutType mu;
+  OutType sigma;
+};
+
+template <typename OutType>
+struct LogisticDistParams {
+  OutType mu;
+  OutType scale;
+};
+
+template <typename OutType>
+struct ExponentialDistParams {
+  OutType lambda;
+};
+
+template <typename OutType>
+struct RayleighDistParams {
+  OutType sigma;
+};
+
+template <typename OutType>
+struct LaplaceDistParams {
+  OutType mu;
+  OutType scale;
+};
+
+// Not really a distro, useful for sample without replacement function
+template <typename WeightsT, typename IdxT>
+struct SamplingParams {
+  IdxT* inIdxPtr;
+  const WeightsT* wts;
 };
 
 template <typename Type>
@@ -55,76 +139,261 @@ DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type
   val1 = R * c * sigma1 + mu1;
   val2 = R * s * sigma2 + mu2;
 }
+
 template <typename Type>
 DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1)
 {
   box_muller_transform<Type>(val1, val2, sigma1, mu1, sigma1, mu1);
 }
 
-/**
- * @brief generator-agnostic way of generating random numbers
- * @tparam GenType the generator object that expose 'next' method
- */
-template <typename GenType>
-struct Generator {
-  DI Generator(uint64_t seed, uint64_t subsequence, uint64_t offset)
-    : gen(seed, subsequence, offset)
-  {
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(GenType& gen,
+                    OutType* val,
+                    InvariantDistParams<OutType> params,
+                    LenType idx    = 0,
+                    LenType stride = 0)
+{
+  *val = params.const_val;
+}
+
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(GenType& gen,
+                    OutType* val,
+                    UniformDistParams<OutType> params,
+                    LenType idx    = 0,
+                    LenType stride = 0)
+{
+  OutType res;
+  gen.next(res);
+  *val = (res * (params.end - params.start)) + params.start;
+}
+
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(GenType& gen,
+                    OutType* val,
+                    UniformIntDistParams<OutType, uint32_t> params,
+                    LenType idx    = 0,
+                    LenType stride = 0)
+{
+  uint32_t x = 0;
+  uint32_t s = params.diff;
+  gen.next(x);
+  uint64_t m = uint64_t(x) * s;
+  uint32_t l = uint32_t(m);
+  if (l < s) {
+    uint32_t t = (-s) % s;  // (2^32 - s) mod s
+    while (l < t) {
+      gen.next(x);
+      m = uint64_t(x) * s;
+      l = uint32_t(m);
+    }
   }
+  *val = OutType(m >> 32) + params.start;
+}
 
-  template <typename Type>
-  DI void next(Type& ret)
-  {
-    gen.next(ret);
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(GenType& gen,
+                    OutType* val,
+                    UniformIntDistParams<OutType, uint64_t> params,
+                    LenType idx    = 0,
+                    LenType stride = 0)
+{
+  uint64_t x = 0;
+  gen.next(x);
+  uint64_t s = params.diff;
+  uint64_t m_lo, m_hi;
+  // m = x * s;
+  asm("mul.hi.u64 %0, %1, %2;" : "=l"(m_hi) : "l"(x), "l"(s));
+  asm("mul.lo.u64 %0, %1, %2;" : "=l"(m_lo) : "l"(x), "l"(s));
+  if (m_lo < s) {
+    uint64_t t = (-s) % s;  // (2^64 - s) mod s
+    while (m_lo < t) {
+      gen.next(x);
+      asm("mul.hi.u64 %0, %1, %2;" : "=l"(m_hi) : "l"(x), "l"(s));
+      asm("mul.lo.u64 %0, %1, %2;" : "=l"(m_lo) : "l"(x), "l"(s));
+    }
   }
+  *val = OutType(m_hi) + params.start;
+}
 
- private:
-  /** the actual generator */
-  GenType gen;
-};
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(
+  GenType& gen, OutType* val, NormalDistParams<OutType> params, LenType idx = 0, LenType stride = 0)
+{
+  OutType res1, res2;
+  gen.next(res1);
+  gen.next(res2);
+  box_muller_transform<OutType>(res1, res2, params.sigma, params.mu);
+  *val       = res1;
+  *(val + 1) = res2;
+}
 
-template <typename OutType, typename MathType, typename GenType, typename LenType, typename Lambda>
-__global__ void randKernel(uint64_t seed, uint64_t offset, OutType* ptr, LenType len, Lambda randOp)
+template <typename GenType, typename IntType, typename LenType>
+DI void custom_next(GenType& gen,
+                    IntType* val,
+                    NormalIntDistParams<IntType> params,
+                    LenType idx    = 0,
+                    LenType stride = 0)
 {
-  LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  Generator<GenType> gen(seed, (uint64_t)tid, offset);
-  const LenType stride = gridDim.x * blockDim.x;
-  for (LenType idx = tid; idx < len; idx += stride) {
-    MathType val;
-    gen.next(val);
-    ptr[idx] = randOp(val, idx);
-  }
+  IntType res1_int, res2_int;
+  gen.next(res1_int);
+  gen.next(res2_int);
+  double res1  = static_cast<double>(res1_int);
+  double res2  = static_cast<double>(res2_int);
+  double mu    = static_cast<double>(params.mu);
+  double sigma = static_cast<double>(params.sigma);
+  box_muller_transform<double>(res1, res2, sigma, mu);
+  *val       = static_cast<IntType>(res1);
+  *(val + 1) = static_cast<IntType>(res2);
 }
 
-// used for Box-Muller type transformations
-template <typename OutType, typename MathType, typename GenType, typename LenType, typename Lambda2>
-__global__ void rand2Kernel(
-  uint64_t seed, uint64_t offset, OutType* ptr, LenType len, Lambda2 rand2Op)
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(GenType& gen,
+                    OutType* val,
+                    NormalTableDistParams<OutType, LenType> params,
+                    LenType idx,
+                    LenType stride)
 {
-  LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  Generator<GenType> gen(seed, (uint64_t)tid, offset);
-  const LenType stride = gridDim.x * blockDim.x;
-  for (LenType idx = tid; idx < len; idx += stride) {
-    MathType val1, val2;
-    gen.next(val1);
-    gen.next(val2);
-    rand2Op(val1, val2, idx, idx + stride);
-    if (idx < len) ptr[idx] = (OutType)val1;
-    idx += stride;
-    if (idx < len) ptr[idx] = (OutType)val2;
-  }
+  OutType res1, res2;
+  gen.next(res1);
+  gen.next(res2);
+  LenType col1  = idx % params.n_cols;
+  LenType col2  = (idx + stride) % params.n_cols;
+  OutType mean1 = params.mu_vec[col1];
+  OutType mean2 = params.mu_vec[col2];
+  OutType sig1  = params.sigma_vec == nullptr ? params.sigma : params.sigma_vec[col1];
+  OutType sig2  = params.sigma_vec == nullptr ? params.sigma : params.sigma_vec[col2];
+  box_muller_transform<OutType>(res1, res2, sig1, mean1, sig2, mean2);
+  *val       = res1;
+  *(val + 1) = res2;
 }
 
-template <typename Type>
-__global__ void constFillKernel(Type* ptr, int len, Type val)
+template <typename GenType, typename OutType, typename Type, typename LenType>
+DI void custom_next(
+  GenType& gen, OutType* val, BernoulliDistParams<Type> params, LenType idx = 0, LenType stride = 0)
+{
+  Type res = 0;
+  gen.next(res);
+  *val = res > params.prob;
+}
+
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(GenType& gen,
+                    OutType* val,
+                    ScaledBernoulliDistParams<OutType> params,
+                    LenType idx,
+                    LenType stride)
+{
+  OutType res = 0;
+  gen.next(res);
+  *val = res > params.prob ? -params.scale : params.scale;
+}
+
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(
+  GenType& gen, OutType* val, GumbelDistParams<OutType> params, LenType idx = 0, LenType stride = 0)
+{
+  OutType res = 0;
+  gen.next(res);
+  *val = params.mu - params.beta * raft::myLog(-raft::myLog(res));
+}
+
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(GenType& gen,
+                    OutType* val,
+                    LogNormalDistParams<OutType> params,
+                    LenType idx    = 0,
+                    LenType stride = 0)
 {
-  unsigned tid          = (blockIdx.x * blockDim.x) + threadIdx.x;
-  const unsigned stride = gridDim.x * blockDim.x;
-  for (unsigned idx = tid; idx < len; idx += stride) {
-    ptr[idx] = val;
+  OutType res1 = 0, res2 = 0;
+  gen.next(res1);
+  gen.next(res2);
+  box_muller_transform<OutType>(res1, res2, params.sigma, params.mu);
+  *val       = raft::myExp(res1);
+  *(val + 1) = raft::myExp(res2);
+}
+
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(GenType& gen,
+                    OutType* val,
+                    LogisticDistParams<OutType> params,
+                    LenType idx    = 0,
+                    LenType stride = 0)
+{
+  OutType res;
+  gen.next(res);
+  constexpr OutType one = (OutType)1.0;
+  *val                  = params.mu - params.scale * raft::myLog(one / res - one);
+}
+
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(GenType& gen,
+                    OutType* val,
+                    ExponentialDistParams<OutType> params,
+                    LenType idx    = 0,
+                    LenType stride = 0)
+{
+  OutType res;
+  gen.next(res);
+  constexpr OutType one = (OutType)1.0;
+  *val                  = -raft::myLog(one - res) / params.lambda;
+}
+
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(GenType& gen,
+                    OutType* val,
+                    RayleighDistParams<OutType> params,
+                    LenType idx    = 0,
+                    LenType stride = 0)
+{
+  OutType res;
+  gen.next(res);
+  constexpr OutType one = (OutType)1.0;
+  constexpr OutType two = (OutType)2.0;
+  *val                  = raft::mySqrt(-two * raft::myLog(one - res)) * params.sigma;
+}
+
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(GenType& gen,
+                    OutType* val,
+                    LaplaceDistParams<OutType> params,
+                    LenType idx    = 0,
+                    LenType stride = 0)
+{
+  OutType res, out;
+  gen.next(res);
+  constexpr OutType one     = (OutType)1.0;
+  constexpr OutType two     = (OutType)2.0;
+  constexpr OutType oneHalf = (OutType)0.5;
+  if (res <= oneHalf) {
+    out = params.mu + params.scale * raft::myLog(two * res);
+  } else {
+    out = params.mu - params.scale * raft::myLog(two * (one - res));
+  }
+  *val = out;
+}
+
+template <typename GenType, typename OutType, typename LenType>
+DI void custom_next(
+  GenType& gen, OutType* val, SamplingParams<OutType, LenType> params, LenType idx, LenType stride)
+{
+  OutType res;
+  gen.next(res);
+  params.inIdxPtr[idx]  = idx;
+  constexpr OutType one = (OutType)1.0;
+  auto exp              = -raft::myLog(one - res);
+  if (params.wts != nullptr) {
+    *val = exp / params.wts[idx];
+  } else {
+    *val = exp;
   }
 }
 
+struct RngState {
+  uint64_t seed;
+  uint64_t base_subsequence;
+};
+
 /** Philox-based random number generator */
 // Courtesy: Jakub Szuppe
 struct PhiloxGenerator {
@@ -136,484 +405,416 @@ struct PhiloxGenerator {
    */
   DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset)
   {
-    curand_init(seed, subsequence, offset, &state);
+    curand_init(seed, subsequence, offset, &philox_state);
   }
 
-  /**
-   * @defgroup NextRand Generate the next random number
-   * @{
-   */
-  DI void next(float& ret) { ret = curand_uniform(&(this->state)); }
-  DI void next(double& ret) { ret = curand_uniform_double(&(this->state)); }
-  DI void next(uint32_t& ret) { ret = curand(&(this->state)); }
-  DI void next(uint64_t& ret)
-  {
-    uint32_t a, b;
-    next(a);
-    next(b);
-    ret = (uint64_t)a | ((uint64_t)b << 32);
-  }
-  DI void next(int32_t& ret)
-  {
-    uint32_t val;
-    next(val);
-    ret = int32_t(val & 0x7fffffff);
-  }
-  DI void next(int64_t& ret)
-  {
-    uint64_t val;
-    next(val);
-    ret = int64_t(val & 0x7fffffffffffffff);
-  }
-  /** @} */
-
- private:
-  /** the state for RNG */
-  curandStatePhilox4_32_10_t state;
-};
-
-/** LFSR taps-filter for generating random numbers. */
-// Courtesy: Vinay Deshpande
-struct TapsGenerator {
-  /**
-   * @brief ctor. Initializes the state for RNG
-   * @param seed the seed (can be same across all threads)
-   * @param subsequence unused
-   * @param offset unused
-   */
-  DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset)
+  DI PhiloxGenerator(const RngState& rng_state, const uint64_t subsequence)
   {
-    uint64_t delta  = (blockIdx.x * blockDim.x) + threadIdx.x;
-    uint64_t stride = blockDim.x * gridDim.x;
-    delta += ((blockIdx.y * blockDim.y) + threadIdx.y) * stride;
-    stride *= blockDim.y * gridDim.y;
-    delta += ((blockIdx.z * blockDim.z) + threadIdx.z) * stride;
-    state = seed + delta + 1;
+    curand_init(rng_state.seed, rng_state.base_subsequence + subsequence, 0, &philox_state);
   }
 
   /**
    * @defgroup NextRand Generate the next random number
    * @{
    */
-  template <typename Type>
-  DI void next(Type& ret)
-  {
-    constexpr double ULL_LARGE = 1.8446744073709551614e19;
-    uint64_t val;
-    next(val);
-    ret = static_cast<Type>(val);
-    ret /= static_cast<Type>(ULL_LARGE);
-  }
-  DI void next(uint64_t& ret)
+  DI uint32_t next_u32()
   {
-    constexpr uint64_t TAPS = 0x8000100040002000ULL;
-    constexpr int ROUNDS    = 128;
-    for (int i = 0; i < ROUNDS; i++)
-      state = (state >> 1) ^ (-(state & 1ULL) & TAPS);
-    ret = state;
+    uint32_t ret = curand(&(this->philox_state));
+    return ret;
   }
-  DI void next(uint32_t& ret)
+
+  DI uint64_t next_u64()
   {
-    uint64_t val;
-    next(val);
-    ret = (uint32_t)val;
+    uint64_t ret;
+    uint32_t a, b;
+    a   = next_u32();
+    b   = next_u32();
+    ret = uint64_t(a) | (uint64_t(b) << 32);
+    return ret;
   }
-  DI void next(int32_t& ret)
+
+  DI int32_t next_i32()
   {
+    int32_t ret;
     uint32_t val;
-    next(val);
+    val = next_u32();
     ret = int32_t(val & 0x7fffffff);
+    return ret;
   }
-  DI void next(int64_t& ret)
+
+  DI int64_t next_i64()
   {
+    int64_t ret;
     uint64_t val;
-    next(val);
+    val = next_u64();
     ret = int64_t(val & 0x7fffffffffffffff);
+    return ret;
   }
+
+  DI void next(float& ret) { ret = curand_uniform(&(this->philox_state)); }
+  DI void next(double& ret) { ret = curand_uniform_double(&(this->philox_state)); }
+
+  DI void next(uint32_t& ret) { ret = next_u32(); }
+  DI void next(uint64_t& ret) { ret = next_u64(); }
+  DI void next(int32_t& ret) { ret = next_i32(); }
+  DI void next(int64_t& ret) { ret = next_i64(); }
+
   /** @} */
 
  private:
   /** the state for RNG */
-  uint64_t state;
+  curandStatePhilox4_32_10_t philox_state;
 };
 
-/** Kiss99-based random number generator */
+/** PCG random number generator */
 
-struct Kiss99Generator {
+struct PCGenerator {
   /**
-   * @brief ctor. Initializes the state for RNG
-   * @param seed the seed (can be same across all threads)
-   * @param subsequence unused
+   * @brief ctor. Initializes the state for RNG. This code is derived from PCG basic code
+   * @param seed the seed (can be same across all threads). Same as PCG's initstate
+   * @param subsequence is same as PCG's initseq
    * @param offset unused
    */
-  DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) { initKiss99(seed); }
+  DI PCGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset)
+  {
+    pcg_state = uint64_t(0);
+    inc       = (subsequence << 1u) | 1u;
+    uint32_t discard;
+    next(discard);
+    pcg_state += seed;
+    next(discard);
+    skipahead(offset);
+  }
+
+  DI PCGenerator(const RngState& rng_state, const uint64_t subsequence)
+  {
+    pcg_state = uint64_t(0);
+    inc       = ((rng_state.base_subsequence + subsequence) << 1u) | 1u;
+    uint32_t discard;
+    next(discard);
+    pcg_state += rng_state.seed;
+    next(discard);
+  }
+
+  // Based on "Random Number Generation with Arbitrary Strides" F. B. Brown
+  // Link https://mcnp.lanl.gov/pdf_files/anl-rn-arb-stride.pdf
+  DI void skipahead(uint64_t offset)
+  {
+    uint64_t G = 1;
+    uint64_t h = 6364136223846793005ULL;
+    uint64_t C = 0;
+    uint64_t f = inc;
+    while (offset) {
+      if (offset & 1) {
+        G = G * h;
+        C = C * h + f;
+      }
+      f = f * (h + 1);
+      h = h * h;
+      offset >>= 1;
+    }
+    pcg_state = pcg_state * G + C;
+  }
 
   /**
    * @defgroup NextRand Generate the next random number
+   * @brief This code is derived from PCG basic code
    * @{
    */
-  template <typename Type>
-  DI void next(Type& ret)
+  DI uint32_t next_u32()
   {
-    constexpr double U_LARGE = 4.294967295e9;
-    uint32_t val;
-    next(val);
-    ret = static_cast<Type>(val);
-    ret /= static_cast<Type>(U_LARGE);
+    uint32_t ret;
+    uint64_t oldstate   = pcg_state;
+    pcg_state           = oldstate * 6364136223846793005ULL + inc;
+    uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
+    uint32_t rot        = oldstate >> 59u;
+    ret                 = (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
+    return ret;
   }
-  DI void next(uint32_t& ret)
-  {
-    uint32_t MWC;
-    z   = 36969 * (z & 65535) + (z >> 16);
-    w   = 18000 * (w & 65535) + (w >> 16);
-    MWC = ((z << 16) + w);
-    jsr ^= (jsr << 17);
-    jsr ^= (jsr >> 13);
-    jsr ^= (jsr << 5);
-    jcong = 69069 * jcong + 1234567;
-    MWC   = ((MWC ^ jcong) + jsr);
-    ret   = MWC;
-  }
-  DI void next(uint64_t& ret)
+  DI uint64_t next_u64()
   {
+    uint64_t ret;
     uint32_t a, b;
-    next(a);
-    next(b);
-    ret = (uint64_t)a | ((uint64_t)b << 32);
+    a   = next_u32();
+    b   = next_u32();
+    ret = uint64_t(a) | (uint64_t(b) << 32);
+    return ret;
   }
-  DI void next(int32_t& ret)
+
+  DI int32_t next_i32()
   {
+    int32_t ret;
     uint32_t val;
-    next(val);
+    val = next_u32();
     ret = int32_t(val & 0x7fffffff);
+    return ret;
   }
-  DI void next(int64_t& ret)
+
+  DI int64_t next_i64()
   {
+    int64_t ret;
     uint64_t val;
-    next(val);
+    val = next_u64();
     ret = int64_t(val & 0x7fffffffffffffff);
+    return ret;
   }
-  /** @} */
 
- private:
-  /** one of the kiss99 states */
-  uint32_t z;
-  /** one of the kiss99 states */
-  uint32_t w;
-  /** one of the kiss99 states */
-  uint32_t jsr;
-  /** one of the kiss99 states */
-  uint32_t jcong;
-
-  // This function multiplies 128-bit hash by 128-bit FNV prime and returns lower
-  // 128 bits. It uses 32-bit wide multiply only.
-  DI void mulByFnv1a128Prime(uint32_t* h)
-  {
-    typedef union {
-      uint32_t u32[2];
-      uint64_t u64[1];
-    } words64;
-
-    // 128-bit FNV prime = p3 * 2^96 + p2 * 2^64 + p1 * 2^32 + p0
-    // Here p0 = 315, p2 = 16777216, p1 = p3 = 0
-    const uint32_t p0 = uint32_t(315), p2 = uint32_t(16777216);
-    // Partial products
-    words64 h0p0, h1p0, h2p0, h0p2, h3p0, h1p2;
-
-    h0p0.u64[0] = uint64_t(h[0]) * p0;
-    h1p0.u64[0] = uint64_t(h[1]) * p0;
-    h2p0.u64[0] = uint64_t(h[2]) * p0;
-    h0p2.u64[0] = uint64_t(h[0]) * p2;
-    h3p0.u64[0] = uint64_t(h[3]) * p0;
-    h1p2.u64[0] = uint64_t(h[1]) * p2;
-
-    // h_n[0] = LO(h[0]*p[0]);
-    // h_n[1] = HI(h[0]*p[0]) + LO(h[1]*p[0]);
-    // h_n[2] = HI(h[1]*p[0]) + LO(h[2]*p[0]) + LO(h[0]*p[2]);
-    // h_n[3] = HI(h[2]*p[0]) + HI(h[0]*p[2]) + LO(h[3]*p[0]) + LO(h[1]*p[2]);
-    uint32_t carry = 0;
-    h[0]           = h0p0.u32[0];
-
-    h[1]  = h0p0.u32[1] + h1p0.u32[0];
-    carry = h[1] < h0p0.u32[1] ? 1 : 0;
-
-    h[2]  = h1p0.u32[1] + carry;
-    carry = h[2] < h1p0.u32[1] ? 1 : 0;
-    h[2] += h2p0.u32[0];
-    carry = h[2] < h2p0.u32[0] ? carry + 1 : carry;
-    h[2] += h0p2.u32[0];
-    carry = h[2] < h0p2.u32[0] ? carry + 1 : carry;
-
-    h[3] = h2p0.u32[1] + h0p2.u32[1] + h3p0.u32[0] + h1p2.u32[0] + carry;
-    return;
+  DI float next_float()
+  {
+    float ret;
+    uint32_t val = next_u32() >> 8;
+    ret          = static_cast<float>(val) / (1U << 24);
+    return ret;
   }
 
-  DI void fnv1a128(uint32_t* hash, uint32_t txt)
+  DI double next_double()
   {
-    hash[0] ^= (txt >> 0) & 0xFF;
-    mulByFnv1a128Prime(hash);
-    hash[0] ^= (txt >> 8) & 0xFF;
-    mulByFnv1a128Prime(hash);
-    hash[0] ^= (txt >> 16) & 0xFF;
-    mulByFnv1a128Prime(hash);
-    hash[0] ^= (txt >> 24) & 0xFF;
-    mulByFnv1a128Prime(hash);
+    double ret;
+    uint64_t val = next_u64() >> 11;
+    ret          = static_cast<double>(val) / (1LU << 53);
+    return ret;
   }
 
-  DI void initKiss99(uint64_t seed)
-  {
-    // Initialize hash to 128-bit FNV1a basis
-    uint32_t hash[4] = {1653982605UL, 1656234357UL, 129696066UL, 1818371886UL};
+  DI void next(uint32_t& ret) { ret = next_u32(); }
+  DI void next(uint64_t& ret) { ret = next_u64(); }
+  DI void next(int32_t& ret) { ret = next_i32(); }
+  DI void next(int64_t& ret) { ret = next_i64(); }
 
-    // Digest threadIdx, blockIdx and seed
-    fnv1a128(hash, threadIdx.x);
-    fnv1a128(hash, threadIdx.y);
-    fnv1a128(hash, threadIdx.z);
-    fnv1a128(hash, blockIdx.x);
-    fnv1a128(hash, blockIdx.y);
-    fnv1a128(hash, blockIdx.z);
-    fnv1a128(hash, uint32_t(seed));
-    fnv1a128(hash, uint32_t(seed >> 32));
+  DI void next(float& ret) { ret = next_float(); }
+  DI void next(double& ret) { ret = next_double(); }
 
-    // Initialize KISS99 state with hash
-    z     = hash[0];
-    w     = hash[1];
-    jsr   = hash[2];
-    jcong = hash[3];
-  }
+  /** @} */
+
+ private:
+  uint64_t pcg_state;
+  uint64_t inc;
 };
 
-/** The main random number generator class, fully on GPUs */
+template <typename OutType,
+          typename LenType,
+          typename GenType,
+          int ITEMS_PER_CALL,
+          typename ParamType>
+__global__ void fillKernel(
+  uint64_t seed, uint64_t adv_subs, uint64_t offset, OutType* ptr, LenType len, ParamType params)
+{
+  LenType tid = threadIdx.x + blockIdx.x * blockDim.x;
+  GenType gen(seed, adv_subs + (uint64_t)tid, offset);
+  const LenType stride = gridDim.x * blockDim.x;
+  for (LenType idx = tid; idx < len; idx += stride * ITEMS_PER_CALL) {
+    OutType val[ITEMS_PER_CALL];
+    custom_next(gen, val, params, idx, stride);
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_CALL; i++) {
+      if ((idx + i * stride) < len) ptr[idx + i * stride] = val[i];
+    }
+  }
+  return;
+}
+
 class RngImpl {
  public:
-  RngImpl(uint64_t _s, GeneratorType _t = GenPhilox)
-    : type(_t),
-      offset(0),
+  RngImpl(uint64_t seed, GeneratorType _t = GenPhilox)
+    : state{seed, 0},
+      type(_t),
       // simple heuristic to make sure all SMs will be occupied properly
       // and also not too many initialization calls will be made by each thread
-      nBlocks(4 * getMultiProcessorCount()),
-      gen()
+      nBlocks(4 * getMultiProcessorCount())
   {
-    seed(_s);
-  }
-
-  void seed(uint64_t _s)
-  {
-    gen.seed(_s);
-    offset = 0;
   }
 
   template <typename IdxT>
   void affine_transform_params(IdxT n, IdxT& a, IdxT& b)
   {
     // always keep 'a' to be coprime to 'n'
-    a = gen() % n;
+    std::mt19937_64 mt_rng(state.seed + state.base_subsequence);
+    a = mt_rng() % n;
     while (gcd(a, n) != 1) {
       ++a;
       if (a >= n) a = 0;
     }
     // the bias term 'b' can be any number in the range of [0, n)
-    b = gen() % n;
+    b = mt_rng() % n;
   }
 
-  template <typename Type, typename LenType = int>
-  void uniform(Type* ptr, LenType len, Type start, Type end, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void uniform(OutType* ptr, LenType len, OutType start, OutType end, cudaStream_t stream)
   {
-    static_assert(std::is_floating_point<Type>::value,
+    static_assert(std::is_floating_point<OutType>::value,
                   "Type for 'uniform' can only be floating point!");
-    custom_distribution(
-      ptr,
-      len,
-      [=] __device__(Type val, LenType idx) { return (val * (end - start)) + start; },
-      stream);
-  }
-  template <typename IntType, typename LenType = int>
-  void uniformInt(IntType* ptr, LenType len, IntType start, IntType end, cudaStream_t stream)
-  {
-    static_assert(std::is_integral<IntType>::value, "Type for 'uniformInt' can only be integer!");
-    custom_distribution(
-      ptr,
-      len,
-      [=] __device__(IntType val, LenType idx) { return (val % (end - start)) + start; },
-      stream);
+    UniformDistParams<OutType> params;
+    params.start = start;
+    params.end   = end;
+    kernel_dispatch<OutType, LenType, 1, UniformDistParams<OutType>>(ptr, len, stream, params);
+  }
+
+  template <typename OutType, typename LenType = int>
+  void uniformInt(OutType* ptr, LenType len, OutType start, OutType end, cudaStream_t stream)
+  {
+    static_assert(std::is_integral<OutType>::value, "Type for 'uniformInt' can only be integer!");
+    ASSERT(end > start, "'end' must be greater than 'start'");
+    if (sizeof(OutType) == 4) {
+      UniformIntDistParams<OutType, uint32_t> params;
+      params.start = start;
+      params.end   = end;
+      params.diff  = uint32_t(params.end - params.start);
+      kernel_dispatch<OutType, LenType, 1, UniformIntDistParams<OutType, uint32_t>>(
+        ptr, len, stream, params);
+    } else {
+      UniformIntDistParams<OutType, uint64_t> params;
+      params.start = start;
+      params.end   = end;
+      params.diff  = uint64_t(params.end - params.start);
+      kernel_dispatch<OutType, LenType, 1, UniformIntDistParams<OutType, uint64_t>>(
+        ptr, len, stream, params);
+    }
   }
 
-  template <typename Type, typename LenType = int>
-  void normal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void normal(OutType* ptr, LenType len, OutType mu, OutType sigma, cudaStream_t stream)
   {
-    static_assert(std::is_floating_point<Type>::value,
+    static_assert(std::is_floating_point<OutType>::value,
                   "Type for 'normal' can only be floating point!");
-    rand2Impl(
-      offset,
-      ptr,
-      len,
-      [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) {
-        box_muller_transform<Type>(val1, val2, sigma, mu);
-      },
-      NumThreads,
-      nBlocks,
-      type,
-      stream);
+    NormalDistParams<OutType> params;
+    params.mu    = mu;
+    params.sigma = sigma;
+    kernel_dispatch<OutType, LenType, 2, NormalDistParams<OutType>>(ptr, len, stream, params);
   }
+
   template <typename IntType, typename LenType = int>
   void normalInt(IntType* ptr, LenType len, IntType mu, IntType sigma, cudaStream_t stream)
   {
     static_assert(std::is_integral<IntType>::value, "Type for 'normalInt' can only be integer!");
-    rand2Impl<IntType, double>(
-      offset,
-      ptr,
-      len,
-      [=] __device__(double& val1, double& val2, LenType idx1, LenType idx2) {
-        box_muller_transform<double>(val1, val2, sigma, mu);
-      },
-      NumThreads,
-      nBlocks,
-      type,
-      stream);
-  }
-
-  template <typename Type, typename LenType = int>
-  void normalTable(Type* ptr,
+    NormalIntDistParams<IntType> params;
+    params.mu    = mu;
+    params.sigma = sigma;
+    kernel_dispatch<IntType, LenType, 2, NormalIntDistParams<IntType>>(ptr, len, stream, params);
+  }
+
+  template <typename OutType, typename LenType = int>
+  void normalTable(OutType* ptr,
                    LenType n_rows,
                    LenType n_cols,
-                   const Type* mu,
-                   const Type* sigma_vec,
-                   Type sigma,
+                   const OutType* mu_vec,
+                   const OutType* sigma_vec,
+                   OutType sigma,
                    cudaStream_t stream)
   {
-    rand2Impl(
-      offset,
-      ptr,
-      n_rows * n_cols,
-      [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) {
-        // yikes! use fast-int-div
-        auto col1  = idx1 % n_cols;
-        auto col2  = idx2 % n_cols;
-        auto mean1 = mu[col1];
-        auto mean2 = mu[col2];
-        auto sig1  = sigma_vec == nullptr ? sigma : sigma_vec[col1];
-        auto sig2  = sigma_vec == nullptr ? sigma : sigma_vec[col2];
-        box_muller_transform<Type>(val1, val2, sig1, mean1, sig2, mean2);
-      },
-      NumThreads,
-      nBlocks,
-      type,
-      stream);
-  }
-
-  template <typename Type, typename LenType = int>
-  void fill(Type* ptr, LenType len, Type val, cudaStream_t stream)
-  {
-    detail::constFillKernel<Type><<<nBlocks, NumThreads, 0, stream>>>(ptr, len, val);
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
+    NormalTableDistParams<OutType, LenType> params;
+    params.n_rows    = n_rows;
+    params.n_cols    = n_cols;
+    params.mu_vec    = mu_vec;
+    params.sigma     = sigma;
+    params.sigma_vec = sigma_vec;
+    LenType len      = n_rows * n_cols;
+    kernel_dispatch<OutType, LenType, 2, NormalTableDistParams<OutType, LenType>>(
+      ptr, len, stream, params);
+  }
+
+  template <typename OutType, typename LenType = int>
+  void fill(OutType* ptr, LenType len, OutType val, cudaStream_t stream)
+  {
+    InvariantDistParams<OutType> params;
+    params.const_val = val;
+    kernel_dispatch<OutType, LenType, 1, InvariantDistParams<OutType>>(ptr, len, stream, params);
   }
 
   template <typename Type, typename OutType = bool, typename LenType = int>
   void bernoulli(OutType* ptr, LenType len, Type prob, cudaStream_t stream)
   {
-    custom_distribution<OutType, Type>(
-      ptr, len, [=] __device__(Type val, LenType idx) { return val > prob; }, stream);
+    BernoulliDistParams<Type> params;
+    params.prob = prob;
+    kernel_dispatch<OutType, LenType, 1, BernoulliDistParams<Type>>(ptr, len, stream, params);
   }
 
-  template <typename Type, typename LenType = int>
-  void scaled_bernoulli(Type* ptr, LenType len, Type prob, Type scale, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void scaled_bernoulli(OutType* ptr, LenType len, OutType prob, OutType scale, cudaStream_t stream)
   {
-    static_assert(std::is_floating_point<Type>::value,
+    static_assert(std::is_floating_point<OutType>::value,
                   "Type for 'scaled_bernoulli' can only be floating point!");
-    custom_distribution(
-      ptr,
-      len,
-      [=] __device__(Type val, LenType idx) { return val > prob ? -scale : scale; },
-      stream);
-  }
-
-  template <typename Type, typename LenType = int>
-  void gumbel(Type* ptr, LenType len, Type mu, Type beta, cudaStream_t stream)
-  {
-    custom_distribution(
-      ptr,
-      len,
-      [=] __device__(Type val, LenType idx) { return mu - beta * raft::myLog(-raft::myLog(val)); },
-      stream);
-  }
-
-  template <typename Type, typename LenType = int>
-  void lognormal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream)
-  {
-    rand2Impl(
-      offset,
-      ptr,
-      len,
-      [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) {
-        box_muller_transform<Type>(val1, val2, sigma, mu);
-        val1 = raft::myExp(val1);
-        val2 = raft::myExp(val2);
-      },
-      NumThreads,
-      nBlocks,
-      type,
-      stream);
-  }
-
-  template <typename Type, typename LenType = int>
-  void logistic(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream)
-  {
-    custom_distribution(
-      ptr,
-      len,
-      [=] __device__(Type val, LenType idx) {
-        constexpr Type one = (Type)1.0;
-        return mu - scale * raft::myLog(one / val - one);
-      },
-      stream);
-  }
-
-  template <typename Type, typename LenType = int>
-  void exponential(Type* ptr, LenType len, Type lambda, cudaStream_t stream)
-  {
-    custom_distribution(
-      ptr,
-      len,
-      [=] __device__(Type val, LenType idx) {
-        constexpr Type one = (Type)1.0;
-        return -raft::myLog(one - val) / lambda;
-      },
-      stream);
-  }
-
-  template <typename Type, typename LenType = int>
-  void rayleigh(Type* ptr, LenType len, Type sigma, cudaStream_t stream)
-  {
-    custom_distribution(
-      ptr,
-      len,
-      [=] __device__(Type val, LenType idx) {
-        constexpr Type one = (Type)1.0;
-        constexpr Type two = (Type)2.0;
-        return raft::mySqrt(-two * raft::myLog(one - val)) * sigma;
-      },
-      stream);
-  }
-
-  template <typename Type, typename LenType = int>
-  void laplace(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream)
-  {
-    custom_distribution(
-      ptr,
-      len,
-      [=] __device__(Type val, LenType idx) {
-        constexpr Type one     = (Type)1.0;
-        constexpr Type two     = (Type)2.0;
-        constexpr Type oneHalf = (Type)0.5;
-        Type out;
-        if (val <= oneHalf) {
-          out = mu + scale * raft::myLog(two * val);
-        } else {
-          out = mu - scale * raft::myLog(two * (one - val));
-        }
-        return out;
-      },
-      stream);
+    ScaledBernoulliDistParams<OutType> params;
+    params.prob  = prob;
+    params.scale = scale;
+    kernel_dispatch<OutType, LenType, 1, ScaledBernoulliDistParams<OutType>>(
+      ptr, len, stream, params);
+  }
+
+  template <typename OutType, typename LenType = int>
+  void gumbel(OutType* ptr, LenType len, OutType mu, OutType beta, cudaStream_t stream)
+  {
+    GumbelDistParams<OutType> params;
+    params.mu   = mu;
+    params.beta = beta;
+    kernel_dispatch<OutType, LenType, 1, GumbelDistParams<OutType>>(ptr, len, stream, params);
+  }
+
+  template <typename OutType, typename LenType = int>
+  void lognormal(OutType* ptr, LenType len, OutType mu, OutType sigma, cudaStream_t stream)
+  {
+    LogNormalDistParams<OutType> params;
+    params.mu    = mu;
+    params.sigma = sigma;
+    kernel_dispatch<OutType, LenType, 2, LogNormalDistParams<OutType>>(ptr, len, stream, params);
+  }
+
+  template <typename OutType, typename LenType = int>
+  void logistic(OutType* ptr, LenType len, OutType mu, OutType scale, cudaStream_t stream)
+  {
+    LogisticDistParams<OutType> params;
+    params.mu    = mu;
+    params.scale = scale;
+    kernel_dispatch<OutType, LenType, 1, LogisticDistParams<OutType>>(ptr, len, stream, params);
+  }
+
+  template <typename OutType, typename LenType = int>
+  void exponential(OutType* ptr, LenType len, OutType lambda, cudaStream_t stream)
+  {
+    ExponentialDistParams<OutType> params;
+    params.lambda = lambda;
+    kernel_dispatch<OutType, LenType, 1, ExponentialDistParams<OutType>>(ptr, len, stream, params);
+  }
+
+  template <typename OutType, typename LenType = int>
+  void rayleigh(OutType* ptr, LenType len, OutType sigma, cudaStream_t stream)
+  {
+    RayleighDistParams<OutType> params;
+    params.sigma = sigma;
+    kernel_dispatch<OutType, LenType, 1, RayleighDistParams<OutType>>(ptr, len, stream, params);
+  }
+
+  template <typename OutType, typename LenType = int>
+  void laplace(OutType* ptr, LenType len, OutType mu, OutType scale, cudaStream_t stream)
+  {
+    LaplaceDistParams<OutType> params;
+    params.mu    = mu;
+    params.scale = scale;
+    kernel_dispatch<OutType, LenType, 1, LaplaceDistParams<OutType>>(ptr, len, stream, params);
+  }
+
+  void advance(uint64_t max_uniq_subsequences_used,
+               uint64_t max_numbers_generated_per_subsequence = 0)
+  {
+    state.base_subsequence += max_uniq_subsequences_used;
+  }
+
+  template <typename OutType, typename LenType, int ITEMS_PER_CALL, typename ParamType>
+  void kernel_dispatch(OutType* ptr, LenType len, cudaStream_t stream, ParamType params)
+  {
+    switch (type) {
+      case GenPhilox:
+        fillKernel<OutType, LenType, PhiloxGenerator, ITEMS_PER_CALL>
+          <<<nBlocks, nThreads, 0, stream>>>(
+            state.seed, state.base_subsequence, 0, ptr, len, params);
+        break;
+      case GenPC:
+        fillKernel<OutType, LenType, PCGenerator, ITEMS_PER_CALL><<<nBlocks, nThreads, 0, stream>>>(
+          state.seed, state.base_subsequence, 0, ptr, len, params);
+        break;
+      default: break;
+    }
+    // The max_numbers_generated_per_subsequence parameter does not matter for now, using 16 for now
+    advance(uint64_t(nBlocks) * nThreads, 16);
+    return;
   }
 
   template <typename DataT, typename WeightsT, typename IdxT = int>
@@ -634,141 +835,28 @@ class RngImpl {
     rmm::device_uvector<IdxT> outIdxBuff(len, stream);
     auto* inIdxPtr = inIdx.data();
     // generate modified weights
-    custom_distribution(
-      expWts.data(),
-      len,
-      [wts, inIdxPtr] __device__(WeightsT val, IdxT idx) {
-        inIdxPtr[idx]          = idx;
-        constexpr WeightsT one = (WeightsT)1.0;
-        auto exp               = -raft::myLog(one - val);
-        if (wts != nullptr) { return exp / wts[idx]; }
-        return exp;
-      },
-      stream);
+    SamplingParams<WeightsT, IdxT> params;
+    params.inIdxPtr = inIdxPtr;
+    params.wts      = wts;
+    kernel_dispatch<WeightsT, IdxT, 1, SamplingParams<WeightsT, IdxT>>(
+      expWts.data(), len, stream, params);
     ///@todo: use a more efficient partitioning scheme instead of full sort
     // sort the array and pick the top sampledLen items
     IdxT* outIdxPtr = outIdxBuff.data();
     rmm::device_uvector<char> workspace(0, stream);
     sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, (int)len, stream);
     if (outIdx != nullptr) {
-      RAFT_CUDA_TRY(cudaMemcpyAsync(
+      CUDA_CHECK(cudaMemcpyAsync(
         outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, cudaMemcpyDeviceToDevice, stream));
     }
-    raft::scatter<DataT, IdxT>(out, in, outIdxPtr, sampledLen, stream);
+    scatter<DataT, IdxT>(out, in, outIdxPtr, sampledLen, stream);
   }
 
-  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
-  void custom_distribution(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream)
-  {
-    randImpl<OutType, MathType, LenType, Lambda>(
-      offset, ptr, len, randOp, NumThreads, nBlocks, type, stream);
-  }
-  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
-  void custom_distribution2(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream)
-  {
-    rand2Impl<OutType, MathType, LenType, Lambda>(
-      offset, ptr, len, randOp, NumThreads, nBlocks, type, stream);
-  }
-  /** @} */
-
- private:
-  /** generator type */
   GeneratorType type;
-  /**
-   * offset is also used to initialize curand state.
-   * Limits period of Philox RNG from (4 * 2^128) to (Blocks * Threads * 2^64),
-   * but is still a large period.
-   */
-  uint64_t offset;
+  RngState state;
   /** number of blocks to launch */
   int nBlocks;
-  /** next seed generator for device-side RNG */
-  std::mt19937_64 gen;
-
-  static const int NumThreads = 256;
-
-  template <bool IsNormal, typename Type, typename LenType>
-  uint64_t _setupSeeds(uint64_t& seed, uint64_t& offset, LenType len, int nThreads, int nBlocks)
-  {
-    LenType itemsPerThread = raft::ceildiv(len, LenType(nBlocks * nThreads));
-    if (IsNormal && itemsPerThread % 2 == 1) { ++itemsPerThread; }
-    // curand uses 2 32b uint's to generate one double
-    uint64_t factor = sizeof(Type) / sizeof(float);
-    if (factor == 0) ++factor;
-    // Check if there are enough random numbers left in sequence
-    // If not, then generate new seed and start from zero offset
-    uint64_t newOffset = offset + LenType(itemsPerThread) * factor;
-    if (newOffset < offset) {
-      offset    = 0;
-      seed      = gen();
-      newOffset = itemsPerThread * factor;
-    }
-    return newOffset;
-  }
-
-  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
-  void randImpl(uint64_t& offset,
-                OutType* ptr,
-                LenType len,
-                Lambda randOp,
-                int nThreads,
-                int nBlocks,
-                GeneratorType type,
-                cudaStream_t stream)
-  {
-    if (len <= 0) return;
-    uint64_t seed  = gen();
-    auto newOffset = _setupSeeds<false, MathType, LenType>(seed, offset, len, nThreads, nBlocks);
-    switch (type) {
-      case GenPhilox:
-        detail::randKernel<OutType, MathType, detail::PhiloxGenerator, LenType, Lambda>
-          <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
-        break;
-      case GenTaps:
-        detail::randKernel<OutType, MathType, detail::TapsGenerator, LenType, Lambda>
-          <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
-        break;
-      case GenKiss99:
-        detail::randKernel<OutType, MathType, detail::Kiss99Generator, LenType, Lambda>
-          <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
-        break;
-      default: ASSERT(false, "randImpl: Incorrect generator type! %d", type);
-    };
-    RAFT_CUDA_TRY(cudaGetLastError());
-    offset = newOffset;
-  }
-
-  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda2>
-  void rand2Impl(uint64_t& offset,
-                 OutType* ptr,
-                 LenType len,
-                 Lambda2 rand2Op,
-                 int nThreads,
-                 int nBlocks,
-                 GeneratorType type,
-                 cudaStream_t stream)
-  {
-    if (len <= 0) return;
-    auto seed      = gen();
-    auto newOffset = _setupSeeds<true, MathType, LenType>(seed, offset, len, nThreads, nBlocks);
-    switch (type) {
-      case GenPhilox:
-        detail::rand2Kernel<OutType, MathType, detail::PhiloxGenerator, LenType, Lambda2>
-          <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
-        break;
-      case GenTaps:
-        detail::rand2Kernel<OutType, MathType, detail::TapsGenerator, LenType, Lambda2>
-          <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
-        break;
-      case GenKiss99:
-        detail::rand2Kernel<OutType, MathType, detail::Kiss99Generator, LenType, Lambda2>
-          <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
-        break;
-      default: ASSERT(false, "rand2Impl: Incorrect generator type! %d", type);
-    };
-    RAFT_CUDA_TRY(cudaGetLastError());
-    offset = newOffset;
-  }
+  static const int nThreads = 256;
 };
 
 };  // end namespace detail
diff --git a/cpp/include/raft/random/rng.hpp b/cpp/include/raft/random/rng.hpp
index 4ec25e71a2..2b1bdbccf7 100644
--- a/cpp/include/raft/random/rng.hpp
+++ b/cpp/include/raft/random/rng.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,21 +21,33 @@
 namespace raft {
 namespace random {
 
-/** all different generator types used */
+using detail::RngState;
+
 using detail::GeneratorType;
-/** curand-based philox generator */
+using detail::GenPC;
 using detail::GenPhilox;
-/** GenTaps : LFSR taps generator */
-using detail::GenTaps;
-/** GenKiss99 : kiss99 generator (currently the fastest) */
-using detail::GenKiss99;
 
-/** Philox-based random number generator */
+using detail::PCGenerator;
 using detail::PhiloxGenerator;
-/** LFSR taps-filter for generating random numbers. */
-using detail::TapsGenerator;
-/** Kiss99-based random number generator */
-using detail::Kiss99Generator;
+
+using detail::BernoulliDistParams;
+using detail::ExponentialDistParams;
+using detail::GumbelDistParams;
+using detail::InvariantDistParams;
+using detail::LaplaceDistParams;
+using detail::LogisticDistParams;
+using detail::LogNormalDistParams;
+using detail::NormalDistParams;
+using detail::NormalIntDistParams;
+using detail::NormalTableDistParams;
+using detail::RayleighDistParams;
+using detail::SamplingParams;
+using detail::ScaledBernoulliDistParams;
+using detail::UniformDistParams;
+using detail::UniformIntDistParams;
+
+// Not strictly needed due to C++ ADL rules
+using detail::custom_next;
 
 /**
  * @brief Helper method to compute Box Muller transform
@@ -53,8 +65,9 @@ using detail::Kiss99Generator;
 template <typename Type>
 DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2)
 {
-  detail::box_muller_transform(val1, val2, sigma1, mu1, sigma1, mu2);
+  detail::box_muller_transform(val1, val2, sigma1, mu1, sigma2, mu2);
 }
+
 template <typename Type>
 DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1)
 {
@@ -62,7 +75,6 @@ DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1)
 }
 /** @} */
 
-/** The main random number generator class, fully on GPUs */
 class Rng : public detail::RngImpl {
  public:
   /**
@@ -71,16 +83,7 @@ class Rng : public detail::RngImpl {
    * @param _t backend device RNG generator type
    * @note Refer to the `Rng::seed` method for details about seeding the engine
    */
-  Rng(uint64_t _s, GeneratorType _t = GenPhilox) : detail::RngImpl{_s, _t} {}
-
-  /**
-   * @brief Seed (and thus re-initialize) the underlying RNG engine
-   * @param _s 64b seed used to initialize the RNG
-   * @note If you need non-reproducibility, pass a seed that's, for example, a
-   *       function of timestamp. Another example is to use the c++11's
-   *       `std::random_device` for setting seed.
-   */
-  void seed(uint64_t _s) { detail::RngImpl::seed(_s); }
+  Rng(uint64_t _s, GeneratorType _t = GenPhilox) : detail::RngImpl(_s, _t) {}
 
   /**
    * @brief Generates the 'a' and 'b' parameters for a modulo affine
@@ -109,13 +112,14 @@ class Rng : public detail::RngImpl {
    * @param stream stream where to launch the kernel
    * @{
    */
-  template <typename Type, typename LenType = int>
-  void uniform(Type* ptr, LenType len, Type start, Type end, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void uniform(OutType* ptr, LenType len, OutType start, OutType end, cudaStream_t stream)
   {
     detail::RngImpl::uniform(ptr, len, start, end, stream);
   }
-  template <typename IntType, typename LenType = int>
-  void uniformInt(IntType* ptr, LenType len, IntType start, IntType end, cudaStream_t stream)
+
+  template <typename OutType, typename LenType = int>
+  void uniformInt(OutType* ptr, LenType len, OutType start, OutType end, cudaStream_t stream)
   {
     detail::RngImpl::uniformInt(ptr, len, start, end, stream);
   }
@@ -132,11 +136,12 @@ class Rng : public detail::RngImpl {
    * @param stream stream where to launch the kernel
    * @{
    */
-  template <typename Type, typename LenType = int>
-  void normal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void normal(OutType* ptr, LenType len, OutType mu, OutType sigma, cudaStream_t stream)
   {
     detail::RngImpl::normal(ptr, len, mu, sigma, stream);
   }
+
   template <typename IntType, typename LenType = int>
   void normalInt(IntType* ptr, LenType len, IntType mu, IntType sigma, cudaStream_t stream)
   {
@@ -158,22 +163,22 @@ class Rng : public detail::RngImpl {
    * @param ptr the output table (dim = n_rows x n_cols)
    * @param n_rows number of rows in the table
    * @param n_cols number of columns in the table
-   * @param mu mean vector (dim = n_cols x 1).
+   * @param mu_vec mean vector (dim = n_cols x 1).
    * @param sigma_vec std-dev vector of each component (dim = n_cols x 1). Pass
    * a nullptr to use the same scalar 'sigma' across all components
    * @param sigma scalar sigma to be used if 'sigma_vec' is nullptr
    * @param stream stream where to launch the kernel
    */
-  template <typename Type, typename LenType = int>
-  void normalTable(Type* ptr,
+  template <typename OutType, typename LenType = int>
+  void normalTable(OutType* ptr,
                    LenType n_rows,
                    LenType n_cols,
-                   const Type* mu,
-                   const Type* sigma_vec,
-                   Type sigma,
+                   const OutType* mu_vec,
+                   const OutType* sigma_vec,
+                   OutType sigma,
                    cudaStream_t stream)
   {
-    detail::RngImpl::normalTable(ptr, n_rows, n_cols, mu, sigma_vec, sigma, stream);
+    detail::RngImpl::normalTable(ptr, n_rows, n_cols, mu_vec, sigma_vec, sigma, stream);
   }
 
   /**
@@ -185,8 +190,8 @@ class Rng : public detail::RngImpl {
    * @param val value to be filled
    * @param stream stream where to launch the kernel
    */
-  template <typename Type, typename LenType = int>
-  void fill(Type* ptr, LenType len, Type val, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void fill(OutType* ptr, LenType len, OutType val, cudaStream_t stream)
   {
     detail::RngImpl::fill(ptr, len, val, stream);
   }
@@ -219,14 +224,14 @@ class Rng : public detail::RngImpl {
    * @param scale scaling factor
    * @param stream stream where to launch the kernel
    */
-  template <typename Type, typename LenType = int>
-  void scaled_bernoulli(Type* ptr, LenType len, Type prob, Type scale, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void scaled_bernoulli(OutType* ptr, LenType len, OutType prob, OutType scale, cudaStream_t stream)
   {
     detail::RngImpl::scaled_bernoulli(ptr, len, prob, scale, stream);
   }
 
   /**
-   * @brief Generate gumbel distributed random numbers
+   * @brief Generate Gumbel distributed random numbers
    * @tparam Type data type of output random number
    * @tparam LenType data type used to represent length of the arrays
    * @param ptr output array
@@ -236,8 +241,8 @@ class Rng : public detail::RngImpl {
    * @param stream stream where to launch the kernel
    * @note https://en.wikipedia.org/wiki/Gumbel_distribution
    */
-  template <typename Type, typename LenType = int>
-  void gumbel(Type* ptr, LenType len, Type mu, Type beta, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void gumbel(OutType* ptr, LenType len, OutType mu, OutType beta, cudaStream_t stream)
   {
     detail::RngImpl::gumbel(ptr, len, mu, beta, stream);
   }
@@ -252,8 +257,8 @@ class Rng : public detail::RngImpl {
    * @param sigma std-dev of the distribution
    * @param stream stream where to launch the kernel
    */
-  template <typename Type, typename LenType = int>
-  void lognormal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void lognormal(OutType* ptr, LenType len, OutType mu, OutType sigma, cudaStream_t stream)
   {
     detail::RngImpl::lognormal(ptr, len, mu, sigma, stream);
   }
@@ -268,8 +273,8 @@ class Rng : public detail::RngImpl {
    * @param scale scale value
    * @param stream stream where to launch the kernel
    */
-  template <typename Type, typename LenType = int>
-  void logistic(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void logistic(OutType* ptr, LenType len, OutType mu, OutType scale, cudaStream_t stream)
   {
     detail::RngImpl::logistic(ptr, len, mu, scale, stream);
   }
@@ -283,8 +288,8 @@ class Rng : public detail::RngImpl {
    * @param lambda the lambda
    * @param stream stream where to launch the kernel
    */
-  template <typename Type, typename LenType = int>
-  void exponential(Type* ptr, LenType len, Type lambda, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void exponential(OutType* ptr, LenType len, OutType lambda, cudaStream_t stream)
   {
     detail::RngImpl::exponential(ptr, len, lambda, stream);
   }
@@ -298,8 +303,8 @@ class Rng : public detail::RngImpl {
    * @param sigma the sigma
    * @param stream stream where to launch the kernel
    */
-  template <typename Type, typename LenType = int>
-  void rayleigh(Type* ptr, LenType len, Type sigma, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void rayleigh(OutType* ptr, LenType len, OutType sigma, cudaStream_t stream)
   {
     detail::RngImpl::rayleigh(ptr, len, sigma, stream);
   }
@@ -314,12 +319,17 @@ class Rng : public detail::RngImpl {
    * @param scale the scale
    * @param stream stream where to launch the kernel
    */
-  template <typename Type, typename LenType = int>
-  void laplace(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream)
+  template <typename OutType, typename LenType = int>
+  void laplace(OutType* ptr, LenType len, OutType mu, OutType scale, cudaStream_t stream)
   {
     detail::RngImpl::laplace(ptr, len, mu, scale, stream);
   }
 
+  void advance(uint64_t max_streams, uint64_t max_calls_per_subsequence)
+  {
+    detail::RngImpl::advance(max_streams, max_calls_per_subsequence);
+  }
+
   /**
    * @brief Sample the input array without replacement, optionally based on the
    * input weight vector for each element in the array
@@ -359,33 +369,6 @@ class Rng : public detail::RngImpl {
     detail::RngImpl::sampleWithoutReplacement(
       handle, out, outIdx, in, wts, sampledLen, len, stream);
   }
-
-  /**
-   * @brief Core method to generate a pdf based on the cdf that is defined in
-   *        the input device lambda
-   *
-   * @tparam OutType  output type
-   * @tparam MathType type on which arithmetic is done
-   * @tparam LenTyp   index type
-   * @tparam Lambda   device lambda (or operator)
-   *
-   * @param[out] ptr    output buffer [on device] [len = len]
-   * @param[in]  len    number of elements to be generated
-   * @param[in]  randOp the device lambda or operator
-   * @param[in]  stream cuda stream
-   * @{
-   */
-  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
-  void custom_distribution(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream)
-  {
-    detail::RngImpl::custom_distribution(ptr, len, randOp, stream);
-  }
-  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
-  void custom_distribution2(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream)
-  {
-    detail::RngImpl::custom_distribution2(ptr, len, randOp, stream);
-  }
-  /** @} */
 };
 
 };  // end namespace random
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index 7aab2c18c0..e35835a445 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -21,7 +21,6 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/eig.hpp>
-#include <raft/random/rng.hpp>
 
 namespace raft {
 namespace linalg {
@@ -30,10 +29,8 @@ template <typename T>
 struct EigSelInputs {
   T tolerance;
   int len;
-  int n_row;
-  int n_col;
-  unsigned long long int seed;
   int n;
+  int n_eigen_vals;
 };
 
 template <typename T>
@@ -49,10 +46,10 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
     : params(::testing::TestWithParam<EigSelInputs<T>>::GetParam()),
       stream(handle.get_stream()),
       cov_matrix(params.len, stream),
-      eig_vectors(12, stream),
-      eig_vectors_ref(12, stream),
-      eig_vals(params.n_col, stream),
-      eig_vals_ref(params.n_col, stream)
+      eig_vectors(params.n_eigen_vals * params.n, stream),
+      eig_vectors_ref(params.n_eigen_vals * params.n, stream),
+      eig_vals(params.n, stream),
+      eig_vals_ref(params.n, stream)
   {
   }
 
@@ -61,6 +58,7 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
   {
     int len = params.len;
 
+    ///@todo: Generate a random symmetric matrix
     T cov_matrix_h[] = {
       1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0};
     ASSERT(len == 16, "This test only works with 4x4 matrices!");
@@ -78,16 +76,17 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
                              0.5123,
                              0.5123,
                              0.4874};
-    T eig_vals_ref_h[]    = {0.1024, 0.3096, 3.5266, 3.5266};
+    T eig_vals_ref_h[]    = {0.1024, 0.3096, 3.5266, 0.0};
 
-    raft::update_device(eig_vectors_ref.data(), eig_vectors_ref_h, 12, stream);
-    raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, 4, stream);
+    raft::update_device(
+      eig_vectors_ref.data(), eig_vectors_ref_h, params.n_eigen_vals * params.n, stream);
+    raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, params.n, stream);
 
     raft::linalg::eigSelDC(handle,
                            cov_matrix.data(),
-                           params.n_row,
-                           params.n_col,
-                           3,
+                           params.n,
+                           params.n,
+                           params.n_eigen_vals,
                            eig_vectors.data(),
                            eig_vals.data(),
                            EigVecMemUsage::OVERWRITE_INPUT,
@@ -107,16 +106,16 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
   rmm::device_uvector<T> eig_vals_ref;
 };
 
-const std::vector<EigSelInputs<float>> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}};
+const std::vector<EigSelInputs<float>> inputsf2 = {{0.001f, 4 * 4, 4, 3}};
 
-const std::vector<EigSelInputs<double>> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}};
+const std::vector<EigSelInputs<double>> inputsd2 = {{0.001, 4 * 4, 4, 3}};
 
 typedef EigSelTest<float> EigSelTestValF;
 TEST_P(EigSelTestValF, Result)
 {
   ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(),
                                 eig_vals.data(),
-                                params.n_col,
+                                params.n_eigen_vals,
                                 raft::CompareApproxAbs<float>(params.tolerance),
                                 stream));
 }
@@ -126,7 +125,7 @@ TEST_P(EigSelTestValD, Result)
 {
   ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(),
                                 eig_vals.data(),
-                                params.n_col,
+                                params.n_eigen_vals,
                                 raft::CompareApproxAbs<double>(params.tolerance),
                                 stream));
 }
@@ -136,7 +135,7 @@ TEST_P(EigSelTestVecF, Result)
 {
   ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(),
                                 eig_vectors.data(),
-                                12,
+                                params.n_eigen_vals * params.n,
                                 raft::CompareApproxAbs<float>(params.tolerance),
                                 stream));
 }
@@ -146,7 +145,7 @@ TEST_P(EigSelTestVecD, Result)
 {
   ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(),
                                 eig_vectors.data(),
-                                12,
+                                params.n_eigen_vals * params.n,
                                 raft::CompareApproxAbs<double>(params.tolerance),
                                 stream));
 }
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
index 08e522d369..c63763d5a4 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/test/random/rng.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@
 namespace raft {
 namespace random {
 
-using namespace raft::random::detail;
+using namespace raft::random;
 
 enum RandomType {
   RNG_Normal,
@@ -99,7 +99,7 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
   {
     // Tests are configured with their expected test-values sigma. For example,
     // 4 x sigma indicates the test shouldn't fail 99.9% of the time.
-    num_sigma = 10;
+    num_sigma = 4;
     Rng r(params.seed, params.gtype);
     switch (params.type) {
       case RNG_Normal: r.normal(data.data(), params.len, params.start, params.end, stream); break;
@@ -211,7 +211,7 @@ const std::vector<RngInputs<float>> inputsf = {
   {0.005, 8 * 1024, -1.f, 1.f, RNG_Uniform, GenPhilox, 1234ULL},
   {0.005, 32 * 1024, 1.f, 1.f, RNG_Gumbel, GenPhilox, 1234ULL},
   {0.01, 8 * 1024, 1.f, 1.f, RNG_Gumbel, GenPhilox, 1234ULL},
-  {0.005, 32 * 1024, 1.f, 1.f, RNG_Logistic, GenPhilox, 1234ULL},
+  {0.005, 32 * 1024, 1.f, 1.f, RNG_Logistic, GenPhilox, 67632ULL},
   {0.01, 8 * 1024, 1.f, 1.f, RNG_Logistic, GenPhilox, 1234ULL},
   {0.008, 32 * 1024, 1.f, 1.f, RNG_Exp, GenPhilox, 1234ULL},
   {0.015, 8 * 1024, 1.f, 1.f, RNG_Exp, GenPhilox, 1234ULL},
@@ -220,39 +220,22 @@ const std::vector<RngInputs<float>> inputsf = {
   {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenPhilox, 1234ULL},
   {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenPhilox, 1234ULL},
 
-  {0.0055, 32 * 1024, 1.f, 1.f, RNG_Normal, GenTaps, 1234ULL},
-  {0.011, 8 * 1024, 1.f, 1.f, RNG_Normal, GenTaps, 1234ULL},
-  {0.05, 32 * 1024, 1.f, 1.f, RNG_LogNormal, GenTaps, 1234ULL},
-  {0.1, 8 * 1024, 1.f, 1.f, RNG_LogNormal, GenTaps, 1234ULL},
-  {0.003, 32 * 1024, -1.f, 1.f, RNG_Uniform, GenTaps, 1234ULL},
-  {0.005, 8 * 1024, -1.f, 1.f, RNG_Uniform, GenTaps, 1234ULL},
-  {0.005, 32 * 1024, 1.f, 1.f, RNG_Gumbel, GenTaps, 1234ULL},
-  {0.01, 8 * 1024, 1.f, 1.f, RNG_Gumbel, GenTaps, 1234ULL},
-  {0.005, 32 * 1024, 1.f, 1.f, RNG_Logistic, GenTaps, 1234ULL},
-  {0.01, 8 * 1024, 1.f, 1.f, RNG_Logistic, GenTaps, 1234ULL},
-  {0.008, 32 * 1024, 1.f, 1.f, RNG_Exp, GenTaps, 1234ULL},
-  {0.015, 8 * 1024, 1.f, 1.f, RNG_Exp, GenTaps, 1234ULL},
-  {0.0125, 32 * 1024, 1.f, 1.f, RNG_Rayleigh, GenTaps, 1234ULL},
-  {0.025, 8 * 1024, 1.f, 1.f, RNG_Rayleigh, GenTaps, 1234ULL},
-  {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenTaps, 1234ULL},
-  {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenTaps, 1234ULL},
-
-  {0.0055, 32 * 1024, 1.f, 1.f, RNG_Normal, GenKiss99, 1234ULL},
-  {0.011, 8 * 1024, 1.f, 1.f, RNG_Normal, GenKiss99, 1234ULL},
-  {0.05, 32 * 1024, 1.f, 1.f, RNG_LogNormal, GenKiss99, 1234ULL},
-  {0.1, 8 * 1024, 1.f, 1.f, RNG_LogNormal, GenKiss99, 1234ULL},
-  {0.003, 32 * 1024, -1.f, 1.f, RNG_Uniform, GenKiss99, 1234ULL},
-  {0.005, 8 * 1024, -1.f, 1.f, RNG_Uniform, GenKiss99, 1234ULL},
-  {0.005, 32 * 1024, 1.f, 1.f, RNG_Gumbel, GenKiss99, 1234ULL},
-  {0.01, 8 * 1024, 1.f, 1.f, RNG_Gumbel, GenKiss99, 1234ULL},
-  {0.005, 32 * 1024, 1.f, 1.f, RNG_Logistic, GenKiss99, 1234ULL},
-  {0.01, 8 * 1024, 1.f, 1.f, RNG_Logistic, GenKiss99, 1234ULL},
-  {0.008, 32 * 1024, 1.f, 1.f, RNG_Exp, GenKiss99, 1234ULL},
-  {0.015, 8 * 1024, 1.f, 1.f, RNG_Exp, GenKiss99, 1234ULL},
-  {0.0125, 32 * 1024, 1.f, 1.f, RNG_Rayleigh, GenKiss99, 1234ULL},
-  {0.025, 8 * 1024, 1.f, 1.f, RNG_Rayleigh, GenKiss99, 1234ULL},
-  {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL},
-  {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL}};
+  {0.0055, 32 * 1024, 1.f, 1.f, RNG_Normal, GenPC, 1234ULL},
+  {0.011, 8 * 1024, 1.f, 1.f, RNG_Normal, GenPC, 1234ULL},
+  {0.05, 32 * 1024, 1.f, 1.f, RNG_LogNormal, GenPC, 1234ULL},
+  {0.1, 8 * 1024, 1.f, 1.f, RNG_LogNormal, GenPC, 1234ULL},
+  {0.003, 32 * 1024, -1.f, 1.f, RNG_Uniform, GenPC, 1234ULL},
+  {0.005, 8 * 1024, -1.f, 1.f, RNG_Uniform, GenPC, 1234ULL},
+  {0.005, 32 * 1024, 1.f, 1.f, RNG_Gumbel, GenPC, 1234ULL},
+  {0.01, 8 * 1024, 1.f, 1.f, RNG_Gumbel, GenPC, 1234ULL},
+  {0.005, 32 * 1024, 1.f, 1.f, RNG_Logistic, GenPC, 1234ULL},
+  {0.01, 8 * 1024, 1.f, 1.f, RNG_Logistic, GenPC, 1234ULL},
+  {0.008, 32 * 1024, 1.f, 1.f, RNG_Exp, GenPC, 1234ULL},
+  {0.015, 8 * 1024, 1.f, 1.f, RNG_Exp, GenPC, 1234ULL},
+  {0.0125, 32 * 1024, 1.f, 1.f, RNG_Rayleigh, GenPC, 1234ULL},
+  {0.025, 8 * 1024, 1.f, 1.f, RNG_Rayleigh, GenPC, 1234ULL},
+  {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenPC, 1234ULL},
+  {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenPC, 1234ULL}};
 
 TEST_P(RngTestF, Result)
 {
@@ -273,7 +256,7 @@ const std::vector<RngInputs<double>> inputsd = {
   {0.005, 8 * 1024, -1.0, 1.0, RNG_Uniform, GenPhilox, 1234ULL},
   {0.005, 32 * 1024, 1.0, 1.0, RNG_Gumbel, GenPhilox, 1234ULL},
   {0.01, 8 * 1024, 1.0, 1.0, RNG_Gumbel, GenPhilox, 1234ULL},
-  {0.005, 32 * 1024, 1.0, 1.0, RNG_Logistic, GenPhilox, 1234ULL},
+  {0.005, 32 * 1024, 1.0, 1.0, RNG_Logistic, GenPhilox, 67632ULL},
   {0.01, 8 * 1024, 1.0, 1.0, RNG_Logistic, GenPhilox, 1234ULL},
   {0.008, 32 * 1024, 1.0, 1.0, RNG_Exp, GenPhilox, 1234ULL},
   {0.015, 8 * 1024, 1.0, 1.0, RNG_Exp, GenPhilox, 1234ULL},
@@ -282,39 +265,23 @@ const std::vector<RngInputs<double>> inputsd = {
   {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenPhilox, 1234ULL},
   {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenPhilox, 1234ULL},
 
-  {0.0055, 32 * 1024, 1.0, 1.0, RNG_Normal, GenTaps, 1234ULL},
-  {0.011, 8 * 1024, 1.0, 1.0, RNG_Normal, GenTaps, 1234ULL},
-  {0.05, 32 * 1024, 1.0, 1.0, RNG_LogNormal, GenTaps, 1234ULL},
-  {0.1, 8 * 1024, 1.0, 1.0, RNG_LogNormal, GenTaps, 1234ULL},
-  {0.003, 32 * 1024, -1.0, 1.0, RNG_Uniform, GenTaps, 1234ULL},
-  {0.005, 8 * 1024, -1.0, 1.0, RNG_Uniform, GenTaps, 1234ULL},
-  {0.005, 32 * 1024, 1.0, 1.0, RNG_Gumbel, GenTaps, 1234ULL},
-  {0.01, 8 * 1024, 1.0, 1.0, RNG_Gumbel, GenTaps, 1234ULL},
-  {0.005, 32 * 1024, 1.0, 1.0, RNG_Logistic, GenTaps, 1234ULL},
-  {0.01, 8 * 1024, 1.0, 1.0, RNG_Logistic, GenTaps, 1234ULL},
-  {0.008, 32 * 1024, 1.0, 1.0, RNG_Exp, GenTaps, 1234ULL},
-  {0.015, 8 * 1024, 1.0, 1.0, RNG_Exp, GenTaps, 1234ULL},
-  {0.0125, 32 * 1024, 1.0, 1.0, RNG_Rayleigh, GenTaps, 1234ULL},
-  {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenTaps, 1234ULL},
-  {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenTaps, 1234ULL},
-  {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenTaps, 1234ULL},
-
-  {0.0055, 32 * 1024, 1.0, 1.0, RNG_Normal, GenKiss99, 1234ULL},
-  {0.011, 8 * 1024, 1.0, 1.0, RNG_Normal, GenKiss99, 1234ULL},
-  {0.05, 32 * 1024, 1.0, 1.0, RNG_LogNormal, GenKiss99, 1234ULL},
-  {0.1, 8 * 1024, 1.0, 1.0, RNG_LogNormal, GenKiss99, 1234ULL},
-  {0.003, 32 * 1024, -1.0, 1.0, RNG_Uniform, GenKiss99, 1234ULL},
-  {0.005, 8 * 1024, -1.0, 1.0, RNG_Uniform, GenKiss99, 1234ULL},
-  {0.005, 32 * 1024, 1.0, 1.0, RNG_Gumbel, GenKiss99, 1234ULL},
-  {0.01, 8 * 1024, 1.0, 1.0, RNG_Gumbel, GenKiss99, 1234ULL},
-  {0.005, 32 * 1024, 1.0, 1.0, RNG_Logistic, GenKiss99, 1234ULL},
-  {0.01, 8 * 1024, 1.0, 1.0, RNG_Logistic, GenKiss99, 1234ULL},
-  {0.008, 32 * 1024, 1.0, 1.0, RNG_Exp, GenKiss99, 1234ULL},
-  {0.015, 8 * 1024, 1.0, 1.0, RNG_Exp, GenKiss99, 1234ULL},
-  {0.0125, 32 * 1024, 1.0, 1.0, RNG_Rayleigh, GenKiss99, 1234ULL},
-  {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenKiss99, 1234ULL},
-  {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL},
-  {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL}};
+  {0.0055, 32 * 1024, 1.0, 1.0, RNG_Normal, GenPC, 1234ULL},
+  {0.011, 8 * 1024, 1.0, 1.0, RNG_Normal, GenPC, 1234ULL},
+  {0.05, 32 * 1024, 1.0, 1.0, RNG_LogNormal, GenPC, 1234ULL},
+  {0.1, 8 * 1024, 1.0, 1.0, RNG_LogNormal, GenPC, 1234ULL},
+  {0.003, 32 * 1024, -1.0, 1.0, RNG_Uniform, GenPC, 1234ULL},
+  {0.005, 8 * 1024, -1.0, 1.0, RNG_Uniform, GenPC, 1234ULL},
+  {0.005, 32 * 1024, 1.0, 1.0, RNG_Gumbel, GenPC, 1234ULL},
+  {0.01, 8 * 1024, 1.0, 1.0, RNG_Gumbel, GenPC, 1234ULL},
+  {0.005, 32 * 1024, 1.0, 1.0, RNG_Logistic, GenPC, 1234ULL},
+  {0.01, 8 * 1024, 1.0, 1.0, RNG_Logistic, GenPC, 1234ULL},
+  {0.008, 32 * 1024, 1.0, 1.0, RNG_Exp, GenPC, 1234ULL},
+  {0.015, 8 * 1024, 1.0, 1.0, RNG_Exp, GenPC, 1234ULL},
+  {0.0125, 32 * 1024, 1.0, 1.0, RNG_Rayleigh, GenPC, 1234ULL},
+  {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenPC, 1234ULL},
+  {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenPC, 1234ULL},
+  {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenPC, 1234ULL}};
+
 TEST_P(RngTestD, Result)
 {
   double meanvar[2];
@@ -381,7 +348,7 @@ TEST(Rng, MeanError)
   rmm::device_uvector<float> mean_result(num_experiments, stream);
   rmm::device_uvector<float> std_result(num_experiments, stream);
 
-  for (auto rtype : {GenPhilox, GenKiss99 /*, raft::random::GenTaps */}) {
+  for (auto rtype : {GenPhilox, GenPC}) {
     Rng r(seed, rtype);
     r.normal(data.data(), len, 3.3f, 0.23f, stream);
     // r.uniform(data, len, -1.0, 2.0);
@@ -560,10 +527,9 @@ typedef RngNormalTableTest<float> RngNormalTableTestF;
 const std::vector<RngNormalTableInputs<float>> inputsf_t = {
   {0.0055, 32, 1024, 1.f, 1.f, GenPhilox, 1234ULL},
   {0.011, 8, 1024, 1.f, 1.f, GenPhilox, 1234ULL},
-  {0.0055, 32, 1024, 1.f, 1.f, GenTaps, 1234ULL},
-  {0.011, 8, 1024, 1.f, 1.f, GenTaps, 1234ULL},
-  {0.0055, 32, 1024, 1.f, 1.f, GenKiss99, 1234ULL},
-  {0.011, 8, 1024, 1.f, 1.f, GenKiss99, 1234ULL}};
+
+  {0.0055, 32, 1024, 1.f, 1.f, GenPC, 1234ULL},
+  {0.011, 8, 1024, 1.f, 1.f, GenPC, 1234ULL}};
 
 TEST_P(RngNormalTableTestF, Result)
 {
@@ -578,10 +544,9 @@ typedef RngNormalTableTest<double> RngNormalTableTestD;
 const std::vector<RngNormalTableInputs<double>> inputsd_t = {
   {0.0055, 32, 1024, 1.0, 1.0, GenPhilox, 1234ULL},
   {0.011, 8, 1024, 1.0, 1.0, GenPhilox, 1234ULL},
-  {0.0055, 32, 1024, 1.0, 1.0, GenTaps, 1234ULL},
-  {0.011, 8, 1024, 1.0, 1.0, GenTaps, 1234ULL},
-  {0.0055, 32, 1024, 1.0, 1.0, GenKiss99, 1234ULL},
-  {0.011, 8, 1024, 1.0, 1.0, GenKiss99, 1234ULL}};
+
+  {0.0055, 32, 1024, 1.0, 1.0, GenPC, 1234ULL},
+  {0.011, 8, 1024, 1.0, 1.0, GenPC, 1234ULL}};
 TEST_P(RngNormalTableTestD, Result)
 {
   double meanvar[2];
diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu
index 02c8dc9f39..2715181db1 100644
--- a/cpp/test/random/rng_int.cu
+++ b/cpp/test/random/rng_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 namespace raft {
 namespace random {
 
-using namespace raft::random::detail;
+using namespace raft::random;
 
 enum RandomType { RNG_Uniform };
 
@@ -125,10 +125,9 @@ typedef RngTest<uint32_t> RngTestU32;
 const std::vector<RngInputs<uint32_t>> inputs_u32 = {
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
-  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
-  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
-  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
-  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
+
+  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPC, 1234ULL},
+  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPC, 1234ULL}};
 TEST_P(RngTestU32, Result)
 {
   float meanvar[2];
@@ -142,10 +141,9 @@ typedef RngTest<uint64_t> RngTestU64;
 const std::vector<RngInputs<uint64_t>> inputs_u64 = {
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
-  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
-  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
-  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
-  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
+
+  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPC, 1234ULL},
+  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPC, 1234ULL}};
 TEST_P(RngTestU64, Result)
 {
   float meanvar[2];
@@ -159,10 +157,9 @@ typedef RngTest<int32_t> RngTestS32;
 const std::vector<RngInputs<int32_t>> inputs_s32 = {
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
-  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
-  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
-  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
-  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
+
+  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPC, 1234ULL},
+  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPC, 1234ULL}};
 TEST_P(RngTestS32, Result)
 {
   float meanvar[2];
@@ -176,10 +173,9 @@ typedef RngTest<int64_t> RngTestS64;
 const std::vector<RngInputs<int64_t>> inputs_s64 = {
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPhilox, 1234ULL},
-  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
-  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
-  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
-  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
+
+  {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenPC, 1234ULL},
+  {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenPC, 1234ULL}};
 TEST_P(RngTestS64, Result)
 {
   float meanvar[2];
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu
index a8bba340fa..e469c366c3 100644
--- a/cpp/test/random/sample_without_replacement.cu
+++ b/cpp/test/random/sample_without_replacement.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@
 namespace raft {
 namespace random {
 
-using namespace raft::random::detail;
+using namespace raft::random;
 
 // Terminology:
 // SWoR - Sample Without Replacement
@@ -91,67 +91,45 @@ class SWoRTest : public ::testing::TestWithParam<SWoRInputs<T>> {
 };
 
 typedef SWoRTest<float> SWoRTestF;
-const std::vector<SWoRInputs<float>> inputsf = {
-  {1024, 512, -1, 0.f, GenPhilox, 1234ULL},
-  {1024, 1024, -1, 0.f, GenPhilox, 1234ULL},
-  {1024, 512 + 1, -1, 0.f, GenPhilox, 1234ULL},
-  {1024, 1024 - 1, -1, 0.f, GenPhilox, 1234ULL},
-  {1024, 512 + 2, -1, 0.f, GenPhilox, 1234ULL},
-  {1024, 1024 - 2, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 1, 512, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 1, 1024, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 1, 512 + 1, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 1, 1024 + 1, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 1, 512 + 2, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 1, 1024 - 2, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 2, 512, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 2, 1024, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 2, 512 + 1, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 2, 1024 + 1, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 2, 512 + 2, -1, 0.f, GenPhilox, 1234ULL},
-  {1024 + 2, 1024 + 2, -1, 0.f, GenPhilox, 1234ULL},
-  {1024, 512, 10, 100000.f, GenPhilox, 1234ULL},
-
-  {1024, 512, -1, 0.f, GenTaps, 1234ULL},
-  {1024, 1024, -1, 0.f, GenTaps, 1234ULL},
-  {1024, 512 + 1, -1, 0.f, GenTaps, 1234ULL},
-  {1024, 1024 - 1, -1, 0.f, GenTaps, 1234ULL},
-  {1024, 512 + 2, -1, 0.f, GenTaps, 1234ULL},
-  {1024, 1024 - 2, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 1, 512, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 1, 1024, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 1, 512 + 1, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 1, 1024 + 1, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 1, 512 + 2, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 1, 1024 - 2, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 2, 512, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 2, 1024, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 2, 512 + 1, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 2, 1024 + 1, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 2, 512 + 2, -1, 0.f, GenTaps, 1234ULL},
-  {1024 + 2, 1024 + 2, -1, 0.f, GenTaps, 1234ULL},
-  {1024, 512, 10, 100000.f, GenTaps, 1234ULL},
-
-  {1024, 512, -1, 0.f, GenKiss99, 1234ULL},
-  {1024, 1024, -1, 0.f, GenKiss99, 1234ULL},
-  {1024, 512 + 1, -1, 0.f, GenKiss99, 1234ULL},
-  {1024, 1024 - 1, -1, 0.f, GenKiss99, 1234ULL},
-  {1024, 512 + 2, -1, 0.f, GenKiss99, 1234ULL},
-  {1024, 1024 - 2, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 1, 512, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 1, 1024, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 1, 512 + 1, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 1, 1024 + 1, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 1, 512 + 2, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 1, 1024 - 2, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 2, 512, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 2, 1024, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 2, 512 + 1, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 2, 1024 + 1, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 2, 512 + 2, -1, 0.f, GenKiss99, 1234ULL},
-  {1024 + 2, 1024 + 2, -1, 0.f, GenKiss99, 1234ULL},
-  {1024, 512, 10, 100000.f, GenKiss99, 1234ULL},
-};
+const std::vector<SWoRInputs<float>> inputsf = {{1024, 512, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024, 1024, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024, 512 + 1, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024, 1024 - 1, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024, 512 + 2, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024, 1024 - 2, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 1, 512, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 1, 1024, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 1, 512 + 1, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 1, 1024 + 1, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 1, 512 + 2, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 1, 1024 - 2, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 2, 512, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 2, 1024, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 2, 512 + 1, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 2, 1024 + 1, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 2, 512 + 2, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024 + 2, 1024 + 2, -1, 0.f, GenPhilox, 1234ULL},
+                                                {1024, 512, 10, 100000.f, GenPhilox, 1234ULL},
+
+                                                {1024, 512, -1, 0.f, GenPC, 1234ULL},
+                                                {1024, 1024, -1, 0.f, GenPC, 1234ULL},
+                                                {1024, 512 + 1, -1, 0.f, GenPC, 1234ULL},
+                                                {1024, 1024 - 1, -1, 0.f, GenPC, 1234ULL},
+                                                {1024, 512 + 2, -1, 0.f, GenPC, 1234ULL},
+                                                {1024, 1024 - 2, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 1, 512, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 1, 1024, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 1, 512 + 1, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 1, 1024 + 1, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 1, 512 + 2, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 1, 1024 - 2, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 2, 512, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 2, 1024, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 2, 512 + 1, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 2, 1024 + 1, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 2, 512 + 2, -1, 0.f, GenPC, 1234ULL},
+                                                {1024 + 2, 1024 + 2, -1, 0.f, GenPC, 1234ULL},
+                                                {1024, 512, 10, 100000.f, GenPC, 1234ULL}};
 
 TEST_P(SWoRTestF, Result)
 {
@@ -173,67 +151,45 @@ TEST_P(SWoRTestF, Result)
 INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestF, ::testing::ValuesIn(inputsf));
 
 typedef SWoRTest<double> SWoRTestD;
-const std::vector<SWoRInputs<double>> inputsd = {
-  {1024, 512, -1, 0.0, GenPhilox, 1234ULL},
-  {1024, 1024, -1, 0.0, GenPhilox, 1234ULL},
-  {1024, 512 + 1, -1, 0.0, GenPhilox, 1234ULL},
-  {1024, 1024 - 1, -1, 0.0, GenPhilox, 1234ULL},
-  {1024, 512 + 2, -1, 0.0, GenPhilox, 1234ULL},
-  {1024, 1024 - 2, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 1, 512, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 1, 1024, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 1, 512 + 1, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 1, 1024 + 1, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 1, 512 + 2, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 1, 1024 - 2, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 2, 512, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 2, 1024, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 2, 512 + 1, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 2, 1024 + 1, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 2, 512 + 2, -1, 0.0, GenPhilox, 1234ULL},
-  {1024 + 2, 1024 + 2, -1, 0.0, GenPhilox, 1234ULL},
-  {1024, 512, 10, 100000.0, GenPhilox, 1234ULL},
-
-  {1024, 512, -1, 0.0, GenTaps, 1234ULL},
-  {1024, 1024, -1, 0.0, GenTaps, 1234ULL},
-  {1024, 512 + 1, -1, 0.0, GenTaps, 1234ULL},
-  {1024, 1024 - 1, -1, 0.0, GenTaps, 1234ULL},
-  {1024, 512 + 2, -1, 0.0, GenTaps, 1234ULL},
-  {1024, 1024 - 2, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 1, 512, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 1, 1024, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 1, 512 + 1, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 1, 1024 + 1, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 1, 512 + 2, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 1, 1024 - 2, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 2, 512, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 2, 1024, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 2, 512 + 1, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 2, 1024 + 1, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 2, 512 + 2, -1, 0.0, GenTaps, 1234ULL},
-  {1024 + 2, 1024 + 2, -1, 0.0, GenTaps, 1234ULL},
-  {1024, 512, 10, 100000.0, GenTaps, 1234ULL},
-
-  {1024, 512, -1, 0.0, GenKiss99, 1234ULL},
-  {1024, 1024, -1, 0.0, GenKiss99, 1234ULL},
-  {1024, 512 + 1, -1, 0.0, GenKiss99, 1234ULL},
-  {1024, 1024 - 1, -1, 0.0, GenKiss99, 1234ULL},
-  {1024, 512 + 2, -1, 0.0, GenKiss99, 1234ULL},
-  {1024, 1024 - 2, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 1, 512, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 1, 1024, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 1, 512 + 1, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 1, 1024 + 1, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 1, 512 + 2, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 1, 1024 - 2, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 2, 512, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 2, 1024, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 2, 512 + 1, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 2, 1024 + 1, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 2, 512 + 2, -1, 0.0, GenKiss99, 1234ULL},
-  {1024 + 2, 1024 + 2, -1, 0.0, GenKiss99, 1234ULL},
-  {1024, 512, 10, 100000.0, GenKiss99, 1234ULL},
-};
+const std::vector<SWoRInputs<double>> inputsd = {{1024, 512, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024, 1024, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024, 512 + 1, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024, 1024 - 1, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024, 512 + 2, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024, 1024 - 2, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 1, 512, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 1, 1024, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 1, 512 + 1, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 1, 1024 + 1, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 1, 512 + 2, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 1, 1024 - 2, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 2, 512, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 2, 1024, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 2, 512 + 1, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 2, 1024 + 1, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 2, 512 + 2, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024 + 2, 1024 + 2, -1, 0.0, GenPhilox, 1234ULL},
+                                                 {1024, 512, 10, 100000.0, GenPhilox, 1234ULL},
+
+                                                 {1024, 512, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024, 1024, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024, 512 + 1, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024, 1024 - 1, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024, 512 + 2, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024, 1024 - 2, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 1, 512, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 1, 1024, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 1, 512 + 1, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 1, 1024 + 1, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 1, 512 + 2, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 1, 1024 - 2, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 2, 512, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 2, 1024, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 2, 512 + 1, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 2, 1024 + 1, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 2, 512 + 2, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024 + 2, 1024 + 2, -1, 0.0, GenPC, 1234ULL},
+                                                 {1024, 512, 10, 100000.0, GenPC, 1234ULL}};
 
 TEST_P(SWoRTestD, Result)
 {
diff --git a/thirdparty/README.md b/thirdparty/README.md
new file mode 100644
index 0000000000..d9f05379dc
--- /dev/null
+++ b/thirdparty/README.md
@@ -0,0 +1,4 @@
+# PCG
+
+Link to the repository https://github.com/imneme/pcg-c-basic
+Commit ID for last borrowed code bc39cd76ac3d541e618606bcc6e1e5ba5e5e6aa3
diff --git a/thirdparty/pcg/LICENSE.txt b/thirdparty/pcg/LICENSE.txt
new file mode 100644
index 0000000000..8dada3edaf
--- /dev/null
+++ b/thirdparty/pcg/LICENSE.txt
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/thirdparty/pcg/pcg_basic.c b/thirdparty/pcg/pcg_basic.c
new file mode 100644
index 0000000000..8c2fd0d22b
--- /dev/null
+++ b/thirdparty/pcg/pcg_basic.c
@@ -0,0 +1,116 @@
+/*
+ * PCG Random Number Generation for C.
+ *
+ * Copyright 2014 Melissa O'Neill <oneill@pcg-random.org>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * For additional information about the PCG random number generation scheme,
+ * including its license and other licensing options, visit
+ *
+ *       http://www.pcg-random.org
+ */
+
+/*
+ * This code is derived from the full C implementation, which is in turn
+ * derived from the canonical C++ PCG implementation. The C++ version
+ * has many additional features and is preferable if you can use C++ in
+ * your project.
+ */
+
+#include "pcg_basic.h"
+
+// state for global RNGs
+
+static pcg32_random_t pcg32_global = PCG32_INITIALIZER;
+
+// pcg32_srandom(initstate, initseq)
+// pcg32_srandom_r(rng, initstate, initseq):
+//     Seed the rng.  Specified in two parts, state initializer and a
+//     sequence selection constant (a.k.a. stream id)
+
+void pcg32_srandom_r(pcg32_random_t* rng, uint64_t initstate, uint64_t initseq)
+{
+    rng->state = 0U;
+    rng->inc = (initseq << 1u) | 1u;
+    pcg32_random_r(rng);
+    rng->state += initstate;
+    pcg32_random_r(rng);
+}
+
+void pcg32_srandom(uint64_t seed, uint64_t seq)
+{
+    pcg32_srandom_r(&pcg32_global, seed, seq);
+}
+
+// pcg32_random()
+// pcg32_random_r(rng)
+//     Generate a uniformly distributed 32-bit random number
+
+uint32_t pcg32_random_r(pcg32_random_t* rng)
+{
+    uint64_t oldstate = rng->state;
+    rng->state = oldstate * 6364136223846793005ULL + rng->inc;
+    uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
+    uint32_t rot = oldstate >> 59u;
+    return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
+}
+
+uint32_t pcg32_random()
+{
+    return pcg32_random_r(&pcg32_global);
+}
+
+
+// pcg32_boundedrand(bound):
+// pcg32_boundedrand_r(rng, bound):
+//     Generate a uniformly distributed number, r, where 0 <= r < bound
+
+uint32_t pcg32_boundedrand_r(pcg32_random_t* rng, uint32_t bound)
+{
+    // To avoid bias, we need to make the range of the RNG a multiple of
+    // bound, which we do by dropping output less than a threshold.
+    // A naive scheme to calculate the threshold would be to do
+    //
+    //     uint32_t threshold = 0x100000000ull % bound;
+    //
+    // but 64-bit div/mod is slower than 32-bit div/mod (especially on
+    // 32-bit platforms).  In essence, we do
+    //
+    //     uint32_t threshold = (0x100000000ull-bound) % bound;
+    //
+    // because this version will calculate the same modulus, but the LHS
+    // value is less than 2^32.
+
+    uint32_t threshold = -bound % bound;
+
+    // Uniformity guarantees that this loop will terminate.  In practice, it
+    // should usually terminate quickly; on average (assuming all bounds are
+    // equally likely), 82.25% of the time, we can expect it to require just
+    // one iteration.  In the worst case, someone passes a bound of 2^31 + 1
+    // (i.e., 2147483649), which invalidates almost 50% of the range.  In 
+    // practice, bounds are typically small and only a tiny amount of the range
+    // is eliminated.
+    for (;;) {
+        uint32_t r = pcg32_random_r(rng);
+        if (r >= threshold)
+            return r % bound;
+    }
+}
+
+
+uint32_t pcg32_boundedrand(uint32_t bound)
+{
+    return pcg32_boundedrand_r(&pcg32_global, bound);
+}
+

From 4c089f681d96f4bcf927842c098eab84a0920925 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Fri, 11 Feb 2022 18:21:59 -0500
Subject: [PATCH 115/171] Moving remaining stats prims from cuml (#507)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/507
---
 cpp/include/raft/common/seive.hpp             | 125 +++++
 cpp/include/raft/device_utils.cuh             | 108 ++++
 cpp/include/raft/random/detail/make_blobs.cuh | 133 +++--
 .../knn/detail/epsilon_neighborhood.cuh       | 243 +++++++++
 .../raft/spatial/knn/epsilon_neighborhood.hpp |  59 +++
 cpp/include/raft/stats/common.hpp             |  67 +++
 cpp/include/raft/stats/cov.hpp                |  58 +++
 cpp/include/raft/stats/detail/cov.cuh         |  95 ++++
 cpp/include/raft/stats/detail/histogram.cuh   | 492 ++++++++++++++++++
 cpp/include/raft/stats/detail/minmax.cuh      | 239 +++++++++
 .../raft/stats/detail/weighted_mean.cuh       |  94 ++++
 cpp/include/raft/stats/histogram.hpp          |  62 +++
 cpp/include/raft/stats/minmax.hpp             |  70 +++
 cpp/include/raft/stats/weighted_mean.hpp      |  60 +++
 cpp/test/CMakeLists.txt                       |   6 +
 cpp/test/common/seive.cu                      |  35 ++
 cpp/test/linalg/rsvd.cu                       | 112 ++--
 cpp/test/random/make_blobs.cu                 | 108 ++--
 cpp/test/spatial/epsilon_neighborhood.cu      | 129 +++++
 cpp/test/stats/cov.cu                         | 185 +++++++
 cpp/test/stats/histogram.cu                   | 262 ++++++++++
 cpp/test/stats/minmax.cu                      | 202 +++++++
 cpp/test/stats/weighted_mean.cu               | 231 ++++++++
 23 files changed, 3027 insertions(+), 148 deletions(-)
 create mode 100644 cpp/include/raft/common/seive.hpp
 create mode 100644 cpp/include/raft/device_utils.cuh
 create mode 100644 cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
 create mode 100644 cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
 create mode 100644 cpp/include/raft/stats/common.hpp
 create mode 100644 cpp/include/raft/stats/cov.hpp
 create mode 100644 cpp/include/raft/stats/detail/cov.cuh
 create mode 100644 cpp/include/raft/stats/detail/histogram.cuh
 create mode 100644 cpp/include/raft/stats/detail/minmax.cuh
 create mode 100644 cpp/include/raft/stats/detail/weighted_mean.cuh
 create mode 100644 cpp/include/raft/stats/histogram.hpp
 create mode 100644 cpp/include/raft/stats/minmax.hpp
 create mode 100644 cpp/include/raft/stats/weighted_mean.hpp
 create mode 100644 cpp/test/common/seive.cu
 create mode 100644 cpp/test/spatial/epsilon_neighborhood.cu
 create mode 100644 cpp/test/stats/cov.cu
 create mode 100644 cpp/test/stats/histogram.cu
 create mode 100644 cpp/test/stats/minmax.cu
 create mode 100644 cpp/test/stats/weighted_mean.cu

diff --git a/cpp/include/raft/common/seive.hpp b/cpp/include/raft/common/seive.hpp
new file mode 100644
index 0000000000..e613f1e5c2
--- /dev/null
+++ b/cpp/include/raft/common/seive.hpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <vector>
+
+// Taken from:
+//  https://github.com/teju85/programming/blob/master/euler/include/seive.h
+
+namespace raft {
+namespace common {
+
+/**
+ * @brief Implementation of 'Seive of Eratosthenes'
+ */
+class Seive {
+ public:
+  /**
+   * @param _num number of integers for which seive is needed
+   */
+  Seive(unsigned _num)
+  {
+    N = _num;
+    generateSeive();
+  }
+
+  /**
+   * @brief Check whether a number is prime or not
+   * @param num number to be checked
+   * @return true if the 'num' is prime, else false
+   */
+  bool isPrime(unsigned num) const
+  {
+    unsigned mask, pos;
+    if (num <= 1) { return false; }
+    if (num == 2) { return true; }
+    if (!(num & 1)) { return false; }
+    getMaskPos(num, mask, pos);
+    return (seive[pos] & mask);
+  }
+
+ private:
+  void generateSeive()
+  {
+    auto sqN  = fastIntSqrt(N);
+    auto size = raft::ceildiv<unsigned>(N, sizeof(unsigned) * 8);
+    seive.resize(size);
+    // assume all to be primes initially
+    for (auto& itr : seive) {
+      itr = 0xffffffffu;
+    }
+    unsigned cid  = 0;
+    unsigned cnum = getNum(cid);
+    while (cnum <= sqN) {
+      do {
+        ++cid;
+        cnum = getNum(cid);
+        if (isPrime(cnum)) { break; }
+      } while (cnum <= sqN);
+      auto cnum2 = cnum << 1;
+      // 'unmark' all the 'odd' multiples of the current prime
+      for (unsigned i = 3, num = i * cnum; num <= N; i += 2, num += cnum2) {
+        unmark(num);
+      }
+    }
+  }
+
+  unsigned getId(unsigned num) const { return (num >> 1); }
+
+  unsigned getNum(unsigned id) const
+  {
+    if (id == 0) { return 2; }
+    return ((id << 1) + 1);
+  }
+
+  void getMaskPos(unsigned num, unsigned& mask, unsigned& pos) const
+  {
+    pos  = getId(num);
+    mask = 1 << (pos & 0x1f);
+    pos >>= 5;
+  }
+
+  void unmark(unsigned num)
+  {
+    unsigned mask, pos;
+    getMaskPos(num, mask, pos);
+    seive[pos] &= ~mask;
+  }
+
+  // REF: http://www.azillionmonkeys.com/qed/ulerysqroot.pdf
+  unsigned fastIntSqrt(unsigned val)
+  {
+    unsigned g = 0;
+    auto bshft = 15u, b = 1u << bshft;
+    do {
+      unsigned temp = ((g << 1) + b) << bshft--;
+      if (val >= temp) {
+        g += b;
+        val -= temp;
+      }
+    } while (b >>= 1);
+    return g;
+  }
+
+  /** find all primes till this number */
+  unsigned N;
+  /** the seive */
+  std::vector<unsigned> seive;
+};
+};  // namespace common
+};  // namespace raft
diff --git a/cpp/include/raft/device_utils.cuh b/cpp/include/raft/device_utils.cuh
new file mode 100644
index 0000000000..d89a484109
--- /dev/null
+++ b/cpp/include/raft/device_utils.cuh
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <utility>  // pair
+
+namespace raft {
+
+// TODO move to raft https://github.com/rapidsai/raft/issues/90
+/** helper method to get the compute capability version numbers */
+inline std::pair<int, int> getDeviceCapability()
+{
+  int devId;
+  RAFT_CUDA_TRY(cudaGetDevice(&devId));
+  int major, minor;
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devId));
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devId));
+  return std::make_pair(major, minor);
+}
+
+/**
+ * @brief Batched warp-level sum reduction
+ *
+ * @tparam T        data type
+ * @tparam NThreads Number of threads in the warp doing independent reductions
+ *
+ * @param[in] val input value
+ * @return        for the first "group" of threads, the reduced value. All
+ *                others will contain unusable values!
+ *
+ * @note Why not cub? Because cub doesn't seem to allow working with arbitrary
+ *       number of warps in a block and also doesn't support this kind of
+ *       batched reduction operation
+ * @note All threads in the warp must enter this function together
+ *
+ * @todo Expand this to support arbitrary reduction ops
+ */
+template <typename T, int NThreads>
+DI T batchedWarpReduce(T val)
+{
+#pragma unroll
+  for (int i = NThreads; i < raft::WarpSize; i <<= 1) {
+    val += raft::shfl(val, raft::laneId() + i);
+  }
+  return val;
+}
+
+/**
+ * @brief 1-D block-level batched sum reduction
+ *
+ * @tparam T        data type
+ * @tparam NThreads Number of threads in the warp doing independent reductions
+ *
+ * @param val  input value
+ * @param smem shared memory region needed for storing intermediate results. It
+ *             must alteast be of size: `sizeof(T) * nWarps * NThreads`
+ * @return     for the first "group" of threads in the block, the reduced value.
+ *             All others will contain unusable values!
+ *
+ * @note Why not cub? Because cub doesn't seem to allow working with arbitrary
+ *       number of warps in a block and also doesn't support this kind of
+ *       batched reduction operation
+ * @note All threads in the block must enter this function together
+ *
+ * @todo Expand this to support arbitrary reduction ops
+ */
+template <typename T, int NThreads>
+DI T batchedBlockReduce(T val, char* smem)
+{
+  auto* sTemp                  = reinterpret_cast<T*>(smem);
+  constexpr int nGroupsPerWarp = raft::WarpSize / NThreads;
+  static_assert(raft::isPo2(nGroupsPerWarp), "nGroupsPerWarp must be a PO2!");
+  const int nGroups = (blockDim.x + NThreads - 1) / NThreads;
+  const int lid     = raft::laneId();
+  const int lgid    = lid % NThreads;
+  const int gid     = threadIdx.x / NThreads;
+  const auto wrIdx  = (gid / nGroupsPerWarp) * NThreads + lgid;
+  const auto rdIdx  = gid * NThreads + lgid;
+  for (int i = nGroups; i > 0;) {
+    auto iAligned = ((i + nGroupsPerWarp - 1) / nGroupsPerWarp) * nGroupsPerWarp;
+    if (gid < iAligned) {
+      val = batchedWarpReduce<T, NThreads>(val);
+      if (lid < NThreads) sTemp[wrIdx] = val;
+    }
+    __syncthreads();
+    i /= nGroupsPerWarp;
+    if (i > 0) { val = gid < i ? sTemp[rdIdx] : T(0); }
+    __syncthreads();
+  }
+  return val;
+}
+
+}  // namespace raft
diff --git a/cpp/include/raft/random/detail/make_blobs.cuh b/cpp/include/raft/random/detail/make_blobs.cuh
index 528d20a284..fff1ab835b 100644
--- a/cpp/include/raft/random/detail/make_blobs.cuh
+++ b/cpp/include/raft/random/detail/make_blobs.cuh
@@ -16,18 +16,18 @@
 
 #pragma once
 
+#include "permute.cuh"
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/unary_op.hpp>
-#include <raft/random/permute.hpp>
 #include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>
 #include <vector>
 
-namespace raft::random {
-namespace detail {
+namespace raft {
+namespace random {
 
-namespace {
+namespace detail {
 
 // generate the labels first and shuffle them instead of shuffling the dataset
 template <typename IdxT>
@@ -90,23 +90,29 @@ DI void get_mu_sigma(DataT& mu,
 }
 
 template <typename DataT, typename IdxT>
-void generate_data(DataT* out,
-                   const IdxT* labels,
-                   IdxT n_rows,
-                   IdxT n_cols,
-                   IdxT n_clusters,
-                   cudaStream_t stream,
-                   bool row_major,
-                   const DataT* centers,
-                   const DataT* cluster_std,
-                   const DataT cluster_std_scalar,
-                   raft::random::Rng& rng)
+__global__ void generate_data_kernel(DataT* out,
+                                     const IdxT* labels,
+                                     IdxT n_rows,
+                                     IdxT n_cols,
+                                     IdxT n_clusters,
+                                     bool row_major,
+                                     const DataT* centers,
+                                     const DataT* cluster_std,
+                                     const DataT cluster_std_scalar,
+                                     raft::random::RngState rng_state)
 {
-  auto op = [=] __device__(DataT & val1, DataT & val2, IdxT idx1, IdxT idx2) {
+  uint64_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  raft::random::PhiloxGenerator gen(rng_state, tid);
+  const IdxT stride = gridDim.x * blockDim.x;
+  IdxT len          = n_rows * n_cols;
+  for (IdxT idx = tid; idx < len; idx += stride) {
+    DataT val1, val2;
+    gen.next(val1);
+    gen.next(val2);
     DataT mu1, sigma1, mu2, sigma2;
     get_mu_sigma(mu1,
                  sigma1,
-                 idx1,
+                 idx,
                  labels,
                  row_major,
                  centers,
@@ -117,7 +123,7 @@ void generate_data(DataT* out,
                  n_clusters);
     get_mu_sigma(mu2,
                  sigma2,
-                 idx2,
+                 idx + stride,
                  labels,
                  row_major,
                  centers,
@@ -127,12 +133,74 @@ void generate_data(DataT* out,
                  n_cols,
                  n_clusters);
     raft::random::box_muller_transform<DataT>(val1, val2, sigma1, mu1, sigma2, mu2);
-  };
-  rng.custom_distribution2<DataT, DataT, IdxT>(out, n_rows * n_cols, op, stream);
+
+    if (idx < len) out[idx] = val1;
+    idx += stride;
+    if (idx < len) out[idx] = val2;
+  }
 }
 
-}  // namespace
+template <typename DataT, typename IdxT>
+void generate_data(DataT* out,
+                   const IdxT* labels,
+                   IdxT n_rows,
+                   IdxT n_cols,
+                   IdxT n_clusters,
+                   cudaStream_t stream,
+                   bool row_major,
+                   const DataT* centers,
+                   const DataT* cluster_std,
+                   const DataT cluster_std_scalar,
+                   raft::random::RngState& rng_state)
+{
+  IdxT items   = n_rows * n_cols;
+  IdxT nBlocks = (items + 127) / 128;
+  generate_data_kernel<<<nBlocks, 128, 0, stream>>>(out,
+                                                    labels,
+                                                    n_rows,
+                                                    n_cols,
+                                                    n_clusters,
+                                                    row_major,
+                                                    centers,
+                                                    cluster_std,
+                                                    cluster_std_scalar,
+                                                    rng_state);
+}
 
+/**
+ * @brief GPU-equivalent of sklearn.datasets.make_blobs
+ *
+ * @tparam DataT output data type
+ * @tparam IdxT  indexing arithmetic type
+ *
+ * @param[out] out                generated data [on device]
+ *                                [dim = n_rows x n_cols]
+ * @param[out] labels             labels for the generated data [on device]
+ *                                [len = n_rows]
+ * @param[in]  n_rows             number of rows in the generated data
+ * @param[in]  n_cols             number of columns in the generated data
+ * @param[in]  n_clusters         number of clusters (or classes) to generate
+ * @param[in]  stream             cuda stream to schedule the work on
+ * @param[in]  row_major          whether input `centers` and output `out`
+ *                                buffers are to be stored in row or column
+ *                                major layout
+ * @param[in]  centers            centers of each of the cluster, pass a nullptr
+ *                                if you need this also to be generated randomly
+ *                                [on device] [dim = n_clusters x n_cols]
+ * @param[in]  cluster_std        standard deviation of each cluster center,
+ *                                pass a nullptr if this is to be read from the
+ *                                `cluster_std_scalar`. [on device]
+ *                                [len = n_clusters]
+ * @param[in]  cluster_std_scalar if 'cluster_std' is nullptr, then use this as
+ *                                the std-dev across all dimensions.
+ * @param[in]  shuffle            shuffle the generated dataset and labels
+ * @param[in]  center_box_min     min value of box from which to pick cluster
+ *                                centers. Useful only if 'centers' is nullptr
+ * @param[in]  center_box_max     max value of box from which to pick cluster
+ *                                centers. Useful only if 'centers' is nullptr
+ * @param[in]  seed               seed for the RNG
+ * @param[in]  type               RNG type
+ */
 template <typename DataT, typename IdxT>
 void make_blobs_caller(DataT* out,
                        IdxT* labels,
@@ -140,15 +208,15 @@ void make_blobs_caller(DataT* out,
                        IdxT n_cols,
                        IdxT n_clusters,
                        cudaStream_t stream,
-                       bool row_major                   = true,
-                       const DataT* centers             = nullptr,
-                       const DataT* cluster_std         = nullptr,
-                       const DataT cluster_std_scalar   = (DataT)1.0,
-                       bool shuffle                     = true,
-                       DataT center_box_min             = (DataT)-10.0,
-                       DataT center_box_max             = (DataT)10.0,
-                       uint64_t seed                    = 0ULL,
-                       raft::random::GeneratorType type = raft::random::GenPhilox)
+                       bool row_major,
+                       const DataT* centers,
+                       const DataT* cluster_std,
+                       const DataT cluster_std_scalar,
+                       bool shuffle,
+                       DataT center_box_min,
+                       DataT center_box_max,
+                       uint64_t seed,
+                       raft::random::GeneratorType type)
 {
   raft::random::Rng r(seed, type);
   // use the right centers buffer for data generation
@@ -172,8 +240,9 @@ void make_blobs_caller(DataT* out,
                 _centers,
                 cluster_std,
                 cluster_std_scalar,
-                r);
+                r.state);
 }
 
 }  // end namespace detail
-}  // end namespace raft::random
\ No newline at end of file
+}  // end namespace random
+}  // end namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
new file mode 100644
index 0000000000..3b4a8d4174
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/device_utils.cuh>
+#include <raft/linalg/contractions.hpp>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+
+template <typename DataT,
+          typename IdxT,
+          typename Policy,
+          typename BaseClass = raft::linalg::Contractions_NT<DataT, IdxT, Policy>>
+struct EpsUnexpL2SqNeighborhood : public BaseClass {
+ private:
+  typedef Policy P;
+
+  bool* adj;
+  DataT eps;
+  IdxT* vd;
+
+  char* smem;  // for final reductions
+
+  DataT acc[P::AccRowsPerTh][P::AccColsPerTh];
+
+ public:
+  DI EpsUnexpL2SqNeighborhood(bool* _adj,
+                              IdxT* _vd,
+                              const DataT* _x,
+                              const DataT* _y,
+                              IdxT _m,
+                              IdxT _n,
+                              IdxT _k,
+                              DataT _eps,
+                              char* _smem)
+    : BaseClass(_x, _y, _m, _n, _k, _smem), adj(_adj), eps(_eps), vd(_vd), smem(_smem)
+  {
+  }
+
+  DI void run()
+  {
+    prolog();
+    loop();
+    epilog();
+  }
+
+ private:
+  DI void prolog()
+  {
+    this->ldgXY(0);
+#pragma unroll
+    for (int i = 0; i < P::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < P::AccColsPerTh; ++j) {
+        acc[i][j] = BaseClass::Zero;
+      }
+    }
+    this->stsXY();
+    __syncthreads();
+    this->pageWr ^= 1;
+  }
+
+  DI void loop()
+  {
+    for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) {
+      this->ldgXY(kidx);
+      accumulate();  // on the previous k-block
+      this->stsXY();
+      __syncthreads();
+      this->pageWr ^= 1;
+      this->pageRd ^= 1;
+    }
+    accumulate();  // last iteration
+  }
+
+  DI void epilog()
+  {
+    IdxT startx = blockIdx.x * P::Mblk + this->accrowid;
+    IdxT starty = blockIdx.y * P::Nblk + this->acccolid;
+    auto lid    = raft::laneId();
+    IdxT sums[P::AccColsPerTh];
+#pragma unroll
+    for (int j = 0; j < P::AccColsPerTh; ++j) {
+      sums[j] = 0;
+    }
+#pragma unroll
+    for (int i = 0; i < P::AccRowsPerTh; ++i) {
+      auto xid = startx + i * P::AccThRows;
+#pragma unroll
+      for (int j = 0; j < P::AccColsPerTh; ++j) {
+        auto yid      = starty + j * P::AccThCols;
+        auto is_neigh = acc[i][j] <= eps;
+        ///@todo: fix uncoalesced writes using shared mem
+        if (xid < this->m && yid < this->n) {
+          adj[xid * this->n + yid] = is_neigh;
+          sums[j] += is_neigh;
+        }
+      }
+    }
+    // perform reduction of adjacency values to compute vertex degrees
+    if (vd != nullptr) { updateVertexDegree(sums); }
+  }
+
+  DI void accumulate()
+  {
+#pragma unroll
+    for (int ki = 0; ki < P::Kblk; ki += P::Veclen) {
+      this->ldsXY(ki);
+#pragma unroll
+      for (int i = 0; i < P::AccRowsPerTh; ++i) {
+#pragma unroll
+        for (int j = 0; j < P::AccColsPerTh; ++j) {
+#pragma unroll
+          for (int v = 0; v < P::Veclen; ++v) {
+            auto diff = this->regx[i][v] - this->regy[j][v];
+            acc[i][j] += diff * diff;
+          }
+        }
+      }
+    }
+  }
+
+  DI void updateVertexDegree(IdxT (&sums)[P::AccColsPerTh])
+  {
+    __syncthreads();  // so that we can safely reuse smem
+    int gid       = threadIdx.x / P::AccThCols;
+    int lid       = threadIdx.x % P::AccThCols;
+    auto cidx     = IdxT(blockIdx.y) * P::Nblk + lid;
+    IdxT totalSum = 0;
+    // update the individual vertex degrees
+#pragma unroll
+    for (int i = 0; i < P::AccColsPerTh; ++i) {
+      sums[i]  = batchedBlockReduce<IdxT, P::AccThCols>(sums[i], smem);
+      auto cid = cidx + i * P::AccThCols;
+      if (gid == 0 && cid < this->n) {
+        atomicUpdate(cid, sums[i]);
+        totalSum += sums[i];
+      }
+      __syncthreads();  // for safe smem reuse
+    }
+    // update the total edge count
+    totalSum = raft::blockReduce<IdxT>(totalSum, smem);
+    if (threadIdx.x == 0) { atomicUpdate(this->n, totalSum); }
+  }
+
+  DI void atomicUpdate(IdxT addrId, IdxT val)
+  {
+    if (sizeof(IdxT) == 4) {
+      raft::myAtomicAdd<unsigned>((unsigned*)(vd + addrId), val);
+    } else if (sizeof(IdxT) == 8) {
+      raft::myAtomicAdd<unsigned long long>((unsigned long long*)(vd + addrId), val);
+    }
+  }
+};  // struct EpsUnexpL2SqNeighborhood
+
+template <typename DataT, typename IdxT, typename Policy>
+__global__ __launch_bounds__(Policy::Nthreads, 2)
+
+  void epsUnexpL2SqNeighKernel(
+    bool* adj, IdxT* vd, const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k, DataT eps)
+{
+  extern __shared__ char smem[];
+  EpsUnexpL2SqNeighborhood<DataT, IdxT, Policy> obj(adj, vd, x, y, m, n, k, eps, smem);
+  obj.run();
+}
+
+template <typename DataT, typename IdxT, int VecLen>
+void epsUnexpL2SqNeighImpl(bool* adj,
+                           IdxT* vd,
+                           const DataT* x,
+                           const DataT* y,
+                           IdxT m,
+                           IdxT n,
+                           IdxT k,
+                           DataT eps,
+                           cudaStream_t stream)
+{
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy Policy;
+  dim3 grid(raft::ceildiv<int>(m, Policy::Mblk), raft::ceildiv<int>(n, Policy::Nblk));
+  dim3 blk(Policy::Nthreads);
+  epsUnexpL2SqNeighKernel<DataT, IdxT, Policy>
+    <<<grid, blk, Policy::SmemSize, stream>>>(adj, vd, x, y, m, n, k, eps);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+/**
+ * @brief Computes epsilon neighborhood for the L2-Squared distance metric
+ *
+ * @tparam DataT   IO and math type
+ * @tparam IdxT    Index type
+ *
+ * @param[out] adj    adjacency matrix [row-major] [on device] [dim = m x n]
+ * @param[out] vd     vertex degree array [on device] [len = m + 1]
+ *                    `vd + m` stores the total number of edges in the adjacency
+ *                    matrix. Pass a nullptr if you don't need this info.
+ * @param[in]  x      first matrix [row-major] [on device] [dim = m x k]
+ * @param[in]  y      second matrix [row-major] [on device] [dim = n x k]
+ * @param[in]  eps    defines epsilon neighborhood radius (should be passed as
+ *                    squared as we compute L2-squared distance in this method)
+ * @param[in]  fop    device lambda to do any other custom functions
+ * @param[in]  stream cuda stream
+ */
+template <typename DataT, typename IdxT>
+void epsUnexpL2SqNeighborhood(bool* adj,
+                              IdxT* vd,
+                              const DataT* x,
+                              const DataT* y,
+                              IdxT m,
+                              IdxT n,
+                              IdxT k,
+                              DataT eps,
+                              cudaStream_t stream)
+{
+  size_t bytes = sizeof(DataT) * k;
+  if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) {
+    epsUnexpL2SqNeighImpl<DataT, IdxT, 16 / sizeof(DataT)>(adj, vd, x, y, m, n, k, eps, stream);
+  } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) {
+    epsUnexpL2SqNeighImpl<DataT, IdxT, 8 / sizeof(DataT)>(adj, vd, x, y, m, n, k, eps, stream);
+  } else {
+    epsUnexpL2SqNeighImpl<DataT, IdxT, 1>(adj, vd, x, y, m, n, k, eps, stream);
+  }
+}
+}  // namespace detail
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
new file mode 100644
index 0000000000..cd9163096a
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/spatial/knn/detail/epsilon_neighborhood.cuh>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+/**
+ * @brief Computes epsilon neighborhood for the L2-Squared distance metric
+ *
+ * @tparam DataT   IO and math type
+ * @tparam IdxT    Index type
+ *
+ * @param[out] adj    adjacency matrix [row-major] [on device] [dim = m x n]
+ * @param[out] vd     vertex degree array [on device] [len = m + 1]
+ *                    `vd + m` stores the total number of edges in the adjacency
+ *                    matrix. Pass a nullptr if you don't need this info.
+ * @param[in]  x      first matrix [row-major] [on device] [dim = m x k]
+ * @param[in]  y      second matrix [row-major] [on device] [dim = n x k]
+ * @param[in]  m      number of rows in x
+ * @param[in]  n      number of rows in y
+ * @param[in]  k      number of columns in x and k
+ * @param[in]  eps    defines epsilon neighborhood radius (should be passed as
+ *                    squared as we compute L2-squared distance in this method)
+ * @param[in]  stream cuda stream
+ */
+template <typename DataT, typename IdxT>
+void epsUnexpL2SqNeighborhood(bool* adj,
+                              IdxT* vd,
+                              const DataT* x,
+                              const DataT* y,
+                              IdxT m,
+                              IdxT n,
+                              IdxT k,
+                              DataT eps,
+                              cudaStream_t stream)
+{
+  detail::epsUnexpL2SqNeighborhood<DataT, IdxT>(adj, vd, x, y, m, n, k, eps, stream);
+}
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
diff --git a/cpp/include/raft/stats/common.hpp b/cpp/include/raft/stats/common.hpp
new file mode 100644
index 0000000000..765f07a012
--- /dev/null
+++ b/cpp/include/raft/stats/common.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cudart_utils.h>
+
+// This file is a shameless amalgamation of independent works done by
+// Lars Nyland and Andy Adinets
+
+///@todo: add cub's histogram as another option
+
+namespace raft {
+namespace stats {
+
+/** Default mapper which just returns the value of the data itself */
+template <typename DataT, typename IdxT>
+struct IdentityBinner {
+  DI int operator()(DataT val, IdxT row, IdxT col) { return int(val); }
+};
+
+/** Types of support histogram implementations */
+enum HistType {
+  /** shared mem atomics but with bins to be 1b int's */
+  HistTypeSmemBits1 = 1,
+  /** shared mem atomics but with bins to be 2b int's */
+  HistTypeSmemBits2 = 2,
+  /** shared mem atomics but with bins to be 4b int's */
+  HistTypeSmemBits4 = 4,
+  /** shared mem atomics but with bins to ba 1B int's */
+  HistTypeSmemBits8 = 8,
+  /** shared mem atomics but with bins to be 2B int's */
+  HistTypeSmemBits16 = 16,
+  /** use only global atomics */
+  HistTypeGmem,
+  /** uses shared mem atomics to reduce global traffic */
+  HistTypeSmem,
+  /**
+   * uses shared mem atomics with match_any intrinsic to further reduce shared
+   * memory traffic. This can only be enabled on Volta and later architectures.
+   * If one tries to enable this for older arch's, it will fall back to
+   * `HistTypeSmem`.
+   * @note This is to be used only when the input dataset leads to a lot of
+   *       repetitions in a given warp, else, this algo can be much slower than
+   *       `HistTypeSmem`!
+   */
+  HistTypeSmemMatchAny,
+  /** builds a hashmap of active bins in shared mem */
+  HistTypeSmemHash,
+  /** decide at runtime the best algo for the given inputs */
+  HistTypeAuto
+};
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/cov.hpp b/cpp/include/raft/stats/cov.hpp
new file mode 100644
index 0000000000..dc5bc63ee8
--- /dev/null
+++ b/cpp/include/raft/stats/cov.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/stats/detail/cov.cuh>
+namespace raft {
+namespace stats {
+/**
+ * @brief Compute covariance of the input matrix
+ *
+ * Mean operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @param covar the output covariance matrix
+ * @param data the input matrix (this will get mean-centered at the end!)
+ * @param mu mean vector of the input matrix
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param sample whether to evaluate sample covariance or not. In other words,
+ * whether to normalize the output using N-1 or N, for true or false,
+ * respectively
+ * @param rowMajor whether the input data is row or col major
+ * @param stable whether to run the slower-but-numerically-stable version or not
+ * @param handle cublas handle
+ * @param stream cuda stream
+ * @note if stable=true, then the input data will be mean centered after this
+ * function returns!
+ */
+template <typename Type>
+void cov(const raft::handle_t& handle,
+         Type* covar,
+         Type* data,
+         const Type* mu,
+         std::size_t D,
+         std::size_t N,
+         bool sample,
+         bool rowMajor,
+         bool stable,
+         cudaStream_t stream)
+{
+  detail::cov(handle, covar, data, mu, D, N, sample, rowMajor, stable, stream);
+}
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/detail/cov.cuh b/cpp/include/raft/stats/detail/cov.cuh
new file mode 100644
index 0000000000..7e3fc701a1
--- /dev/null
+++ b/cpp/include/raft/stats/detail/cov.cuh
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/linalg/gemm.hpp>
+#include <raft/stats/mean_center.hpp>
+
+namespace raft {
+namespace stats {
+namespace detail {
+/**
+ * @brief Compute covariance of the input matrix
+ *
+ * Mean operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @param covar the output covariance matrix
+ * @param data the input matrix (this will get mean-centered at the end!)
+ * @param mu mean vector of the input matrix
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param sample whether to evaluate sample covariance or not. In other words,
+ * whether to normalize the output using N-1 or N, for true or false,
+ * respectively
+ * @param rowMajor whether the input data is row or col major
+ * @param stable whether to run the slower-but-numerically-stable version or not
+ * @param handle cublas handle
+ * @param stream cuda stream
+ * @note if stable=true, then the input data will be mean centered after this
+ * function returns!
+ */
+template <typename Type>
+void cov(const raft::handle_t& handle,
+         Type* covar,
+         Type* data,
+         const Type* mu,
+         std::size_t D,
+         std::size_t N,
+         bool sample,
+         bool rowMajor,
+         bool stable,
+         cudaStream_t stream)
+{
+  if (stable) {
+    cublasHandle_t cublas_h = handle.get_cublas_handle();
+
+    // since mean operation is assumed to be along a given column, broadcast
+    // must be along rows!
+    raft::stats::meanCenter(data, data, mu, D, N, rowMajor, true, stream);
+    Type alpha = Type(1) / (sample ? Type(N - 1) : Type(N));
+    Type beta  = Type(0);
+    if (rowMajor) {
+      // #TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_h,
+                                                       CUBLAS_OP_N,
+                                                       CUBLAS_OP_T,
+                                                       D,
+                                                       D,
+                                                       N,
+                                                       &alpha,
+                                                       data,
+                                                       D,
+                                                       data,
+                                                       D,
+                                                       &beta,
+                                                       covar,
+                                                       D,
+                                                       stream));
+    } else {
+      raft::linalg::gemm(
+        handle, data, N, D, data, covar, D, D, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream);
+    }
+  } else {
+    ///@todo: implement this using cutlass + customized epilogue!
+    ASSERT(false, "cov: Implement stable=false case!");
+  }
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+};  // end namespace detail
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/detail/histogram.cuh b/cpp/include/raft/stats/detail/histogram.cuh
new file mode 100644
index 0000000000..65241f524f
--- /dev/null
+++ b/cpp/include/raft/stats/detail/histogram.cuh
@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/common/seive.hpp>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/stats/common.hpp>
+#include <raft/vectorized.cuh>
+#include <stdint.h>
+
+// This file is a shameless amalgamation of independent works done by
+// Lars Nyland and Andy Adinets
+
+///@todo: add cub's histogram as another option
+
+namespace raft {
+namespace stats {
+namespace detail {
+
+static const int ThreadsPerBlock = 256;
+
+template <typename IdxT, int VecLen>
+dim3 computeGridDim(IdxT nrows, IdxT ncols, const void* kernel)
+{
+  int occupancy;
+  RAFT_CUDA_TRY(
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, ThreadsPerBlock, 0));
+  const auto maxBlks = occupancy * raft::getMultiProcessorCount();
+  int nblksx         = raft::ceildiv<int>(VecLen ? nrows / VecLen : nrows, ThreadsPerBlock);
+  // for cases when there aren't a lot of blocks for computing one histogram
+  nblksx = std::min(nblksx, maxBlks);
+  return dim3(nblksx, ncols);
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen, typename CoreOp>
+DI void histCoreOp(const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner, CoreOp op, IdxT col)
+{
+  IdxT offset = col * nrows;
+  auto bdim   = IdxT(blockDim.x);
+  IdxT tid    = threadIdx.x + bdim * blockIdx.x;
+  tid *= VecLen;
+  IdxT stride = bdim * gridDim.x * VecLen;
+  int nCeil   = raft::alignTo<int>(nrows, stride);
+  typedef raft::TxN_t<DataT, VecLen> VecType;
+  VecType a;
+  for (auto i = tid; i < nCeil; i += stride) {
+    if (i < nrows) { a.load(data, offset + i); }
+#pragma unroll
+    for (int j = 0; j < VecLen; ++j) {
+      int binId = binner(a.val.data[j], i + j, col);
+      op(binId, i + j, col);
+    }
+  }
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen>
+__global__ void gmemHistKernel(
+  int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner)
+{
+  auto op = [=] __device__(int binId, IdxT row, IdxT col) {
+    if (row >= nrows) return;
+    auto binOffset = col * nbins;
+#if __CUDA_ARCH__ < 700
+    raft::myAtomicAdd(bins + binOffset + binId, 1);
+#else
+    auto amask  = __activemask();
+    auto mask   = __match_any_sync(amask, binId);
+    auto leader = __ffs(mask) - 1;
+    if (raft::laneId() == leader) { raft::myAtomicAdd(bins + binOffset + binId, __popc(mask)); }
+#endif  // __CUDA_ARCH__
+  };
+  histCoreOp<DataT, BinnerOp, IdxT, VecLen>(data, nrows, nbins, binner, op, blockIdx.y);
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen>
+void gmemHist(int* bins,
+              IdxT nbins,
+              const DataT* data,
+              IdxT nrows,
+              IdxT ncols,
+              BinnerOp binner,
+              cudaStream_t stream)
+{
+  auto blks = computeGridDim<IdxT, VecLen>(
+    nrows, ncols, (const void*)gmemHistKernel<DataT, BinnerOp, IdxT, VecLen>);
+  gmemHistKernel<DataT, BinnerOp, IdxT, VecLen>
+    <<<blks, ThreadsPerBlock, 0, stream>>>(bins, data, nrows, nbins, binner);
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen, bool UseMatchAny>
+__global__ void smemHistKernel(
+  int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner)
+{
+  extern __shared__ unsigned sbins[];
+  for (auto i = threadIdx.x; i < nbins; i += blockDim.x) {
+    sbins[i] = 0;
+  }
+  __syncthreads();
+  auto op = [=] __device__(int binId, IdxT row, IdxT col) {
+    if (row >= nrows) return;
+#if __CUDA_ARCH__ < 700
+    raft::myAtomicAdd<unsigned int>(sbins + binId, 1);
+#else
+    if (UseMatchAny) {
+      auto amask  = __activemask();
+      auto mask   = __match_any_sync(amask, binId);
+      auto leader = __ffs(mask) - 1;
+      if (raft::laneId() == leader) {
+        raft::myAtomicAdd<unsigned int>(sbins + binId, __popc(mask));
+      }
+    } else {
+      raft::myAtomicAdd<unsigned int>(sbins + binId, 1);
+    }
+#endif  // __CUDA_ARCH__
+  };
+  IdxT col = blockIdx.y;
+  histCoreOp<DataT, BinnerOp, IdxT, VecLen>(data, nrows, nbins, binner, op, col);
+  __syncthreads();
+  auto binOffset = col * nbins;
+  for (auto i = threadIdx.x; i < nbins; i += blockDim.x) {
+    auto val = sbins[i];
+    if (val > 0) { raft::myAtomicAdd<unsigned int>((unsigned int*)bins + binOffset + i, val); }
+  }
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen, bool UseMatchAny>
+void smemHist(int* bins,
+              IdxT nbins,
+              const DataT* data,
+              IdxT nrows,
+              IdxT ncols,
+              BinnerOp binner,
+              cudaStream_t stream)
+{
+  auto blks = computeGridDim<IdxT, VecLen>(
+    nrows, ncols, (const void*)smemHistKernel<DataT, BinnerOp, IdxT, VecLen, UseMatchAny>);
+  size_t smemSize = nbins * sizeof(unsigned);
+  smemHistKernel<DataT, BinnerOp, IdxT, VecLen, UseMatchAny>
+    <<<blks, ThreadsPerBlock, smemSize, stream>>>(bins, data, nrows, nbins, binner);
+}
+
+template <unsigned _BIN_BITS>
+struct BitsInfo {
+  static unsigned const BIN_BITS  = _BIN_BITS;
+  static unsigned const WORD_BITS = sizeof(unsigned) * 8;
+  static unsigned const WORD_BINS = WORD_BITS / BIN_BITS;
+  static unsigned const BIN_MASK  = (1 << BIN_BITS) - 1;
+};
+
+template <unsigned BIN_BITS>
+DI void incrementBin(unsigned* sbins, int* bins, int nbins, int binId)
+{
+  typedef BitsInfo<BIN_BITS> Bits;
+  auto iword    = binId / Bits::WORD_BINS;
+  auto ibin     = binId % Bits::WORD_BINS;
+  auto sh       = ibin * Bits::BIN_BITS;
+  auto old_word = atomicAdd(sbins + iword, unsigned(1 << sh));
+  auto new_word = old_word + unsigned(1 << sh);
+  if ((new_word >> sh & Bits::BIN_MASK) != 0) return;
+  // overflow
+  raft::myAtomicAdd<unsigned int>((unsigned int*)bins + binId, Bits::BIN_MASK + 1);
+  for (int dbin = 1; ibin + dbin < Bits::WORD_BINS && binId + dbin < nbins; ++dbin) {
+    auto sh1 = (ibin + dbin) * Bits::BIN_BITS;
+    if ((new_word >> sh1 & Bits::BIN_MASK) == 0) {
+      // overflow
+      raft::myAtomicAdd<unsigned int>((unsigned int*)bins + binId + dbin, Bits::BIN_MASK);
+    } else {
+      // correction
+      raft::myAtomicAdd(bins + binId + dbin, -1);
+      break;
+    }
+  }
+}
+
+template <>
+DI void incrementBin<1>(unsigned* sbins, int* bins, int nbins, int binId)
+{
+  typedef BitsInfo<1> Bits;
+  auto iword    = binId / Bits::WORD_BITS;
+  auto sh       = binId % Bits::WORD_BITS;
+  auto old_word = atomicXor(sbins + iword, unsigned(1 << sh));
+  if ((old_word >> sh & 1) != 0) raft::myAtomicAdd(bins + binId, 2);
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int BIN_BITS, int VecLen>
+__global__ void smemBitsHistKernel(
+  int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner)
+{
+  extern __shared__ unsigned sbins[];
+  typedef BitsInfo<BIN_BITS> Bits;
+  auto nwords = raft::ceildiv<int>(nbins, Bits::WORD_BINS);
+  for (auto j = threadIdx.x; j < nwords; j += blockDim.x) {
+    sbins[j] = 0;
+  }
+  __syncthreads();
+  IdxT col       = blockIdx.y;
+  IdxT binOffset = col * nbins;
+  auto op        = [=] __device__(int binId, IdxT row, IdxT col) {
+    if (row >= nrows) return;
+    incrementBin<Bits::BIN_BITS>(sbins, bins + binOffset, (int)nbins, binId);
+  };
+  histCoreOp<DataT, BinnerOp, IdxT, VecLen>(data, nrows, nbins, binner, op, col);
+  __syncthreads();
+  for (auto j = threadIdx.x; j < (int)nbins; j += blockDim.x) {
+    auto shift = j % Bits::WORD_BINS * Bits::BIN_BITS;
+    int count  = sbins[j / Bits::WORD_BINS] >> shift & Bits::BIN_MASK;
+    if (count > 0) raft::myAtomicAdd(bins + binOffset + j, count);
+  }
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int BIN_BITS, int VecLen>
+void smemBitsHist(int* bins,
+                  IdxT nbins,
+                  const DataT* data,
+                  IdxT nrows,
+                  IdxT ncols,
+                  BinnerOp binner,
+                  cudaStream_t stream)
+{
+  typedef BitsInfo<BIN_BITS> Bits;
+  auto blks = computeGridDim<IdxT, VecLen>(
+    nrows, ncols, (const void*)smemBitsHistKernel<DataT, BinnerOp, IdxT, Bits::BIN_BITS, VecLen>);
+  size_t smemSize = raft::ceildiv<size_t>(nbins, Bits::WORD_BITS / Bits::BIN_BITS) * sizeof(int);
+  smemBitsHistKernel<DataT, BinnerOp, IdxT, Bits::BIN_BITS, VecLen>
+    <<<blks, ThreadsPerBlock, smemSize, stream>>>(bins, data, nrows, nbins, binner);
+}
+
+#define INVALID_KEY -1
+
+DI void clearHashTable(int2* ht, int hashSize)
+{
+  for (auto i = threadIdx.x; i < hashSize; i += blockDim.x) {
+    ht[i] = {INVALID_KEY, 0};
+  }
+}
+
+DI int findEntry(int2* ht, int hashSize, int binId, int threshold)
+{
+  int idx = binId % hashSize;
+  int t;
+  int count = 0;
+  while ((t = atomicCAS(&(ht[idx].x), INVALID_KEY, binId)) != INVALID_KEY && t != binId) {
+    ++count;
+    if (count >= threshold) {
+      idx = INVALID_KEY;
+      break;
+    }
+    ++idx;
+    if (idx >= hashSize) { idx = 0; }
+  }
+  return idx;
+}
+
+DI void flushHashTable(int2* ht, int hashSize, int* bins, int nbins, int col)
+{
+  int binOffset = col * nbins;
+  for (auto i = threadIdx.x; i < hashSize; i += blockDim.x) {
+    if (ht[i].x != INVALID_KEY && ht[i].y > 0) {
+      raft::myAtomicAdd(bins + binOffset + ht[i].x, ht[i].y);
+    }
+    ht[i] = {INVALID_KEY, 0};
+  }
+}
+
+#undef INVALID_KEY
+
+///@todo: honor VecLen template param
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen>
+__global__ void smemHashHistKernel(int* bins,
+                                   const DataT* data,
+                                   IdxT nrows,
+                                   IdxT nbins,
+                                   BinnerOp binner,
+                                   int hashSize,
+                                   int threshold)
+{
+  extern __shared__ int2 ht[];
+  int* needFlush = (int*)&(ht[hashSize]);
+  if (threadIdx.x == 0) { needFlush[0] = 0; }
+  clearHashTable(ht, hashSize);
+  __syncthreads();
+  auto op = [=] __device__(int binId, IdxT row, IdxT col) {
+    bool iNeedFlush = false;
+    if (row < nrows) {
+      int hidx = findEntry(ht, hashSize, binId, threshold);
+      if (hidx >= 0) {
+        raft::myAtomicAdd(&(ht[hidx].y), 1);
+      } else {
+        needFlush[0] = 1;
+        iNeedFlush   = true;
+      }
+    }
+    __syncthreads();
+    if (needFlush[0]) {
+      flushHashTable(ht, hashSize, bins, nbins, col);
+      __syncthreads();
+      if (threadIdx.x == 0) { needFlush[0] = 0; }
+      __syncthreads();
+    }
+    if (iNeedFlush) {
+      int hidx = findEntry(ht, hashSize, binId, threshold);
+      // all threads are bound to get one valid entry as all threads in this
+      // block will make forward progress due to the __syncthreads call in the
+      // subsequent iteration
+      raft::myAtomicAdd(&(ht[hidx].y), 1);
+    }
+  };
+  IdxT col = blockIdx.y;
+  histCoreOp<DataT, BinnerOp, IdxT, VecLen>(data, nrows, nbins, binner, op, col);
+  __syncthreads();
+  flushHashTable(ht, hashSize, bins, nbins, col);
+}
+
+inline int computeHashTableSize()
+{
+  // we shouldn't have this much of shared memory available anytime soon!
+  static const unsigned maxBinsEverPossible = 256 * 1024;
+  static raft::common::Seive primes(maxBinsEverPossible);
+  unsigned smem = raft::getSharedMemPerBlock();
+  // divide-by-2 because hash table entry stores 2 elements: idx and count
+  auto binsPossible = smem / sizeof(unsigned) / 2;
+  for (; binsPossible > 1; --binsPossible) {
+    if (primes.isPrime(binsPossible)) return (int)binsPossible;
+  }
+  return 1;  // should not happen!
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen>
+void smemHashHist(int* bins,
+                  IdxT nbins,
+                  const DataT* data,
+                  IdxT nrows,
+                  IdxT ncols,
+                  BinnerOp binner,
+                  cudaStream_t stream)
+{
+  static const int flushThreshold = 10;
+  auto blks                       = computeGridDim<IdxT, 1>(
+    nrows, ncols, (const void*)smemHashHistKernel<DataT, BinnerOp, IdxT, 1>);
+  int hashSize    = computeHashTableSize();
+  size_t smemSize = hashSize * sizeof(int2) + sizeof(int);
+  smemHashHistKernel<DataT, BinnerOp, IdxT, 1><<<blks, ThreadsPerBlock, smemSize, stream>>>(
+    bins, data, nrows, nbins, binner, hashSize, flushThreshold);
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT, int VecLen>
+void histogramVecLen(HistType type,
+                     int* bins,
+                     IdxT nbins,
+                     const DataT* data,
+                     IdxT nrows,
+                     IdxT ncols,
+                     cudaStream_t stream,
+                     BinnerOp binner)
+{
+  RAFT_CUDA_TRY(cudaMemsetAsync(bins, 0, ncols * nbins * sizeof(int), stream));
+  switch (type) {
+    case HistTypeGmem:
+      gmemHist<DataT, BinnerOp, IdxT, VecLen>(bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmem:
+      smemHist<DataT, BinnerOp, IdxT, VecLen, false>(
+        bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmemMatchAny:
+      smemHist<DataT, BinnerOp, IdxT, VecLen, true>(
+        bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmemBits16:
+      smemBitsHist<DataT, BinnerOp, IdxT, 16, VecLen>(
+        bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmemBits8:
+      smemBitsHist<DataT, BinnerOp, IdxT, 8, VecLen>(
+        bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmemBits4:
+      smemBitsHist<DataT, BinnerOp, IdxT, 4, VecLen>(
+        bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmemBits2:
+      smemBitsHist<DataT, BinnerOp, IdxT, 2, VecLen>(
+        bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmemBits1:
+      smemBitsHist<DataT, BinnerOp, IdxT, 1, VecLen>(
+        bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    case HistTypeSmemHash:
+      smemHashHist<DataT, BinnerOp, IdxT, VecLen>(bins, nbins, data, nrows, ncols, binner, stream);
+      break;
+    default: ASSERT(false, "histogram: Invalid type passed '%d'!", type);
+  };
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+template <typename DataT, typename BinnerOp, typename IdxT>
+void histogramImpl(HistType type,
+                   int* bins,
+                   IdxT nbins,
+                   const DataT* data,
+                   IdxT nrows,
+                   IdxT ncols,
+                   cudaStream_t stream,
+                   BinnerOp binner)
+{
+  size_t bytes = nrows * sizeof(DataT);
+  if (nrows <= 0) return;
+  if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) {
+    histogramVecLen<DataT, BinnerOp, IdxT, 16 / sizeof(DataT)>(
+      type, bins, nbins, data, nrows, ncols, stream, binner);
+  } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) {
+    histogramVecLen<DataT, BinnerOp, IdxT, 8 / sizeof(DataT)>(
+      type, bins, nbins, data, nrows, ncols, stream, binner);
+  } else if (4 % sizeof(DataT) == 0 && bytes % 4 == 0) {
+    histogramVecLen<DataT, BinnerOp, IdxT, 4 / sizeof(DataT)>(
+      type, bins, nbins, data, nrows, ncols, stream, binner);
+  } else if (2 % sizeof(DataT) == 0 && bytes % 2 == 0) {
+    histogramVecLen<DataT, BinnerOp, IdxT, 2 / sizeof(DataT)>(
+      type, bins, nbins, data, nrows, ncols, stream, binner);
+  } else {
+    histogramVecLen<DataT, BinnerOp, IdxT, 1>(
+      type, bins, nbins, data, nrows, ncols, stream, binner);
+  }
+}
+
+template <typename IdxT>
+HistType selectBestHistAlgo(IdxT nbins)
+{
+  size_t smem         = raft::getSharedMemPerBlock();
+  size_t requiredSize = nbins * sizeof(unsigned);
+  if (requiredSize <= smem) { return HistTypeSmem; }
+  for (int bits = 16; bits >= 1; bits >>= 1) {
+    auto nBytesForBins = raft::ceildiv<size_t>(bits * nbins, 8);
+    requiredSize       = raft::alignTo<size_t>(nBytesForBins, sizeof(unsigned));
+    if (requiredSize <= smem) { return static_cast<HistType>(bits); }
+  }
+  return HistTypeGmem;
+}
+
+/**
+ * @brief Perform histogram on the input data. It chooses the right load size
+ * based on the input data vector length. It also supports large-bin cases
+ * using a specialized smem-based hashing technique.
+ * @tparam DataT input data type
+ * @tparam IdxT data type used to compute indices
+ * @tparam BinnerOp takes the input data and computes its bin index
+ * @param type histogram implementation type to choose
+ * @param bins the output bins (length = ncols * nbins)
+ * @param nbins number of bins
+ * @param data input data (length = ncols * nrows)
+ * @param nrows data array length in each column (or batch)
+ * @param ncols number of columsn (or batch size)
+ * @param stream cuda stream
+ * @param binner the operation that computes the bin index of the input data
+ *
+ * @note signature of BinnerOp is `int func(DataT, IdxT);`
+ */
+template <typename DataT, typename IdxT = int, typename BinnerOp = IdentityBinner<DataT, IdxT>>
+void histogram(HistType type,
+               int* bins,
+               IdxT nbins,
+               const DataT* data,
+               IdxT nrows,
+               IdxT ncols,
+               cudaStream_t stream,
+               BinnerOp binner = IdentityBinner<DataT, IdxT>())
+{
+  HistType computedType = type;
+  if (type == HistTypeAuto) { computedType = selectBestHistAlgo(nbins); }
+  histogramImpl<DataT, BinnerOp, IdxT>(
+    computedType, bins, nbins, data, nrows, ncols, stream, binner);
+}
+
+};  // end namespace detail
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/detail/minmax.cuh b/cpp/include/raft/stats/detail/minmax.cuh
new file mode 100644
index 0000000000..2a4a9bff93
--- /dev/null
+++ b/cpp/include/raft/stats/detail/minmax.cuh
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+
+#include <limits>
+
+namespace raft {
+namespace stats {
+namespace detail {
+
+// TODO: replace with `std::bitcast` once we adopt C++20 or libcu++ adds it
+template <class To, class From>
+constexpr To bit_cast(const From& from) noexcept
+{
+  To to{};
+  static_assert(sizeof(To) == sizeof(From));
+  memcpy(&to, &from, sizeof(To));
+  return to;
+}
+
+template <typename T>
+struct encode_traits {
+};
+
+template <>
+struct encode_traits<float> {
+  using E = int;
+};
+
+template <>
+struct encode_traits<double> {
+  using E = long long;
+};
+
+HDI int encode(float val)
+{
+  int i = detail::bit_cast<int>(val);
+  return i >= 0 ? i : (1 << 31) | ~i;
+}
+
+HDI long long encode(double val)
+{
+  std::int64_t i = detail::bit_cast<std::int64_t>(val);
+  return i >= 0 ? i : (1ULL << 63) | ~i;
+}
+
+HDI float decode(int val)
+{
+  if (val < 0) val = (1 << 31) | ~val;
+  return detail::bit_cast<float>(val);
+}
+
+HDI double decode(long long val)
+{
+  if (val < 0) val = (1ULL << 63) | ~val;
+  return detail::bit_cast<double>(val);
+}
+
+template <typename T, typename E>
+DI T atomicMaxBits(T* address, T val)
+{
+  E old = atomicMax((E*)address, encode(val));
+  return decode(old);
+}
+
+template <typename T, typename E>
+DI T atomicMinBits(T* address, T val)
+{
+  E old = atomicMin((E*)address, encode(val));
+  return decode(old);
+}
+
+template <typename T, typename E>
+__global__ void decodeKernel(T* globalmin, T* globalmax, int ncols)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < ncols) {
+    globalmin[tid] = decode(*(E*)&globalmin[tid]);
+    globalmax[tid] = decode(*(E*)&globalmax[tid]);
+  }
+}
+
+///@todo: implement a proper "fill" kernel
+template <typename T, typename E>
+__global__ void minmaxInitKernel(int ncols, T* globalmin, T* globalmax, T init_val)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= ncols) return;
+  *(E*)&globalmin[tid] = encode(init_val);
+  *(E*)&globalmax[tid] = encode(-init_val);
+}
+
+template <typename T, typename E>
+__global__ void minmaxKernel(const T* data,
+                             const unsigned int* rowids,
+                             const unsigned int* colids,
+                             int nrows,
+                             int ncols,
+                             int row_stride,
+                             T* g_min,
+                             T* g_max,
+                             T* sampledcols,
+                             T init_min_val,
+                             int batch_ncols,
+                             int num_batches)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  extern __shared__ char shmem[];
+  T* s_min = (T*)shmem;
+  T* s_max = (T*)(shmem + sizeof(T) * batch_ncols);
+
+  int last_batch_ncols = ncols % batch_ncols;
+  if (last_batch_ncols == 0) { last_batch_ncols = batch_ncols; }
+  int orig_batch_ncols = batch_ncols;
+
+  for (int batch_id = 0; batch_id < num_batches; batch_id++) {
+    if (batch_id == num_batches - 1) { batch_ncols = last_batch_ncols; }
+
+    for (int i = threadIdx.x; i < batch_ncols; i += blockDim.x) {
+      *(E*)&s_min[i] = encode(init_min_val);
+      *(E*)&s_max[i] = encode(-init_min_val);
+    }
+    __syncthreads();
+
+    for (int i = tid; i < nrows * batch_ncols; i += blockDim.x * gridDim.x) {
+      int col = (batch_id * orig_batch_ncols) + (i / nrows);
+      int row = i % nrows;
+      if (colids != nullptr) { col = colids[col]; }
+      if (rowids != nullptr) { row = rowids[row]; }
+      int index = row + col * row_stride;
+      T coldata = data[index];
+      if (!isnan(coldata)) {
+        // Min max values are saved in shared memory and global memory as per the shuffled colids.
+        atomicMinBits<T, E>(&s_min[(int)(i / nrows)], coldata);
+        atomicMaxBits<T, E>(&s_max[(int)(i / nrows)], coldata);
+      }
+      if (sampledcols != nullptr) { sampledcols[batch_id * orig_batch_ncols + i] = coldata; }
+    }
+    __syncthreads();
+
+    // finally, perform global mem atomics
+    for (int j = threadIdx.x; j < batch_ncols; j += blockDim.x) {
+      atomicMinBits<T, E>(&g_min[batch_id * orig_batch_ncols + j], decode(*(E*)&s_min[j]));
+      atomicMaxBits<T, E>(&g_max[batch_id * orig_batch_ncols + j], decode(*(E*)&s_max[j]));
+    }
+    __syncthreads();
+  }
+}
+
+/**
+ * @brief Computes min/max across every column of the input matrix, as well as
+ * optionally allow to subsample based on the given row/col ID mapping vectors
+ *
+ * @tparam T the data type
+ * @tparam TPB number of threads per block
+ * @param data input data
+ * @param rowids actual row ID mappings. It is of length nrows. If you want to
+ * skip this index lookup entirely, pass nullptr
+ * @param colids actual col ID mappings. It is of length ncols. If you want to
+ * skip this index lookup entirely, pass nullptr
+ * @param nrows number of rows of data to be worked upon. The actual rows of the
+ * input "data" can be bigger than this!
+ * @param ncols number of cols of data to be worked upon. The actual cols of the
+ * input "data" can be bigger than this!
+ * @param row_stride stride (in number of elements) between 2 adjacent columns
+ * @param globalmin final col-wise global minimum (size = ncols)
+ * @param globalmax final col-wise global maximum (size = ncols)
+ * @param sampledcols output sampled data. Pass nullptr if you don't need this
+ * @param stream cuda stream
+ * @note This method makes the following assumptions:
+ * 1. input and output matrices are assumed to be col-major
+ * 2. ncols is small enough to fit the whole of min/max values across all cols
+ *    in shared memory
+ */
+template <typename T, int TPB = 512>
+void minmax(const T* data,
+            const unsigned* rowids,
+            const unsigned* colids,
+            int nrows,
+            int ncols,
+            int row_stride,
+            T* globalmin,
+            T* globalmax,
+            T* sampledcols,
+            cudaStream_t stream)
+{
+  using E    = typename encode_traits<T>::E;
+  int nblks  = raft::ceildiv(ncols, TPB);
+  T init_val = std::numeric_limits<T>::max();
+  minmaxInitKernel<T, E><<<nblks, TPB, 0, stream>>>(ncols, globalmin, globalmax, init_val);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  nblks           = raft::ceildiv(nrows * ncols, TPB);
+  nblks           = min(nblks, 65536);
+  size_t smemSize = sizeof(T) * 2 * ncols;
+
+  // Compute the batch_ncols, in [1, ncols] range, that meet the available
+  // shared memory constraints.
+  auto smemPerBlk = raft::getSharedMemPerBlock();
+  int batch_ncols = min(ncols, (int)(smemPerBlk / (sizeof(T) * 2)));
+  int num_batches = raft::ceildiv(ncols, batch_ncols);
+  smemSize        = sizeof(T) * 2 * batch_ncols;
+
+  minmaxKernel<T, E><<<nblks, TPB, smemSize, stream>>>(data,
+                                                       rowids,
+                                                       colids,
+                                                       nrows,
+                                                       ncols,
+                                                       row_stride,
+                                                       globalmin,
+                                                       globalmax,
+                                                       sampledcols,
+                                                       init_val,
+                                                       batch_ncols,
+                                                       num_batches);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  decodeKernel<T, E><<<nblks, TPB, 0, stream>>>(globalmin, globalmax, ncols);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+};  // end namespace detail
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/detail/weighted_mean.cuh b/cpp/include/raft/stats/detail/weighted_mean.cuh
new file mode 100644
index 0000000000..ca7fc136d3
--- /dev/null
+++ b/cpp/include/raft/stats/detail/weighted_mean.cuh
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cudart_utils.h>
+#include <raft/linalg/coalesced_reduction.hpp>
+#include <raft/linalg/strided_reduction.hpp>
+
+namespace raft {
+namespace stats {
+namespace detail {
+
+/**
+ * @brief Compute the row-wise weighted mean of the input matrix
+ *
+ * @tparam Type the data type
+ * @param mu the output mean vector
+ * @param data the input matrix (assumed to be row-major)
+ * @param weights per-column means
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param stream cuda stream to launch work on
+ */
+template <typename Type>
+void rowWeightedMean(
+  Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream)
+{
+  // sum the weights & copy back to CPU
+  Type WS = 0;
+  raft::linalg::coalescedReduction(mu, weights, D, 1, (Type)0, stream, false);
+  raft::update_host(&WS, mu, 1, stream);
+
+  raft::linalg::coalescedReduction(
+    mu,
+    data,
+    D,
+    N,
+    (Type)0,
+    stream,
+    false,
+    [weights] __device__(Type v, int i) { return v * weights[i]; },
+    [] __device__(Type a, Type b) { return a + b; },
+    [WS] __device__(Type v) { return v / WS; });
+}
+
+/**
+ * @brief Compute the column-wise weighted mean of the input matrix
+ *
+ * @tparam Type the data type
+ * @param mu the output mean vector
+ * @param data the input matrix (assumed to be column-major)
+ * @param weights per-column means
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param stream cuda stream to launch work on
+ */
+template <typename Type>
+void colWeightedMean(
+  Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream)
+{
+  // sum the weights & copy back to CPU
+  Type WS = 0;
+  raft::linalg::stridedReduction(mu, weights, 1, N, (Type)0, stream, false);
+  raft::update_host(&WS, mu, 1, stream);
+
+  raft::linalg::stridedReduction(
+    mu,
+    data,
+    D,
+    N,
+    (Type)0,
+    stream,
+    false,
+    [weights] __device__(Type v, int i) { return v * weights[i]; },
+    [] __device__(Type a, Type b) { return a + b; },
+    [WS] __device__(Type v) { return v / WS; });
+}
+};  // end namespace detail
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/histogram.hpp b/cpp/include/raft/stats/histogram.hpp
new file mode 100644
index 0000000000..d4d3b449f7
--- /dev/null
+++ b/cpp/include/raft/stats/histogram.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/stats/common.hpp>
+#include <raft/stats/detail/histogram.cuh>
+
+// This file is a shameless amalgamation of independent works done by
+// Lars Nyland and Andy Adinets
+
+///@todo: add cub's histogram as another option
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Perform histogram on the input data. It chooses the right load size
+ * based on the input data vector length. It also supports large-bin cases
+ * using a specialized smem-based hashing technique.
+ * @tparam DataT input data type
+ * @tparam IdxT data type used to compute indices
+ * @tparam BinnerOp takes the input data and computes its bin index
+ * @param type histogram implementation type to choose
+ * @param bins the output bins (length = ncols * nbins)
+ * @param nbins number of bins
+ * @param data input data (length = ncols * nrows)
+ * @param nrows data array length in each column (or batch)
+ * @param ncols number of columsn (or batch size)
+ * @param stream cuda stream
+ * @param binner the operation that computes the bin index of the input data
+ *
+ * @note signature of BinnerOp is `int func(DataT, IdxT);`
+ */
+template <typename DataT, typename IdxT = int, typename BinnerOp = IdentityBinner<DataT, IdxT>>
+void histogram(HistType type,
+               int* bins,
+               IdxT nbins,
+               const DataT* data,
+               IdxT nrows,
+               IdxT ncols,
+               cudaStream_t stream,
+               BinnerOp binner = IdentityBinner<DataT, IdxT>())
+{
+  detail::histogram<DataT, IdxT, BinnerOp>(type, bins, nbins, data, nrows, ncols, stream, binner);
+}
+
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/minmax.hpp b/cpp/include/raft/stats/minmax.hpp
new file mode 100644
index 0000000000..966287bb41
--- /dev/null
+++ b/cpp/include/raft/stats/minmax.hpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/stats/detail/minmax.cuh>
+
+#include <limits>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Computes min/max across every column of the input matrix, as well as
+ * optionally allow to subsample based on the given row/col ID mapping vectors
+ *
+ * @tparam T the data type
+ * @tparam TPB number of threads per block
+ * @param data input data
+ * @param rowids actual row ID mappings. It is of length nrows. If you want to
+ * skip this index lookup entirely, pass nullptr
+ * @param colids actual col ID mappings. It is of length ncols. If you want to
+ * skip this index lookup entirely, pass nullptr
+ * @param nrows number of rows of data to be worked upon. The actual rows of the
+ * input "data" can be bigger than this!
+ * @param ncols number of cols of data to be worked upon. The actual cols of the
+ * input "data" can be bigger than this!
+ * @param row_stride stride (in number of elements) between 2 adjacent columns
+ * @param globalmin final col-wise global minimum (size = ncols)
+ * @param globalmax final col-wise global maximum (size = ncols)
+ * @param sampledcols output sampled data. Pass nullptr if you don't need this
+ * @param stream cuda stream
+ * @note This method makes the following assumptions:
+ * 1. input and output matrices are assumed to be col-major
+ * 2. ncols is small enough to fit the whole of min/max values across all cols
+ *    in shared memory
+ */
+template <typename T, int TPB = 512>
+void minmax(const T* data,
+            const unsigned* rowids,
+            const unsigned* colids,
+            int nrows,
+            int ncols,
+            int row_stride,
+            T* globalmin,
+            T* globalmax,
+            T* sampledcols,
+            cudaStream_t stream)
+{
+  detail::minmax<T, TPB>(
+    data, rowids, colids, nrows, ncols, row_stride, globalmin, globalmax, sampledcols, stream);
+}
+
+};  // namespace stats
+};  // namespace raft
diff --git a/cpp/include/raft/stats/weighted_mean.hpp b/cpp/include/raft/stats/weighted_mean.hpp
new file mode 100644
index 0000000000..ad90142a08
--- /dev/null
+++ b/cpp/include/raft/stats/weighted_mean.hpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/stats/detail/weighted_mean.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute the row-wise weighted mean of the input matrix
+ *
+ * @tparam Type the data type
+ * @param mu the output mean vector
+ * @param data the input matrix (assumed to be row-major)
+ * @param weights per-column means
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param stream cuda stream to launch work on
+ */
+template <typename Type>
+void rowWeightedMean(
+  Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream)
+{
+  detail::rowWeightedMean(mu, data, weights, D, N, stream);
+}
+
+/**
+ * @brief Compute the column-wise weighted mean of the input matrix
+ *
+ * @tparam Type the data type
+ * @param mu the output mean vector
+ * @param data the input matrix (assumed to be column-major)
+ * @param weights per-column means
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param stream cuda stream to launch work on
+ */
+template <typename Type>
+void colWeightedMean(
+  Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream)
+{
+  detail::colWeightedMean(mu, data, weights, D, N, stream);
+}
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 654ab73f84..430b69341c 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -16,6 +16,7 @@
 
 # keep the files in alphabetical order!
 add_executable(test_raft
+    test/common/seive.cu
     test/cudart_utils.cpp
     test/cluster_solvers.cu
     test/distance/dist_adj.cu
@@ -106,14 +107,19 @@ add_executable(test_raft
     test/spatial/fused_l2_knn.cu
     test/spatial/haversine.cu
     test/spatial/ball_cover.cu
+    test/spatial/epsilon_neighborhood.cu
     test/spatial/faiss_mr.cu
     test/spatial/selection.cu
     test/spectral_matrix.cu
+    test/stats/cov.cu
+    test/stats/histogram.cu
     test/stats/mean.cu
     test/stats/meanvar.cu
     test/stats/mean_center.cu
+    test/stats/minmax.cu
     test/stats/stddev.cu
     test/stats/sum.cu
+    test/stats/weighted_mean.cu
     test/test.cpp
 )
 
diff --git a/cpp/test/common/seive.cu b/cpp/test/common/seive.cu
new file mode 100644
index 0000000000..8044dbb532
--- /dev/null
+++ b/cpp/test/common/seive.cu
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/common/seive.hpp>
+
+namespace raft {
+namespace common {
+TEST(Seive, Test)
+{
+  Seive s1(32);
+  ASSERT_TRUE(s1.isPrime(17));
+  ASSERT_FALSE(s1.isPrime(28));
+
+  Seive s2(1024 * 1024);
+  ASSERT_TRUE(s2.isPrime(107));
+  ASSERT_FALSE(s2.isPrime(111));
+  ASSERT_TRUE(s2.isPrime(6047));
+}
+
+}  // end namespace common
+}  // end namespace raft
diff --git a/cpp/test/linalg/rsvd.cu b/cpp/test/linalg/rsvd.cu
index b8e44580b5..da38464bf7 100644
--- a/cpp/test/linalg/rsvd.cu
+++ b/cpp/test/linalg/rsvd.cu
@@ -31,6 +31,7 @@ struct RsvdInputs {
   T tolerance;
   int n_row;
   int n_col;
+  float redundancy;
   T PC_perc;
   T UpS_perc;
   int k;
@@ -66,7 +67,7 @@ class RsvdTest : public ::testing::TestWithParam<RsvdInputs<T>> {
 
     params = ::testing::TestWithParam<RsvdInputs<T>>::GetParam();
     // rSVD seems to be very sensitive to the random number sequence as well!
-    raft::random::Rng r(params.seed, raft::random::GenTaps);
+    raft::random::Rng r(params.seed, raft::random::GenPC);
     int m = params.n_row, n = params.n_col;
     T eig_svd_tol  = 1.e-7;
     int max_sweeps = 100;
@@ -91,8 +92,19 @@ class RsvdTest : public ::testing::TestWithParam<RsvdInputs<T>> {
       raft::update_device(right_eig_vectors_ref.data(), right_eig_vectors_ref_h, n * 1, stream);
       raft::update_device(sing_vals_ref.data(), sing_vals_ref_h, 1, stream);
 
-    } else {  // Other normal tests
-      r.normal(A.data(), m * n, mu, sigma, stream);
+    } else {                                 // Other normal tests
+      int n_informative   = int(0.25f * n);  // Informative cols
+      int len_informative = m * n_informative;
+
+      int n_redundant   = n - n_informative;  // Redundant cols
+      int len_redundant = m * n_redundant;
+
+      r.normal(A.data(), len_informative, mu, sigma, stream);
+      CUDA_CHECK(cudaMemcpyAsync(A.data() + len_informative,
+                                 A.data(),
+                                 len_redundant * sizeof(T),
+                                 cudaMemcpyDeviceToDevice,
+                                 stream));
     }
     std::vector<T> A_backup_cpu(m *
                                 n);  // Backup A matrix as svdJacobi will destroy the content of A
@@ -157,59 +169,65 @@ class RsvdTest : public ::testing::TestWithParam<RsvdInputs<T>> {
 
 const std::vector<RsvdInputs<float>> inputs_fx = {
   // Test with ratios
-  {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL},     // Square + BBT
-  {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, true, 4321ULL},    // Tall + BBT
-  {0.20f, 256, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL},    // Square + non-BBT
-  {0.20f, 2048, 256, 0.2f, 0.05f, 0, 0, false, 4321ULL},   // Tall + non-BBT
-  {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL},   // Square + BBT
-  {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, true, 4321ULL},  // Tall + BBT
-  {0.20f, 2048, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL},  // Square + non-BBT
-  {0.60f, 16384, 2048, 0.2f, 0.05f, 0, 0, false, 4321ULL}  // Tall + non-BBT
-
-  ,                                                         // Test with fixed ranks
-  {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL},     // Square + BBT
-  {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, true, 4321ULL},    // Tall + BBT
-  {0.10f, 256, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL},    // Square + non-BBT
-  {0.12f, 2048, 256, 0.0f, 0.0f, 100, 5, false, 4321ULL},   // Tall + non-BBT
-  {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL},   // Square + BBT
-  {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, true, 4321ULL},  // Tall + BBT
-  {0.60f, 2048, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL},  // Square + non-BBT
-  {1.00f, 16384, 2048, 0.0f, 0.0f, 100, 5, false, 4321ULL}  // Tall + non-BBT
+  {0.20f, 256, 256, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},   // Square + BBT
+  {0.20f, 2048, 256, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},  // Tall + BBT
+
+  {0.20f, 256, 256, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},   // Square + non-BBT
+  {0.20f, 2048, 256, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},  // Tall + non-BBT
+
+  {0.20f, 2048, 2048, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},   // Square + BBT
+  {0.60f, 16384, 2048, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},  // Tall + BBT
+
+  {0.20f, 2048, 2048, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},  // Square + non-BBT
+  {0.60f, 16384, 2048, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL}  // Tall + non-BBT
+
+  ,                                                              // Test with fixed ranks
+  {0.10f, 256, 256, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},   // Square + BBT
+  {0.12f, 2048, 256, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},  // Tall + BBT
+
+  {0.10f, 256, 256, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL},   // Square + non-BBT
+  {0.12f, 2048, 256, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL},  // Tall + non-BBT
+
+  {0.60f, 2048, 2048, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},   // Square + BBT
+  {1.00f, 16384, 2048, 0.25f, 0.0f, 0.0f, 100, 5, true, 4321ULL},  // Tall + BBT
+
+  {0.60f, 2048, 2048, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL},  // Square + non-BBT
+  {1.00f, 16384, 2048, 0.25f, 0.0f, 0.0f, 100, 5, false, 4321ULL}  // Tall + non-BBT
 };
 
 const std::vector<RsvdInputs<double>> inputs_dx = {
   // Test with ratios
-  {0.20, 256, 256, 0.2, 0.05, 0, 0, true, 4321ULL},     // Square + BBT
-  {0.20, 2048, 256, 0.2, 0.05, 0, 0, true, 4321ULL},    // Tall + BBT
-  {0.20, 256, 256, 0.2, 0.05, 0, 0, false, 4321ULL},    // Square + non-BBT
-  {0.20, 2048, 256, 0.2, 0.05, 0, 0, false, 4321ULL},   // Tall + non-BBT
-  {0.20, 2048, 2048, 0.2, 0.05, 0, 0, true, 4321ULL},   // Square + BBT
-  {0.60, 16384, 2048, 0.2, 0.05, 0, 0, true, 4321ULL},  // Tall + BBT
-  {0.20, 2048, 2048, 0.2, 0.05, 0, 0, false, 4321ULL},  // Square + non-BBT
-  {0.60, 16384, 2048, 0.2, 0.05, 0, 0, false, 4321ULL}  // Tall + non-BBT
-
-  ,                                                      // Test with fixed ranks
-  {0.10, 256, 256, 0.0, 0.0, 100, 5, true, 4321ULL},     // Square + BBT
-  {0.12, 2048, 256, 0.0, 0.0, 100, 5, true, 4321ULL},    // Tall + BBT
-  {0.10, 256, 256, 0.0, 0.0, 100, 5, false, 4321ULL},    // Square + non-BBT
-  {0.12, 2048, 256, 0.0, 0.0, 100, 5, false, 4321ULL},   // Tall + non-BBT
-  {0.60, 2048, 2048, 0.0, 0.0, 100, 5, true, 4321ULL},   // Square + BBT
-  {1.00, 16384, 2048, 0.0, 0.0, 100, 5, true, 4321ULL},  // Tall + BBT
-  {0.60, 2048, 2048, 0.0, 0.0, 100, 5, false, 4321ULL},  // Square + non-BBT
-  {1.00, 16384, 2048, 0.0, 0.0, 100, 5, false, 4321ULL}  // Tall + non-BBT
+  {0.20, 256, 256, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},     // Square + BBT
+  {0.20, 2048, 256, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},    // Tall + BBT
+  {0.20, 256, 256, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},    // Square + non-BBT
+  {0.20, 2048, 256, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},   // Tall + non-BBT
+  {0.20, 2048, 2048, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},   // Square + BBT
+  {0.60, 16384, 2048, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},  // Tall + BBT
+  {0.20, 2048, 2048, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},  // Square + non-BBT
+  {0.60, 16384, 2048, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL}  // Tall + non-BBT
+
+  ,                                                             // Test with fixed ranks
+  {0.10, 256, 256, 0.25f, 0.0, 0.0, 100, 5, true, 4321ULL},     // Square + BBT
+  {0.12, 2048, 256, 0.25f, 0.0, 0.0, 100, 5, true, 4321ULL},    // Tall + BBT
+  {0.10, 256, 256, 0.25f, 0.0, 0.0, 100, 5, false, 4321ULL},    // Square + non-BBT
+  {0.12, 2048, 256, 0.25f, 0.0, 0.0, 100, 5, false, 4321ULL},   // Tall + non-BBT
+  {0.60, 2048, 2048, 0.25f, 0.0, 0.0, 100, 5, true, 4321ULL},   // Square + BBT
+  {1.00, 16384, 2048, 0.25f, 0.0, 0.0, 100, 5, true, 4321ULL},  // Tall + BBT
+  {0.60, 2048, 2048, 0.25f, 0.0, 0.0, 100, 5, false, 4321ULL},  // Square + non-BBT
+  {1.00, 16384, 2048, 0.25f, 0.0, 0.0, 100, 5, false, 4321ULL}  // Tall + non-BBT
 };
 
 const std::vector<RsvdInputs<float>> sanity_inputs_fx = {
-  {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, true, 4321ULL},
-  {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, true, 4321ULL},
-  {100000000000000000.0f, 3, 2, 0.2f, 0.05f, 0, 0, false, 4321ULL},
-  {100000000000000000.0f, 3, 2, 0.0f, 0.0f, 1, 1, false, 4321ULL}};
+  {100000000000000000.0f, 3, 2, 0.25f, 0.2f, 0.05f, 0, 0, true, 4321ULL},
+  {100000000000000000.0f, 3, 2, 0.25f, 0.0f, 0.0f, 1, 1, true, 4321ULL},
+  {100000000000000000.0f, 3, 2, 0.25f, 0.2f, 0.05f, 0, 0, false, 4321ULL},
+  {100000000000000000.0f, 3, 2, 0.25f, 0.0f, 0.0f, 1, 1, false, 4321ULL}};
 
 const std::vector<RsvdInputs<double>> sanity_inputs_dx = {
-  {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, true, 4321ULL},
-  {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, true, 4321ULL},
-  {100000000000000000.0, 3, 2, 0.2, 0.05, 0, 0, false, 4321ULL},
-  {100000000000000000.0, 3, 2, 0.0, 0.0, 1, 1, false, 4321ULL}};
+  {100000000000000000.0, 3, 2, 0.25f, 0.2, 0.05, 0, 0, true, 4321ULL},
+  {100000000000000000.0, 3, 2, 0.25f, 0.0, 0.0, 1, 1, true, 4321ULL},
+  {100000000000000000.0, 3, 2, 0.25f, 0.2, 0.05, 0, 0, false, 4321ULL},
+  {100000000000000000.0, 3, 2, 0.25f, 0.0, 0.0, 1, 1, false, 4321ULL}};
 
 typedef RsvdTest<float> RsvdSanityCheckValF;
 TEST_P(RsvdSanityCheckValF, Result)
diff --git a/cpp/test/random/make_blobs.cu b/cpp/test/random/make_blobs.cu
index 8c7e440d0e..caad627d49 100644
--- a/cpp/test/random/make_blobs.cu
+++ b/cpp/test/random/make_blobs.cu
@@ -21,7 +21,8 @@
 #include <raft/cudart_utils.h>
 #include <raft/random/make_blobs.hpp>
 
-namespace raft::random {
+namespace raft {
+namespace random {
 
 template <typename T>
 __global__ void meanKernel(T* out,
@@ -136,8 +137,8 @@ class MakeBlobsTest : public ::testing::TestWithParam<MakeBlobsInputs<T>> {
   {
     int len      = params.n_clusters * params.cols;
     auto compare = raft::CompareApprox<T>(num_sigma * params.tolerance);
-    ASSERT_TRUE(raft::devArrMatch(mu_vec.data(), mean_var.data(), len, compare, stream));
-    ASSERT_TRUE(raft::devArrMatch(params.std, mean_var.data() + len, len, compare, stream));
+    ASSERT_TRUE(raft::devArrMatch(mu_vec.data(), mean_var.data(), len, compare));
+    ASSERT_TRUE(raft::devArrMatch(params.std, mean_var.data() + len, len, compare));
   }
 
  protected:
@@ -153,53 +154,37 @@ typedef MakeBlobsTest<float> MakeBlobsTestF;
 const std::vector<MakeBlobsInputs<float>> inputsf_t = {
   {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenPC, 1234ULL},
   {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenPC, 1234ULL},
   {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenPC, 1234ULL},
   {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenKiss99, 1234ULL},
-
+  {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenKiss99, 1234ULL},
+
+  {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenPC, 1234ULL},
 };
 
 TEST_P(MakeBlobsTestF, Result) { check(); }
@@ -209,55 +194,40 @@ typedef MakeBlobsTest<double> MakeBlobsTestD;
 const std::vector<MakeBlobsInputs<double>> inputsd_t = {
   {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenPC, 1234ULL},
   {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenPC, 1234ULL},
   {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenPC, 1234ULL},
   {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenPC, 1234ULL},
 
   {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenTaps, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenTaps, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenKiss99, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenKiss99, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenPC, 1234ULL},
 };
 TEST_P(MakeBlobsTestD, Result) { check(); }
 INSTANTIATE_TEST_CASE_P(MakeBlobsTests, MakeBlobsTestD, ::testing::ValuesIn(inputsd_t));
 
-}  // end namespace raft::random
\ No newline at end of file
+}  // end namespace random
+}  // end namespace raft
\ No newline at end of file
diff --git a/cpp/test/spatial/epsilon_neighborhood.cu b/cpp/test/spatial/epsilon_neighborhood.cu
new file mode 100644
index 0000000000..33af5726a0
--- /dev/null
+++ b/cpp/test/spatial/epsilon_neighborhood.cu
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <raft/cudart_utils.h>
+#include <raft/random/make_blobs.hpp>
+#include <raft/spatial/knn/epsilon_neighborhood.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+template <typename T, typename IdxT>
+struct EpsInputs {
+  IdxT n_row, n_col, n_centers, n_batches;
+  T eps;
+};
+
+template <typename T, typename IdxT>
+::std::ostream& operator<<(::std::ostream& os, const EpsInputs<T, IdxT>& p)
+{
+  return os;
+}
+
+template <typename T, typename IdxT>
+class EpsNeighTest : public ::testing::TestWithParam<EpsInputs<T, IdxT>> {
+ protected:
+  EpsNeighTest() : data(0, stream), adj(0, stream), labels(0, stream), vd(0, stream) {}
+
+  void SetUp() override
+  {
+    param = ::testing::TestWithParam<EpsInputs<T, IdxT>>::GetParam();
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    data.resize(param.n_row * param.n_col, stream);
+    labels.resize(param.n_row, stream);
+    batchSize = param.n_row / param.n_batches;
+    adj.resize(param.n_row * batchSize, stream);
+    vd.resize(batchSize + 1, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(vd.data(), 0, vd.size() * sizeof(IdxT), stream));
+    random::make_blobs<T, IdxT>(data.data(),
+                                labels.data(),
+                                param.n_row,
+                                param.n_col,
+                                param.n_centers,
+                                stream,
+                                true,
+                                nullptr,
+                                nullptr,
+                                T(0.01),
+                                false);
+  }
+
+  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
+
+  EpsInputs<T, IdxT> param;
+  cudaStream_t stream = 0;
+  rmm::device_uvector<T> data;
+  rmm::device_uvector<bool> adj;
+  rmm::device_uvector<IdxT> labels, vd;
+  IdxT batchSize;
+};  // class EpsNeighTest
+
+const std::vector<EpsInputs<float, int>> inputsfi = {
+  {15000, 16, 5, 1, 2.f},
+  {14000, 16, 5, 1, 2.f},
+  {15000, 17, 5, 1, 2.f},
+  {14000, 17, 5, 1, 2.f},
+  {15000, 18, 5, 1, 2.f},
+  {14000, 18, 5, 1, 2.f},
+  {15000, 32, 5, 1, 2.f},
+  {14000, 32, 5, 1, 2.f},
+  {20000, 10000, 10, 1, 2.f},
+  {20000, 10000, 10, 2, 2.f},
+};
+typedef EpsNeighTest<float, int> EpsNeighTestFI;
+TEST_P(EpsNeighTestFI, Result)
+{
+  for (int i = 0; i < param.n_batches; ++i) {
+    RAFT_CUDA_TRY(cudaMemsetAsync(adj.data(), 0, sizeof(bool) * param.n_row * batchSize, stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(vd.data(), 0, sizeof(int) * (batchSize + 1), stream));
+    epsUnexpL2SqNeighborhood<float, int>(adj.
+
+                                         data(),
+                                         vd
+
+                                           .
+
+                                         data(),
+                                         data
+
+                                           .
+
+                                         data(),
+                                         data
+
+                                             .
+
+                                           data()
+
+                                           + (i * batchSize * param.n_col),
+                                         param.n_row,
+                                         batchSize,
+                                         param.n_col,
+                                         param.eps * param.eps,
+                                         stream);
+    ASSERT_TRUE(raft::devArrMatch(
+      param.n_row / param.n_centers, vd.data(), batchSize, raft::Compare<int>(), stream));
+  }
+}
+INSTANTIATE_TEST_CASE_P(EpsNeighTests, EpsNeighTestFI, ::testing::ValuesIn(inputsfi));
+
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
diff --git a/cpp/test/stats/cov.cu b/cpp/test/stats/cov.cu
new file mode 100644
index 0000000000..2db64a7999
--- /dev/null
+++ b/cpp/test/stats/cov.cu
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.hpp>
+#include <raft/stats/cov.hpp>
+#include <raft/stats/mean.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace stats {
+
+template <typename T>
+struct CovInputs {
+  T tolerance, mean, var;
+  int rows, cols;
+  bool sample, rowMajor, stable;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const CovInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+class CovTest : public ::testing::TestWithParam<CovInputs<T>> {
+ protected:
+  CovTest()
+    : data(0, stream),
+      mean_act(0, stream),
+      cov_act(0, stream),
+      cov_cm(0, stream),
+      cov_cm_ref(0, stream)
+  {
+  }
+
+  void SetUp() override
+  {
+    raft::handle_t handle;
+    cudaStream_t stream = handle.get_stream();
+
+    params = ::testing::TestWithParam<CovInputs<T>>::GetParam();
+    params.tolerance *= 2;
+    raft::random::Rng r(params.seed);
+    int rows = params.rows, cols = params.cols;
+    auto len = rows * cols;
+    T var    = params.var;
+    data.resize(len, stream);
+    mean_act.resize(cols, stream);
+    cov_act.resize(cols * cols, stream);
+
+    r.normal(data.data(), len, params.mean, var, stream);
+    raft::stats::mean(
+      mean_act.data(), data.data(), cols, rows, params.sample, params.rowMajor, stream);
+    cov(handle,
+        cov_act.data(),
+        data.data(),
+        mean_act.data(),
+        cols,
+        rows,
+        params.sample,
+        params.rowMajor,
+        params.stable,
+        stream);
+
+    T data_h[6]       = {1.0, 2.0, 5.0, 4.0, 2.0, 1.0};
+    T cov_cm_ref_h[4] = {4.3333, -2.8333, -2.8333, 2.333};
+
+    cov_cm.resize(4, stream);
+    cov_cm_ref.resize(4, stream);
+    rmm::device_uvector<T> data_cm(6, stream);
+    rmm::device_uvector<T> mean_cm(2, stream);
+
+    raft::update_device(data_cm.data(), data_h, 6, stream);
+    raft::update_device(cov_cm_ref.data(), cov_cm_ref_h, 4, stream);
+
+    raft::stats::mean(mean_cm.data(), data_cm.data(), 2, 3, true, false, stream);
+    cov(handle, cov_cm.data(), data_cm.data(), mean_cm.data(), 2, 3, true, false, true, stream);
+  }
+
+ protected:
+  CovInputs<T> params;
+  rmm::device_uvector<T> data, mean_act, cov_act, cov_cm, cov_cm_ref;
+  cublasHandle_t handle;
+  cudaStream_t stream = 0;
+};
+
+///@todo: add stable=false after it has been implemented
+const std::vector<CovInputs<float>> inputsf = {
+  {0.03f, 1.f, 2.f, 32 * 1024, 32, true, false, true, 1234ULL},
+  {0.03f, 1.f, 2.f, 32 * 1024, 64, true, false, true, 1234ULL},
+  {0.03f, 1.f, 2.f, 32 * 1024, 128, true, false, true, 1234ULL},
+  {0.03f, 1.f, 2.f, 32 * 1024, 256, true, false, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 32, false, false, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 64, false, false, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 128, false, false, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 256, false, false, true, 1234ULL},
+  {0.03f, 1.f, 2.f, 32 * 1024, 32, true, true, true, 1234ULL},
+  {0.03f, 1.f, 2.f, 32 * 1024, 64, true, true, true, 1234ULL},
+  {0.03f, 1.f, 2.f, 32 * 1024, 128, true, true, true, 1234ULL},
+  {0.03f, 1.f, 2.f, 32 * 1024, 256, true, true, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 32, false, true, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 64, false, true, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 128, false, true, true, 1234ULL},
+  {0.03f, -1.f, 2.f, 32 * 1024, 256, false, true, true, 1234ULL}};
+
+const std::vector<CovInputs<double>> inputsd = {
+  {0.03, 1.0, 2.0, 32 * 1024, 32, true, false, true, 1234ULL},
+  {0.03, 1.0, 2.0, 32 * 1024, 64, true, false, true, 1234ULL},
+  {0.03, 1.0, 2.0, 32 * 1024, 128, true, false, true, 1234ULL},
+  {0.03, 1.0, 2.0, 32 * 1024, 256, true, false, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 32, false, false, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 64, false, false, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 128, false, false, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 256, false, false, true, 1234ULL},
+  {0.03, 1.0, 2.0, 32 * 1024, 32, true, true, true, 1234ULL},
+  {0.03, 1.0, 2.0, 32 * 1024, 64, true, true, true, 1234ULL},
+  {0.03, 1.0, 2.0, 32 * 1024, 128, true, true, true, 1234ULL},
+  {0.03, 1.0, 2.0, 32 * 1024, 256, true, true, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 32, false, true, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 64, false, true, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 128, false, true, true, 1234ULL},
+  {0.03, -1.0, 2.0, 32 * 1024, 256, false, true, true, 1234ULL}};
+
+typedef CovTest<float> CovTestF;
+TEST_P(CovTestF, Result)
+{
+  ASSERT_TRUE(raft::diagonalMatch(params.var * params.var,
+                                  cov_act.data(),
+                                  params.cols,
+                                  params.cols,
+                                  raft::CompareApprox<float>(params.tolerance)));
+}
+
+typedef CovTest<double> CovTestD;
+TEST_P(CovTestD, Result)
+{
+  ASSERT_TRUE(raft::diagonalMatch(params.var * params.var,
+                                  cov_act.data(),
+                                  params.cols,
+                                  params.cols,
+                                  raft::CompareApprox<double>(params.tolerance)));
+}
+
+typedef CovTest<float> CovTestSmallF;
+TEST_P(CovTestSmallF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    cov_cm_ref.data(), cov_cm.data(), 2, 2, raft::CompareApprox<float>(params.tolerance)));
+}
+
+typedef CovTest<double> CovTestSmallD;
+TEST_P(CovTestSmallD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    cov_cm_ref.data(), cov_cm.data(), 2, 2, raft::CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_CASE_P(CovTests, CovTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(CovTests, CovTestD, ::testing::ValuesIn(inputsd));
+
+INSTANTIATE_TEST_CASE_P(CovTests, CovTestSmallF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(CovTests, CovTestSmallD, ::testing::ValuesIn(inputsd));
+
+}  // namespace stats
+}  // namespace raft
diff --git a/cpp/test/stats/histogram.cu b/cpp/test/stats/histogram.cu
new file mode 100644
index 0000000000..ff538fcdca
--- /dev/null
+++ b/cpp/test/stats/histogram.cu
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/interruptible.hpp>
+#include <raft/random/rng.hpp>
+#include <raft/stats/histogram.hpp>
+
+namespace raft {
+namespace stats {
+
+// Note: this kernel also updates the input vector to take care of OOB bins!
+__global__ void naiveHistKernel(int* bins, int nbins, int* in, int nrows)
+{
+  int tid        = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride     = blockDim.x * gridDim.x;
+  auto offset    = blockIdx.y * nrows;
+  auto binOffset = blockIdx.y * nbins;
+  for (; tid < nrows; tid += stride) {
+    int id = in[offset + tid];
+    if (id < 0)
+      id = 0;
+    else if (id >= nbins)
+      id = nbins - 1;
+    in[offset + tid] = id;
+    raft::myAtomicAdd(bins + binOffset + id, 1);
+  }
+}
+
+void naiveHist(int* bins, int nbins, int* in, int nrows, int ncols, cudaStream_t stream)
+{
+  const int TPB = 128;
+  int nblksx    = raft::ceildiv(nrows, TPB);
+  dim3 blks(nblksx, ncols);
+  naiveHistKernel<<<blks, TPB, 0, stream>>>(bins, nbins, in, nrows);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+struct HistInputs {
+  int nrows, ncols, nbins;
+  bool isNormal;
+  HistType type;
+  int start, end;
+  unsigned long long int seed;
+};
+
+class HistTest : public ::testing::TestWithParam<HistInputs> {
+ protected:
+  HistTest() : in(0, stream), bins(0, stream), ref_bins(0, stream) {}
+
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<HistInputs>::GetParam();
+    raft::random::Rng r(params.seed);
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    int len = params.nrows * params.ncols;
+    in.resize(len, stream);
+    if (params.isNormal) {
+      r.normalInt(in.data(), len, params.start, params.end, stream);
+    } else {
+      r.uniformInt(in.data(), len, params.start, params.end, stream);
+    }
+    bins.resize(params.nbins * params.ncols, stream);
+    ref_bins.resize(params.nbins * params.ncols, stream);
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(ref_bins.data(), 0, sizeof(int) * params.nbins * params.ncols, stream));
+    naiveHist(ref_bins.data(), params.nbins, in.data(), params.nrows, params.ncols, stream);
+    histogram<int>(
+      params.type, bins.data(), params.nbins, in.data(), params.nrows, params.ncols, stream);
+    raft::interruptible::synchronize(stream);
+  }
+
+  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
+
+ protected:
+  cudaStream_t stream = 0;
+  HistInputs params;
+  rmm::device_uvector<int> in, bins, ref_bins;
+};
+
+static const int oneK                = 1024;
+static const int oneM                = oneK * oneK;
+const std::vector<HistInputs> inputs = {
+  {oneM, 1, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL},
+  {oneM, 1, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL},
+  {oneM + 1, 1, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL},
+  {oneM + 2, 1, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL},
+  {oneM, 21, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL},
+  {oneM + 1, 21, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneM, false, HistTypeGmem, 0, 2 * oneM, 1234ULL},
+  {oneM + 2, 21, 2 * oneM, true, HistTypeGmem, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmem, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmem, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmemMatchAny, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmemMatchAny, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits16, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits16, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits8, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits8, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits4, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits4, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits2, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits2, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmemBits1, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmemBits1, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL},
+  {oneM, 1, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL},
+  {oneM + 1, 1, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL},
+  {oneM + 2, 1, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM, 1, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL},
+  {oneM, 21, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL},
+  {oneM + 1, 21, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneM, false, HistTypeSmemHash, 0, 2 * oneM, 1234ULL},
+  {oneM + 2, 21, 2 * oneM, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeSmemHash, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeSmemHash, 1000, 50, 1234ULL},
+
+  {oneM, 1, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL},
+  {oneM, 1, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL},
+  {oneM + 1, 1, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL},
+  {oneM + 2, 1, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM, 1, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL},
+  {oneM, 1, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 1, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 1, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL},
+  {oneM, 21, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL},
+  {oneM + 1, 21, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneM, false, HistTypeAuto, 0, 2 * oneM, 1234ULL},
+  {oneM + 2, 21, 2 * oneM, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM, 21, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL},
+  {oneM, 21, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL},
+  {oneM + 1, 21, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, false, HistTypeAuto, 0, 2 * oneK, 1234ULL},
+  {oneM + 2, 21, 2 * oneK, true, HistTypeAuto, 1000, 50, 1234ULL},
+};
+TEST_P(HistTest, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    ref_bins.data(), bins.data(), params.nbins * params.ncols, raft::Compare<int>()));
+}
+INSTANTIATE_TEST_CASE_P(HistTests, HistTest, ::testing::ValuesIn(inputs));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/stats/minmax.cu b/cpp/test/stats/minmax.cu
new file mode 100644
index 0000000000..61b16b65ae
--- /dev/null
+++ b/cpp/test/stats/minmax.cu
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <limits>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.hpp>
+#include <raft/stats/minmax.hpp>
+#include <stdio.h>
+#include <stdlib.h>
+
+namespace raft {
+namespace stats {
+
+///@todo: need to add tests for verifying the column subsampling feature
+
+template <typename T>
+struct MinMaxInputs {
+  T tolerance;
+  int rows, cols;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const MinMaxInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+__global__ void naiveMinMaxInitKernel(int ncols, T* globalmin, T* globalmax, T init_val)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= ncols) return;
+  globalmin[tid] = init_val;
+  globalmax[tid] = -init_val;
+}
+
+template <typename T>
+__global__ void naiveMinMaxKernel(const T* data, int nrows, int ncols, T* globalmin, T* globalmax)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int col = tid / nrows;
+  if (col < ncols) {
+    T val = data[tid];
+    if (!isnan(val)) {
+      raft::myAtomicMin(&globalmin[col], val);
+      raft::myAtomicMax(&globalmax[col], val);
+    }
+  }
+}
+
+template <typename T>
+void naiveMinMax(
+  const T* data, int nrows, int ncols, T* globalmin, T* globalmax, cudaStream_t stream)
+{
+  const int TPB = 128;
+  int nblks     = raft::ceildiv(ncols, TPB);
+  T init_val    = std::numeric_limits<T>::max();
+  naiveMinMaxInitKernel<<<nblks, TPB, 0, stream>>>(ncols, globalmin, globalmax, init_val);
+  RAFT_CUDA_TRY(cudaGetLastError());
+  nblks = raft::ceildiv(nrows * ncols, TPB);
+  naiveMinMaxKernel<<<nblks, TPB, 0, stream>>>(data, nrows, ncols, globalmin, globalmax);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+template <typename T>
+__global__ void nanKernel(T* data, const bool* mask, int len, T nan)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= len) return;
+  if (!mask[tid]) data[tid] = nan;
+}
+
+template <typename T>
+class MinMaxTest : public ::testing::TestWithParam<MinMaxInputs<T>> {
+ protected:
+  MinMaxTest() : minmax_act(0, stream), minmax_ref(0, stream) {}
+
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<MinMaxInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int len = params.rows * params.cols;
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+
+    rmm::device_uvector<T> data(len, stream);
+    rmm::device_uvector<bool> mask(len, stream);
+    minmax_act.resize(2 * params.cols, stream);
+    minmax_ref.resize(2 * params.cols, stream);
+
+    r.normal(data.data(), len, (T)0.0, (T)1.0, stream);
+    T nan_prob = 0.01;
+    r.bernoulli(mask.data(), len, nan_prob, stream);
+    const int TPB = 256;
+    nanKernel<<<raft::ceildiv(len, TPB), TPB, 0, stream>>>(
+      data.data(), mask.data(), len, std::numeric_limits<T>::quiet_NaN());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+    naiveMinMax(data.data(),
+                params.rows,
+                params.cols,
+                minmax_ref.data(),
+                minmax_ref.data() + params.cols,
+                stream);
+    minmax<T, 512>(data.data(),
+                   nullptr,
+                   nullptr,
+                   params.rows,
+                   params.cols,
+                   params.rows,
+                   minmax_act.data(),
+                   minmax_act.data() + params.cols,
+                   nullptr,
+                   stream);
+  }
+
+ protected:
+  MinMaxInputs<T> params;
+  rmm::device_uvector<T> minmax_act;
+  rmm::device_uvector<T> minmax_ref;
+  cudaStream_t stream = 0;
+};
+
+const std::vector<MinMaxInputs<float>> inputsf = {{0.00001f, 1024, 32, 1234ULL},
+                                                  {0.00001f, 1024, 64, 1234ULL},
+                                                  {0.00001f, 1024, 128, 1234ULL},
+                                                  {0.00001f, 1024, 256, 1234ULL},
+                                                  {0.00001f, 1024, 512, 1234ULL},
+                                                  {0.00001f, 1024, 1024, 1234ULL},
+                                                  {0.00001f, 4096, 32, 1234ULL},
+                                                  {0.00001f, 4096, 64, 1234ULL},
+                                                  {0.00001f, 4096, 128, 1234ULL},
+                                                  {0.00001f, 4096, 256, 1234ULL},
+                                                  {0.00001f, 4096, 512, 1234ULL},
+                                                  {0.00001f, 4096, 1024, 1234ULL},
+                                                  {0.00001f, 8192, 32, 1234ULL},
+                                                  {0.00001f, 8192, 64, 1234ULL},
+                                                  {0.00001f, 8192, 128, 1234ULL},
+                                                  {0.00001f, 8192, 256, 1234ULL},
+                                                  {0.00001f, 8192, 512, 1234ULL},
+                                                  {0.00001f, 8192, 1024, 1234ULL},
+                                                  {0.00001f, 1024, 8192, 1234ULL}};
+
+const std::vector<MinMaxInputs<double>> inputsd = {{0.0000001, 1024, 32, 1234ULL},
+                                                   {0.0000001, 1024, 64, 1234ULL},
+                                                   {0.0000001, 1024, 128, 1234ULL},
+                                                   {0.0000001, 1024, 256, 1234ULL},
+                                                   {0.0000001, 1024, 512, 1234ULL},
+                                                   {0.0000001, 1024, 1024, 1234ULL},
+                                                   {0.0000001, 4096, 32, 1234ULL},
+                                                   {0.0000001, 4096, 64, 1234ULL},
+                                                   {0.0000001, 4096, 128, 1234ULL},
+                                                   {0.0000001, 4096, 256, 1234ULL},
+                                                   {0.0000001, 4096, 512, 1234ULL},
+                                                   {0.0000001, 4096, 1024, 1234ULL},
+                                                   {0.0000001, 8192, 32, 1234ULL},
+                                                   {0.0000001, 8192, 64, 1234ULL},
+                                                   {0.0000001, 8192, 128, 1234ULL},
+                                                   {0.0000001, 8192, 256, 1234ULL},
+                                                   {0.0000001, 8192, 512, 1234ULL},
+                                                   {0.0000001, 8192, 1024, 1234ULL},
+                                                   {0.0000001, 1024, 8192, 1234ULL}};
+
+typedef MinMaxTest<float> MinMaxTestF;
+TEST_P(MinMaxTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(minmax_ref.data(),
+                                minmax_act.data(),
+                                2 * params.cols,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+
+typedef MinMaxTest<double> MinMaxTestD;
+TEST_P(MinMaxTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(minmax_ref.data(),
+                                minmax_act.data(),
+                                2 * params.cols,
+                                raft::CompareApprox<double>(params.tolerance)));
+}
+
+INSTANTIATE_TEST_CASE_P(MinMaxTests, MinMaxTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(MinMaxTests, MinMaxTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/stats/weighted_mean.cu b/cpp/test/stats/weighted_mean.cu
new file mode 100644
index 0000000000..ee58747b69
--- /dev/null
+++ b/cpp/test/stats/weighted_mean.cu
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/random/rng.hpp>
+#include <raft/stats/weighted_mean.hpp>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+namespace raft {
+namespace stats {
+
+template <typename T>
+struct WeightedMeanInputs {
+  T tolerance;
+  int M, N;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const WeightedMeanInputs<T>& I)
+{
+  return os << "{ " << I.tolerance << ", " << I.M << ", " << I.N << ", " << I.seed << "}"
+            << std::endl;
+}
+
+///// weighted row-wise mean test and support functions
+template <typename T>
+void naiveRowWeightedMean(T* R, T* D, T* W, int M, int N, bool rowMajor)
+{
+  int istr = rowMajor ? 1 : M;
+  int jstr = rowMajor ? N : 1;
+
+  // sum the weights
+  T WS = 0;
+  for (int i = 0; i < N; i++)
+    WS += W[i];
+
+  for (int j = 0; j < M; j++) {
+    R[j] = (T)0;
+    for (int i = 0; i < N; i++) {
+      // R[j] += (W[i]*D[i*istr + j*jstr] - R[j])/(T)(i+1);
+      R[j] += (W[i] * D[i * istr + j * jstr]) / WS;
+    }
+  }
+}
+
+template <typename T>
+class RowWeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T>> {
+ protected:
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<WeightedMeanInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int rows = params.M, cols = params.N, len = rows * cols;
+    cudaStream_t stream = 0;
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    // device-side data
+    din.resize(len);
+    dweights.resize(cols);
+    dexp.resize(rows);
+    dact.resize(rows);
+
+    // create random matrix and weights
+    r.uniform(din.data().get(), len, T(-1.0), T(1.0), stream);
+    r.uniform(dweights.data().get(), cols, T(-1.0), T(1.0), stream);
+
+    // host-side data
+    thrust::host_vector<T> hin      = din;
+    thrust::host_vector<T> hweights = dweights;
+    thrust::host_vector<T> hexp(rows);
+
+    // compute naive result & copy to GPU
+    naiveRowWeightedMean(hexp.data(), hin.data(), hweights.data(), rows, cols, true);
+    dexp = hexp;
+
+    // compute ml-prims result
+    rowWeightedMean(dact.data().get(), din.data().get(), dweights.data().get(), cols, rows, stream);
+
+    // adjust tolerance to account for round-off accumulation
+    params.tolerance *= params.N;
+    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {}
+
+ protected:
+  WeightedMeanInputs<T> params;
+  thrust::host_vector<T> hin, hweights;
+  thrust::device_vector<T> din, dweights, dexp, dact;
+};
+
+///// weighted column-wise mean test and support functions
+template <typename T>
+void naiveColWeightedMean(T* R, T* D, T* W, int M, int N, bool rowMajor)
+{
+  int istr = rowMajor ? 1 : M;
+  int jstr = rowMajor ? N : 1;
+
+  // sum the weights
+  T WS = 0;
+  for (int j = 0; j < M; j++)
+    WS += W[j];
+
+  for (int i = 0; i < N; i++) {
+    R[i] = (T)0;
+    for (int j = 0; j < M; j++) {
+      // R[i] += (W[j]*D[i*istr + j*jstr] - R[i])/(T)(j+1);
+      R[i] += (W[j] * D[i * istr + j * jstr]) / WS;
+    }
+  }
+}
+
+template <typename T>
+class ColWeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T>> {
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<WeightedMeanInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int rows = params.M, cols = params.N, len = rows * cols;
+
+    cudaStream_t stream = 0;
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    // device-side data
+    din.resize(len);
+    dweights.resize(rows);
+    dexp.resize(cols);
+    dact.resize(cols);
+
+    // create random matrix and weights
+    r.uniform(din.data().get(), len, T(-1.0), T(1.0), stream);
+    r.uniform(dweights.data().get(), rows, T(-1.0), T(1.0), stream);
+
+    // host-side data
+    thrust::host_vector<T> hin      = din;
+    thrust::host_vector<T> hweights = dweights;
+    thrust::host_vector<T> hexp(cols);
+
+    // compute naive result & copy to GPU
+    naiveColWeightedMean(hexp.data(), hin.data(), hweights.data(), rows, cols, true);
+    dexp = hexp;
+
+    // compute ml-prims result
+    colWeightedMean(dact.data().get(), din.data().get(), dweights.data().get(), cols, rows, stream);
+
+    // adjust tolerance to account for round-off accumulation
+    params.tolerance *= params.M;
+    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {}
+
+ protected:
+  WeightedMeanInputs<T> params;
+  thrust::host_vector<T> hin, hweights;
+  thrust::device_vector<T> din, dweights, dexp, dact;
+};
+
+////// Parameter sets and test instantiation
+static const float tolF  = 128 * std::numeric_limits<float>::epsilon();
+static const double tolD = 256 * std::numeric_limits<double>::epsilon();
+
+const std::vector<WeightedMeanInputs<float>> inputsf = {{tolF, 4, 4, 1234},
+                                                        {tolF, 1024, 32, 1234},
+                                                        {tolF, 1024, 64, 1234},
+                                                        {tolF, 1024, 128, 1234},
+                                                        {tolF, 1024, 256, 1234},
+                                                        {tolF, 1024, 32, 1234},
+                                                        {tolF, 1024, 64, 1234},
+                                                        {tolF, 1024, 128, 1234},
+                                                        {tolF, 1024, 256, 1234}};
+
+const std::vector<WeightedMeanInputs<double>> inputsd = {{tolD, 4, 4, 1234},
+                                                         {tolD, 1024, 32, 1234},
+                                                         {tolD, 1024, 64, 1234},
+                                                         {tolD, 1024, 128, 1234},
+                                                         {tolD, 1024, 256, 1234},
+                                                         {tolD, 1024, 32, 1234},
+                                                         {tolD, 1024, 64, 1234},
+                                                         {tolD, 1024, 128, 1234},
+                                                         {tolD, 1024, 256, 1234}};
+
+using RowWeightedMeanTestF = RowWeightedMeanTest<float>;
+TEST_P(RowWeightedMeanTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dexp.data().get(), dact.data().get(), params.M, raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(RowWeightedMeanTest, RowWeightedMeanTestF, ::testing::ValuesIn(inputsf));
+
+using RowWeightedMeanTestD = RowWeightedMeanTest<double>;
+TEST_P(RowWeightedMeanTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dexp.data().get(), dact.data().get(), params.M, raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(RowWeightedMeanTest, RowWeightedMeanTestD, ::testing::ValuesIn(inputsd));
+
+using ColWeightedMeanTestF = ColWeightedMeanTest<float>;
+TEST_P(ColWeightedMeanTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dexp.data().get(), dact.data().get(), params.N, raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ColWeightedMeanTest, ColWeightedMeanTestF, ::testing::ValuesIn(inputsf));
+
+using ColWeightedMeanTestD = ColWeightedMeanTest<double>;
+TEST_P(ColWeightedMeanTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dexp.data().get(), dact.data().get(), params.N, raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(ColWeightedMeanTest, ColWeightedMeanTestD, ::testing::ValuesIn(inputsd));
+
+};  // end namespace stats
+};  // end namespace raft

From ad3af3e558dee950d68c65e35bef4b23db14cfc8 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Tue, 15 Feb 2022 18:41:54 -0500
Subject: [PATCH 116/171] Initializing memory in RBC (#509)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Vinay Deshpande (https://github.com/vinaydes)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/509
---
 .../raft/spatial/knn/detail/ball_cover.cuh    | 45 ++++++++++++++++++-
 .../knn/detail/ball_cover/registers.cuh       | 25 ++++++-----
 2 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
index 81eee717d6..2b245d06cb 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -76,6 +76,9 @@ void sample_landmarks(const raft::handle_t& handle,
   thrust::fill(
     handle.get_thrust_policy(), R_1nn_ones.data(), R_1nn_ones.data() + R_1nn_ones.size(), 1.0);
 
+  thrust::fill(
+    handle.get_thrust_policy(), R_indices.data(), R_indices.data() + R_indices.size(), 0.0);
+
   /**
    * 1. Randomly sample sqrt(n) points from X
    */
@@ -234,6 +237,16 @@ void perform_rbc_query(const raft::handle_t& handle,
                        float weight                = 1.0,
                        bool perform_post_filtering = true)
 {
+  // initialize output inds and dists
+  thrust::fill(handle.get_thrust_policy(),
+               inds,
+               inds + (k * n_query_pts),
+               std::numeric_limits<value_idx>::max());
+  thrust::fill(handle.get_thrust_policy(),
+               dists,
+               dists + (k * n_query_pts),
+               std::numeric_limits<value_t>::max());
+
   // Compute nearest k for each neighborhood in each closest R
   rbc_low_dim_pass_one(handle,
                        index,
@@ -289,6 +302,16 @@ void rbc_build_index(const raft::handle_t& handle,
   rmm::device_uvector<value_idx> R_knn_inds(index.m, handle.get_stream());
   rmm::device_uvector<value_t> R_knn_dists(index.m, handle.get_stream());
 
+  // Initialize the uvectors
+  thrust::fill(handle.get_thrust_policy(),
+               R_knn_inds.begin(),
+               R_knn_inds.end(),
+               std::numeric_limits<value_idx>::max());
+  thrust::fill(handle.get_thrust_policy(),
+               R_knn_dists.begin(),
+               R_knn_dists.end(),
+               std::numeric_limits<value_t>::max());
+
   /**
    * 1. Randomly sample sqrt(n) points from X
    */
@@ -340,6 +363,16 @@ void rbc_all_knn_query(const raft::handle_t& handle,
   rmm::device_uvector<value_idx> R_knn_inds(k * index.m, handle.get_stream());
   rmm::device_uvector<value_t> R_knn_dists(k * index.m, handle.get_stream());
 
+  // Initialize the uvectors
+  thrust::fill(handle.get_thrust_policy(),
+               R_knn_inds.begin(),
+               R_knn_inds.end(),
+               std::numeric_limits<value_idx>::max());
+  thrust::fill(handle.get_thrust_policy(),
+               R_knn_dists.begin(),
+               R_knn_dists.end(),
+               std::numeric_limits<value_t>::max());
+
   // For debugging / verification. Remove before releasing
   rmm::device_uvector<value_int> dists_counter(index.m, handle.get_stream());
   rmm::device_uvector<value_int> post_dists_counter(index.m, handle.get_stream());
@@ -396,6 +429,16 @@ void rbc_knn_query(const raft::handle_t& handle,
   rmm::device_uvector<value_idx> R_knn_inds(k * index.m, handle.get_stream());
   rmm::device_uvector<value_t> R_knn_dists(k * index.m, handle.get_stream());
 
+  // Initialize the uvectors
+  thrust::fill(handle.get_thrust_policy(),
+               R_knn_inds.begin(),
+               R_knn_inds.end(),
+               std::numeric_limits<value_idx>::max());
+  thrust::fill(handle.get_thrust_policy(),
+               R_knn_dists.begin(),
+               R_knn_dists.end(),
+               std::numeric_limits<value_t>::max());
+
   k_closest_landmarks(handle, index, query, n_query_pts, k, R_knn_inds.data(), R_knn_dists.data());
 
   // For debugging / verification. Remove before releasing
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
index a06cfd09de..7c5859e043 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
@@ -228,12 +228,14 @@ __global__ void compute_final_dists_registers(const value_t* X_index,
       for (; i < limit; i += tpb) {
         value_idx cur_candidate_ind = R_1nn_inds[R_start_offset + i];
         value_t cur_candidate_dist  = R_1nn_dists[R_start_offset + i];
-        value_t z                   = heap.warpKTopRDist == 0.00 ? 0.0
-                                                                 : (abs(heap.warpKTop - heap.warpKTopRDist) *
+
+        value_t z = heap.warpKTopRDist == 0.00 ? 0.0
+                                               : (abs(heap.warpKTop - heap.warpKTopRDist) *
                                                     abs(heap.warpKTopRDist - cur_candidate_dist) -
                                                   heap.warpKTop * cur_candidate_dist) /
                                                    heap.warpKTopRDist;
-        z                           = isnan(z) ? 0.0 : z;
+        z         = isnan(z) || isinf(z) ? 0.0 : z;
+
         // If lower bound on distance could possibly be in
         // the closest k neighbors, compute it and add to k-select
         value_t dist = std::numeric_limits<value_t>::max();
@@ -261,7 +263,8 @@ __global__ void compute_final_dists_registers(const value_t* X_index,
                                                   heap.warpKTop * cur_candidate_dist) /
                                                    heap.warpKTopRDist;
 
-        z = isnan(z) ? 0.0 : z;
+        z = isnan(z) || isinf(z) ? 0.0 : z;
+
         // If lower bound on distance could possibly be in
         // the closest k neighbors, compute it and add to k-select
         value_t dist = std::numeric_limits<value_t>::max();
@@ -361,8 +364,7 @@ __global__ void block_rbc_kernel_registers(const value_t* X_index,
          shared_memV,
          k);
 
-  value_t min_R_dist = R_knn_dists[blockIdx.x * k + (k - 1)];
-
+  value_t min_R_dist         = R_knn_dists[blockIdx.x * k + (k - 1)];
   value_int n_dists_computed = 0;
 
   /**
@@ -409,9 +411,10 @@ __global__ void block_rbc_kernel_registers(const value_t* X_index,
                                                 heap.warpKTop * cur_candidate_dist) /
                                                  heap.warpKTopRDist;
 
-      z            = isnan(z) ? 0.0 : z;
+      z            = isnan(z) || isinf(z) ? 0.0 : z;
       value_t dist = std::numeric_limits<value_t>::max();
-      if (i < k || z <= heap.warpKTop) {
+
+      if (z <= heap.warpKTop) {
         const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind);
         value_t local_y_ptr[col_q];
         for (value_int j = 0; j < n_cols; ++j) {
@@ -433,9 +436,10 @@ __global__ void block_rbc_kernel_registers(const value_t* X_index,
                                                heap.warpKTop * cur_candidate_dist) /
                                                 heap.warpKTopRDist;
 
-      z            = isnan(z) ? 0.0 : z;
+      z            = isnan(z) || isinf(z) ? 0.0 : z;
       value_t dist = std::numeric_limits<value_t>::max();
-      if (i < k || z <= heap.warpKTop) {
+
+      if (z <= heap.warpKTop) {
         const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind);
         value_t local_y_ptr[col_q];
         for (value_int j = 0; j < n_cols; ++j) {
@@ -610,6 +614,7 @@ void rbc_low_dim_pass_two(const raft::handle_t& handle,
   const value_int bitset_size = ceil(index.n_landmarks / 32.0);
 
   rmm::device_uvector<std::uint32_t> bitset(bitset_size * index.m, handle.get_stream());
+  thrust::fill(handle.get_thrust_policy(), bitset.data(), bitset.data() + bitset.size(), 0);
 
   perform_post_filter_registers<value_idx, value_t, value_int, 128>
     <<<n_query_rows, 128, bitset_size * sizeof(std::uint32_t), handle.get_stream()>>>(

From b2a88c2e3b6cd9022de5beb0b9daf24927fd64a7 Mon Sep 17 00:00:00 2001
From: Matt Joux <mjoux@nvidia.com>
Date: Wed, 16 Feb 2022 21:41:39 +0100
Subject: [PATCH 117/171] make raft sources compilable with clang (#424)

This makes RAFT sources compilable with clang.
It fixes some fragile code (using `static const` instead of `static constexpr` or `%laneid` in PTX relying on quirks in nvcc which make this happen).

RAFT is still not compilable with clang entirely though due to the dependencies:
1. cub has this issue before 1.14: https://github.com/NVIDIA/cub/pull/335
2. libcudacxx has issues with atomic, which should be fixed in >= 1.7.0-ea (wasn't able to verify this yet)
3. libcudacxx has issues with variadic CUDA functions, which is apparently fixed by passing `-Xclang -fcuda-allow-variadic-functions` to clang (wasn't able to verify this yet)
3. cooperative_groups from CUDA does not work with clang 11.0 / 11.1 but only with >= 13

EDIT: this is necessary to close #84

Authors:
  - Matt Joux (https://github.com/MatthiasKohl)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Artem M. Chirkin (https://github.com/achirkin)

URL: https://github.com/rapidsai/raft/pull/424
---
 cpp/include/raft/cluster/detail/kmeans.cuh    |  41 +-
 cpp/include/raft/cuda_utils.cuh               |   8 +-
 cpp/include/raft/label/detail/classlabels.cuh |   4 +-
 .../raft/linalg/detail/contractions.cuh       |   2 +-
 cpp/include/raft/linalg/detail/qr.cuh         |  12 +-
 .../raft/linalg/detail/reduce_rows_by_key.cuh |   2 +-
 cpp/include/raft/linalg/detail/rsvd.cuh       |   8 +-
 .../raft/matrix/detail/linewise_op.cuh        |  10 +-
 cpp/include/raft/matrix/detail/matrix.cuh     |   6 +-
 cpp/include/raft/random/detail/make_blobs.cuh |   2 +-
 .../sparse/distance/detail/l2_distance.cuh    |   4 +-
 .../sparse/distance/detail/lp_distance.cuh    |   6 +-
 .../raft/sparse/selection/detail/knn.cuh      |   4 +-
 .../sparse/selection/detail/knn_graph.cuh     |   3 +-
 .../raft/spatial/knn/detail/fused_l2_knn.cuh  |  30 +-
 .../raft/spectral/detail/spectral_util.cuh    |   4 +-
 cpp/include/raft/stats/detail/meanvar.cuh     |   5 +-
 .../__clang_cuda_additional_intrinsics.h      | 391 ++++++++++++++++++
 cpp/scripts/run-clang-compile.py              | 331 +++++++++++++++
 cpp/scripts/run-clang-tidy.py                 |   7 +-
 cpp/test/linalg/rsvd.cu                       |   6 +-
 cpp/test/random/make_blobs.cu                 |   3 +-
 cpp/test/stats/meanvar.cu                     |   7 +-
 23 files changed, 824 insertions(+), 72 deletions(-)
 create mode 100644 cpp/scripts/__clang_cuda_additional_intrinsics.h
 create mode 100644 cpp/scripts/run-clang-compile.py

diff --git a/cpp/include/raft/cluster/detail/kmeans.cuh b/cpp/include/raft/cluster/detail/kmeans.cuh
index 039ac8854a..51e4037c60 100644
--- a/cpp/include/raft/cluster/detail/kmeans.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans.cuh
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <algorithm>
 #include <cmath>
 #include <cstdio>
 #include <ctime>
@@ -28,6 +29,7 @@
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 
+#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/device_atomics.cuh>
 #include <raft/handle.hpp>
@@ -404,8 +406,8 @@ static int chooseNewCentroid(handle_t const& handle,
   //}
 
   RAFT_CHECK_CUDA(stream);
-  obsIndex = max(obsIndex, 0);
-  obsIndex = min(obsIndex, n - 1);
+  obsIndex = std::max(obsIndex, static_cast<index_type_t>(0));
+  obsIndex = std::min(obsIndex, n - 1);
 
   // Record new centroid position
   RAFT_CUDA_TRY(cudaMemcpyAsync(centroid,
@@ -467,7 +469,7 @@ static int initializeCentroids(handle_t const& handle,
   auto stream             = handle.get_stream();
   auto thrust_exec_policy = handle.get_thrust_policy();
 
-  constexpr index_type_t grid_lower_bound{65535};
+  constexpr unsigned grid_lower_bound{65535};
 
   // -------------------------------------------------------
   // Implementation
@@ -477,12 +479,12 @@ static int initializeCentroids(handle_t const& handle,
   dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE};
 
   // CUDA grid dimensions
-  dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
+  dim3 gridDim_warp{std::min(ceildiv<unsigned>(d, WARP_SIZE), grid_lower_bound),
                     1,
-                    min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)};
+                    std::min(ceildiv<unsigned>(n, BSIZE_DIV_WSIZE), grid_lower_bound)};
 
   // CUDA grid dimensions
-  dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), 1, 1};
+  dim3 gridDim_block{std::min(ceildiv<unsigned>(n, BLOCK_SIZE), grid_lower_bound), 1, 1};
 
   // Assign observation vectors to code 0
   RAFT_CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
@@ -574,10 +576,10 @@ static int assignCentroids(handle_t const& handle,
   dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};
 
   dim3 gridDim;
-  constexpr index_type_t grid_lower_bound{65535};
-  gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound);
-  gridDim.y = min(k, grid_lower_bound);
-  gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound);
+  constexpr unsigned grid_lower_bound{65535};
+  gridDim.x = std::min(ceildiv<unsigned>(d, WARP_SIZE), grid_lower_bound);
+  gridDim.y = std::min(static_cast<unsigned>(k), grid_lower_bound);
+  gridDim.z = std::min(ceildiv<unsigned>(n, BSIZE_DIV_WSIZE), grid_lower_bound);
 
   computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, k, obs, centroids, dists);
   RAFT_CHECK_CUDA(stream);
@@ -587,7 +589,7 @@ static int assignCentroids(handle_t const& handle,
   blockDim.x = BLOCK_SIZE;
   blockDim.y = 1;
   blockDim.z = 1;
-  gridDim.x  = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound);
+  gridDim.x  = std::min(ceildiv<unsigned>(n, BLOCK_SIZE), grid_lower_bound);
   gridDim.y  = 1;
   gridDim.z  = 1;
   minDistances<<<gridDim, blockDim, 0, stream>>>(n, k, dists, codes, clusterSizes);
@@ -644,7 +646,7 @@ static int updateCentroids(handle_t const& handle,
   const value_type_t one  = 1;
   const value_type_t zero = 0;
 
-  constexpr index_type_t grid_lower_bound{65535};
+  constexpr unsigned grid_lower_bound{65535};
 
   auto stream             = handle.get_stream();
   auto cublas_h           = handle.get_cublas_handle();
@@ -717,8 +719,8 @@ static int updateCentroids(handle_t const& handle,
   dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1};
 
   // CUDA grid dimensions
-  dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
-               min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound),
+  dim3 gridDim{std::min(ceildiv<unsigned>(d, WARP_SIZE), grid_lower_bound),
+               std::min(ceildiv<unsigned>(k, BSIZE_DIV_WSIZE), grid_lower_bound),
                1};
 
   divideCentroids<<<gridDim, blockDim, 0, stream>>>(d, k, clusterSizes, centroids);
@@ -791,7 +793,7 @@ int kmeans(handle_t const& handle,
   // Current iteration
   index_type_t iter;
 
-  constexpr index_type_t grid_lower_bound{65535};
+  constexpr unsigned grid_lower_bound{65535};
 
   // Residual sum of squares at previous iteration
   value_type_t residualPrev = 0;
@@ -818,10 +820,9 @@ int kmeans(handle_t const& handle,
 
     dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};
 
-    dim3 gridDim{
-      min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
-      1,
-      min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), grid_lower_bound)};
+    dim3 gridDim{std::min(ceildiv<unsigned>(d, WARP_SIZE), grid_lower_bound),
+                 1,
+                 std::min(ceildiv<unsigned>(n, BLOCK_SIZE / WARP_SIZE), grid_lower_bound)};
 
     CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream));
     computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, 1, obs, centroids, work);
@@ -958,7 +959,7 @@ int kmeans(handle_t const& handle,
   // Allocate memory
   raft::spectral::matrix::vector_t<index_type_t> clusterSizes(handle, k);
   raft::spectral::matrix::vector_t<value_type_t> centroids(handle, d * k);
-  raft::spectral::matrix::vector_t<value_type_t> work(handle, n * max(k, d));
+  raft::spectral::matrix::vector_t<value_type_t> work(handle, n * std::max(k, d));
   raft::spectral::matrix::vector_t<index_type_t> work_int(handle, 2 * d * n);
 
   // Perform k-means
diff --git a/cpp/include/raft/cuda_utils.cuh b/cpp/include/raft/cuda_utils.cuh
index 8a66eff242..be995ea824 100644
--- a/cpp/include/raft/cuda_utils.cuh
+++ b/cpp/include/raft/cuda_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -109,7 +109,7 @@ static const int WarpSize = 32;
 DI int laneId()
 {
   int id;
-  asm("mov.s32 %0, %laneid;" : "=r"(id));
+  asm("mov.s32 %0, %%laneid;" : "=r"(id));
   return id;
 }
 
@@ -228,13 +228,13 @@ DI T myAtomicMax(T* address, T val);
 
 DI float myAtomicMin(float* address, float val)
 {
-  myAtomicReduce(address, val, fminf);
+  myAtomicReduce<float(float, float)>(address, val, fminf);
   return *address;
 }
 
 DI float myAtomicMax(float* address, float val)
 {
-  myAtomicReduce(address, val, fmaxf);
+  myAtomicReduce<float(float, float)>(address, val, fmaxf);
   return *address;
 }
 
diff --git a/cpp/include/raft/label/detail/classlabels.cuh b/cpp/include/raft/label/detail/classlabels.cuh
index c805860759..53657a5dfa 100644
--- a/cpp/include/raft/label/detail/classlabels.cuh
+++ b/cpp/include/raft/label/detail/classlabels.cuh
@@ -24,6 +24,8 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <algorithm>
+
 namespace raft {
 namespace label {
 namespace detail {
@@ -56,7 +58,7 @@ int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n,
     NULL, bytes, y, workspace.data(), n, 0, sizeof(value_t) * 8, stream);
   cub::DeviceSelect::Unique(
     NULL, bytes2, workspace.data(), workspace.data(), d_num_selected.data(), n, stream);
-  bytes = max(bytes, bytes2);
+  bytes = std::max(bytes, bytes2);
   rmm::device_uvector<char> cub_storage(bytes, stream);
 
   // Select Unique classes
diff --git a/cpp/include/raft/linalg/detail/contractions.cuh b/cpp/include/raft/linalg/detail/contractions.cuh
index 40d0839f60..0261d1967e 100644
--- a/cpp/include/raft/linalg/detail/contractions.cuh
+++ b/cpp/include/raft/linalg/detail/contractions.cuh
@@ -76,7 +76,7 @@ struct Contractions_NT {
   /** block of Y data loaded from global mem after `ldgXY()` */
   DataT ldgDataY[P::LdgPerThY][P::Veclen];
 
-  static const DataT Zero = (DataT)0;
+  static constexpr DataT Zero = (DataT)0;
 
  public:
   /**
diff --git a/cpp/include/raft/linalg/detail/qr.cuh b/cpp/include/raft/linalg/detail/qr.cuh
index a250dd3578..81b1867a82 100644
--- a/cpp/include/raft/linalg/detail/qr.cuh
+++ b/cpp/include/raft/linalg/detail/qr.cuh
@@ -22,6 +22,8 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <algorithm>
+
 namespace raft {
 namespace linalg {
 namespace detail {
@@ -37,7 +39,7 @@ void qrGetQ(const raft::handle_t& handle,
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int m = n_rows, n = n_cols;
-  int k = min(m, n);
+  int k = std::min(m, n);
   RAFT_CUDA_TRY(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
 
   rmm::device_uvector<math_t> tau(k, stream);
@@ -70,8 +72,8 @@ void qrGetQR(const raft::handle_t& handle,
 
   int m = n_rows, n = n_cols;
   rmm::device_uvector<math_t> R_full(m * n, stream);
-  rmm::device_uvector<math_t> tau(min(m, n), stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream));
+  rmm::device_uvector<math_t> tau(std::min(m, n), stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * std::min(m, n), stream));
   int R_full_nrows = m, R_full_ncols = n;
   RAFT_CUDA_TRY(
     cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
@@ -100,12 +102,12 @@ void qrGetQR(const raft::handle_t& handle,
   int Q_nrows = m, Q_ncols = n;
 
   RAFT_CUSOLVER_TRY(cusolverDnorgqr_bufferSize(
-    cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), &Lwork));
+    cusolverH, Q_nrows, Q_ncols, std::min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), &Lwork));
   workspace.resize(Lwork, stream);
   RAFT_CUSOLVER_TRY(cusolverDnorgqr(cusolverH,
                                     Q_nrows,
                                     Q_ncols,
-                                    min(Q_ncols, Q_nrows),
+                                    std::min(Q_ncols, Q_nrows),
                                     Q,
                                     Q_nrows,
                                     tau.data(),
diff --git a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
index aa0b1545d3..7550ce2093 100644
--- a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/linalg/detail/rsvd.cuh b/cpp/include/raft/linalg/detail/rsvd.cuh
index 88436eda64..3dc22a7e89 100644
--- a/cpp/include/raft/linalg/detail/rsvd.cuh
+++ b/cpp/include/raft/linalg/detail/rsvd.cuh
@@ -26,6 +26,8 @@
 #include <raft/matrix/matrix.hpp>
 #include <raft/random/rng.hpp>
 
+#include <algorithm>
+
 namespace raft {
 namespace linalg {
 namespace detail {
@@ -386,9 +388,9 @@ void rsvdPerc(const raft::handle_t& handle,
               int max_sweeps,
               cudaStream_t stream)
 {
-  int k = max((int)(min(n_rows, n_cols) * PC_perc),
-              1);  // Number of singular values to be computed
-  int p = max((int)(min(n_rows, n_cols) * UpS_perc), 1);  // Upsamples
+  int k = std::max((int)(std::min(n_rows, n_cols) * PC_perc),
+                   1);  // Number of singular values to be computed
+  int p = std::max((int)(std::min(n_rows, n_cols) * UpS_perc), 1);  // Upsamples
   rsvdFixedRank(handle,
                 M,
                 n_rows,
diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh
index 63fa872f9d..81204bfe66 100644
--- a/cpp/include/raft/matrix/detail/linewise_op.cuh
+++ b/cpp/include/raft/matrix/detail/linewise_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include <raft/pow2_utils.cuh>
 #include <raft/vectorized.cuh>
 
+#include <algorithm>
+
 namespace raft {
 namespace matrix {
 namespace detail {
@@ -312,7 +314,7 @@ __global__ void __launch_bounds__(BlockSize)
   typedef Linewise<Type, IdxType, VecBytes, BlockSize> L;
   constexpr uint workSize = L::VecElems * BlockSize;
   uint workOffset         = workSize;
-  __shared__ alignas(sizeof(Type) * L::VecElems)
+  __shared__ __align__(sizeof(Type) * L::VecElems)
     Type shm[workSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)];
   const IdxType blockOffset = (arrOffset + BlockSize * L::VecElems * blockIdx.x) % rowLen;
   return L::vectorRows(
@@ -422,7 +424,7 @@ void matrixLinewiseVecCols(Type* out,
     const uint occupy = getOptimalGridSize<BlockSize>();
     // does not make sense to have more blocks than this
     const uint maxBlocks = raft::ceildiv<uint>(uint(alignedLen), bs.x * VecElems);
-    const dim3 gs(min(maxBlocks, occupy), 1, 1);
+    const dim3 gs(std::min(maxBlocks, occupy), 1, 1);
     // The work arrangement is blocked on the block and warp levels;
     //   see more details at Linewise::vectorCols.
     // The value below determines how many scalar elements are processed by on thread in total.
@@ -482,7 +484,7 @@ void matrixLinewiseVecRows(Type* out,
     const uint expected_grid_size = rowLen / raft::gcd(block_work_size, uint(rowLen));
     // Minimum size of the grid to make the device well occupied
     const uint occupy = getOptimalGridSize<BlockSize>();
-    const dim3 gs(min(
+    const dim3 gs(std::min(
                     // does not make sense to have more blocks than this
                     raft::ceildiv<uint>(uint(totalLen), block_work_size),
                     // increase the grid size to be not less than `occupy` while
diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index 6d631b4f4f..f057ba283c 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -220,7 +220,7 @@ template <typename m_t, typename idx_t = int>
 void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 {
   idx_t m = n_rows, n = n_cols;
-  idx_t k = min(m, n);
+  idx_t k = std::min(m, n);
   dim3 block(64);
   dim3 grid((m * n + block.x - 1) / block.x);
   getUpperTriangular<<<grid, block, 0, stream>>>(src, dst, m, n, k);
@@ -246,7 +246,7 @@ template <typename m_t, typename idx_t = int>
 void initializeDiagonalMatrix(
   m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 {
-  idx_t k = min(n_rows, n_cols);
+  idx_t k = std::min(n_rows, n_cols);
   dim3 block(64);
   dim3 grid((k + block.x - 1) / block.x);
   copyVectorToMatrixDiagonal<<<grid, block, 0, stream>>>(vec, matrix, n_rows, n_cols, k);
@@ -285,4 +285,4 @@ m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t st
 
 }  // end namespace detail
 }  // end namespace matrix
-}  // end namespace raft
\ No newline at end of file
+}  // end namespace raft
diff --git a/cpp/include/raft/random/detail/make_blobs.cuh b/cpp/include/raft/random/detail/make_blobs.cuh
index fff1ab835b..b79178567b 100644
--- a/cpp/include/raft/random/detail/make_blobs.cuh
+++ b/cpp/include/raft/random/detail/make_blobs.cuh
@@ -245,4 +245,4 @@ void make_blobs_caller(DataT* out,
 
 }  // end namespace detail
 }  // end namespace random
-}  // end namespace raft
\ No newline at end of file
+}  // end namespace raft
diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
index 0624674e81..e6dd396f2d 100644
--- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
@@ -31,6 +31,8 @@
 
 #include <nvfunctional>
 
+#include <algorithm>
+
 namespace raft {
 namespace sparse {
 namespace distance {
@@ -411,7 +413,7 @@ class hellinger_expanded_distances_t : public distances_t<value_t> {
 
   void compute(value_t* out_dists)
   {
-    rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
+    rmm::device_uvector<value_idx> coo_rows(std::max(config_->b_nnz, config_->a_nnz),
                                             config_->handle.get_stream());
 
     raft::sparse::convert::csr_to_coo(config_->b_indptr,
diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
index de9049ced7..96d51f2e75 100644
--- a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
@@ -32,6 +32,8 @@
 
 #include <nvfunctional>
 
+#include <algorithm>
+
 namespace raft {
 namespace sparse {
 namespace distance {
@@ -48,7 +50,7 @@ void unexpanded_lp_distances(value_t* out_dists,
                              accum_f accum_func,
                              write_f write_func)
 {
-  rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
+  rmm::device_uvector<value_idx> coo_rows(std::max(config_->b_nnz, config_->a_nnz),
                                           config_->handle.get_stream());
 
   raft::sparse::convert::csr_to_coo(config_->b_indptr,
@@ -283,7 +285,7 @@ class kl_divergence_unexpanded_distances_t : public distances_t<value_t> {
 
   void compute(value_t* out_dists)
   {
-    rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
+    rmm::device_uvector<value_idx> coo_rows(std::max(config_->b_nnz, config_->a_nnz),
                                             config_->handle.get_stream());
 
     raft::sparse::convert::csr_to_coo(config_->b_indptr,
diff --git a/cpp/include/raft/sparse/selection/detail/knn.cuh b/cpp/include/raft/sparse/selection/detail/knn.cuh
index 82a689fe00..d263f2409f 100644
--- a/cpp/include/raft/sparse/selection/detail/knn.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn.cuh
@@ -31,6 +31,8 @@
 #include <raft/sparse/op/slice.hpp>
 #include <raft/spatial/knn/knn.hpp>
 
+#include <algorithm>
+
 namespace raft {
 namespace sparse {
 namespace selection {
@@ -354,7 +356,7 @@ class sparse_knn_t {
 
     // in the case where the number of idx rows in the batch is < k, we
     // want to adjust k.
-    value_idx n_neighbors = min(k, batch_cols);
+    value_idx n_neighbors = std::min(static_cast<value_idx>(k), batch_cols);
 
     bool ascending = true;
     if (metric == raft::distance::DistanceType::InnerProduct) ascending = false;
diff --git a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
index 6ac96e1324..b222dfd9bd 100644
--- a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
@@ -32,6 +32,7 @@
 #include <thrust/scan.h>
 #include <thrust/sort.h>
 
+#include <algorithm>
 #include <limits>
 
 namespace raft {
@@ -59,7 +60,7 @@ value_idx build_k(value_idx n_samples, int c)
 {
   // from "kNN-MST-Agglomerative: A fast & scalable graph-based data clustering
   // approach on GPU"
-  return min(n_samples, max((value_idx)2, (value_idx)floor(log2(n_samples)) + c));
+  return std::min(n_samples, std::max((value_idx)2, (value_idx)floor(log2(n_samples)) + c));
 }
 
 template <typename in_t, typename out_t>
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index 6b5df01a97..e3e33e6642 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -29,7 +29,7 @@ namespace knn {
 namespace detail {
 
 template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT>
-DI void loadAllWarpQShmem(myWarpSelect& heapArr,
+DI void loadAllWarpQShmem(myWarpSelect** heapArr,
                           Pair* shDumpKV,
                           const IdxT m,
                           const unsigned int numOfNN)
@@ -40,7 +40,7 @@ DI void loadAllWarpQShmem(myWarpSelect& heapArr,
     const auto rowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
     if (rowId < m) {
 #pragma unroll
-      for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) {
+      for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
         const int idx = j * warpSize + lid;
         if (idx < numOfNN) {
           Pair KVPair          = shDumpKV[rowId * numOfNN + idx];
@@ -53,14 +53,14 @@ DI void loadAllWarpQShmem(myWarpSelect& heapArr,
 }
 
 template <typename Policy, typename Pair, typename myWarpSelect>
-DI void loadWarpQShmem(myWarpSelect& heapArr,
+DI void loadWarpQShmem(myWarpSelect* heapArr,
                        Pair* shDumpKV,
                        const int rowId,
                        const unsigned int numOfNN)
 {
   const int lid = raft::laneId();
 #pragma unroll
-  for (int j = 0; j < heapArr->kNumWarpQRegisters; ++j) {
+  for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
     const int idx = j * warpSize + lid;
     if (idx < numOfNN) {
       Pair KVPair       = shDumpKV[rowId * numOfNN + idx];
@@ -71,7 +71,7 @@ DI void loadWarpQShmem(myWarpSelect& heapArr,
 }
 
 template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT>
-DI void storeWarpQShmem(myWarpSelect& heapArr,
+DI void storeWarpQShmem(myWarpSelect* heapArr,
                         Pair* shDumpKV,
                         const IdxT rowId,
                         const unsigned int numOfNN)
@@ -79,7 +79,7 @@ DI void storeWarpQShmem(myWarpSelect& heapArr,
   const int lid = raft::laneId();
 
 #pragma unroll
-  for (int j = 0; j < heapArr->kNumWarpQRegisters; ++j) {
+  for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
     const int idx = j * warpSize + lid;
     if (idx < numOfNN) {
       Pair otherKV                    = Pair(heapArr->warpV[j], heapArr->warpK[j]);
@@ -89,7 +89,7 @@ DI void storeWarpQShmem(myWarpSelect& heapArr,
 }
 
 template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT, typename OutT>
-DI void storeWarpQGmem(myWarpSelect& heapArr,
+DI void storeWarpQGmem(myWarpSelect** heapArr,
                        volatile OutT* out_dists,
                        volatile IdxT* out_inds,
                        const IdxT m,
@@ -102,7 +102,7 @@ DI void storeWarpQGmem(myWarpSelect& heapArr,
     const auto gmemRowId = starty + i * Policy::AccThRows;
     if (gmemRowId < m) {
 #pragma unroll
-      for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) {
+      for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
         const auto idx = j * warpSize + lid;
         if (idx < numOfNN) {
           out_dists[gmemRowId * numOfNN + idx] = heapArr[i]->warpK[j];
@@ -114,7 +114,7 @@ DI void storeWarpQGmem(myWarpSelect& heapArr,
 }
 
 template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT, typename OutT>
-DI void loadPrevTopKsGmemWarpQ(myWarpSelect& heapArr,
+DI void loadPrevTopKsGmemWarpQ(myWarpSelect** heapArr,
                                volatile OutT* out_dists,
                                volatile IdxT* out_inds,
                                const IdxT m,
@@ -127,14 +127,14 @@ DI void loadPrevTopKsGmemWarpQ(myWarpSelect& heapArr,
     const auto gmemRowId = starty + i * Policy::AccThRows;
     if (gmemRowId < m) {
 #pragma unroll
-      for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) {
+      for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
         const auto idx = j * warpSize + lid;
         if (idx < numOfNN) {
           heapArr[i]->warpK[j] = out_dists[gmemRowId * numOfNN + idx];
           heapArr[i]->warpV[j] = (uint32_t)out_inds[gmemRowId * numOfNN + idx];
         }
       }
-      auto constexpr kLaneWarpKTop = heapArr[i]->kNumWarpQRegisters - 1;
+      static constexpr auto kLaneWarpKTop = myWarpSelect::kNumWarpQRegisters - 1;
       heapArr[i]->warpKTop = raft::shfl(heapArr[i]->warpK[kLaneWarpKTop], heapArr[i]->kLane);
     }
   }
@@ -261,7 +261,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
           const auto rowId = starty + i * Policy::AccThRows;
           if (rowId < m) {
 #pragma unroll
-            for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) {
+            for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
               Pair otherKV;
               otherKV.value  = identity;
               otherKV.key    = keyMax;
@@ -287,7 +287,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
           const auto rowId = starty + i * Policy::AccThRows;
           if (rowId < m) {
 #pragma unroll
-            for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) {
+            for (int j = 0; j < myWarpSelect::kNumWarpQRegisters; ++j) {
               Pair otherKV;
               otherKV.value  = identity;
               otherKV.key    = keyMax;
@@ -341,7 +341,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [numOfNN, m, n, ldd, out_dists, out_inds] __device__(
+  auto epilog_lambda = [numOfNN, m, n, ldd, out_dists, out_inds, keyMax, identity] __device__(
                          AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
                          DataT * regxn,
                          DataT * regyn,
@@ -448,7 +448,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
               }
               const int finalNumVals = raft::shfl(numValsWarpTopK[i], 31);
               loadWarpQShmem<Policy, Pair>(heapArr[i], &shDumpKV[0], rowId, numOfNN);
-              updateSortedWarpQ<Pair, heapArr[i]->kNumWarpQRegisters>(
+              updateSortedWarpQ<Pair, myWarpSelect::kNumWarpQRegisters>(
                 heapArr[i], &allWarpTopKs[0], rowId, finalNumVals);
             }
           }
diff --git a/cpp/include/raft/spectral/detail/spectral_util.cuh b/cpp/include/raft/spectral/detail/spectral_util.cuh
index c7a0f0c5ef..c1796cbbc1 100644
--- a/cpp/include/raft/spectral/detail/spectral_util.cuh
+++ b/cpp/include/raft/spectral/detail/spectral_util.cuh
@@ -25,6 +25,8 @@
 #include <thrust/reduce.h>
 #include <thrust/transform.h>
 
+#include <algorithm>
+
 namespace raft {
 namespace spectral {
 
@@ -96,7 +98,7 @@ cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs)
   // find next power of 2
   p2m = next_pow2<index_type_t>(m);
   // setup launch configuration
-  unsigned int xsize = max(2, min(p2m, 32));
+  unsigned int xsize = std::max(2, std::min(p2m, 32));
   dim3 nthreads{xsize, 256 / xsize, 1};
 
   dim3 nblocks{1, (n + nthreads.y - 1) / nthreads.y, 1};
diff --git a/cpp/include/raft/stats/detail/meanvar.cuh b/cpp/include/raft/stats/detail/meanvar.cuh
index 7d4c68e364..075e7fe170 100644
--- a/cpp/include/raft/stats/detail/meanvar.cuh
+++ b/cpp/include/raft/stats/detail/meanvar.cuh
@@ -199,13 +199,14 @@ void meanvar(
   if (rowMajor) {
     static_assert(BlockSize >= WarpSize, "Block size must be not smaller than the warp size.");
     const dim3 bs(WarpSize, BlockSize / WarpSize, 1);
-    dim3 gs(raft::ceildiv<typeof(bs.x)>(D, bs.x), raft::ceildiv<typeof(bs.y)>(N, bs.y), 1);
+    dim3 gs(raft::ceildiv<decltype(bs.x)>(D, bs.x), raft::ceildiv<decltype(bs.y)>(N, bs.y), 1);
 
     // Don't create more blocks than necessary to occupy the GPU
     int occupancy;
     RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
       &occupancy, meanvar_kernel_rowmajor<T, I, BlockSize>, BlockSize, 0));
-    gs.y = min(gs.y, raft::ceildiv<typeof(gs.y)>(occupancy * getMultiProcessorCount(), gs.x));
+    gs.y =
+      std::min(gs.y, raft::ceildiv<decltype(gs.y)>(occupancy * getMultiProcessorCount(), gs.x));
 
     // Global memory: one mean_var<T> for each column
     //                one lock per all blocks working on the same set of columns
diff --git a/cpp/scripts/__clang_cuda_additional_intrinsics.h b/cpp/scripts/__clang_cuda_additional_intrinsics.h
new file mode 100644
index 0000000000..8964d210bf
--- /dev/null
+++ b/cpp/scripts/__clang_cuda_additional_intrinsics.h
@@ -0,0 +1,391 @@
+#ifndef __CLANG_CUDA_ADDITIONAL_INTRINSICS_H__
+#define __CLANG_CUDA_ADDITIONAL_INTRINSICS_H__
+#ifndef __CUDA__
+#error "This file is for CUDA compilation only."
+#endif
+
+// for some of these macros, see cuda_fp16.hpp
+#if defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __LDG_PTR   "l"
+#define __LBITS "64"
+#else
+#define __LDG_PTR   "r"
+#define __LBITS "32"
+#endif // (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+
+#define __NOARG
+
+#define __MAKE_LD(cop, c_typ, int_typ, ptx_typ, inl_typ, mem)         \
+  __device__ __forceinline__ c_typ __ld ## cop (const c_typ* addr) {  \
+    int_typ out;                                                      \
+    asm("ld." #cop "." ptx_typ " %0, [%1];"                           \
+        : "=" inl_typ(out) : __LDG_PTR(addr)mem);                     \
+    return (c_typ)out;                                                \
+  }
+
+#define __MAKE_LD2(cop, c_typ, int_typ, ptx_typ, inl_typ, mem)        \
+  __device__ __forceinline__ c_typ __ld ## cop (const c_typ* addr) {  \
+    int_typ out1, out2;                                               \
+    asm("ld." #cop ".v2." ptx_typ " {%0, %1}, [%2];"                  \
+        : "=" inl_typ(out1), "=" inl_typ(out2) : __LDG_PTR(addr)mem); \
+    c_typ out;                                                        \
+    out.x = out1;                                                     \
+    out.y = out2;                                                     \
+    return out;                                                       \
+  }
+
+#define __MAKE_LD4(cop, c_typ, int_typ, ptx_typ, inl_typ, mem)        \
+  __device__ __forceinline__ c_typ __ld ## cop (const c_typ* addr) {  \
+    int_typ out1, out2, out3, out4;                                   \
+    asm("ld." #cop".v4." ptx_typ " {%0, %1, %2, %3}, [%4];"           \
+        : "=" inl_typ(out1), "=" inl_typ(out2),                       \
+        "=" inl_typ(out3), "=" inl_typ(out4) : __LDG_PTR(addr)mem);   \
+    c_typ out;                                                        \
+    out.x = out1;                                                     \
+    out.y = out2;                                                     \
+    out.z = out3;                                                     \
+    out.w = out4;                                                     \
+    return out;                                                       \
+  }
+
+__MAKE_LD(cg, char, short, "s8", "h", __NOARG)
+__MAKE_LD(cg, signed char, short, "s8", "h", __NOARG)
+__MAKE_LD(cg, unsigned char, short, "u8", "h", __NOARG)
+__MAKE_LD(cg, short, short, "s16", "h", __NOARG)
+__MAKE_LD(cg, unsigned short, unsigned short, "u16", "h", __NOARG)
+__MAKE_LD(cg, int, int, "s32", "r", __NOARG)
+__MAKE_LD(cg, unsigned int, unsigned int, "u32", "r", __NOARG)
+__MAKE_LD(cg, long, long, "s" __LBITS, __LDG_PTR, __NOARG)
+__MAKE_LD(cg, unsigned long, unsigned long, "u" __LBITS, __LDG_PTR, __NOARG)
+__MAKE_LD(cg, long long, long long, "s64", "l", __NOARG)
+__MAKE_LD(cg, unsigned long long, unsigned long long, "u64", "l", __NOARG)
+__MAKE_LD(cg, float, float, "f32", "f", __NOARG)
+__MAKE_LD(cg, double, double, "f64", "d", __NOARG)
+
+__MAKE_LD2(cg, char2, short, "s8", "h", __NOARG)
+__MAKE_LD2(cg, uchar2, short, "u8", "h", __NOARG)
+__MAKE_LD2(cg, short2, short, "s16", "h", __NOARG)
+__MAKE_LD2(cg, ushort2, unsigned short, "u16", "h", __NOARG)
+__MAKE_LD2(cg, int2, int, "s32", "r", __NOARG)
+__MAKE_LD2(cg, uint2, unsigned int, "u32", "r", __NOARG)
+__MAKE_LD2(cg, longlong2, long long, "s64", "l", __NOARG)
+__MAKE_LD2(cg, ulonglong2, unsigned long long, "u64", "l", __NOARG)
+__MAKE_LD2(cg, float2, float, "f32", "f", __NOARG)
+__MAKE_LD2(cg, double2, double, "f64", "d", __NOARG)
+
+__MAKE_LD4(cg, char4, short, "s8", "h", __NOARG)
+__MAKE_LD4(cg, uchar4, short, "u8", "h", __NOARG)
+__MAKE_LD4(cg, short4, short, "s16", "h", __NOARG)
+__MAKE_LD4(cg, ushort4, unsigned short, "u16", "h", __NOARG)
+__MAKE_LD4(cg, int4, int, "s32", "r", __NOARG)
+__MAKE_LD4(cg, uint4, unsigned int, "u32", "r", __NOARG)
+__MAKE_LD4(cg, float4, float, "f32", "f", __NOARG)
+
+
+__MAKE_LD(ca, char, short, "s8", "h", __NOARG)
+__MAKE_LD(ca, signed char, short, "s8", "h", __NOARG)
+__MAKE_LD(ca, unsigned char, short, "u8", "h", __NOARG)
+__MAKE_LD(ca, short, short, "s16", "h", __NOARG)
+__MAKE_LD(ca, unsigned short, unsigned short, "u16", "h", __NOARG)
+__MAKE_LD(ca, int, int, "s32", "r", __NOARG)
+__MAKE_LD(ca, unsigned int, unsigned int, "u32", "r", __NOARG)
+__MAKE_LD(ca, long, long, "s" __LBITS, __LDG_PTR, __NOARG)
+__MAKE_LD(ca, unsigned long, unsigned long, "u" __LBITS, __LDG_PTR, __NOARG)
+__MAKE_LD(ca, long long, long long, "s64", "l", __NOARG)
+__MAKE_LD(ca, unsigned long long, unsigned long long, "u64", "l", __NOARG)
+__MAKE_LD(ca, float, float, "f32", "f", __NOARG)
+__MAKE_LD(ca, double, double, "f64", "d", __NOARG)
+
+__MAKE_LD2(ca, char2, short, "s8", "h", __NOARG)
+__MAKE_LD2(ca, uchar2, short, "u8", "h", __NOARG)
+__MAKE_LD2(ca, short2, short, "s16", "h", __NOARG)
+__MAKE_LD2(ca, ushort2, unsigned short, "u16", "h", __NOARG)
+__MAKE_LD2(ca, int2, int, "s32", "r", __NOARG)
+__MAKE_LD2(ca, uint2, unsigned int, "u32", "r", __NOARG)
+__MAKE_LD2(ca, longlong2, long long, "s64", "l", __NOARG)
+__MAKE_LD2(ca, ulonglong2, unsigned long long, "u64", "l", __NOARG)
+__MAKE_LD2(ca, float2, float, "f32", "f", __NOARG)
+__MAKE_LD2(ca, double2, double, "f64", "d", __NOARG)
+
+__MAKE_LD4(ca, char4, short, "s8", "h", __NOARG)
+__MAKE_LD4(ca, uchar4, short, "u8", "h", __NOARG)
+__MAKE_LD4(ca, short4, short, "s16", "h", __NOARG)
+__MAKE_LD4(ca, ushort4, unsigned short, "u16", "h", __NOARG)
+__MAKE_LD4(ca, int4, int, "s32", "r", __NOARG)
+__MAKE_LD4(ca, uint4, unsigned int, "u32", "r", __NOARG)
+__MAKE_LD4(ca, float4, float, "f32", "f", __NOARG)
+
+
+__MAKE_LD(cs, char, short, "s8", "h", __NOARG)
+__MAKE_LD(cs, signed char, short, "s8", "h", __NOARG)
+__MAKE_LD(cs, unsigned char, short, "u8", "h", __NOARG)
+__MAKE_LD(cs, short, short, "s16", "h", __NOARG)
+__MAKE_LD(cs, unsigned short, unsigned short, "u16", "h", __NOARG)
+__MAKE_LD(cs, int, int, "s32", "r", __NOARG)
+__MAKE_LD(cs, unsigned int, unsigned int, "u32", "r", __NOARG)
+__MAKE_LD(cs, long, long, "s" __LBITS, __LDG_PTR, __NOARG)
+__MAKE_LD(cs, unsigned long, unsigned long, "u" __LBITS, __LDG_PTR, __NOARG)
+__MAKE_LD(cs, long long, long long, "s64", "l", __NOARG)
+__MAKE_LD(cs, unsigned long long, unsigned long long, "u64", "l", __NOARG)
+__MAKE_LD(cs, float, float, "f32", "f", __NOARG)
+__MAKE_LD(cs, double, double, "f64", "d", __NOARG)
+
+__MAKE_LD2(cs, char2, short, "s8", "h", __NOARG)
+__MAKE_LD2(cs, uchar2, short, "u8", "h", __NOARG)
+__MAKE_LD2(cs, short2, short, "s16", "h", __NOARG)
+__MAKE_LD2(cs, ushort2, unsigned short, "u16", "h", __NOARG)
+__MAKE_LD2(cs, int2, int, "s32", "r", __NOARG)
+__MAKE_LD2(cs, uint2, unsigned int, "u32", "r", __NOARG)
+__MAKE_LD2(cs, longlong2, long long, "s64", "l", __NOARG)
+__MAKE_LD2(cs, ulonglong2, unsigned long long, "u64", "l", __NOARG)
+__MAKE_LD2(cs, float2, float, "f32", "f", __NOARG)
+__MAKE_LD2(cs, double2, double, "f64", "d", __NOARG)
+
+__MAKE_LD4(cs, char4, short, "s8", "h", __NOARG)
+__MAKE_LD4(cs, uchar4, short, "u8", "h", __NOARG)
+__MAKE_LD4(cs, short4, short, "s16", "h", __NOARG)
+__MAKE_LD4(cs, ushort4, unsigned short, "u16", "h", __NOARG)
+__MAKE_LD4(cs, int4, int, "s32", "r", __NOARG)
+__MAKE_LD4(cs, uint4, unsigned int, "u32", "r", __NOARG)
+__MAKE_LD4(cs, float4, float, "f32", "f", __NOARG)
+
+
+__MAKE_LD(lu, char, short, "s8", "h", : "memory")
+__MAKE_LD(lu, signed char, short, "s8", "h", : "memory")
+__MAKE_LD(lu, unsigned char, short, "u8", "h", : "memory")
+__MAKE_LD(lu, short, short, "s16", "h", : "memory")
+__MAKE_LD(lu, unsigned short, unsigned short, "u16", "h", : "memory")
+__MAKE_LD(lu, int, int, "s32", "r", : "memory")
+__MAKE_LD(lu, unsigned int, unsigned int, "u32", "r", : "memory")
+__MAKE_LD(lu, long, long, "s" __LBITS, __LDG_PTR, : "memory")
+__MAKE_LD(lu, unsigned long, unsigned long, "u" __LBITS, __LDG_PTR, : "memory")
+__MAKE_LD(lu, long long, long long, "s64", "l", : "memory")
+__MAKE_LD(lu, unsigned long long, unsigned long long, "u64", "l", : "memory")
+__MAKE_LD(lu, float, float, "f32", "f", : "memory")
+__MAKE_LD(lu, double, double, "f64", "d", : "memory")
+
+__MAKE_LD2(lu, char2, short, "s8", "h", : "memory")
+__MAKE_LD2(lu, uchar2, short, "u8", "h", : "memory")
+__MAKE_LD2(lu, short2, short, "s16", "h", : "memory")
+__MAKE_LD2(lu, ushort2, unsigned short, "u16", "h", : "memory")
+__MAKE_LD2(lu, int2, int, "s32", "r", : "memory")
+__MAKE_LD2(lu, uint2, unsigned int, "u32", "r", : "memory")
+__MAKE_LD2(lu, longlong2, long long, "s64", "l", : "memory")
+__MAKE_LD2(lu, ulonglong2, unsigned long long, "u64", "l", : "memory")
+__MAKE_LD2(lu, float2, float, "f32", "f", : "memory")
+__MAKE_LD2(lu, double2, double, "f64", "d", : "memory")
+
+__MAKE_LD4(lu, char4, short, "s8", "h", : "memory")
+__MAKE_LD4(lu, uchar4, short, "u8", "h", : "memory")
+__MAKE_LD4(lu, short4, short, "s16", "h", : "memory")
+__MAKE_LD4(lu, ushort4, unsigned short, "u16", "h", : "memory")
+__MAKE_LD4(lu, int4, int, "s32", "r", : "memory")
+__MAKE_LD4(lu, uint4, unsigned int, "u32", "r", : "memory")
+__MAKE_LD4(lu, float4, float, "f32", "f", : "memory")
+
+
+__MAKE_LD(cv, char, short, "s8", "h", : "memory")
+__MAKE_LD(cv, signed char, short, "s8", "h", : "memory")
+__MAKE_LD(cv, unsigned char, short, "u8", "h", : "memory")
+__MAKE_LD(cv, short, short, "s16", "h", : "memory")
+__MAKE_LD(cv, unsigned short, unsigned short, "u16", "h", : "memory")
+__MAKE_LD(cv, int, int, "s32", "r", : "memory")
+__MAKE_LD(cv, unsigned int, unsigned int, "u32", "r", : "memory")
+__MAKE_LD(cv, long, long, "s" __LBITS, __LDG_PTR, : "memory")
+__MAKE_LD(cv, unsigned long, unsigned long, "u" __LBITS, __LDG_PTR, : "memory")
+__MAKE_LD(cv, long long, long long, "s64", "l", : "memory")
+__MAKE_LD(cv, unsigned long long, unsigned long long, "u64", "l", : "memory")
+__MAKE_LD(cv, float, float, "f32", "f", : "memory")
+__MAKE_LD(cv, double, double, "f64", "d", : "memory")
+
+__MAKE_LD2(cv, char2, short, "s8", "h", : "memory")
+__MAKE_LD2(cv, uchar2, short, "u8", "h", : "memory")
+__MAKE_LD2(cv, short2, short, "s16", "h", : "memory")
+__MAKE_LD2(cv, ushort2, unsigned short, "u16", "h", : "memory")
+__MAKE_LD2(cv, int2, int, "s32", "r", : "memory")
+__MAKE_LD2(cv, uint2, unsigned int, "u32", "r", : "memory")
+__MAKE_LD2(cv, longlong2, long long, "s64", "l", : "memory")
+__MAKE_LD2(cv, ulonglong2, unsigned long long, "u64", "l", : "memory")
+__MAKE_LD2(cv, float2, float, "f32", "f", : "memory")
+__MAKE_LD2(cv, double2, double, "f64", "d", : "memory")
+
+__MAKE_LD4(cv, char4, short, "s8", "h", : "memory")
+__MAKE_LD4(cv, uchar4, short, "u8", "h", : "memory")
+__MAKE_LD4(cv, short4, short, "s16", "h", : "memory")
+__MAKE_LD4(cv, ushort4, unsigned short, "u16", "h", : "memory")
+__MAKE_LD4(cv, int4, int, "s32", "r", : "memory")
+__MAKE_LD4(cv, uint4, unsigned int, "u32", "r", : "memory")
+__MAKE_LD4(cv, float4, float, "f32", "f", : "memory")
+
+
+#define __MAKE_ST(cop, c_typ, int_typ, ptx_typ, inl_typ)                \
+  __device__ __forceinline__ void __st ## cop (c_typ* addr, c_typ v) {  \
+    asm("st." #cop "." ptx_typ " [%0], %1;"                             \
+        :: __LDG_PTR(addr), inl_typ((int_typ)v) : "memory");            \
+  }
+
+#define __MAKE_ST2(cop, c_typ, int_typ, ptx_typ, inl_typ)               \
+  __device__ __forceinline__ void __st ## cop (c_typ* addr, c_typ v) {  \
+    int_typ v1 = v.x, v2 = v.y;                                         \
+    asm("st." #cop ".v2." ptx_typ " [%0], {%1, %2};"                    \
+        :: __LDG_PTR(addr), inl_typ(v1), inl_typ(v2) : "memory");       \
+  }
+
+#define __MAKE_ST4(cop, c_typ, int_typ, ptx_typ, inl_typ)               \
+  __device__ __forceinline__ c_typ __st ## cop (c_typ* addr, c_typ v) { \
+    int_typ v1 = v.x, v2 = v.y, v3 = v.z, v4 = v.w;                     \
+    asm("st." #cop ".v4." ptx_typ " [%0], {%1, %2, %3, %4};"            \
+        :: __LDG_PTR(addr), inl_typ(v1), inl_typ(v2),                   \
+        inl_typ(v3), inl_typ(v4) : "memory");                           \
+  }
+
+__MAKE_ST(wb, char, short, "s8", "h")
+__MAKE_ST(wb, signed char, short, "s8", "h")
+__MAKE_ST(wb, unsigned char, short, "u8", "h")
+__MAKE_ST(wb, short, short, "s16", "h")
+__MAKE_ST(wb, unsigned short, unsigned short, "u16", "h")
+__MAKE_ST(wb, int, int, "s32", "r")
+__MAKE_ST(wb, unsigned int, unsigned int, "u32", "r")
+__MAKE_ST(wb, long, long, "s" __LBITS, __LDG_PTR)
+__MAKE_ST(wb, unsigned long, unsigned long, "u" __LBITS, __LDG_PTR)
+__MAKE_ST(wb, long long, long long, "s64", "l")
+__MAKE_ST(wb, unsigned long long, unsigned long long, "u64", "l")
+__MAKE_ST(wb, float, float, "f32", "f")
+__MAKE_ST(wb, double, double, "f64", "d")
+
+__MAKE_ST2(wb, char2, short, "s8", "h")
+__MAKE_ST2(wb, uchar2, short, "u8", "h")
+__MAKE_ST2(wb, short2, short, "s16", "h")
+__MAKE_ST2(wb, ushort2, unsigned short, "u16", "h")
+__MAKE_ST2(wb, int2, int, "s32", "r")
+__MAKE_ST2(wb, uint2, unsigned int, "u32", "r")
+__MAKE_ST2(wb, longlong2, long long, "s64", "l")
+__MAKE_ST2(wb, ulonglong2, unsigned long long, "u64", "l")
+__MAKE_ST2(wb, float2, float, "f32", "f")
+__MAKE_ST2(wb, double2, double, "f64", "d")
+
+__MAKE_ST4(wb, char4, short, "s8", "h")
+__MAKE_ST4(wb, uchar4, short, "u8", "h")
+__MAKE_ST4(wb, short4, short, "s16", "h")
+__MAKE_ST4(wb, ushort4, unsigned short, "u16", "h")
+__MAKE_ST4(wb, int4, int, "s32", "r")
+__MAKE_ST4(wb, uint4, unsigned int, "u32", "r")
+__MAKE_ST4(wb, float4, float, "f32", "f")
+
+
+__MAKE_ST(cg, char, short, "s8", "h")
+__MAKE_ST(cg, signed char, short, "s8", "h")
+__MAKE_ST(cg, unsigned char, short, "u8", "h")
+__MAKE_ST(cg, short, short, "s16", "h")
+__MAKE_ST(cg, unsigned short, unsigned short, "u16", "h")
+__MAKE_ST(cg, int, int, "s32", "r")
+__MAKE_ST(cg, unsigned int, unsigned int, "u32", "r")
+__MAKE_ST(cg, long, long, "s" __LBITS, __LDG_PTR)
+__MAKE_ST(cg, unsigned long, unsigned long, "u" __LBITS, __LDG_PTR)
+__MAKE_ST(cg, long long, long long, "s64", "l")
+__MAKE_ST(cg, unsigned long long, unsigned long long, "u64", "l")
+__MAKE_ST(cg, float, float, "f32", "f")
+__MAKE_ST(cg, double, double, "f64", "d")
+
+__MAKE_ST2(cg, char2, short, "s8", "h")
+__MAKE_ST2(cg, uchar2, short, "u8", "h")
+__MAKE_ST2(cg, short2, short, "s16", "h")
+__MAKE_ST2(cg, ushort2, unsigned short, "u16", "h")
+__MAKE_ST2(cg, int2, int, "s32", "r")
+__MAKE_ST2(cg, uint2, unsigned int, "u32", "r")
+__MAKE_ST2(cg, longlong2, long long, "s64", "l")
+__MAKE_ST2(cg, ulonglong2, unsigned long long, "u64", "l")
+__MAKE_ST2(cg, float2, float, "f32", "f")
+__MAKE_ST2(cg, double2, double, "f64", "d")
+
+__MAKE_ST4(cg, char4, short, "s8", "h")
+__MAKE_ST4(cg, uchar4, short, "u8", "h")
+__MAKE_ST4(cg, short4, short, "s16", "h")
+__MAKE_ST4(cg, ushort4, unsigned short, "u16", "h")
+__MAKE_ST4(cg, int4, int, "s32", "r")
+__MAKE_ST4(cg, uint4, unsigned int, "u32", "r")
+__MAKE_ST4(cg, float4, float, "f32", "f")
+
+
+__MAKE_ST(cs, char, short, "s8", "h")
+__MAKE_ST(cs, signed char, short, "s8", "h")
+__MAKE_ST(cs, unsigned char, short, "u8", "h")
+__MAKE_ST(cs, short, short, "s16", "h")
+__MAKE_ST(cs, unsigned short, unsigned short, "u16", "h")
+__MAKE_ST(cs, int, int, "s32", "r")
+__MAKE_ST(cs, unsigned int, unsigned int, "u32", "r")
+__MAKE_ST(cs, long, long, "s" __LBITS, __LDG_PTR)
+__MAKE_ST(cs, unsigned long, unsigned long, "u" __LBITS, __LDG_PTR)
+__MAKE_ST(cs, long long, long long, "s64", "l")
+__MAKE_ST(cs, unsigned long long, unsigned long long, "u64", "l")
+__MAKE_ST(cs, float, float, "f32", "f")
+__MAKE_ST(cs, double, double, "f64", "d")
+
+__MAKE_ST2(cs, char2, short, "s8", "h")
+__MAKE_ST2(cs, uchar2, short, "u8", "h")
+__MAKE_ST2(cs, short2, short, "s16", "h")
+__MAKE_ST2(cs, ushort2, unsigned short, "u16", "h")
+__MAKE_ST2(cs, int2, int, "s32", "r")
+__MAKE_ST2(cs, uint2, unsigned int, "u32", "r")
+__MAKE_ST2(cs, longlong2, long long, "s64", "l")
+__MAKE_ST2(cs, ulonglong2, unsigned long long, "u64", "l")
+__MAKE_ST2(cs, float2, float, "f32", "f")
+__MAKE_ST2(cs, double2, double, "f64", "d")
+
+__MAKE_ST4(cs, char4, short, "s8", "h")
+__MAKE_ST4(cs, uchar4, short, "u8", "h")
+__MAKE_ST4(cs, short4, short, "s16", "h")
+__MAKE_ST4(cs, ushort4, unsigned short, "u16", "h")
+__MAKE_ST4(cs, int4, int, "s32", "r")
+__MAKE_ST4(cs, uint4, unsigned int, "u32", "r")
+__MAKE_ST4(cs, float4, float, "f32", "f")
+
+
+__MAKE_ST(wt, char, short, "s8", "h")
+__MAKE_ST(wt, signed char, short, "s8", "h")
+__MAKE_ST(wt, unsigned char, short, "u8", "h")
+__MAKE_ST(wt, short, short, "s16", "h")
+__MAKE_ST(wt, unsigned short, unsigned short, "u16", "h")
+__MAKE_ST(wt, int, int, "s32", "r")
+__MAKE_ST(wt, unsigned int, unsigned int, "u32", "r")
+__MAKE_ST(wt, long, long, "s" __LBITS, __LDG_PTR)
+__MAKE_ST(wt, unsigned long, unsigned long, "u" __LBITS, __LDG_PTR)
+__MAKE_ST(wt, long long, long long, "s64", "l")
+__MAKE_ST(wt, unsigned long long, unsigned long long, "u64", "l")
+__MAKE_ST(wt, float, float, "f32", "f")
+__MAKE_ST(wt, double, double, "f64", "d")
+
+__MAKE_ST2(wt, char2, short, "s8", "h")
+__MAKE_ST2(wt, uchar2, short, "u8", "h")
+__MAKE_ST2(wt, short2, short, "s16", "h")
+__MAKE_ST2(wt, ushort2, unsigned short, "u16", "h")
+__MAKE_ST2(wt, int2, int, "s32", "r")
+__MAKE_ST2(wt, uint2, unsigned int, "u32", "r")
+__MAKE_ST2(wt, longlong2, long long, "s64", "l")
+__MAKE_ST2(wt, ulonglong2, unsigned long long, "u64", "l")
+__MAKE_ST2(wt, float2, float, "f32", "f")
+__MAKE_ST2(wt, double2, double, "f64", "d")
+
+__MAKE_ST4(wt, char4, short, "s8", "h")
+__MAKE_ST4(wt, uchar4, short, "u8", "h")
+__MAKE_ST4(wt, short4, short, "s16", "h")
+__MAKE_ST4(wt, ushort4, unsigned short, "u16", "h")
+__MAKE_ST4(wt, int4, int, "s32", "r")
+__MAKE_ST4(wt, uint4, unsigned int, "u32", "r")
+__MAKE_ST4(wt, float4, float, "f32", "f")
+
+
+#undef __MAKE_ST4
+#undef __MAKE_ST2
+#undef __MAKE_ST
+#undef __MAKE_LD4
+#undef __MAKE_LD2
+#undef __MAKE_LD
+#undef __NOARG
+#undef __LBITS
+#undef __LDG_PTR
+
+#endif // defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))
+
+#endif // defined(__CLANG_CUDA_ADDITIONAL_INTRINSICS_H__)
diff --git a/cpp/scripts/run-clang-compile.py b/cpp/scripts/run-clang-compile.py
new file mode 100644
index 0000000000..4edbde84b3
--- /dev/null
+++ b/cpp/scripts/run-clang-compile.py
@@ -0,0 +1,331 @@
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+# IMPORTANT DISCLAIMER:                                                       #
+# This file is experimental and may not run successfully on the entire repo!  #
+# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+#
+
+from __future__ import print_function
+import argparse
+import glob
+import json
+import multiprocessing as mp
+import os
+import re
+import shutil
+import subprocess
+
+
+CLANG_COMPILER = "clang++"
+GPU_ARCH_REGEX = re.compile(r"sm_(\d+)")
+SPACES = re.compile(r"\s+")
+XCOMPILER_FLAG = re.compile(r"-((Xcompiler)|(-compiler-options))=?")
+XPTXAS_FLAG = re.compile(r"-((Xptxas)|(-ptxas-options))=?")
+# any options that may have equal signs in nvcc but not in clang
+# add those options here if you find any
+OPTIONS_NO_EQUAL_SIGN = ['-isystem']
+SEPARATOR = "-" * 8
+END_SEPARATOR = "*" * 64
+
+
+def parse_args():
+    argparser = argparse.ArgumentParser("Runs clang++ on a project instead of nvcc")
+    argparser.add_argument(
+        "-cdb", type=str, default="compile_commands.json",
+        help="Path to cmake-generated compilation database")
+    argparser.add_argument(
+        "-ignore", type=str, default=None,
+        help="Regex used to ignore files from checking")
+    argparser.add_argument(
+        "-select", type=str, default=None,
+        help="Regex used to select files for checking")
+    argparser.add_argument(
+        "-j", type=int, default=-1, help="Number of parallel jobs to launch.")
+    args = argparser.parse_args()
+    if args.j <= 0:
+        args.j = mp.cpu_count()
+    args.ignore_compiled = re.compile(args.ignore) if args.ignore else None
+    args.select_compiled = re.compile(args.select) if args.select else None
+    # we don't check clang's version, it should be OK with any clang
+    # recent enough to handle CUDA >= 11
+    if not os.path.exists(args.cdb):
+        raise Exception("Compilation database '%s' missing" % args.cdb)
+    return args
+
+
+def list_all_cmds(cdb):
+    with open(cdb, "r") as fp:
+        return json.load(fp)
+
+
+def get_gpu_archs(command):
+    archs = []
+    for loc in range(len(command)):
+        if (command[loc] != "-gencode" and command[loc] != "--generate-code"
+                and not command[loc].startswith("--generate-code=")):
+            continue
+        if command[loc].startswith("--generate-code="):
+            arch_flag = command[loc][len("--generate-code="):]
+        else:
+            arch_flag = command[loc + 1]
+        match = GPU_ARCH_REGEX.search(arch_flag)
+        if match is not None:
+            archs.append("--cuda-gpu-arch=sm_%s" % match.group(1))
+    return archs
+
+
+def get_index(arr, item_options):
+    return set(i for i, s in enumerate(arr) for item in item_options
+               if s == item)
+
+
+def remove_items(arr, item_options):
+    for i in sorted(get_index(arr, item_options), reverse=True):
+        del arr[i]
+
+
+def remove_items_plus_one(arr, item_options):
+    for i in sorted(get_index(arr, item_options), reverse=True):
+        if i < len(arr) - 1:
+            del arr[i + 1]
+        del arr[i]
+    idx = set(i for i, s in enumerate(arr) for item in item_options
+              if s.startswith(item + "="))
+    for i in sorted(idx, reverse=True):
+        del arr[i]
+
+
+def add_cuda_path(command, nvcc):
+    nvcc_path = shutil.which(nvcc)
+    if not nvcc_path:
+        raise Exception("Command %s has invalid compiler %s" % (command, nvcc))
+    cuda_root = os.path.dirname(os.path.dirname(nvcc_path))
+    # make sure that cuda root has version.txt
+    if not os.path.isfile(os.path.join(cuda_root, "version.txt")):
+        raise Exception(
+            "clang++ expects a `version.txt` file in your CUDA root path with "
+            "content `CUDA Version <major>.<minor>.<build>`")
+    command.append('--cuda-path=%s' % cuda_root)
+
+
+def get_clang_args(cmd):
+    command, file = cmd["command"], cmd["file"]
+    is_cuda = file.endswith(".cu")
+    command = re.split(SPACES, command)
+    # get original compiler
+    cc_orig = command[0]
+    # compiler is always clang++!
+    command[0] = "clang++"
+    # remove compilation and output targets from the original command
+    remove_items_plus_one(command, ["--compile", "-c"])
+    remove_items_plus_one(command, ["--output-file", "-o"])
+    if is_cuda:
+        # replace nvcc's "-gencode ..." with clang's "--cuda-gpu-arch ..."
+        archs = get_gpu_archs(command)
+        command.extend(archs)
+        # provide proper cuda path to clang
+        add_cuda_path(command, cc_orig)
+        # remove all kinds of nvcc flags clang doesn't know about
+        remove_items_plus_one(command, [
+            "--generate-code",
+            "-gencode",
+            "--x",
+            "-x",
+            "--compiler-bindir",
+            "-ccbin",
+            "--diag_suppress",
+            "-diag-suppress",
+            "--default-stream",
+            "-default-stream",
+        ])
+        remove_items(command, [
+            "-extended-lambda",
+            "--extended-lambda",
+            "-expt-extended-lambda",
+            "--expt-extended-lambda",
+            "-expt-relaxed-constexpr",
+            "--expt-relaxed-constexpr",
+            "--device-debug",
+            "-G",
+            "--generate-line-info",
+            "-lineinfo",
+        ])
+        # "-x cuda" is the right usage in clang
+        command.extend(["-x", "cuda"])
+        # we remove -Xcompiler flags: here we basically have to hope for the
+        # best that clang++ will accept any flags which nvcc passed to gcc
+        for i, c in reversed(list(enumerate(command))):
+            new_c = XCOMPILER_FLAG.sub('', c)
+            if new_c == c:
+                continue
+            command[i:i + 1] = new_c.split(',')
+        # we also change -Xptxas to -Xcuda-ptxas, always adding space here
+        for i, c in reversed(list(enumerate(command))):
+            if XPTXAS_FLAG.search(c):
+                if not c.endswith("=") and i < len(command) - 1:
+                    del command[i + 1]
+                command[i] = '-Xcuda-ptxas'
+                command.insert(i + 1, XPTXAS_FLAG.sub('', c))
+        # several options like isystem don't expect `=`
+        for opt in OPTIONS_NO_EQUAL_SIGN:
+            opt_eq = opt + '='
+            # make sure that we iterate from back to front here for insert
+            for i, c in reversed(list(enumerate(command))):
+                if not c.startswith(opt_eq):
+                    continue
+                x = c.split('=')
+                # we only care about the first `=`
+                command[i] = x[0]
+                command.insert(i + 1, '='.join(x[1:]))
+        # use extensible whole program, to avoid ptx resolution/linking
+        command.extend(["-Xcuda-ptxas", "-ewp"])
+        # for libcudacxx, we need to allow variadic functions
+        command.extend(["-Xclang", "-fcuda-allow-variadic-functions"])
+        # add some additional CUDA intrinsics
+        cuda_intrinsics_file = os.path.join(
+            os.path.dirname(os.path.realpath(__file__)),
+            "__clang_cuda_additional_intrinsics.h")
+        command.extend(["-include", cuda_intrinsics_file])
+    # somehow this option gets onto the commandline, it is unrecognized by clang
+    remove_items(command, [
+        "--forward-unknown-to-host-compiler",
+        "-forward-unknown-to-host-compiler"
+    ])
+    # do not treat warnings as errors here !
+    for i, x in reversed(list(enumerate(command))):
+        if x.startswith("-Werror"):
+            del command[i]
+    # add GCC headers if we can find GCC
+    gcc_path = shutil.which("gcc")
+    if gcc_path:
+        gcc_base = os.path.dirname(os.path.dirname(gcc_path))
+        gcc_glob1 = os.path.join(gcc_base, "lib", "gcc", "*", "*", "include")
+        gcc_glob2 = os.path.join(gcc_base, "lib64", "gcc", "*", "*", "include")
+        inc_dirs = glob.glob(gcc_glob1) + glob.glob(gcc_glob2)
+        for d in inc_dirs:
+            command.extend(["-isystem", d])
+    return command
+
+
+def run_clang_command(clang_cmd, cwd):
+    cmd = " ".join(clang_cmd)
+    result = subprocess.run(cmd, check=False, shell=True, cwd=cwd,
+                            stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    result.stdout = result.stdout.decode("utf-8").strip()
+    out = "CMD: " + cmd + "\n"
+    out += "CWD: " + cwd + "\n"
+    out += "EXIT-CODE: %d\n" % result.returncode
+    status = result.returncode == 0
+    out += result.stdout
+    return status, out
+
+
+class LockContext(object):
+    def __init__(self, lock=None) -> None:
+        self._lock = lock
+    
+    def __enter__(self):
+        if self._lock:
+            self._lock.acquire()
+        return self
+    
+    def __exit__(self, _, __, ___):
+        if self._lock:
+            self._lock.release()
+        return False  # we don't handle exceptions
+
+
+def print_result(passed, stdout, file):
+    status_str = "PASSED" if passed else "FAILED"
+    print("%s File:%s %s %s" % (SEPARATOR, file, status_str, SEPARATOR))
+    if not passed and stdout:
+        print(stdout)
+        print("%s\n" % END_SEPARATOR)
+
+
+def run_clang(cmd, args):
+    command = get_clang_args(cmd)
+    cwd = os.path.dirname(args.cdb)
+    # compile only and dump output to /dev/null
+    command.extend(["-c", cmd["file"], "-o", os.devnull])
+    status, out = run_clang_command(command, cwd)
+    # we immediately print the result since this is more interactive for user
+    with lock:
+        print_result(status, out, cmd["file"])
+        return status
+
+
+# mostly used for debugging purposes
+def run_sequential(args, all_files):
+    # lock must be defined as in `run_parallel`
+    global lock
+    lock = LockContext()
+    results = []
+    for cmd in all_files:
+        # skip files that we don't want to look at
+        if args.ignore_compiled is not None and \
+           re.search(args.ignore_compiled, cmd["file"]) is not None:
+            continue
+        if args.select_compiled is not None and \
+           re.search(args.select_compiled, cmd["file"]) is None:
+            continue
+        results.append(run_clang(cmd, args))
+    return all(results)
+
+
+def copy_lock(init_lock):
+    # this is required to pass locks to pool workers
+    # see https://stackoverflow.com/questions/25557686/
+    # python-sharing-a-lock-between-processes
+    global lock
+    lock = init_lock
+
+
+def run_parallel(args, all_files):
+    init_lock = LockContext(mp.Lock())
+    pool = mp.Pool(args.j, initializer=copy_lock, initargs=(init_lock,))
+    results = []
+    for cmd in all_files:
+        # skip files that we don't want to look at
+        if args.ignore_compiled is not None and \
+           re.search(args.ignore_compiled, cmd["file"]) is not None:
+            continue
+        if args.select_compiled is not None and \
+           re.search(args.select_compiled, cmd["file"]) is None:
+            continue
+        results.append(pool.apply_async(run_clang, args=(cmd, args)))
+    results_final = [r.get() for r in results]
+    pool.close()
+    pool.join()
+    return all(results_final)
+
+
+def main():
+    args = parse_args()
+    all_files = list_all_cmds(args.cdb)
+    # ensure that we use only the real paths
+    for cmd in all_files:
+        cmd["file"] = os.path.realpath(os.path.expanduser(cmd["file"]))
+    if args.j == 1:
+        status = run_sequential(args, all_files)
+    else:
+        status = run_parallel(args, all_files)
+    if not status:
+        raise Exception("clang++ failed! Refer to the errors above.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py
index 23260d2f4d..ed1a633232 100644
--- a/cpp/scripts/run-clang-tidy.py
+++ b/cpp/scripts/run-clang-tidy.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,6 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+# IMPORTANT DISCLAIMER:                                                       #
+# This file is experimental and may not run successfully on the entire repo!  #
+# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+#
 
 from __future__ import print_function
 import sys
diff --git a/cpp/test/linalg/rsvd.cu b/cpp/test/linalg/rsvd.cu
index da38464bf7..7b0bb7c928 100644
--- a/cpp/test/linalg/rsvd.cu
+++ b/cpp/test/linalg/rsvd.cu
@@ -23,6 +23,8 @@
 #include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <algorithm>
+
 namespace raft {
 namespace linalg {
 
@@ -111,8 +113,8 @@ class RsvdTest : public ::testing::TestWithParam<RsvdInputs<T>> {
     raft::update_host(A_backup_cpu.data(), A.data(), m * n, stream);
 
     if (params.k == 0) {
-      params.k = max((int)(min(m, n) * params.PC_perc), 1);
-      params.p = max((int)(min(m, n) * params.UpS_perc), 1);
+      params.k = std::max((int)(std::min(m, n) * params.PC_perc), 1);
+      params.p = std::max((int)(std::min(m, n) * params.UpS_perc), 1);
     }
 
     U.resize(m * params.k, stream);
diff --git a/cpp/test/random/make_blobs.cu b/cpp/test/random/make_blobs.cu
index caad627d49..48e8986947 100644
--- a/cpp/test/random/make_blobs.cu
+++ b/cpp/test/random/make_blobs.cu
@@ -170,7 +170,6 @@ const std::vector<MakeBlobsInputs<float>> inputsf_t = {
   {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
   {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
-
   {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenPC, 1234ULL},
   {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenPC, 1234ULL},
   {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
@@ -230,4 +229,4 @@ TEST_P(MakeBlobsTestD, Result) { check(); }
 INSTANTIATE_TEST_CASE_P(MakeBlobsTests, MakeBlobsTestD, ::testing::ValuesIn(inputsd_t));
 
 }  // end namespace random
-}  // end namespace raft
\ No newline at end of file
+}  // end namespace raft
diff --git a/cpp/test/stats/meanvar.cu b/cpp/test/stats/meanvar.cu
index a5bb5b0b0d..b0efe1c7dd 100644
--- a/cpp/test/stats/meanvar.cu
+++ b/cpp/test/stats/meanvar.cu
@@ -21,6 +21,8 @@
 #include <raft/random/rng.hpp>
 #include <raft/stats/meanvar.hpp>
 
+#include <algorithm>
+
 namespace raft {
 namespace stats {
 
@@ -34,7 +36,10 @@ struct MeanVarInputs {
 
   T mean_tol() const { return T(N_SIGMAS) * stddev / sqrt(T(rows)); }
 
-  T var_tol() const { return T(N_SIGMAS) * stddev * stddev * sqrt(T(2.0) / T(max(1, rows - 1))); }
+  T var_tol() const
+  {
+    return T(N_SIGMAS) * stddev * stddev * sqrt(T(2.0) / T(std::max(1, rows - 1)));
+  }
 };
 
 template <typename T>

From 1409002c65744db842c39b6fee50b5092091ba2f Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 19 Feb 2022 02:09:58 +0800
Subject: [PATCH 118/171] Implement span storage optimization. (#515)

Close https://github.com/rapidsai/raft/issues/511 .

Authors:
  - Jiaming Yuan (https://github.com/trivialfis)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/515
---
 cpp/include/raft/detail/span.hpp | 25 +++++++++++++++++++++++++
 cpp/include/raft/span.hpp        | 17 ++++++++---------
 2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/cpp/include/raft/detail/span.hpp b/cpp/include/raft/detail/span.hpp
index aa598caf32..8a26a33247 100644
--- a/cpp/include/raft/detail/span.hpp
+++ b/cpp/include/raft/detail/span.hpp
@@ -86,5 +86,30 @@ __host__ __device__ constexpr auto lexicographical_compare(InputIt1 first1,
   }
   return first1 == last1 && first2 != last2;
 }
+
+template <typename T, std::size_t Extent>
+struct span_storage {
+ private:
+  T* ptr_{nullptr};
+
+ public:
+  constexpr span_storage() noexcept = default;
+  constexpr span_storage(T* ptr, std::size_t) noexcept : ptr_{ptr} {}
+  [[nodiscard]] constexpr auto size() const noexcept -> std::size_t { return Extent; }
+  [[nodiscard]] constexpr auto data() const noexcept -> T* { return ptr_; }
+};
+
+template <typename T>
+struct span_storage<T, dynamic_extent> {
+ private:
+  T* ptr_{nullptr};
+  std::size_t size_{0};
+
+ public:
+  constexpr span_storage() noexcept = default;
+  constexpr span_storage(T* ptr, std::size_t size) noexcept : ptr_{ptr}, size_{size} {}
+  [[nodiscard]] constexpr auto size() const noexcept -> std::size_t { return size_; }
+  [[nodiscard]] constexpr auto data() const noexcept -> T* { return ptr_; }
+};
 }  // namespace detail
 }  // namespace raft
diff --git a/cpp/include/raft/span.hpp b/cpp/include/raft/span.hpp
index 389a6a2177..b4fbf5b63a 100644
--- a/cpp/include/raft/span.hpp
+++ b/cpp/include/raft/span.hpp
@@ -59,7 +59,7 @@ class span {
   /**
    * @brief Constructs a span that is a view over the range [first, first + count);
    */
-  constexpr span(pointer ptr, size_type count) noexcept : size_(count), data_(ptr)
+  constexpr span(pointer ptr, size_type count) noexcept : storage_{ptr, count}
   {
     assert(!(Extent != dynamic_extent && count != Extent));
     assert(ptr || count == 0);
@@ -67,15 +67,15 @@ class span {
   /**
    * @brief Constructs a span that is a view over the range [first, last)
    */
-  constexpr span(pointer first, pointer last) noexcept : size_(last - first), data_(first)
+  constexpr span(pointer first, pointer last) noexcept
+    : span{first, static_cast<size_type>(thrust::distance(first, last))}
   {
-    assert(data_ || size_ == 0);
   }
   /**
    * @brief Constructs a span that is a view over the array arr.
    */
   template <std::size_t N>
-  constexpr span(element_type (&arr)[N]) noexcept : size_(N), data_(&arr[0])
+  constexpr span(element_type (&arr)[N]) noexcept : span{&arr[0], N}
   {
   }
 
@@ -89,7 +89,7 @@ class span {
               detail::is_allowed_element_type_conversion_t<U, T>::value &&
               detail::is_allowed_extent_conversion_t<OtherExtent, Extent>::value>>
   constexpr span(const span<U, is_device, OtherExtent>& other) noexcept
-    : size_(other.size()), data_(other.data())
+    : span{other.data(), other.size()}
   {
   }
 
@@ -139,10 +139,10 @@ class span {
     return data()[_idx];
   }
 
-  constexpr auto data() const noexcept -> pointer { return data_; }
+  constexpr auto data() const noexcept -> pointer { return storage_.data(); }
 
   // Observers
-  [[nodiscard]] constexpr auto size() const noexcept -> size_type { return size_; }
+  [[nodiscard]] constexpr auto size() const noexcept -> size_type { return storage_.size(); }
   [[nodiscard]] constexpr auto size_bytes() const noexcept -> size_type
   {
     return size() * sizeof(T);
@@ -197,8 +197,7 @@ class span {
   }
 
  private:
-  size_type size_{0};
-  pointer data_{nullptr};
+  detail::span_storage<T, Extent> storage_;
 };
 
 /**

From 98d60db157724f0344a4dc5b189a7d40aff858d6 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 18 Feb 2022 15:38:02 -0500
Subject: [PATCH 119/171] raft-config is idempotent no matter
 RAFT_COMPILE_LIBRARIES value (#516)

This allows us to correctly handle raft with and without compiled libraries.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/516
---
 cpp/CMakeLists.txt                   | 10 ++++++++--
 cpp/cmake/modules/config.cmake.in    |  5 ++++-
 cpp/cmake/thirdparty/get_faiss.cmake |  4 ++--
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index ea0ef2c2f1..118a48525a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -276,13 +276,13 @@ install(TARGETS raft_nn
 if(TARGET raft_distance_lib)
   install(TARGETS raft_distance_lib
           DESTINATION ${lib_dir}
-          EXPORT raft-distance-exports)
+          EXPORT raft-distance-lib-exports)
 endif()
 
 if(TARGET raft_nn_lib)
   install(TARGETS raft_nn_lib
           DESTINATION ${lib_dir}
-          EXPORT raft-nn-exports)
+          EXPORT raft-nn-lib-exports)
 endif()
 
 
@@ -364,6 +364,12 @@ raft_export(BUILD raft
 include("${rapids-cmake-dir}/export/write_dependencies.cmake")
 
 set(raft_components distance nn)
+if(TARGET raft_distance_lib)
+  list(APPEND raft_components distance-lib)
+endif()
+if(TARGET raft_nn_lib)
+  list(APPEND raft_components  nn-lib)
+endif()
 foreach(comp IN LISTS raft_components)
   install(
     EXPORT raft-${comp}-exports
diff --git a/cpp/cmake/modules/config.cmake.in b/cpp/cmake/modules/config.cmake.in
index bfafe3555a..4895a3baac 100644
--- a/cpp/cmake/modules/config.cmake.in
+++ b/cpp/cmake/modules/config.cmake.in
@@ -40,7 +40,8 @@ foreach(lang IN LISTS rapids_global_languages)
 endforeach()
 unset(rapids_global_languages)
 
-set(rapids_allowed_components @RAPIDS_COMPONENTS@)
+set(rapids_base_components @RAPIDS_COMPONENTS@)
+set(rapids_allowed_components ${rapids_base_components})
 
 if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/@project_name@-dependencies.cmake")
   include("${CMAKE_CURRENT_LIST_DIR}/@project_name@-dependencies.cmake")
@@ -50,6 +51,7 @@ foreach(comp IN LISTS rapids_allowed_components)
   # this way if a dependency can't be found we fail
   if(${comp} IN_LIST @project_name@_FIND_COMPONENTS)
     include("${CMAKE_CURRENT_LIST_DIR}/@project_name@-${comp}-dependencies.cmake" OPTIONAL)
+    include("${CMAKE_CURRENT_LIST_DIR}/@project_name@-${comp}-lib-dependencies.cmake" OPTIONAL)
   endif()
 endforeach()
 
@@ -58,6 +60,7 @@ include("${CMAKE_CURRENT_LIST_DIR}/@project_name@-targets.cmake" OPTIONAL)
 foreach(comp IN LISTS rapids_allowed_components)
   if(${comp} IN_LIST @project_name@_FIND_COMPONENTS)
     include("${CMAKE_CURRENT_LIST_DIR}/@project_name@-${comp}-targets.cmake" OPTIONAL)
+    include("${CMAKE_CURRENT_LIST_DIR}/@project_name@-${comp}-lib-targets.cmake" OPTIONAL)
     set(@project_name@_${comp}_FOUND TRUE)
   endif()
 endforeach()
diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake
index 1079db3294..8c29d2b321 100644
--- a/cpp/cmake/thirdparty/get_faiss.cmake
+++ b/cpp/cmake/thirdparty/get_faiss.cmake
@@ -59,8 +59,8 @@ function(find_and_configure_faiss)
 
     # We generate the faiss-config files when we built faiss locally, so always do `find_dependency`
     rapids_export_package(BUILD OpenMP raft-nn-exports) # faiss uses openMP but doesn't export a need for it
-    rapids_export_package(BUILD faiss raft-nn-exports)
-    rapids_export_package(INSTALL faiss raft-nn-exports)
+    rapids_export_package(BUILD faiss raft-nn-exports GLOBAL_TARGETS faiss::faiss faiss)
+    rapids_export_package(INSTALL faiss raft-nn-exports GLOBAL_TARGETS faiss::faiss faiss)
 
     # Tell cmake where it can find the generated faiss-config.cmake we wrote.
     include("${rapids-cmake-dir}/export/find_package_root.cmake")

From 5f7aa84d749cfa366fac80a97c521b72772a29c2 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Fri, 18 Feb 2022 17:31:05 -0500
Subject: [PATCH 120/171] Moving scores and metrics over to raft::stats (#512)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/512
---
 cpp/include/raft/matrix/col_wise_sort.hpp     |  52 +++
 .../raft/matrix/detail/columnWiseSort.cuh     | 346 ++++++++++++++++++
 .../knn/detail/ann_quantized_faiss.cuh        |   2 -
 cpp/include/raft/stats/accuracy.hpp           |  40 ++
 .../raft/stats/adjusted_rand_index.hpp        |  50 +++
 cpp/include/raft/stats/common.hpp             |   4 +
 cpp/include/raft/stats/completeness_score.hpp |  47 +++
 cpp/include/raft/stats/contingency_matrix.hpp | 101 +++++
 .../raft/stats/detail/adjusted_rand_index.cuh | 197 ++++++++++
 .../detail/batched/information_criterion.cuh  |  74 ++++
 .../stats/detail/batched/silhouette_score.cuh | 274 ++++++++++++++
 .../raft/stats/detail/completeness_score.cuh  |  71 ++++
 .../raft/stats/detail/contingencyMatrix.cuh   | 314 ++++++++++++++++
 cpp/include/raft/stats/detail/dispersion.cuh  | 138 +++++++
 cpp/include/raft/stats/detail/entropy.cuh     | 154 ++++++++
 .../raft/stats/detail/homogeneity_score.cuh   |  72 ++++
 .../raft/stats/detail/kl_divergence.cuh       |  84 +++++
 .../raft/stats/detail/mutual_info_score.cuh   | 179 +++++++++
 cpp/include/raft/stats/detail/rand_index.cuh  | 167 +++++++++
 cpp/include/raft/stats/detail/scores.cuh      | 215 +++++++++++
 .../raft/stats/detail/silhouette_score.cuh    | 332 +++++++++++++++++
 .../stats/detail/trustworthiness_score.cuh    | 219 +++++++++++
 cpp/include/raft/stats/detail/v_measure.cuh   |  64 ++++
 cpp/include/raft/stats/dispersion.hpp         |  56 +++
 cpp/include/raft/stats/entropy.hpp            |  45 +++
 cpp/include/raft/stats/homogeneity_score.hpp  |  48 +++
 .../raft/stats/information_criterion.hpp      |  63 ++++
 cpp/include/raft/stats/kl_divergence.hpp      |  41 +++
 cpp/include/raft/stats/mutual_info_score.hpp  |  46 +++
 cpp/include/raft/stats/r2_score.hpp           |  46 +++
 cpp/include/raft/stats/rand_index.hpp         |  39 ++
 cpp/include/raft/stats/regression_metrics.hpp |  51 +++
 cpp/include/raft/stats/silhouette_score.hpp   |  75 ++++
 cpp/include/raft/stats/specializations.hpp    |  20 +
 .../raft/stats/trustworthiness_score.hpp      |  49 +++
 cpp/include/raft/stats/v_measure.hpp          |  47 +++
 cpp/test/CMakeLists.txt                       |  14 +
 cpp/test/matrix/columnSort.cu                 | 166 +++++++++
 cpp/test/stats/adjusted_rand_index.cu         | 201 ++++++++++
 cpp/test/stats/completeness_score.cu          | 138 +++++++
 cpp/test/stats/contingencyMatrix.cu           | 172 +++++++++
 cpp/test/stats/dispersion.cu                  | 125 +++++++
 cpp/test/stats/entropy.cu                     | 118 ++++++
 cpp/test/stats/homogeneity_score.cu           | 135 +++++++
 cpp/test/stats/information_criterion.cu       | 149 ++++++++
 cpp/test/stats/kl_divergence.cu               | 105 ++++++
 cpp/test/stats/mutual_info_score.cu           | 163 +++++++++
 cpp/test/stats/rand_index.cu                  | 127 +++++++
 cpp/test/stats/silhouette_score.cu            | 233 ++++++++++++
 cpp/test/stats/trustworthiness.cu             | 340 +++++++++++++++++
 cpp/test/stats/v_measure.cu                   | 140 +++++++
 51 files changed, 6146 insertions(+), 2 deletions(-)
 create mode 100644 cpp/include/raft/matrix/col_wise_sort.hpp
 create mode 100644 cpp/include/raft/matrix/detail/columnWiseSort.cuh
 create mode 100644 cpp/include/raft/stats/accuracy.hpp
 create mode 100644 cpp/include/raft/stats/adjusted_rand_index.hpp
 create mode 100644 cpp/include/raft/stats/completeness_score.hpp
 create mode 100644 cpp/include/raft/stats/contingency_matrix.hpp
 create mode 100644 cpp/include/raft/stats/detail/adjusted_rand_index.cuh
 create mode 100644 cpp/include/raft/stats/detail/batched/information_criterion.cuh
 create mode 100644 cpp/include/raft/stats/detail/batched/silhouette_score.cuh
 create mode 100644 cpp/include/raft/stats/detail/completeness_score.cuh
 create mode 100644 cpp/include/raft/stats/detail/contingencyMatrix.cuh
 create mode 100644 cpp/include/raft/stats/detail/dispersion.cuh
 create mode 100644 cpp/include/raft/stats/detail/entropy.cuh
 create mode 100644 cpp/include/raft/stats/detail/homogeneity_score.cuh
 create mode 100644 cpp/include/raft/stats/detail/kl_divergence.cuh
 create mode 100644 cpp/include/raft/stats/detail/mutual_info_score.cuh
 create mode 100644 cpp/include/raft/stats/detail/rand_index.cuh
 create mode 100644 cpp/include/raft/stats/detail/scores.cuh
 create mode 100644 cpp/include/raft/stats/detail/silhouette_score.cuh
 create mode 100644 cpp/include/raft/stats/detail/trustworthiness_score.cuh
 create mode 100644 cpp/include/raft/stats/detail/v_measure.cuh
 create mode 100644 cpp/include/raft/stats/dispersion.hpp
 create mode 100644 cpp/include/raft/stats/entropy.hpp
 create mode 100644 cpp/include/raft/stats/homogeneity_score.hpp
 create mode 100644 cpp/include/raft/stats/information_criterion.hpp
 create mode 100644 cpp/include/raft/stats/kl_divergence.hpp
 create mode 100644 cpp/include/raft/stats/mutual_info_score.hpp
 create mode 100644 cpp/include/raft/stats/r2_score.hpp
 create mode 100644 cpp/include/raft/stats/rand_index.hpp
 create mode 100644 cpp/include/raft/stats/regression_metrics.hpp
 create mode 100644 cpp/include/raft/stats/silhouette_score.hpp
 create mode 100644 cpp/include/raft/stats/specializations.hpp
 create mode 100644 cpp/include/raft/stats/trustworthiness_score.hpp
 create mode 100644 cpp/include/raft/stats/v_measure.hpp
 create mode 100644 cpp/test/matrix/columnSort.cu
 create mode 100644 cpp/test/stats/adjusted_rand_index.cu
 create mode 100644 cpp/test/stats/completeness_score.cu
 create mode 100644 cpp/test/stats/contingencyMatrix.cu
 create mode 100644 cpp/test/stats/dispersion.cu
 create mode 100644 cpp/test/stats/entropy.cu
 create mode 100644 cpp/test/stats/homogeneity_score.cu
 create mode 100644 cpp/test/stats/information_criterion.cu
 create mode 100644 cpp/test/stats/kl_divergence.cu
 create mode 100644 cpp/test/stats/mutual_info_score.cu
 create mode 100644 cpp/test/stats/rand_index.cu
 create mode 100644 cpp/test/stats/silhouette_score.cu
 create mode 100644 cpp/test/stats/trustworthiness.cu
 create mode 100644 cpp/test/stats/v_measure.cu

diff --git a/cpp/include/raft/matrix/col_wise_sort.hpp b/cpp/include/raft/matrix/col_wise_sort.hpp
new file mode 100644
index 0000000000..7ace5881bc
--- /dev/null
+++ b/cpp/include/raft/matrix/col_wise_sort.hpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/matrix/detail/columnWiseSort.cuh>
+
+namespace raft {
+namespace matrix {
+
+/**
+ * @brief sort columns within each row of row-major input matrix and return sorted indexes
+ * modelled as key-value sort with key being input matrix and value being index of values
+ * @param in: input matrix
+ * @param out: output value(index) matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_columns: number columns of input matrix
+ * @param bAllocWorkspace: check returned value, if true allocate workspace passed in workspaceSize
+ * @param workspacePtr: pointer to workspace memory
+ * @param workspaceSize: Size of workspace to be allocated
+ * @param stream: cuda stream to execute prim on
+ * @param sortedKeys: Optional, output matrix for sorted keys (input)
+ */
+template <typename InType, typename OutType>
+void sort_cols_per_row(const InType* in,
+                       OutType* out,
+                       int n_rows,
+                       int n_columns,
+                       bool& bAllocWorkspace,
+                       void* workspacePtr,
+                       size_t& workspaceSize,
+                       cudaStream_t stream,
+                       InType* sortedKeys = nullptr)
+{
+  detail::sortColumnsPerRow<InType, OutType>(
+    in, out, n_rows, n_columns, bAllocWorkspace, workspacePtr, workspaceSize, stream, sortedKeys);
+}
+};  // end namespace matrix
+};  // end namespace raft
diff --git a/cpp/include/raft/matrix/detail/columnWiseSort.cuh b/cpp/include/raft/matrix/detail/columnWiseSort.cuh
new file mode 100644
index 0000000000..65febcb6d8
--- /dev/null
+++ b/cpp/include/raft/matrix/detail/columnWiseSort.cuh
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cub/cub.cuh>
+#include <limits>
+#include <map>
+#include <raft/cuda_utils.cuh>
+
+#define INST_BLOCK_SORT(keyIn, keyOut, valueInOut, rows, columns, blockSize, elemPT, stream)     \
+  devKeyValSortColumnPerRow<InType, OutType, blockSize, elemPT><<<rows, blockSize, 0, stream>>>( \
+    keyIn, keyOut, valueInOut, rows, columns, std::numeric_limits<InType>::max())
+
+namespace raft {
+namespace matrix {
+namespace detail {
+
+template <typename InType, int BLOCK_SIZE>
+struct TemplateChecker {
+  enum {
+    IsValid = (std::is_same<InType, short>::value && BLOCK_SIZE <= 1024) ||
+              (std::is_same<InType, int>::value && BLOCK_SIZE <= 1024) ||
+              (std::is_same<InType, float>::value && BLOCK_SIZE <= 1024) ||
+              (std::is_same<InType, double>::value && BLOCK_SIZE <= 512)
+  };
+};
+
+template <typename InType, typename OutType, int BLOCK_SIZE, int ITEMS_PER_THREAD>
+struct SmemPerBlock {
+  typedef cub::BlockLoad<InType, BLOCK_SIZE, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE>
+    BlockLoadTypeKey;
+
+  typedef cub::BlockRadixSort<InType, BLOCK_SIZE, ITEMS_PER_THREAD, OutType> BlockRadixSortType;
+
+  union TempStorage {
+    typename BlockLoadTypeKey::TempStorage keyLoad;
+    typename BlockRadixSortType::TempStorage sort;
+  } tempStorage;
+};
+
+template <typename InType>
+__global__ void devLayoutIdx(InType* in, int n_cols, int totalElements)
+{
+  int idx = threadIdx.x + blockDim.x * blockIdx.x;
+  int n   = n_cols;
+
+  if (idx < totalElements) { in[idx] = idx % n; }
+}
+
+template <typename T>
+__global__ void devOffsetKernel(T* in, T value, int n_times)
+{
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < n_times) in[idx] = idx * value;
+}
+
+// block level radix sort - can only sort as much data we can fit within shared memory
+template <
+  typename InType,
+  typename OutType,
+  int BLOCK_SIZE,
+  int ITEMS_PER_THREAD,
+  typename std::enable_if<TemplateChecker<InType, BLOCK_SIZE>::IsValid, InType>::type* = nullptr>
+__global__ void __launch_bounds__(1024, 1) devKeyValSortColumnPerRow(const InType* inputKeys,
+                                                                     InType* outputKeys,
+                                                                     OutType* inputVals,
+                                                                     int n_rows,
+                                                                     int n_cols,
+                                                                     InType MAX_VALUE)
+{
+  typedef cub::BlockLoad<InType, BLOCK_SIZE, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE>
+    BlockLoadTypeKey;
+
+  typedef cub::BlockRadixSort<InType, BLOCK_SIZE, ITEMS_PER_THREAD, OutType> BlockRadixSortType;
+
+  __shared__ SmemPerBlock<InType, OutType, BLOCK_SIZE, ITEMS_PER_THREAD> tmpSmem;
+
+  InType threadKeys[ITEMS_PER_THREAD];
+  OutType threadValues[ITEMS_PER_THREAD];
+
+  int blockOffset = blockIdx.x * n_cols;
+  BlockLoadTypeKey(tmpSmem.tempStorage.keyLoad)
+    .Load(inputKeys + blockOffset, threadKeys, n_cols, MAX_VALUE);
+
+  OutType idxBase = threadIdx.x * ITEMS_PER_THREAD;
+  for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+    OutType eId = idxBase + (OutType)i;
+    if (eId < n_cols)
+      threadValues[i] = eId;
+    else
+      threadValues[i] = MAX_VALUE;
+  }
+
+  __syncthreads();
+
+  BlockRadixSortType(tmpSmem.tempStorage.sort).SortBlockedToStriped(threadKeys, threadValues);
+
+  // storing index values back (not keys)
+  cub::StoreDirectStriped<BLOCK_SIZE>(threadIdx.x, inputVals + blockOffset, threadValues, n_cols);
+
+  if (outputKeys) {
+    cub::StoreDirectStriped<BLOCK_SIZE>(threadIdx.x, outputKeys + blockOffset, threadKeys, n_cols);
+  }
+}
+
+template <
+  typename InType,
+  typename OutType,
+  int BLOCK_SIZE,
+  int ITEMS_PER_THREAD,
+  typename std::enable_if<!(TemplateChecker<InType, BLOCK_SIZE>::IsValid), InType>::type* = nullptr>
+__global__ void devKeyValSortColumnPerRow(const InType* inputKeys,
+                                          InType* outputKeys,
+                                          OutType* inputVals,
+                                          int n_rows,
+                                          int n_cols,
+                                          InType MAX_VALUE)
+{
+  // place holder function
+  // so that compiler unrolls for all template types successfully
+}
+
+// helper function to layout values (index's) for key-value sort
+template <typename OutType>
+cudaError_t layoutIdx(OutType* in, int n_rows, int n_columns, cudaStream_t stream)
+{
+  int totalElements = n_rows * n_columns;
+  dim3 block(256);
+  dim3 grid((totalElements + block.x - 1) / block.x);
+  devLayoutIdx<OutType><<<grid, block, 0, stream>>>(in, n_columns, totalElements);
+  return cudaGetLastError();
+}
+
+// helper function to layout offsets for rows for DeviceSegmentedRadixSort
+template <typename T>
+cudaError_t layoutSortOffset(T* in, T value, int n_times, cudaStream_t stream)
+{
+  dim3 block(128);
+  dim3 grid((n_times + block.x - 1) / block.x);
+  devOffsetKernel<T><<<grid, block, 0, stream>>>(in, value, n_times);
+  return cudaGetLastError();
+}
+
+/**
+ * @brief sort columns within each row of row-major input matrix and return sorted indexes
+ * modelled as key-value sort with key being input matrix and value being index of values
+ * @param in: input matrix
+ * @param out: output value(index) matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param bAllocWorkspace: check returned value, if true allocate workspace passed in workspaceSize
+ * @param workspacePtr: pointer to workspace memory
+ * @param workspaceSize: Size of workspace to be allocated
+ * @param stream: cuda stream to execute prim on
+ * @param sortedKeys: Optional, output matrix for sorted keys (input)
+ */
+template <typename InType, typename OutType>
+void sortColumnsPerRow(const InType* in,
+                       OutType* out,
+                       int n_rows,
+                       int n_columns,
+                       bool& bAllocWorkspace,
+                       void* workspacePtr,
+                       size_t& workspaceSize,
+                       cudaStream_t stream,
+                       InType* sortedKeys = nullptr)
+{
+  // assume non-square row-major matrices
+  // current use-case: KNN, trustworthiness scores
+  // output : either sorted indices or sorted indices and input values
+  // future : this prim can be modified to be more generic and serve as a way to sort column entries
+  // per row
+  //          i.e. another output format: sorted values only
+
+  int totalElements          = n_rows * n_columns;
+  size_t perElementSmemUsage = sizeof(InType) + sizeof(OutType);
+  size_t memAlignWidth       = 256;
+
+  // @ToDo: Figure out dynamic shared memory for block sort kernel - better for volta and beyond
+  // int currDevice = 0, smemLimit = 0;
+  // RAFT_CUDA_TRY(cudaGetDevice(&currDevice));
+  // RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smemLimit, cudaDevAttrMaxSharedMemoryPerBlock,
+  // currDevice)); size_t maxElementsForBlockSort = smemLimit / perElementSmemUsage;
+
+  // for 48KB smem/block, can fit in 6144 4byte key-value pair
+  // assuming key-value sort for now - smem computation will change for value only sort
+  // dtype being size of key-value pair
+  std::map<size_t, int> dtypeToColumnMap = {{4, 12288},   // short + short
+                                            {8, 12288},   // float/int + int/float
+                                            {12, 6144},   // double + int/float
+                                            {16, 6144}};  // double + double
+
+  if (dtypeToColumnMap.count(perElementSmemUsage) != 0 &&
+      n_columns <= dtypeToColumnMap[perElementSmemUsage]) {
+    // more elements per thread --> more register pressure
+    // 512(blockSize) * 8 elements per thread = 71 register / thread
+
+    // instantiate some kernel combinations
+    if (n_columns <= 512)
+      INST_BLOCK_SORT(in, sortedKeys, out, n_rows, n_columns, 128, 4, stream);
+    else if (n_columns > 512 && n_columns <= 1024)
+      INST_BLOCK_SORT(in, sortedKeys, out, n_rows, n_columns, 128, 8, stream);
+    else if (n_columns > 1024 && n_columns <= 3072)
+      INST_BLOCK_SORT(in, sortedKeys, out, n_rows, n_columns, 512, 6, stream);
+    else if (n_columns > 3072 && n_columns <= 4096)
+      INST_BLOCK_SORT(in, sortedKeys, out, n_rows, n_columns, 512, 8, stream);
+    else if (n_columns > 4096 && n_columns <= 6144)
+      INST_BLOCK_SORT(in, sortedKeys, out, n_rows, n_columns, 512, 12, stream);
+    else
+      INST_BLOCK_SORT(in, sortedKeys, out, n_rows, n_columns, 1024, 12, stream);
+  } else if (n_columns <= (1 << 18) && n_rows > 1) {
+    // device Segmented radix sort
+    // 2^18 column cap to restrict size of workspace ~512 MB
+    // will give better perf than below deviceWide Sort for even larger dims
+    int numSegments = n_rows + 1;
+
+    // need auxillary storage: cub sorting + keys (if user not passing) +
+    // staging for values out + segment partition
+    if (workspaceSize == 0 || !workspacePtr) {
+      OutType* tmpValIn    = nullptr;
+      int* tmpOffsetBuffer = nullptr;
+
+      // first call is to get size of workspace
+      RAFT_CUDA_TRY(cub::DeviceSegmentedRadixSort::SortPairs(workspacePtr,
+                                                             workspaceSize,
+                                                             in,
+                                                             sortedKeys,
+                                                             tmpValIn,
+                                                             out,
+                                                             totalElements,
+                                                             numSegments,
+                                                             tmpOffsetBuffer,
+                                                             tmpOffsetBuffer + 1));
+      bAllocWorkspace = true;
+      // more staging space for temp output of keys
+      if (!sortedKeys)
+        workspaceSize += raft::alignTo(sizeof(InType) * (size_t)totalElements, memAlignWidth);
+
+      // value in KV pair need to be passed in, out buffer is separate
+      workspaceSize += raft::alignTo(sizeof(OutType) * (size_t)totalElements, memAlignWidth);
+
+      // for segment offsets
+      workspaceSize += raft::alignTo(sizeof(int) * (size_t)numSegments, memAlignWidth);
+    } else {
+      size_t workspaceOffset = 0;
+
+      if (!sortedKeys) {
+        sortedKeys      = reinterpret_cast<InType*>(workspacePtr);
+        workspaceOffset = raft::alignTo(sizeof(InType) * (size_t)totalElements, memAlignWidth);
+        workspacePtr    = (void*)((size_t)workspacePtr + workspaceOffset);
+      }
+
+      OutType* dValuesIn = reinterpret_cast<OutType*>(workspacePtr);
+      workspaceOffset    = raft::alignTo(sizeof(OutType) * (size_t)totalElements, memAlignWidth);
+      workspacePtr       = (void*)((size_t)workspacePtr + workspaceOffset);
+
+      int* dSegmentOffsets = reinterpret_cast<int*>(workspacePtr);
+      workspaceOffset      = raft::alignTo(sizeof(int) * (size_t)numSegments, memAlignWidth);
+      workspacePtr         = (void*)((size_t)workspacePtr + workspaceOffset);
+
+      // layout idx
+      RAFT_CUDA_TRY(layoutIdx(dValuesIn, n_rows, n_columns, stream));
+
+      // layout segment lengths - spread out column length
+      RAFT_CUDA_TRY(layoutSortOffset(dSegmentOffsets, n_columns, numSegments, stream));
+
+      RAFT_CUDA_TRY(cub::DeviceSegmentedRadixSort::SortPairs(workspacePtr,
+                                                             workspaceSize,
+                                                             in,
+                                                             sortedKeys,
+                                                             dValuesIn,
+                                                             out,
+                                                             totalElements,
+                                                             numSegments,
+                                                             dSegmentOffsets,
+                                                             dSegmentOffsets + 1,
+                                                             0,
+                                                             sizeof(InType) * 8,
+                                                             stream));
+    }
+  } else {
+    // batched per row device wide sort
+    if (workspaceSize == 0 || !workspacePtr) {
+      OutType* tmpValIn = nullptr;
+
+      // first call is to get size of workspace
+      RAFT_CUDA_TRY(cub::DeviceRadixSort::SortPairs(
+        workspacePtr, workspaceSize, in, sortedKeys, tmpValIn, out, n_columns));
+      bAllocWorkspace = true;
+
+      if (!sortedKeys)
+        workspaceSize += raft::alignTo(sizeof(InType) * (size_t)n_columns, memAlignWidth);
+
+      workspaceSize += raft::alignTo(sizeof(OutType) * (size_t)n_columns, memAlignWidth);
+    } else {
+      size_t workspaceOffset   = 0;
+      bool userKeyOutputBuffer = true;
+
+      if (!sortedKeys) {
+        userKeyOutputBuffer = false;
+        sortedKeys          = reinterpret_cast<InType*>(workspacePtr);
+        workspaceOffset     = raft::alignTo(sizeof(InType) * (size_t)n_columns, memAlignWidth);
+        workspacePtr        = (void*)((size_t)workspacePtr + workspaceOffset);
+      }
+
+      OutType* dValuesIn = reinterpret_cast<OutType*>(workspacePtr);
+      workspaceOffset    = raft::alignTo(sizeof(OutType) * (size_t)n_columns, memAlignWidth);
+      workspacePtr       = (void*)((size_t)workspacePtr + workspaceOffset);
+
+      // layout idx
+      RAFT_CUDA_TRY(layoutIdx(dValuesIn, 1, n_columns, stream));
+
+      for (int i = 0; i < n_rows; i++) {
+        InType* rowIn =
+          reinterpret_cast<InType*>((size_t)in + (i * sizeof(InType) * (size_t)n_columns));
+        OutType* rowOut =
+          reinterpret_cast<OutType*>((size_t)out + (i * sizeof(OutType) * (size_t)n_columns));
+
+        RAFT_CUDA_TRY(cub::DeviceRadixSort::SortPairs(
+          workspacePtr, workspaceSize, rowIn, sortedKeys, dValuesIn, rowOut, n_columns));
+
+        if (userKeyOutputBuffer)
+          sortedKeys =
+            reinterpret_cast<InType*>((size_t)sortedKeys + sizeof(InType) * (size_t)n_columns);
+      }
+    }
+  }
+}
+};  // end namespace detail
+};  // end namespace matrix
+};  // end namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index 153b6b1d8a..4d9bfd82ad 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -45,8 +45,6 @@
 
 #include <raft/distance/distance_type.hpp>
 
-#include <cuml/neighbors/knn.hpp>
-
 #include <iostream>
 #include <set>
 
diff --git a/cpp/include/raft/stats/accuracy.hpp b/cpp/include/raft/stats/accuracy.hpp
new file mode 100644
index 0000000000..043d2c0d0b
--- /dev/null
+++ b/cpp/include/raft/stats/accuracy.hpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/stats/detail/scores.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute accuracy of predictions. Useful for classification.
+ * @tparam math_t: data type for predictions (e.g., int for classification)
+ * @param[in] predictions: array of predictions (GPU pointer).
+ * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer).
+ * @param[in] n: number of elements in each of predictions, ref_predictions.
+ * @param[in] stream: cuda stream.
+ * @return: Accuracy score in [0, 1]; higher is better.
+ */
+template <typename math_t>
+float accuracy(const math_t* predictions, const math_t* ref_predictions, int n, cudaStream_t stream)
+{
+  return detail::accuracy_score(predictions, ref_predictions, n, stream);
+}
+
+}  // namespace stats
+}  // namespace raft
diff --git a/cpp/include/raft/stats/adjusted_rand_index.hpp b/cpp/include/raft/stats/adjusted_rand_index.hpp
new file mode 100644
index 0000000000..22d81e5296
--- /dev/null
+++ b/cpp/include/raft/stats/adjusted_rand_index.hpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file adjusted_rand_index.hpp
+ * @brief The adjusted Rand index is the corrected-for-chance version of the Rand index.
+ * Such a correction for chance establishes a baseline by using the expected similarity
+ * of all pair-wise comparisons between clusterings specified by a random model.
+ */
+
+#pragma once
+
+#include <raft/stats/detail/adjusted_rand_index.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate Adjusted RandIndex as described
+ *        <a href="https://en.wikipedia.org/wiki/Rand_index">here</a>
+ * @tparam T data-type for input label arrays
+ * @tparam MathT integral data-type used for computing n-choose-r
+ * @param firstClusterArray: the array of classes
+ * @param secondClusterArray: the array of classes
+ * @param size: the size of the data points of type int
+ * @param stream: the cudaStream object
+ */
+template <typename T, typename MathT = int>
+double adjusted_rand_index(const T* firstClusterArray,
+                           const T* secondClusterArray,
+                           int size,
+                           cudaStream_t stream)
+{
+  return detail::compute_adjusted_rand_index(firstClusterArray, secondClusterArray, size, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/common.hpp b/cpp/include/raft/stats/common.hpp
index 765f07a012..da3f44a0fa 100644
--- a/cpp/include/raft/stats/common.hpp
+++ b/cpp/include/raft/stats/common.hpp
@@ -63,5 +63,9 @@ enum HistType {
   /** decide at runtime the best algo for the given inputs */
   HistTypeAuto
 };
+
+/// Supported types of information criteria
+enum IC_Type { AIC, AICc, BIC };
+
 };  // end namespace stats
 };  // end namespace raft
diff --git a/cpp/include/raft/stats/completeness_score.hpp b/cpp/include/raft/stats/completeness_score.hpp
new file mode 100644
index 0000000000..ee8598bcc4
--- /dev/null
+++ b/cpp/include/raft/stats/completeness_score.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/stats/detail/completeness_score.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate the completeness score between two clusters
+ *
+ * @param truthClusterArray: the array of truth classes of type T
+ * @param predClusterArray: the array of predicted classes of type T
+ * @param size: the size of the data points of type int
+ * @param lowerLabelRange: the lower bound of the range of labels
+ * @param upperLabelRange: the upper bound of the range of labels
+ * @param stream: the cudaStream object
+ */
+template <typename T>
+double completeness_score(const T* truthClusterArray,
+                          const T* predClusterArray,
+                          int size,
+                          T lowerLabelRange,
+                          T upperLabelRange,
+                          cudaStream_t stream)
+{
+  return detail::completeness_score(
+    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/contingency_matrix.hpp b/cpp/include/raft/stats/contingency_matrix.hpp
new file mode 100644
index 0000000000..7783bb9f42
--- /dev/null
+++ b/cpp/include/raft/stats/contingency_matrix.hpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/stats/detail/contingencyMatrix.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief use this to allocate output matrix size
+ * size of matrix = (maxLabel - minLabel + 1)^2 * sizeof(int)
+ * @param groundTruth: device 1-d array for ground truth (num of rows)
+ * @param nSamples: number of elements in input array
+ * @param stream: cuda stream for execution
+ * @param minLabel: [out] calculated min value in input array
+ * @param maxLabel: [out] calculated max value in input array
+ */
+template <typename T>
+void getInputClassCardinality(
+  const T* groundTruth, const int nSamples, cudaStream_t stream, T& minLabel, T& maxLabel)
+{
+  detail::getInputClassCardinality(groundTruth, nSamples, stream, minLabel, maxLabel);
+}
+
+/**
+ * @brief Calculate workspace size for running contingency matrix calculations
+ * @tparam T label type
+ * @tparam OutT output matrix type
+ * @param nSamples: number of elements in input array
+ * @param groundTruth: device 1-d array for ground truth (num of rows)
+ * @param stream: cuda stream for execution
+ * @param minLabel: Optional, min value in input array
+ * @param maxLabel: Optional, max value in input array
+ */
+template <typename T, typename OutT = int>
+size_t getContingencyMatrixWorkspaceSize(int nSamples,
+                                         const T* groundTruth,
+                                         cudaStream_t stream,
+                                         T minLabel = std::numeric_limits<T>::max(),
+                                         T maxLabel = std::numeric_limits<T>::max())
+{
+  return detail::getContingencyMatrixWorkspaceSize(
+    nSamples, groundTruth, stream, minLabel, maxLabel);
+}
+
+/**
+ * @brief contruct contingency matrix given input ground truth and prediction
+ *        labels. Users should call function getInputClassCardinality to find
+ *        and allocate memory for output. Similarly workspace requirements
+ *        should be checked using function getContingencyMatrixWorkspaceSize
+ * @tparam T label type
+ * @tparam OutT output matrix type
+ * @param groundTruth: device 1-d array for ground truth (num of rows)
+ * @param predictedLabel: device 1-d array for prediction (num of columns)
+ * @param nSamples: number of elements in input array
+ * @param outMat: output buffer for contingecy matrix
+ * @param stream: cuda stream for execution
+ * @param workspace: Optional, workspace memory allocation
+ * @param workspaceSize: Optional, size of workspace memory
+ * @param minLabel: Optional, min value in input ground truth array
+ * @param maxLabel: Optional, max value in input ground truth array
+ */
+template <typename T, typename OutT = int>
+void contingencyMatrix(const T* groundTruth,
+                       const T* predictedLabel,
+                       int nSamples,
+                       OutT* outMat,
+                       cudaStream_t stream,
+                       void* workspace      = nullptr,
+                       size_t workspaceSize = 0,
+                       T minLabel           = std::numeric_limits<T>::max(),
+                       T maxLabel           = std::numeric_limits<T>::max())
+{
+  detail::contingencyMatrix<T, OutT>(groundTruth,
+                                     predictedLabel,
+                                     nSamples,
+                                     outMat,
+                                     stream,
+                                     workspace,
+                                     workspaceSize,
+                                     minLabel,
+                                     maxLabel);
+}
+
+};  // namespace stats
+};  // namespace raft
diff --git a/cpp/include/raft/stats/detail/adjusted_rand_index.cuh b/cpp/include/raft/stats/detail/adjusted_rand_index.cuh
new file mode 100644
index 0000000000..03ffac6377
--- /dev/null
+++ b/cpp/include/raft/stats/detail/adjusted_rand_index.cuh
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file adjusted_rand_index.cuh
+ * @brief The adjusted Rand index is the corrected-for-chance version of the Rand index.
+ * Such a correction for chance establishes a baseline by using the expected similarity
+ * of all pair-wise comparisons between clusterings specified by a random model.
+ */
+
+#pragma once
+
+#include "contingencyMatrix.cuh"
+#include <cub/cub.cuh>
+#include <math.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/map_then_reduce.hpp>
+#include <raft/linalg/reduce.hpp>
+#include <raft/stats/histogram.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace stats {
+namespace detail {
+
+/**
+ * @brief Lambda to calculate the number of unordered pairs in a given input
+ *
+ * @tparam Type: Data type of the input
+ * @param in: the input to the functional mapping
+ * @param i: the indexing(not used in this case)
+ */
+template <typename Type>
+struct nCTwo {
+  HDI Type operator()(Type in, int i = 0)
+  {
+    return in % 2 ? ((in - 1) >> 1) * in : (in >> 1) * (in - 1);
+  }
+};
+
+template <typename DataT, typename IdxT>
+struct Binner {
+  Binner(DataT minL) : minLabel(minL) {}
+
+  DI int operator()(DataT val, IdxT row, IdxT col) { return int(val - minLabel); }
+
+ private:
+  DataT minLabel;
+};  // struct Binner
+
+/**
+ * @brief Function to count the number of unique elements in the input array
+ *
+ * @tparam T data-type for input arrays
+ *
+ * @param[in]  arr       input array [on device] [len = size]
+ * @param[in]  size      the size of the input array
+ * @param[out] minLabel  the lower bound of the range of labels
+ * @param[out] maxLabel  the upper bound of the range of labels
+ * @param[in]  stream    cuda stream
+ *
+ * @return the number of unique elements in the array
+ */
+template <typename T>
+int countUnique(const T* arr, int size, T& minLabel, T& maxLabel, cudaStream_t stream)
+{
+  auto ptr         = thrust::device_pointer_cast(arr);
+  auto minmax      = thrust::minmax_element(thrust::cuda::par.on(stream), ptr, ptr + size);
+  minLabel         = *minmax.first;
+  maxLabel         = *minmax.second;
+  auto totalLabels = int(maxLabel - minLabel + 1);
+  rmm::device_uvector<int> labelCounts(totalLabels, stream);
+  rmm::device_scalar<int> nUniq(stream);
+  raft::stats::histogram<T, int>(
+    raft::stats::HistTypeAuto,
+    labelCounts.data(),
+    totalLabels,
+    arr,
+    size,
+    1,
+    stream,
+    [minLabel] __device__(T val, int row, int col) { return int(val - minLabel); });
+  raft::linalg::mapThenSumReduce<int>(
+    nUniq.data(),
+    totalLabels,
+    [] __device__(const T& val) { return val != 0; },
+    stream,
+    labelCounts.data());
+  auto numUniques = nUniq.value(stream);
+  return numUniques;
+}
+
+/**
+ * @brief Function to calculate Adjusted RandIndex as described
+ *        <a href="https://en.wikipedia.org/wiki/Rand_index">here</a>
+ * @tparam T data-type for input label arrays
+ * @tparam MathT integral data-type used for computing n-choose-r
+ * @param firstClusterArray: the array of classes
+ * @param secondClusterArray: the array of classes
+ * @param size: the size of the data points of type int
+ * @param stream: the cudaStream object
+ */
+template <typename T, typename MathT = int>
+double compute_adjusted_rand_index(const T* firstClusterArray,
+                                   const T* secondClusterArray,
+                                   int size,
+                                   cudaStream_t stream)
+{
+  ASSERT(size >= 2, "Rand Index for size less than 2 not defined!");
+  T minFirst, maxFirst, minSecond, maxSecond;
+  auto nUniqFirst      = countUnique(firstClusterArray, size, minFirst, maxFirst, stream);
+  auto nUniqSecond     = countUnique(secondClusterArray, size, minSecond, maxSecond, stream);
+  auto lowerLabelRange = std::min(minFirst, minSecond);
+  auto upperLabelRange = std::max(maxFirst, maxSecond);
+  auto nClasses        = upperLabelRange - lowerLabelRange + 1;
+  // degenerate case of single cluster or clusters each with just one element
+  if (nUniqFirst == nUniqSecond) {
+    if (nUniqFirst == 1 || nUniqFirst == size) return 1.0;
+  }
+  auto nUniqClasses = MathT(nClasses);
+  rmm::device_uvector<MathT> dContingencyMatrix(nUniqClasses * nUniqClasses, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(
+    dContingencyMatrix.data(), 0, nUniqClasses * nUniqClasses * sizeof(MathT), stream));
+  auto workspaceSz = getContingencyMatrixWorkspaceSize<T, MathT>(
+    size, firstClusterArray, stream, lowerLabelRange, upperLabelRange);
+  rmm::device_uvector<char> workspaceBuff(workspaceSz, stream);
+  contingencyMatrix<T, MathT>(firstClusterArray,
+                              secondClusterArray,
+                              size,
+                              dContingencyMatrix.data(),
+                              stream,
+                              workspaceBuff.data(),
+                              workspaceSz,
+                              lowerLabelRange,
+                              upperLabelRange);
+  rmm::device_uvector<MathT> a(nUniqClasses, stream);
+  rmm::device_uvector<MathT> b(nUniqClasses, stream);
+  rmm::device_scalar<MathT> d_aCTwoSum(stream);
+  rmm::device_scalar<MathT> d_bCTwoSum(stream);
+  rmm::device_scalar<MathT> d_nChooseTwoSum(stream);
+  MathT h_aCTwoSum, h_bCTwoSum, h_nChooseTwoSum;
+  RAFT_CUDA_TRY(cudaMemsetAsync(a.data(), 0, nUniqClasses * sizeof(MathT), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(b.data(), 0, nUniqClasses * sizeof(MathT), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(d_aCTwoSum.data(), 0, sizeof(MathT), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(d_bCTwoSum.data(), 0, sizeof(MathT), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(d_nChooseTwoSum.data(), 0, sizeof(MathT), stream));
+  // calculating the sum of NijC2
+  raft::linalg::mapThenSumReduce<MathT, nCTwo<MathT>>(d_nChooseTwoSum.data(),
+                                                      nUniqClasses * nUniqClasses,
+                                                      nCTwo<MathT>(),
+                                                      stream,
+                                                      dContingencyMatrix.data(),
+                                                      dContingencyMatrix.data());
+  // calculating the row-wise sums
+  raft::linalg::reduce<MathT, MathT>(
+    a.data(), dContingencyMatrix.data(), nUniqClasses, nUniqClasses, 0, true, true, stream);
+  // calculating the column-wise sums
+  raft::linalg::reduce<MathT, MathT>(
+    b.data(), dContingencyMatrix.data(), nUniqClasses, nUniqClasses, 0, true, false, stream);
+  // calculating the sum of number of unordered pairs for every element in a
+  raft::linalg::mapThenSumReduce<MathT, nCTwo<MathT>>(
+    d_aCTwoSum.data(), nUniqClasses, nCTwo<MathT>(), stream, a.data(), a.data());
+  // calculating the sum of number of unordered pairs for every element of b
+  raft::linalg::mapThenSumReduce<MathT, nCTwo<MathT>>(
+    d_bCTwoSum.data(), nUniqClasses, nCTwo<MathT>(), stream, b.data(), b.data());
+  // updating in the host memory
+  raft::update_host(&h_nChooseTwoSum, d_nChooseTwoSum.data(), 1, stream);
+  raft::update_host(&h_aCTwoSum, d_aCTwoSum.data(), 1, stream);
+  raft::update_host(&h_bCTwoSum, d_bCTwoSum.data(), 1, stream);
+  // calculating the ARI
+  auto nChooseTwo    = double(size) * double(size - 1) / 2.0;
+  auto expectedIndex = double(h_aCTwoSum) * double(h_bCTwoSum) / double(nChooseTwo);
+  auto maxIndex      = (double(h_bCTwoSum) + double(h_aCTwoSum)) / 2.0;
+  auto index         = double(h_nChooseTwoSum);
+  if (maxIndex - expectedIndex)
+    return (index - expectedIndex) / (maxIndex - expectedIndex);
+  else
+    return 0;
+}
+
+};  // end namespace detail
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/detail/batched/information_criterion.cuh b/cpp/include/raft/stats/detail/batched/information_criterion.cuh
new file mode 100644
index 0000000000..a6d8d174b0
--- /dev/null
+++ b/cpp/include/raft/stats/detail/batched/information_criterion.cuh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/linalg/unary_op.hpp>
+#include <raft/stats/common.hpp>
+
+#include <cmath>
+
+namespace raft {
+namespace stats {
+namespace batched {
+namespace detail {
+
+/**
+ * Compute the given type of information criterion
+ *
+ * @note: it is safe to do the computation in-place (i.e give same pointer
+ *        as input and output)
+ *
+ * @param[out] d_ic             Information criterion to be returned for each
+ *                              series (device)
+ * @param[in]  d_loglikelihood  Log-likelihood for each series (device)
+ * @param[in]  ic_type          Type of criterion to compute. See IC_Type
+ * @param[in]  n_params         Number of parameters in the model
+ * @param[in]  batch_size       Number of series in the batch
+ * @param[in]  n_samples        Number of samples in each series
+ * @param[in]  stream           CUDA stream
+ */
+template <typename ScalarT, typename IdxT>
+void information_criterion(ScalarT* d_ic,
+                           const ScalarT* d_loglikelihood,
+                           IC_Type ic_type,
+                           IdxT n_params,
+                           IdxT batch_size,
+                           IdxT n_samples,
+                           cudaStream_t stream)
+{
+  ScalarT ic_base{};
+  ScalarT N = static_cast<ScalarT>(n_params);
+  ScalarT T = static_cast<ScalarT>(n_samples);
+  switch (ic_type) {
+    case AIC: ic_base = (ScalarT)2.0 * N; break;
+    case AICc:
+      ic_base = (ScalarT)2.0 * (N + (N * (N + (ScalarT)1.0)) / (T - N - (ScalarT)1.0));
+      break;
+    case BIC: ic_base = std::log(T) * N; break;
+  }
+  /* Compute information criterion from log-likelihood and base term */
+  raft::linalg::unaryOp(
+    d_ic,
+    d_loglikelihood,
+    batch_size,
+    [=] __device__(ScalarT loglike) { return ic_base - (ScalarT)2.0 * loglike; },
+    stream);
+}
+
+}  // namespace detail
+}  // namespace batched
+}  // namespace stats
+}  // namespace raft
diff --git a/cpp/include/raft/stats/detail/batched/silhouette_score.cuh b/cpp/include/raft/stats/detail/batched/silhouette_score.cuh
new file mode 100644
index 0000000000..d709c7472a
--- /dev/null
+++ b/cpp/include/raft/stats/detail/batched/silhouette_score.cuh
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../silhouette_score.cuh"
+#include <raft/cuda_utils.cuh>
+#include <raft/device_atomics.cuh>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+#include <thrust/device_vector.h>
+
+namespace raft {
+namespace stats {
+namespace batched {
+namespace detail {
+
+/**
+ * This kernel initializes matrix b (n_rows * n_labels)
+ * For each label that the corresponding row is not a part of is initialized as 0
+ * If the corresponding row is the only sample in its label, again 0
+ * Only if the there are > 1 samples in the label, row is initialized to max
+ */
+template <typename value_t, typename value_idx, typename label_idx>
+__global__ void fill_b_kernel(value_t* b,
+                              const label_idx* y,
+                              value_idx n_rows,
+                              label_idx n_labels,
+                              const value_idx* cluster_counts)
+{
+  value_idx idx = threadIdx.x + blockIdx.x * blockDim.x;
+  label_idx idy = threadIdx.y + blockIdx.y * blockDim.y;
+
+  if (idx >= n_rows || idy >= n_labels) { return; }
+
+  auto row_cluster = y[idx];
+
+  auto col_cluster_count = cluster_counts[idy];
+
+  // b for own cluster should be max value
+  // so that it does not interfere with min operator
+  // b is also max if col cluster count is 0
+  // however, b is 0 if self cluster count is 1
+  if (row_cluster == idy || col_cluster_count == 0) {
+    if (cluster_counts[row_cluster] == 1) {
+      b[idx * n_labels + idy] = 0;
+    } else {
+      b[idx * n_labels + idy] = std::numeric_limits<value_t>::max();
+    }
+  } else {
+    b[idx * n_labels + idy] = 0;
+  }
+}
+
+/**
+ * This kernel does an elementwise sweep of chunked pairwise distance matrix
+ * By knowing the offsets of the chunked pairwise distance matrix in the
+ * global pairwise distance matrix, we are able to calculate
+ * intermediate values of a and b for the rows and columns present in the
+ * current chunked pairwise distance matrix.
+ */
+template <typename value_t, typename value_idx, typename label_idx>
+__global__ void compute_chunked_a_b_kernel(value_t* a,
+                                           value_t* b,
+                                           value_idx row_offset,
+                                           value_idx col_offset,
+                                           const label_idx* y,
+                                           label_idx n_labels,
+                                           const value_idx* cluster_counts,
+                                           const value_t* distances,
+                                           value_idx dist_rows,
+                                           value_idx dist_cols)
+{
+  value_idx row_id = threadIdx.x + blockIdx.x * blockDim.x;
+  value_idx col_id = threadIdx.y + blockIdx.y * blockDim.y;
+
+  // these are global offsets of current element
+  // in the full pairwise distance matrix
+  value_idx pw_row_id = row_id + row_offset;
+  value_idx pw_col_id = col_id + col_offset;
+
+  if (row_id >= dist_rows || col_id >= dist_cols || pw_row_id == pw_col_id) { return; }
+
+  auto row_cluster = y[pw_row_id];
+  if (cluster_counts[row_cluster] == 1) { return; }
+
+  auto col_cluster        = y[pw_col_id];
+  auto col_cluster_counts = cluster_counts[col_cluster];
+
+  if (col_cluster == row_cluster) {
+    atomicAdd(&a[pw_row_id], distances[row_id * dist_cols + col_id] / (col_cluster_counts - 1));
+  } else {
+    atomicAdd(&b[pw_row_id * n_labels + col_cluster],
+              distances[row_id * dist_cols + col_id] / col_cluster_counts);
+  }
+}
+
+template <typename value_idx, typename label_idx>
+rmm::device_uvector<value_idx> get_cluster_counts(const raft::handle_t& handle,
+                                                  label_idx* y,
+                                                  value_idx& n_rows,
+                                                  label_idx& n_labels)
+{
+  auto stream = handle.get_stream();
+
+  rmm::device_uvector<value_idx> cluster_counts(n_labels, stream);
+
+  rmm::device_uvector<char> workspace(1, stream);
+
+  raft::stats::detail::countLabels(y, cluster_counts.data(), n_rows, n_labels, workspace, stream);
+
+  return cluster_counts;
+}
+
+template <typename value_t, typename value_idx>
+rmm::device_uvector<value_t> get_pairwise_distance(const raft::handle_t& handle,
+                                                   value_t* left_begin,
+                                                   value_t* right_begin,
+                                                   value_idx& n_left_rows,
+                                                   value_idx& n_right_rows,
+                                                   value_idx& n_cols,
+                                                   raft::distance::DistanceType metric,
+                                                   cudaStream_t stream)
+{
+  rmm::device_uvector<value_t> distances(n_left_rows * n_right_rows, stream);
+
+  raft::distance::pairwise_distance(
+    handle, left_begin, right_begin, distances.data(), n_left_rows, n_right_rows, n_cols, metric);
+
+  return distances;
+}
+
+template <typename value_t, typename value_idx, typename label_idx>
+void compute_chunked_a_b(const raft::handle_t& handle,
+                         value_t* a,
+                         value_t* b,
+                         value_idx& row_offset,
+                         value_idx& col_offset,
+                         const label_idx* y,
+                         label_idx& n_labels,
+                         const value_idx* cluster_counts,
+                         const value_t* distances,
+                         value_idx& dist_rows,
+                         value_idx& dist_cols,
+                         cudaStream_t stream)
+{
+  dim3 block_size(std::min(dist_rows, 32), std::min(dist_cols, 32));
+  dim3 grid_size(raft::ceildiv(dist_rows, (value_idx)block_size.x),
+                 raft::ceildiv(dist_cols, (value_idx)block_size.y));
+
+  detail::compute_chunked_a_b_kernel<<<grid_size, block_size, 0, stream>>>(
+    a, b, row_offset, col_offset, y, n_labels, cluster_counts, distances, dist_rows, dist_cols);
+}
+
+template <typename value_t, typename value_idx, typename label_idx>
+value_t silhouette_score(
+  const raft::handle_t& handle,
+  value_t* X,
+  value_idx n_rows,
+  value_idx n_cols,
+  label_idx* y,
+  label_idx n_labels,
+  value_t* scores,
+  value_idx chunk,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded)
+{
+  ASSERT(n_labels >= 2 && n_labels <= (n_rows - 1),
+         "silhouette Score not defined for the given number of labels!");
+
+  rmm::device_uvector<value_idx> cluster_counts = get_cluster_counts(handle, y, n_rows, n_labels);
+
+  auto stream = handle.get_stream();
+  auto policy = handle.get_thrust_policy();
+
+  auto b_size = n_rows * n_labels;
+
+  value_t *a_ptr, *b_ptr;
+  rmm::device_uvector<value_t> a(0, stream);
+  rmm::device_uvector<value_t> b(b_size, stream);
+
+  b_ptr = b.data();
+
+  // since a and silhouette score per sample are same size, reusing
+  if (scores == nullptr || scores == NULL) {
+    a.resize(n_rows, stream);
+    a_ptr = a.data();
+  } else {
+    a_ptr = scores;
+  }
+
+  thrust::fill(policy, a_ptr, a_ptr + n_rows, 0);
+
+  dim3 block_size(std::min(n_rows, 32), std::min(n_labels, 32));
+  dim3 grid_size(raft::ceildiv(n_rows, (value_idx)block_size.x),
+                 raft::ceildiv(n_labels, (label_idx)block_size.y));
+  detail::fill_b_kernel<<<grid_size, block_size, 0, stream>>>(
+    b_ptr, y, n_rows, n_labels, cluster_counts.data());
+
+  handle.wait_stream_pool_on_stream();
+
+  auto n_iters = 0;
+
+  for (value_idx i = 0; i < n_rows; i += chunk) {
+    for (value_idx j = 0; j < n_rows; j += chunk) {
+      ++n_iters;
+
+      auto chunk_stream = handle.get_next_usable_stream(i + chunk * j);
+
+      auto* left_begin  = X + (i * n_cols);
+      auto* right_begin = X + (j * n_cols);
+
+      auto n_left_rows  = (i + chunk) < n_rows ? chunk : (n_rows - i);
+      auto n_right_rows = (j + chunk) < n_rows ? chunk : (n_rows - j);
+
+      rmm::device_uvector<value_t> distances = get_pairwise_distance(
+        handle, left_begin, right_begin, n_left_rows, n_right_rows, n_cols, metric, chunk_stream);
+
+      compute_chunked_a_b(handle,
+                          a_ptr,
+                          b_ptr,
+                          i,
+                          j,
+                          y,
+                          n_labels,
+                          cluster_counts.data(),
+                          distances.data(),
+                          n_left_rows,
+                          n_right_rows,
+                          chunk_stream);
+    }
+  }
+
+  handle.sync_stream_pool();
+
+  // calculating row-wise minimum in b
+  // this prim only supports int indices for now
+  raft::linalg::
+    reduce<value_t, value_t, value_idx, raft::Nop<value_t>, raft::stats::detail::MinOp<value_t>>(
+      b_ptr,
+      b_ptr,
+      n_labels,
+      n_rows,
+      std::numeric_limits<value_t>::max(),
+      true,
+      true,
+      stream,
+      false,
+      raft::Nop<value_t>(),
+      raft::stats::detail::MinOp<value_t>());
+
+  // calculating the silhouette score per sample
+  raft::linalg::binaryOp<value_t, raft::stats::detail::SilOp<value_t>, value_t, value_idx>(
+    a_ptr, a_ptr, b_ptr, n_rows, raft::stats::detail::SilOp<value_t>(), stream);
+
+  return thrust::reduce(policy, a_ptr, a_ptr + n_rows, value_t(0)) / n_rows;
+}
+
+}  // namespace detail
+}  // namespace batched
+}  // namespace stats
+}  // namespace raft
diff --git a/cpp/include/raft/stats/detail/completeness_score.cuh b/cpp/include/raft/stats/detail/completeness_score.cuh
new file mode 100644
index 0000000000..1ddd4ffc4c
--- /dev/null
+++ b/cpp/include/raft/stats/detail/completeness_score.cuh
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file completeness_score.cuh
+ *
+ * @brief A clustering result satisfies completeness if all the data points
+ * that are members of a given class are elements of the same cluster.
+ */
+
+#pragma once
+
+#include <raft/stats/entropy.hpp>
+#include <raft/stats/mutual_info_score.hpp>
+
+namespace raft {
+namespace stats {
+namespace detail {
+
+/**
+ * @brief Function to calculate the completeness score between two clusters
+ *
+ * @param truthClusterArray: the array of truth classes of type T
+ * @param predClusterArray: the array of predicted classes of type T
+ * @param size: the size of the data points of type int
+ * @param lowerLabelRange: the lower bound of the range of labels
+ * @param upperLabelRange: the upper bound of the range of labels
+ * @param stream: the cudaStream object
+ */
+template <typename T>
+double completeness_score(const T* truthClusterArray,
+                          const T* predClusterArray,
+                          int size,
+                          T lowerLabelRange,
+                          T upperLabelRange,
+                          cudaStream_t stream)
+{
+  if (size == 0) return 1.0;
+
+  double computedMI, computedEntropy;
+
+  computedMI = raft::stats::mutual_info_score(
+    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+  computedEntropy =
+    raft::stats::entropy(predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+
+  double completeness;
+
+  if (computedEntropy) {
+    completeness = computedMI / computedEntropy;
+  } else
+    completeness = 1.0;
+
+  return completeness;
+}
+
+};  // end namespace detail
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/detail/contingencyMatrix.cuh b/cpp/include/raft/stats/detail/contingencyMatrix.cuh
new file mode 100644
index 0000000000..6318e241bf
--- /dev/null
+++ b/cpp/include/raft/stats/detail/contingencyMatrix.cuh
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+
+#include <thrust/device_ptr.h>
+#include <thrust/reduce.h>
+
+#include <cub/cub.cuh>
+
+#include <math.h>
+
+namespace raft {
+namespace stats {
+namespace detail {
+
+typedef enum {
+  IMPL_NONE,
+  SMEM_ATOMICS,
+  GLOBAL_ATOMICS,
+  SORT_AND_GATOMICS
+} ContingencyMatrixImplType;
+
+template <typename T, typename OutT = int>
+__global__ void devConstructContingencyMatrix(const T* groundTruth,
+                                              const T* predicted,
+                                              int nSamples,
+                                              OutT* outMat,
+                                              int outIdxOffset,
+                                              int outMatWidth)
+{
+  int elementId = threadIdx.x + blockDim.x * blockIdx.x;
+  if (elementId < nSamples) {
+    T gt           = groundTruth[elementId];
+    T pd           = predicted[elementId];
+    auto outputIdx = (gt - outIdxOffset) * outMatWidth + pd - outIdxOffset;
+    raft::myAtomicAdd(outMat + outputIdx, OutT(1));
+  }
+}
+
+template <typename T, typename OutT = int>
+void computeCMatWAtomics(const T* groundTruth,
+                         const T* predictedLabel,
+                         int nSamples,
+                         OutT* outMat,
+                         int outIdxOffset,
+                         int outDimN,
+                         cudaStream_t stream)
+{
+  RAFT_CUDA_TRY(
+    cudaFuncSetCacheConfig(devConstructContingencyMatrix<T, OutT>, cudaFuncCachePreferL1));
+  static const int block = 128;
+  auto grid              = raft::ceildiv(nSamples, block);
+  devConstructContingencyMatrix<T, OutT><<<grid, block, 0, stream>>>(
+    groundTruth, predictedLabel, nSamples, outMat, outIdxOffset, outDimN);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+template <typename T, typename OutT = int>
+__global__ void devConstructContingencyMatrixSmem(const T* groundTruth,
+                                                  const T* predicted,
+                                                  int nSamples,
+                                                  OutT* outMat,
+                                                  int outIdxOffset,
+                                                  int outMatWidth)
+{
+  extern __shared__ char smem[];
+  auto* sMemMatrix = reinterpret_cast<OutT*>(smem);
+  for (int smemIdx = threadIdx.x; smemIdx < outMatWidth * outMatWidth; smemIdx += blockDim.x) {
+    sMemMatrix[smemIdx] = 0;
+  }
+  __syncthreads();
+  int elementId = threadIdx.x + blockDim.x * blockIdx.x;
+  if (elementId < nSamples) {
+    T gt           = groundTruth[elementId];
+    T pd           = predicted[elementId];
+    auto outputIdx = (gt - outIdxOffset) * outMatWidth + pd - outIdxOffset;
+    raft::myAtomicAdd(sMemMatrix + outputIdx, OutT(1));
+  }
+  __syncthreads();
+  for (int smemIdx = threadIdx.x; smemIdx < outMatWidth * outMatWidth; smemIdx += blockDim.x) {
+    raft::myAtomicAdd(outMat + smemIdx, sMemMatrix[smemIdx]);
+  }
+}
+
+template <typename T, typename OutT = int>
+void computeCMatWSmemAtomics(const T* groundTruth,
+                             const T* predictedLabel,
+                             int nSamples,
+                             OutT* outMat,
+                             int outIdxOffset,
+                             int outDimN,
+                             cudaStream_t stream)
+{
+  static const int block  = 128;
+  auto grid               = raft::ceildiv(nSamples, block);
+  size_t smemSizePerBlock = outDimN * outDimN * sizeof(OutT);
+  devConstructContingencyMatrixSmem<T, OutT><<<grid, block, smemSizePerBlock, stream>>>(
+    groundTruth, predictedLabel, nSamples, outMat, outIdxOffset, outDimN);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+template <typename T, typename OutT = int>
+void contingencyMatrixWSort(const T* groundTruth,
+                            const T* predictedLabel,
+                            int nSamples,
+                            OutT* outMat,
+                            T minLabel,
+                            T maxLabel,
+                            void* workspace,
+                            size_t workspaceSize,
+                            cudaStream_t stream)
+{
+  T* outKeys           = reinterpret_cast<T*>(workspace);
+  auto alignedBufferSz = raft::alignTo<size_t>(nSamples * sizeof(T), 256);
+  T* outValue          = reinterpret_cast<T*>((size_t)workspace + alignedBufferSz);
+  void* pWorkspaceCub  = reinterpret_cast<void*>((size_t)workspace + 2 * alignedBufferSz);
+  auto bitsToSort      = log2<int>(maxLabel);
+  if (!raft::isPo2(maxLabel)) ++bitsToSort;
+  // we dont really need perfect sorting, should get by with some sort of
+  // binning-reordering operation
+  ///@todo: future work - explore "efficient" custom binning kernels vs cub sort
+  RAFT_CUDA_TRY(cub::DeviceRadixSort::SortPairs(pWorkspaceCub,
+                                                workspaceSize,
+                                                groundTruth,
+                                                outKeys,
+                                                predictedLabel,
+                                                outValue,
+                                                nSamples,
+                                                0,
+                                                bitsToSort,
+                                                stream));
+  auto outDimM_N = int(maxLabel - minLabel + 1);
+  computeCMatWAtomics<T, OutT>(outKeys, outValue, nSamples, outMat, minLabel, outDimM_N, stream);
+}
+
+template <typename OutT = int>
+ContingencyMatrixImplType getImplVersion(OutT outDimN)
+{
+  int currDevice  = 0;
+  int l2CacheSize = 0;
+  // no way to query this from CUDA APIs, value for CC 7.0, 3.0
+  int maxBlocksResidentPerSM = 16;
+  RAFT_CUDA_TRY(cudaGetDevice(&currDevice));
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&l2CacheSize, cudaDevAttrL2CacheSize, currDevice));
+  auto maxSmemPerBlock                  = raft::getSharedMemPerBlock();
+  ContingencyMatrixImplType implVersion = IMPL_NONE;
+  // keeping 8 block per SM to get good utilization
+  // can go higher but reduced L1 size degrades perf
+  OutT upperLimitSmemAtomics =
+    std::floor(std::sqrt(maxSmemPerBlock / (sizeof(OutT) * (maxBlocksResidentPerSM / 2))));
+  OutT upperLimitL2Atomics = std::floor(std::sqrt(l2CacheSize / sizeof(OutT)));
+  if (outDimN <= upperLimitSmemAtomics)
+    implVersion = SMEM_ATOMICS;
+  else if (outDimN <= upperLimitL2Atomics)
+    implVersion = GLOBAL_ATOMICS;
+  else
+    implVersion = SORT_AND_GATOMICS;
+  return implVersion;
+}
+
+/**
+ * @brief use this to allocate output matrix size
+ * size of matrix = (maxLabel - minLabel + 1)^2 * sizeof(int)
+ * @param groundTruth: device 1-d array for ground truth (num of rows)
+ * @param nSamples: number of elements in input array
+ * @param stream: cuda stream for execution
+ * @param minLabel: [out] calculated min value in input array
+ * @param maxLabel: [out] calculated max value in input array
+ */
+template <typename T>
+void getInputClassCardinality(
+  const T* groundTruth, const int nSamples, cudaStream_t stream, T& minLabel, T& maxLabel)
+{
+  thrust::device_ptr<const T> dTrueLabel = thrust::device_pointer_cast(groundTruth);
+  auto min_max =
+    thrust::minmax_element(thrust::cuda::par.on(stream), dTrueLabel, dTrueLabel + nSamples);
+  minLabel = *min_max.first;
+  maxLabel = *min_max.second;
+}
+
+/**
+ * @brief Calculate workspace size for running contingency matrix calculations
+ * @tparam T label type
+ * @tparam OutT output matrix type
+ * @param nSamples: number of elements in input array
+ * @param groundTruth: device 1-d array for ground truth (num of rows)
+ * @param stream: cuda stream for execution
+ * @param minLabel: Optional, min value in input array
+ * @param maxLabel: Optional, max value in input array
+ */
+template <typename T, typename OutT = int>
+size_t getContingencyMatrixWorkspaceSize(int nSamples,
+                                         const T* groundTruth,
+                                         cudaStream_t stream,
+                                         T minLabel = std::numeric_limits<T>::max(),
+                                         T maxLabel = std::numeric_limits<T>::max())
+{
+  size_t workspaceSize = 0;
+  // below is a redundant computation - can be avoided
+  if (minLabel == std::numeric_limits<T>::max() || maxLabel == std::numeric_limits<T>::max()) {
+    getInputClassCardinality<T>(groundTruth, nSamples, stream, minLabel, maxLabel);
+  }
+  auto outDimN                          = OutT(maxLabel - minLabel + 1);
+  ContingencyMatrixImplType implVersion = getImplVersion<OutT>(outDimN);
+  if (implVersion == SORT_AND_GATOMICS) {
+    void* pWorkspaceCub{};
+    size_t tmpStorageBytes = 0;
+    // no-op pointers to get workspace size
+    T* pTmpUnused{};
+    RAFT_CUDA_TRY(cub::DeviceRadixSort::SortPairs(
+      pWorkspaceCub, tmpStorageBytes, pTmpUnused, pTmpUnused, pTmpUnused, pTmpUnused, nSamples));
+    auto tmpStagingMemorySize = raft::alignTo<size_t>(nSamples * sizeof(T), 256);
+    tmpStagingMemorySize *= 2;
+    workspaceSize = tmpStagingMemorySize + tmpStorageBytes;
+  }
+  return workspaceSize;
+}
+
+/**
+ * @brief contruct contingency matrix given input ground truth and prediction
+ *        labels. Users should call function getInputClassCardinality to find
+ *        and allocate memory for output. Similarly workspace requirements
+ *        should be checked using function getContingencyMatrixWorkspaceSize
+ * @tparam T label type
+ * @tparam OutT output matrix type
+ * @param groundTruth: device 1-d array for ground truth (num of rows)
+ * @param predictedLabel: device 1-d array for prediction (num of columns)
+ * @param nSamples: number of elements in input array
+ * @param outMat: output buffer for contingecy matrix
+ * @param stream: cuda stream for execution
+ * @param workspace: Optional, workspace memory allocation
+ * @param workspaceSize: Optional, size of workspace memory
+ * @param minLabel: Optional, min value in input ground truth array
+ * @param maxLabel: Optional, max value in input ground truth array
+ */
+template <typename T, typename OutT = int>
+void contingencyMatrix(const T* groundTruth,
+                       const T* predictedLabel,
+                       int nSamples,
+                       OutT* outMat,
+                       cudaStream_t stream,
+                       void* workspace      = nullptr,
+                       size_t workspaceSize = 0,
+                       T minLabel           = std::numeric_limits<T>::max(),
+                       T maxLabel           = std::numeric_limits<T>::max())
+{
+  // assumptions:
+  // output is not at par with scikit learn - output will be square matrix
+  // always with numRows = numColumns = numOfClassesInTrueLabel
+  // it is also assumed that true labels are monotically increasing
+  // if for some reason groundTruth completely skips some labels
+  // eg: {0,1,2,5} instead of {0,1,2,3}.
+  // Output matrix will still have empty rows for label value {3,4}
+  // Users can use "make_monotonic" to convert their discontinuous input label
+  // range to a monotonically increasing one  //
+  // this also serves as way to measure co-occurence/joint counts for NLP tasks which
+  // can be used to then compute pointwise mutual information and mutual information
+  if (minLabel == std::numeric_limits<T>::max() || maxLabel == std::numeric_limits<T>::max()) {
+    getInputClassCardinality<T>(groundTruth, nSamples, stream, minLabel, maxLabel);
+  }
+  auto outDimM_N = OutT(maxLabel - minLabel + 1);
+  RAFT_CUDA_TRY(cudaMemsetAsync(outMat, 0, sizeof(OutT) * outDimM_N * outDimM_N, stream));
+  ContingencyMatrixImplType implVersion = getImplVersion<OutT>(outDimM_N);
+  switch (implVersion) {
+    case SMEM_ATOMICS:
+      // smem atomics and then single global mem atomics only works
+      // when all label count can fit in smem for a block
+      // helps when GLOBAL_ATOMICS performance blocked by atomic update
+      // serialization -when very less labels ~10 labels
+      computeCMatWSmemAtomics<T, OutT>(
+        groundTruth, predictedLabel, nSamples, outMat, minLabel, outDimM_N, stream);
+      break;
+    case GLOBAL_ATOMICS:
+      // launch kernel - global atomic ops per (groundTruth,predictedValue) pair
+      computeCMatWAtomics<T, OutT>(
+        groundTruth, predictedLabel, nSamples, outMat, minLabel, outDimM_N, stream);
+      break;
+      // more L2 thrashing if atomic OPs land in completely different mem
+      // segment - when more labels
+    case SORT_AND_GATOMICS:
+      contingencyMatrixWSort<T, OutT>(groundTruth,
+                                      predictedLabel,
+                                      nSamples,
+                                      outMat,
+                                      minLabel,
+                                      maxLabel,
+                                      workspace,
+                                      workspaceSize,
+                                      stream);
+      break;
+    case IMPL_NONE: break;
+  }
+}
+
+};  // namespace detail
+};  // namespace stats
+};  // namespace raft
diff --git a/cpp/include/raft/stats/detail/dispersion.cuh b/cpp/include/raft/stats/detail/dispersion.cuh
new file mode 100644
index 0000000000..c1d9376e05
--- /dev/null
+++ b/cpp/include/raft/stats/detail/dispersion.cuh
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/cub.cuh>
+#include <memory>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/interruptible.hpp>
+#include <raft/linalg/eltwise.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace stats {
+namespace detail {
+
+///@todo: ColsPerBlk has been tested only for 32!
+template <typename DataT, typename IdxT, int TPB, int ColsPerBlk = 32>
+__global__ void weightedMeanKernel(DataT* mu, const DataT* data, const IdxT* counts, IdxT D, IdxT N)
+{
+  constexpr int RowsPerBlkPerIter = TPB / ColsPerBlk;
+  IdxT thisColId                  = threadIdx.x % ColsPerBlk;
+  IdxT thisRowId                  = threadIdx.x / ColsPerBlk;
+  IdxT colId                      = thisColId + ((IdxT)blockIdx.y * ColsPerBlk);
+  IdxT rowId                      = thisRowId + ((IdxT)blockIdx.x * RowsPerBlkPerIter);
+  DataT thread_data               = DataT(0);
+  const IdxT stride               = RowsPerBlkPerIter * gridDim.x;
+  __shared__ DataT smu[ColsPerBlk];
+  if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = DataT(0);
+  for (IdxT i = rowId; i < N; i += stride) {
+    thread_data += (colId < D) ? data[i * D + colId] * (DataT)counts[i] : DataT(0);
+  }
+  __syncthreads();
+  raft::myAtomicAdd(smu + thisColId, thread_data);
+  __syncthreads();
+  if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(mu + colId, smu[thisColId]);
+}
+
+template <typename DataT, typename IdxT, int TPB>
+__global__ void dispersionKernel(DataT* result,
+                                 const DataT* clusters,
+                                 const IdxT* clusterSizes,
+                                 const DataT* mu,
+                                 IdxT dim,
+                                 IdxT nClusters)
+{
+  IdxT tid    = threadIdx.x + blockIdx.x * blockDim.x;
+  IdxT len    = dim * nClusters;
+  IdxT stride = blockDim.x * gridDim.x;
+  DataT sum   = DataT(0);
+  for (; tid < len; tid += stride) {
+    IdxT col   = tid % dim;
+    IdxT row   = tid / dim;
+    DataT diff = clusters[tid] - mu[col];
+    sum += diff * diff * DataT(clusterSizes[row]);
+  }
+  typedef cub::BlockReduce<DataT, TPB> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  __syncthreads();
+  auto acc = BlockReduce(temp_storage).Sum(sum);
+  __syncthreads();
+  if (threadIdx.x == 0) raft::myAtomicAdd(result, acc);
+}
+
+/**
+ * @brief Compute cluster dispersion metric. This is very useful for
+ * automatically finding the 'k' (in kmeans) that improves this metric.
+ * @tparam DataT data type
+ * @tparam IdxT index type
+ * @tparam TPB threads block for kernels launched
+ * @param centroids the cluster centroids. This is assumed to be row-major
+ *   and of dimension (nClusters x dim)
+ * @param clusterSizes number of points in the dataset which belong to each
+ *   cluster. This is of length nClusters
+ * @param globalCentroid compute the global weighted centroid of all cluster
+ *   centroids. This is of length dim. Pass a nullptr if this is not needed
+ * @param nClusters number of clusters
+ * @param nPoints number of points in the dataset
+ * @param dim dataset dimensionality
+ * @param stream cuda stream
+ * @return the cluster dispersion value
+ */
+template <typename DataT, typename IdxT = int, int TPB = 256>
+DataT dispersion(const DataT* centroids,
+                 const IdxT* clusterSizes,
+                 DataT* globalCentroid,
+                 IdxT nClusters,
+                 IdxT nPoints,
+                 IdxT dim,
+                 cudaStream_t stream)
+{
+  static const int RowsPerThread = 4;
+  static const int ColsPerBlk    = 32;
+  static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
+  dim3 grid(raft::ceildiv(nPoints, (IdxT)RowsPerBlk), raft::ceildiv(dim, (IdxT)ColsPerBlk));
+  rmm::device_uvector<DataT> mean(0, stream);
+  rmm::device_uvector<DataT> result(1, stream);
+  DataT* mu = globalCentroid;
+  if (globalCentroid == nullptr) {
+    mean.resize(dim, stream);
+    mu = mean.data();
+  }
+  RAFT_CUDA_TRY(cudaMemsetAsync(mu, 0, sizeof(DataT) * dim, stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(result.data(), 0, sizeof(DataT), stream));
+  weightedMeanKernel<DataT, IdxT, TPB, ColsPerBlk>
+    <<<grid, TPB, 0, stream>>>(mu, centroids, clusterSizes, dim, nClusters);
+  RAFT_CUDA_TRY(cudaGetLastError());
+  DataT ratio = DataT(1) / DataT(nPoints);
+  raft::linalg::scalarMultiply(mu, mu, ratio, dim, stream);
+  // finally, compute the dispersion
+  constexpr int ItemsPerThread = 4;
+  int nblks                    = raft::ceildiv<int>(dim * nClusters, TPB * ItemsPerThread);
+  dispersionKernel<DataT, IdxT, TPB>
+    <<<nblks, TPB, 0, stream>>>(result.data(), centroids, clusterSizes, mu, dim, nClusters);
+  RAFT_CUDA_TRY(cudaGetLastError());
+  DataT h_result;
+  raft::update_host(&h_result, result.data(), 1, stream);
+  raft::interruptible::synchronize(stream);
+  return sqrt(h_result);
+}
+
+}  // end namespace detail
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/include/raft/stats/detail/entropy.cuh b/cpp/include/raft/stats/detail/entropy.cuh
new file mode 100644
index 0000000000..3eed86f705
--- /dev/null
+++ b/cpp/include/raft/stats/detail/entropy.cuh
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file entropy.cuh
+ * @brief Calculates the entropy for a labeling in nats.(ie, uses natural logarithm for the
+ * calculations)
+ */
+
+#pragma once
+#include <cub/cub.cuh>
+#include <math.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/divide.hpp>
+#include <raft/linalg/map_then_reduce.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace stats {
+namespace detail {
+
+/**
+ * @brief Lambda to calculate the entropy of a sample given its probability value
+ *
+ * @param p: the input to the functional mapping
+ * @param q: dummy param
+ */
+struct entropyOp {
+  HDI double operator()(double p, double q)
+  {
+    if (p)
+      return -1 * (p) * (log(p));
+    else
+      return 0.0;
+  }
+};
+
+/**
+ * @brief function to calculate the bincounts of number of samples in every label
+ *
+ * @tparam LabelT: type of the labels
+ * @param labels: the pointer to the array containing labels for every data sample
+ * @param binCountArray: pointer to the 1D array that contains the count of samples per cluster
+ * @param nRows: number of data samples
+ * @param lowerLabelRange
+ * @param upperLabelRange
+ * @param workspace: device buffer containing workspace memory
+ * @param stream: the cuda stream where to launch this kernel
+ */
+template <typename LabelT>
+void countLabels(const LabelT* labels,
+                 double* binCountArray,
+                 int nRows,
+                 LabelT lowerLabelRange,
+                 LabelT upperLabelRange,
+                 rmm::device_uvector<char>& workspace,
+                 cudaStream_t stream)
+{
+  int num_levels            = upperLabelRange - lowerLabelRange + 2;
+  LabelT lower_level        = lowerLabelRange;
+  LabelT upper_level        = upperLabelRange + 1;
+  size_t temp_storage_bytes = 0;
+
+  RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(nullptr,
+                                                    temp_storage_bytes,
+                                                    labels,
+                                                    binCountArray,
+                                                    num_levels,
+                                                    lower_level,
+                                                    upper_level,
+                                                    nRows,
+                                                    stream));
+
+  workspace.resize(temp_storage_bytes, stream);
+
+  RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(workspace.data(),
+                                                    temp_storage_bytes,
+                                                    labels,
+                                                    binCountArray,
+                                                    num_levels,
+                                                    lower_level,
+                                                    upper_level,
+                                                    nRows,
+                                                    stream));
+}
+
+/**
+ * @brief Function to calculate entropy
+ * <a href="https://en.wikipedia.org/wiki/Entropy_(information_theory)">more info on entropy</a>
+ *
+ * @param clusterArray: the array of classes of type T
+ * @param size: the size of the data points of type int
+ * @param lowerLabelRange: the lower bound of the range of labels
+ * @param upperLabelRange: the upper bound of the range of labels
+ * @param stream: the cudaStream object
+ * @return the entropy score
+ */
+template <typename T>
+double entropy(const T* clusterArray,
+               const int size,
+               const T lowerLabelRange,
+               const T upperLabelRange,
+               cudaStream_t stream)
+{
+  if (!size) return 1.0;
+
+  T numUniqueClasses = upperLabelRange - lowerLabelRange + 1;
+
+  // declaring, allocating and initializing memory for bincount array and entropy values
+  rmm::device_uvector<double> prob(numUniqueClasses, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(prob.data(), 0, numUniqueClasses * sizeof(double), stream));
+  rmm::device_scalar<double> d_entropy(stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(d_entropy.data(), 0, sizeof(double), stream));
+
+  // workspace allocation
+  rmm::device_uvector<char> workspace(1, stream);
+
+  // calculating the bincounts and populating the prob array
+  countLabels(clusterArray, prob.data(), size, lowerLabelRange, upperLabelRange, workspace, stream);
+
+  // scalar dividing by size
+  raft::linalg::divideScalar<double>(
+    prob.data(), prob.data(), (double)size, numUniqueClasses, stream);
+
+  // calculating the aggregate entropy
+  raft::linalg::mapThenSumReduce<double, entropyOp>(
+    d_entropy.data(), numUniqueClasses, entropyOp(), stream, prob.data(), prob.data());
+
+  // updating in the host memory
+  double h_entropy;
+  raft::update_host(&h_entropy, d_entropy.data(), 1, stream);
+
+  raft::interruptible::synchronize(stream);
+
+  return h_entropy;
+}
+
+};  // end namespace detail
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/detail/homogeneity_score.cuh b/cpp/include/raft/stats/detail/homogeneity_score.cuh
new file mode 100644
index 0000000000..b91175fe0f
--- /dev/null
+++ b/cpp/include/raft/stats/detail/homogeneity_score.cuh
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file homogeneity_score.cuh
+ *
+ * @brief A clustering result satisfies homogeneity if all of its clusters
+ * contain only data points which are members of a single class.
+ */
+
+#pragma once
+
+#include <raft/mr/device/allocator.hpp>
+#include <raft/stats/entropy.hpp>
+#include <raft/stats/mutual_info_score.hpp>
+
+namespace raft {
+namespace stats {
+namespace detail {
+/**
+ * @brief Function to calculate the homogeneity score between two clusters
+ * <a href="https://en.wikipedia.org/wiki/Homogeneity_(statistics)">more info on mutual
+ * information</a>
+ * @param truthClusterArray: the array of truth classes of type T
+ * @param predClusterArray: the array of predicted classes of type T
+ * @param size: the size of the data points of type int
+ * @param lowerLabelRange: the lower bound of the range of labels
+ * @param upperLabelRange: the upper bound of the range of labels
+ * @param stream: the cudaStream object
+ */
+template <typename T>
+double homogeneity_score(const T* truthClusterArray,
+                         const T* predClusterArray,
+                         int size,
+                         T lowerLabelRange,
+                         T upperLabelRange,
+                         cudaStream_t stream)
+{
+  if (size == 0) return 1.0;
+
+  double computedMI, computedEntropy;
+
+  computedMI = raft::stats::mutual_info_score(
+    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+  computedEntropy =
+    raft::stats::entropy(truthClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+
+  double homogeneity;
+
+  if (computedEntropy) {
+    homogeneity = computedMI / computedEntropy;
+  } else
+    homogeneity = 1.0;
+
+  return homogeneity;
+}
+
+};  // end namespace detail
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/detail/kl_divergence.cuh b/cpp/include/raft/stats/detail/kl_divergence.cuh
new file mode 100644
index 0000000000..117dfd07fc
--- /dev/null
+++ b/cpp/include/raft/stats/detail/kl_divergence.cuh
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file kl_divergence.cuh
+ * @brief The KL divergence tells us how well the probability distribution Q AKA candidatePDF
+ * approximates the probability distribution P AKA modelPDF.
+ */
+
+#pragma once
+
+#include <math.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/map_then_reduce.hpp>
+#include <rmm/device_scalar.hpp>
+
+namespace raft {
+namespace stats {
+namespace detail {
+
+/**
+ * @brief the KL Diverence mapping function
+ *
+ * @tparam Type: Data type of the input
+ * @param modelPDF: the model probability density function of type DataT
+ * @param candidatePDF: the candidate probability density function of type DataT
+ */
+template <typename Type>
+struct KLDOp {
+  HDI Type operator()(Type modelPDF, Type candidatePDF)
+  {
+    if (modelPDF == 0.0)
+      return 0;
+
+    else
+      return modelPDF * (log(modelPDF) - log(candidatePDF));
+  }
+};
+
+/**
+ * @brief Function to calculate KL Divergence
+ * <a href="https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence">more info on KL
+ * Divergence</a>
+ *
+ * @tparam DataT: Data type of the input array
+ * @param modelPDF: the model array of probability density functions of type DataT
+ * @param candidatePDF: the candidate array of probability density functions of type DataT
+ * @param size: the size of the data points of type int
+ * @param stream: the cudaStream object
+ */
+template <typename DataT>
+DataT kl_divergence(const DataT* modelPDF, const DataT* candidatePDF, int size, cudaStream_t stream)
+{
+  rmm::device_scalar<DataT> d_KLDVal(stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(d_KLDVal.data(), 0, sizeof(DataT), stream));
+
+  raft::linalg::mapThenSumReduce<DataT, KLDOp<DataT>, 256, const DataT*>(
+    d_KLDVal.data(), (size_t)size, KLDOp<DataT>(), stream, modelPDF, candidatePDF);
+
+  DataT h_KLDVal;
+
+  raft::update_host(&h_KLDVal, d_KLDVal.data(), 1, stream);
+
+  raft::interruptible::synchronize(stream);
+
+  return h_KLDVal;
+}
+
+};  // end namespace detail
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/detail/mutual_info_score.cuh b/cpp/include/raft/stats/detail/mutual_info_score.cuh
new file mode 100644
index 0000000000..b1349d6379
--- /dev/null
+++ b/cpp/include/raft/stats/detail/mutual_info_score.cuh
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file mutual_info_score.cuh
+ * @brief The Mutual Information is a measure of the similarity between two labels of
+ *   the same data.This metric is independent of the absolute values of the labels:
+ *   a permutation of the class or cluster label values won't change the
+ *   score value in any way.
+ *   This metric is furthermore symmetric.This can be useful to
+ *   measure the agreement of two independent label assignments strategies
+ *   on the same dataset when the real ground truth is not known.
+ */
+#pragma once
+
+#include <cub/cub.cuh>
+#include <math.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/interruptible.hpp>
+#include <raft/linalg/reduce.hpp>
+#include <raft/stats/contingency_matrix.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace stats {
+namespace detail {
+
+/**
+ * @brief kernel to calculate the mutual info score
+ * @param dContingencyMatrix: the contingency matrix corresponding to the two clusters
+ * @param a: the row wise sum of the contingency matrix, which is also the bin counts of first
+ * cluster array
+ * @param b: the column wise sum of the contingency matrix, which is also the bin counts of second
+ * cluster array
+ * @param numUniqueClasses: number of unique classes
+ * @param size: the size of array a and b (size of the contingency matrix is (size x size))
+ * @param d_MI: pointer to the device memory that stores the aggreggate mutual information
+ */
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y>
+__global__ void mutual_info_kernel(const int* dContingencyMatrix,
+                                   const int* a,
+                                   const int* b,
+                                   int numUniqueClasses,
+                                   int size,
+                                   double* d_MI)
+{
+  // calculating the indices of pairs of datapoints compared by the current thread
+  int j = threadIdx.x + blockIdx.x * blockDim.x;
+  int i = threadIdx.y + blockIdx.y * blockDim.y;
+
+  // thread-local variable to count the mutual info
+  double localMI = 0.0;
+
+  if (i < numUniqueClasses && j < numUniqueClasses && a[i] * b[j] != 0 &&
+      dContingencyMatrix[i * numUniqueClasses + j] != 0) {
+    localMI += (double(dContingencyMatrix[i * numUniqueClasses + j])) *
+               (log(double(size) * double(dContingencyMatrix[i * numUniqueClasses + j])) -
+                log(double(a[i] * b[j])));
+  }
+
+  // specialize blockReduce for a 2D block of 1024 threads of type uint64_t
+  typedef cub::BlockReduce<double, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
+    BlockReduce;
+
+  // Allocate shared memory for blockReduce
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  // summing up thread-local counts specific to a block
+  localMI = BlockReduce(temp_storage).Sum(localMI);
+  __syncthreads();
+
+  // executed once per block
+  if (threadIdx.x == 0 && threadIdx.y == 0) { raft::myAtomicAdd(d_MI, localMI); }
+}
+
+/**
+ * @brief Function to calculate the mutual information between two clusters
+ * <a href="https://en.wikipedia.org/wiki/Mutual_information">more info on mutual information</a>
+ * @param firstClusterArray: the array of classes of type T
+ * @param secondClusterArray: the array of classes of type T
+ * @param size: the size of the data points of type int
+ * @param lowerLabelRange: the lower bound of the range of labels
+ * @param upperLabelRange: the upper bound of the range of labels
+ * @param stream: the cudaStream object
+ */
+template <typename T>
+double mutual_info_score(const T* firstClusterArray,
+                         const T* secondClusterArray,
+                         int size,
+                         T lowerLabelRange,
+                         T upperLabelRange,
+                         cudaStream_t stream)
+{
+  int numUniqueClasses = upperLabelRange - lowerLabelRange + 1;
+
+  // declaring, allocating and initializing memory for the contingency marix
+  rmm::device_uvector<int> dContingencyMatrix(numUniqueClasses * numUniqueClasses, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(
+    dContingencyMatrix.data(), 0, numUniqueClasses * numUniqueClasses * sizeof(int), stream));
+
+  // workspace allocation
+  size_t workspaceSz = raft::stats::getContingencyMatrixWorkspaceSize(
+    size, firstClusterArray, stream, lowerLabelRange, upperLabelRange);
+  rmm::device_uvector<char> pWorkspace(workspaceSz, stream);
+
+  // calculating the contingency matrix
+  raft::stats::contingencyMatrix(firstClusterArray,
+                                 secondClusterArray,
+                                 (int)size,
+                                 (int*)dContingencyMatrix.data(),
+                                 stream,
+                                 (void*)pWorkspace.data(),
+                                 workspaceSz,
+                                 lowerLabelRange,
+                                 upperLabelRange);
+
+  // creating device buffers for all the parameters involved in ARI calculation
+  // device variables
+  rmm::device_uvector<int> a(numUniqueClasses, stream);
+  rmm::device_uvector<int> b(numUniqueClasses, stream);
+  rmm::device_scalar<double> d_MI(stream);
+
+  // host variables
+  double h_MI;
+
+  // initializing device memory
+  RAFT_CUDA_TRY(cudaMemsetAsync(a.data(), 0, numUniqueClasses * sizeof(int), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(b.data(), 0, numUniqueClasses * sizeof(int), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(d_MI.data(), 0, sizeof(double), stream));
+
+  // calculating the row-wise sums
+  raft::linalg::reduce<int, int, int>(
+    a.data(), dContingencyMatrix.data(), numUniqueClasses, numUniqueClasses, 0, true, true, stream);
+
+  // calculating the column-wise sums
+  raft::linalg::reduce<int, int, int>(b.data(),
+                                      dContingencyMatrix.data(),
+                                      numUniqueClasses,
+                                      numUniqueClasses,
+                                      0,
+                                      true,
+                                      false,
+                                      stream);
+
+  // kernel configuration
+  static const int BLOCK_DIM_Y = 16, BLOCK_DIM_X = 16;
+  dim3 numThreadsPerBlock(BLOCK_DIM_X, BLOCK_DIM_Y);
+  dim3 numBlocks(raft::ceildiv<int>(numUniqueClasses, numThreadsPerBlock.x),
+                 raft::ceildiv<int>(numUniqueClasses, numThreadsPerBlock.y));
+
+  // calling the kernel
+  mutual_info_kernel<T, BLOCK_DIM_X, BLOCK_DIM_Y><<<numBlocks, numThreadsPerBlock, 0, stream>>>(
+    dContingencyMatrix.data(), a.data(), b.data(), numUniqueClasses, size, d_MI.data());
+
+  // updating in the host memory
+  h_MI = d_MI.value(stream);
+
+  raft::interruptible::synchronize(stream);
+
+  return h_MI / size;
+}
+
+};  // end namespace detail
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/detail/rand_index.cuh b/cpp/include/raft/stats/detail/rand_index.cuh
new file mode 100644
index 0000000000..19f8e56121
--- /dev/null
+++ b/cpp/include/raft/stats/detail/rand_index.cuh
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file rand_index.cuh
+ * @todo TODO(Ganesh Venkataramana):
+ * <pre>
+ * The below rand_index calculation implementation is a Brute force one that uses
+ (nElements*nElements) threads (2 dimensional grids and blocks)
+ * For small datasets, this will suffice; but for larger ones, work done by the threads increase
+ dramatically.
+ * A more mathematically intensive implementation that uses half the above threads can be done,
+ which will prove to be more efficient for larger datasets
+ * the idea is as follows:
+  * instead of 2D block and grid configuration with a total of (nElements*nElements) threads (where
+ each (i,j) through these threads represent an ordered pair selection of 2 data points), a 1D block
+ and grid configuration with a total of (nElements*(nElements))/2 threads (each thread index
+ represents an element part of the set of unordered pairwise selections from the dataset (nChoose2))
+  * In this setup, one has to generate a one-to-one mapping between this 1D thread index (for each
+ kernel) and the unordered pair of chosen datapoints.
+  * More specifically, thread0-> {dataPoint1, dataPoint0}, thread1-> {dataPoint2, dataPoint0},
+ thread2-> {dataPoint2, dataPoint1} ... thread((nElements*(nElements))/2 - 1)->
+ {dataPoint(nElements-1),dataPoint(nElements-2)}
+  * say ,
+     * threadNum: thread index | threadNum = threadIdx.x + BlockIdx.x*BlockDim.x,
+     * i : index of dataPoint i
+     * j : index of dataPoint j
+  * then the mapping is as follows:
+     * i = ceil((-1 + sqrt(1 + 8*(1 + threadNum)))/2) = floor((1 + sqrt(1 + 8*threadNum))/2)
+     * j = threadNum - i(i-1)/2
+  * after obtaining the the pair of datapoints, calculation of rand index is the same as done in
+ this implementation
+ * Caveat: since the kernel implementation involves use of emulated sqrt() operations:
+  * the number of instructions executed per kernel is ~40-50 times
+  * as the O(nElements*nElements) increase beyond the floating point limit, floating point
+ inaccuracies occur, and hence the above floor(...) !=  ceil(...)
+ * </pre>
+ */
+
+#pragma once
+
+#include <cub/cub.cuh>
+#include <math.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/interruptible.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace stats {
+namespace detail {
+
+/**
+ * @brief kernel to calculate the values of a and b
+ * @param firstClusterArray: the array of classes of type T
+ * @param secondClusterArray: the array of classes of type T
+ * @param size: the size of the data points
+ * @param a: number of pairs of points that both the clusters have classified the same
+ * @param b: number of pairs of points that both the clusters have classified differently
+ */
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y>
+__global__ void computeTheNumerator(
+  const T* firstClusterArray, const T* secondClusterArray, uint64_t size, uint64_t* a, uint64_t* b)
+{
+  // calculating the indices of pairs of datapoints compared by the current thread
+  uint64_t j = threadIdx.x + blockIdx.x * blockDim.x;
+  uint64_t i = threadIdx.y + blockIdx.y * blockDim.y;
+
+  // thread-local variables to count a and b
+  uint64_t myA = 0, myB = 0;
+
+  if (i < size && j < size && j < i) {
+    // checking if the pair have been classified the same by both the clusters
+    if (firstClusterArray[i] == firstClusterArray[j] &&
+        secondClusterArray[i] == secondClusterArray[j]) {
+      ++myA;
+    }
+
+    // checking if the pair have been classified differently by both the clusters
+    else if (firstClusterArray[i] != firstClusterArray[j] &&
+             secondClusterArray[i] != secondClusterArray[j]) {
+      ++myB;
+    }
+  }
+
+  // specialize blockReduce for a 2D block of 1024 threads of type uint64_t
+  typedef cub::BlockReduce<uint64_t, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
+    BlockReduce;
+
+  // Allocate shared memory for blockReduce
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  // summing up thread-local counts specific to a block
+  myA = BlockReduce(temp_storage).Sum(myA);
+  __syncthreads();
+  myB = BlockReduce(temp_storage).Sum(myB);
+  __syncthreads();
+
+  // executed once per block
+  if (threadIdx.x == 0 && threadIdx.y == 0) {
+    raft::myAtomicAdd<unsigned long long int>((unsigned long long int*)a, myA);
+    raft::myAtomicAdd<unsigned long long int>((unsigned long long int*)b, myB);
+  }
+}
+
+/**
+ * @brief Function to calculate RandIndex
+ * <a href="https://en.wikipedia.org/wiki/Rand_index">more info on rand index</a>
+ * @param firstClusterArray: the array of classes of type T
+ * @param secondClusterArray: the array of classes of type T
+ * @param size: the size of the data points of type uint64_t
+ * @param stream: the cudaStream object
+ */
+template <typename T>
+double compute_rand_index(T* firstClusterArray,
+                          T* secondClusterArray,
+                          uint64_t size,
+                          cudaStream_t stream)
+{
+  // rand index for size less than 2 is not defined
+  ASSERT(size >= 2, "Rand Index for size less than 2 not defined!");
+
+  // allocating and initializing memory for a and b in the GPU
+  rmm::device_uvector<uint64_t> arr_buf(2, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream));
+
+  // kernel configuration
+  static const int BLOCK_DIM_Y = 16, BLOCK_DIM_X = 16;
+  dim3 numThreadsPerBlock(BLOCK_DIM_X, BLOCK_DIM_Y);
+  dim3 numBlocks(raft::ceildiv<int>(size, numThreadsPerBlock.x),
+                 raft::ceildiv<int>(size, numThreadsPerBlock.y));
+
+  // calling the kernel
+  computeTheNumerator<T, BLOCK_DIM_X, BLOCK_DIM_Y><<<numBlocks, numThreadsPerBlock, 0, stream>>>(
+    firstClusterArray, secondClusterArray, size, arr_buf.data(), arr_buf.data() + 1);
+
+  // synchronizing and updating the calculated values of a and b from device to host
+  uint64_t ab_host[2] = {0};
+  raft::update_host(ab_host, arr_buf.data(), 2, stream);
+  raft::interruptible::synchronize(stream);
+
+  // error handling
+  RAFT_CUDA_TRY(cudaGetLastError());
+
+  // denominator
+  uint64_t nChooseTwo = size * (size - 1) / 2;
+
+  // calculating the rand_index
+  return (double)(((double)(ab_host[0] + ab_host[1])) / (double)nChooseTwo);
+}
+
+};  // end namespace detail
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/detail/scores.cuh b/cpp/include/raft/stats/detail/scores.cuh
new file mode 100644
index 0000000000..130bdb4a85
--- /dev/null
+++ b/cpp/include/raft/stats/detail/scores.cuh
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#include <raft/cudart_utils.h>
+#include <raft/distance/distance.hpp>
+#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/power.cuh>
+#include <raft/linalg/subtract.hpp>
+#include <raft/spatial/knn/knn.hpp>
+#include <raft/stats/mean.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+#include <thrust/device_ptr.h>
+#include <thrust/reduce.h>
+
+#define N_THREADS 512
+
+namespace raft {
+namespace stats {
+namespace detail {
+/**
+ * Calculates the "Coefficient of Determination" (R-Squared) score
+ * normalizing the sum of squared errors by the total sum of squares.
+ *
+ * This score indicates the proportionate amount of variation in an
+ * expected response variable is explained by the independent variables
+ * in a linear regression model. The larger the R-squared value, the
+ * more variability is explained by the linear regression model.
+ *
+ * @param y: Array of ground-truth response variables
+ * @param y_hat: Array of predicted response variables
+ * @param n: Number of elements in y and y_hat
+ * @param stream: cuda stream
+ * @return: The R-squared value.
+ */
+template <typename math_t>
+math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
+{
+  rmm::device_scalar<math_t> y_bar(stream);
+
+  raft::stats::mean(y_bar.data(), y, 1, n, false, false, stream);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+  rmm::device_uvector<math_t> sse_arr(n, stream);
+
+  raft::linalg::eltwiseSub(sse_arr.data(), y, y_hat, n, stream);
+  raft::linalg::powerScalar(sse_arr.data(), sse_arr.data(), math_t(2.0), n, stream);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+  rmm::device_uvector<math_t> ssto_arr(n, stream);
+
+  raft::linalg::subtractDevScalar(ssto_arr.data(), y, y_bar.data(), n, stream);
+  raft::linalg::powerScalar(ssto_arr.data(), ssto_arr.data(), math_t(2.0), n, stream);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+  thrust::device_ptr<math_t> d_sse  = thrust::device_pointer_cast(sse_arr.data());
+  thrust::device_ptr<math_t> d_ssto = thrust::device_pointer_cast(ssto_arr.data());
+
+  math_t sse  = thrust::reduce(thrust::cuda::par.on(stream), d_sse, d_sse + n);
+  math_t ssto = thrust::reduce(thrust::cuda::par.on(stream), d_ssto, d_ssto + n);
+
+  return 1.0 - sse / ssto;
+}
+
+/**
+ * @brief Compute accuracy of predictions. Useful for classification.
+ * @tparam math_t: data type for predictions (e.g., int for classification)
+ * @param[in] predictions: array of predictions (GPU pointer).
+ * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer).
+ * @param[in] n: number of elements in each of predictions, ref_predictions.
+ * @param[in] stream: cuda stream.
+ * @return: Accuracy score in [0, 1]; higher is better.
+ */
+template <typename math_t>
+float accuracy_score(const math_t* predictions,
+                     const math_t* ref_predictions,
+                     int n,
+                     cudaStream_t stream)
+{
+  unsigned long long correctly_predicted = 0ULL;
+  rmm::device_uvector<math_t> diffs_array(n, stream);
+
+  // TODO could write a kernel instead
+  raft::linalg::eltwiseSub(diffs_array.data(), predictions, ref_predictions, n, stream);
+  RAFT_CUDA_TRY(cudaGetLastError());
+  correctly_predicted =
+    thrust::count(thrust::cuda::par.on(stream), diffs_array.data(), diffs_array.data() + n, 0);
+
+  float accuracy = correctly_predicted * 1.0f / n;
+  return accuracy;
+}
+
+template <typename T>
+__global__ void reg_metrics_kernel(
+  const T* predictions, const T* ref_predictions, int n, double* abs_diffs, double* tmp_sums)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  __shared__ double shmem[2];  // {abs_difference_sum, squared difference sum}
+
+  for (int i = threadIdx.x; i < 2; i += blockDim.x) {
+    shmem[i] = 0;
+  }
+  __syncthreads();
+
+  for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
+    double diff     = predictions[i] - ref_predictions[i];
+    double abs_diff = abs(diff);
+    raft::myAtomicAdd(&shmem[0], abs_diff);
+    raft::myAtomicAdd(&shmem[1], diff * diff);
+
+    // update absolute difference in global memory for subsequent abs. median computation
+    abs_diffs[i] = abs_diff;
+  }
+  __syncthreads();
+
+  // Update tmp_sum w/ total abs_difference_sum and squared difference sum.
+  for (int i = threadIdx.x; i < 2; i += blockDim.x) {
+    raft::myAtomicAdd(&tmp_sums[i], shmem[i]);
+  }
+}
+
+/**
+ * @brief Compute regression metrics mean absolute error, mean squared error, median absolute error
+ * @tparam T: data type for predictions (e.g., float or double for regression).
+ * @param[in] predictions: array of predictions (GPU pointer).
+ * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer).
+ * @param[in] n: number of elements in each of predictions, ref_predictions. Should be > 0.
+ * @param[in] stream: cuda stream.
+ * @param[out] mean_abs_error: Mean Absolute Error. Sum over n of (|predictions[i] -
+ * ref_predictions[i]|) / n.
+ * @param[out] mean_squared_error: Mean Squared Error. Sum over n of ((predictions[i] -
+ * ref_predictions[i])^2) / n.
+ * @param[out] median_abs_error: Median Absolute Error. Median of |predictions[i] -
+ * ref_predictions[i]| for i in [0, n).
+ */
+template <typename T>
+void regression_metrics(const T* predictions,
+                        const T* ref_predictions,
+                        int n,
+                        cudaStream_t stream,
+                        double& mean_abs_error,
+                        double& mean_squared_error,
+                        double& median_abs_error)
+{
+  std::vector<double> mean_errors(2);
+  std::vector<double> h_sorted_abs_diffs(n);
+  int thread_cnt = 256;
+  int block_cnt  = raft::ceildiv(n, thread_cnt);
+
+  int array_size = n * sizeof(double);
+  rmm::device_uvector<double> abs_diffs_array(array_size, stream);
+  rmm::device_uvector<double> sorted_abs_diffs(array_size, stream);
+  rmm::device_uvector<double> tmp_sums(2 * sizeof(double), stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(tmp_sums.data(), 0, 2 * sizeof(double), stream));
+
+  reg_metrics_kernel<T><<<block_cnt, thread_cnt, 0, stream>>>(
+    predictions, ref_predictions, n, abs_diffs_array.data(), tmp_sums.data());
+  RAFT_CUDA_TRY(cudaGetLastError());
+  raft::update_host(&mean_errors[0], tmp_sums.data(), 2, stream);
+  raft::interruptible::synchronize(stream);
+
+  mean_abs_error     = mean_errors[0] / n;
+  mean_squared_error = mean_errors[1] / n;
+
+  // Compute median error. Sort diffs_array and pick median value
+  char* temp_storage = nullptr;
+  size_t temp_storage_bytes;
+  RAFT_CUDA_TRY(cub::DeviceRadixSort::SortKeys((void*)temp_storage,
+                                               temp_storage_bytes,
+                                               abs_diffs_array.data(),
+                                               sorted_abs_diffs.data(),
+                                               n,
+                                               0,
+                                               8 * sizeof(double),
+                                               stream));
+  rmm::device_uvector<char> temp_storage_v(temp_storage_bytes, stream);
+  temp_storage = temp_storage_v.data();
+  RAFT_CUDA_TRY(cub::DeviceRadixSort::SortKeys((void*)temp_storage,
+                                               temp_storage_bytes,
+                                               abs_diffs_array.data(),
+                                               sorted_abs_diffs.data(),
+                                               n,
+                                               0,
+                                               8 * sizeof(double),
+                                               stream));
+
+  raft::update_host(h_sorted_abs_diffs.data(), sorted_abs_diffs.data(), n, stream);
+  raft::interruptible::synchronize(stream);
+
+  int middle = n / 2;
+  if (n % 2 == 1) {
+    median_abs_error = h_sorted_abs_diffs[middle];
+  } else {
+    median_abs_error = (h_sorted_abs_diffs[middle] + h_sorted_abs_diffs[middle - 1]) / 2;
+  }
+}
+}  // namespace detail
+}  // namespace stats
+}  // namespace raft
diff --git a/cpp/include/raft/stats/detail/silhouette_score.cuh b/cpp/include/raft/stats/detail/silhouette_score.cuh
new file mode 100644
index 0000000000..8f02ff5045
--- /dev/null
+++ b/cpp/include/raft/stats/detail/silhouette_score.cuh
@@ -0,0 +1,332 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <cub/cub.cuh>
+#include <iostream>
+#include <math.h>
+#include <numeric>
+#include <raft/cuda_utils.cuh>
+#include <raft/distance/distance.hpp>
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/map_then_reduce.hpp>
+#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/reduce.hpp>
+#include <raft/linalg/reduce_cols_by_key.cuh>
+#include <rmm/device_scalar.hpp>
+
+namespace raft {
+namespace stats {
+namespace detail {
+
+/**
+ * @brief kernel that calculates the average intra-cluster distance for every sample data point and
+ * updates the cluster distance to max value
+ * @tparam DataT: type of the data samples
+ * @tparam LabelT: type of the labels
+ * @param sampleToClusterSumOfDistances: the pointer to the 2D array that contains the sum of
+ * distances from every sample to every cluster (nRows x nLabels)
+ * @param binCountArray: pointer to the 1D array that contains the count of samples per cluster (1 x
+ * nLabels)
+ * @param d_aArray: the pointer to the array of average intra-cluster distances for every sample in
+ * device memory (1 x nRows)
+ * @param labels: the pointer to the array containing labels for every data sample (1 x nRows)
+ * @param nRows: number of data samples
+ * @param nLabels: number of Labels
+ * @param MAX_VAL: DataT specific upper limit
+ */
+template <typename DataT, typename LabelT>
+__global__ void populateAKernel(DataT* sampleToClusterSumOfDistances,
+                                DataT* binCountArray,
+                                DataT* d_aArray,
+                                LabelT* labels,
+                                int nRows,
+                                int nLabels,
+                                const DataT MAX_VAL)
+{
+  // getting the current index
+  int sampleIndex = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (sampleIndex >= nRows) return;
+
+  // sampleDistanceVector is an array that stores that particular row of the distanceMatrix
+  DataT* sampleToClusterSumOfDistancesVector =
+    &sampleToClusterSumOfDistances[sampleIndex * nLabels];
+
+  LabelT sampleCluster = labels[sampleIndex];
+
+  int sampleClusterIndex = (int)sampleCluster;
+
+  if (binCountArray[sampleClusterIndex] - 1 <= 0) {
+    d_aArray[sampleIndex] = -1;
+    return;
+
+  }
+
+  else {
+    d_aArray[sampleIndex] = (sampleToClusterSumOfDistancesVector[sampleClusterIndex]) /
+                            (binCountArray[sampleClusterIndex] - 1);
+
+    // modifying the sampleDistanceVector to give sample average distance
+    sampleToClusterSumOfDistancesVector[sampleClusterIndex] = MAX_VAL;
+  }
+}
+
+/**
+ * @brief function to calculate the bincounts of number of samples in every label
+ * @tparam DataT: type of the data samples
+ * @tparam LabelT: type of the labels
+ * @param labels: the pointer to the array containing labels for every data sample (1 x nRows)
+ * @param binCountArray: pointer to the 1D array that contains the count of samples per cluster (1 x
+ * nLabels)
+ * @param nRows: number of data samples
+ * @param nUniqueLabels: number of Labels
+ * @param workspace: device buffer containing workspace memory
+ * @param stream: the cuda stream where to launch this kernel
+ */
+template <typename DataT, typename LabelT>
+void countLabels(LabelT* labels,
+                 DataT* binCountArray,
+                 int nRows,
+                 int nUniqueLabels,
+                 rmm::device_uvector<char>& workspace,
+                 cudaStream_t stream)
+{
+  int num_levels            = nUniqueLabels + 1;
+  LabelT lower_level        = 0;
+  LabelT upper_level        = nUniqueLabels;
+  size_t temp_storage_bytes = 0;
+
+  rmm::device_uvector<int> countArray(nUniqueLabels, stream);
+
+  RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(nullptr,
+                                                    temp_storage_bytes,
+                                                    labels,
+                                                    binCountArray,
+                                                    num_levels,
+                                                    lower_level,
+                                                    upper_level,
+                                                    nRows,
+                                                    stream));
+
+  workspace.resize(temp_storage_bytes, stream);
+
+  RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(workspace.data(),
+                                                    temp_storage_bytes,
+                                                    labels,
+                                                    binCountArray,
+                                                    num_levels,
+                                                    lower_level,
+                                                    upper_level,
+                                                    nRows,
+                                                    stream));
+}
+
+/**
+ * @brief stucture that defines the division Lambda for elementwise op
+ */
+template <typename DataT>
+struct DivOp {
+  HDI DataT operator()(DataT a, int b, int c)
+  {
+    if (b == 0)
+      return ULLONG_MAX;
+    else
+      return a / b;
+  }
+};
+
+/**
+ * @brief stucture that defines the elementwise operation to calculate silhouette score using params
+ * 'a' and 'b'
+ */
+template <typename DataT>
+struct SilOp {
+  HDI DataT operator()(DataT a, DataT b)
+  {
+    if (a == 0 && b == 0 || a == b)
+      return 0;
+    else if (a == -1)
+      return 0;
+    else if (a > b)
+      return (b - a) / a;
+    else
+      return (b - a) / b;
+  }
+};
+
+/**
+ * @brief stucture that defines the reduction Lambda to find minimum between elements
+ */
+template <typename DataT>
+struct MinOp {
+  HDI DataT operator()(DataT a, DataT b)
+  {
+    if (a > b)
+      return b;
+    else
+      return a;
+  }
+};
+
+/**
+ * @brief main function that returns the average silhouette score for a given set of data and its
+ * clusterings
+ * @tparam DataT: type of the data samples
+ * @tparam LabelT: type of the labels
+ * @param X_in: pointer to the input Data samples array (nRows x nCols)
+ * @param nRows: number of data samples
+ * @param nCols: number of features
+ * @param labels: the pointer to the array containing labels for every data sample (1 x nRows)
+ * @param nLabels: number of Labels
+ * @param silhouette_scorePerSample: pointer to the array that is optionally taken in as input and
+ * is populated with the silhouette score for every sample (1 x nRows)
+ * @param stream: the cuda stream where to launch this kernel
+ * @param metric: the numerical value that maps to the type of distance metric to be used in the
+ * calculations
+ */
+template <typename DataT, typename LabelT>
+DataT silhouette_score(
+  const raft::handle_t& handle,
+  DataT* X_in,
+  int nRows,
+  int nCols,
+  LabelT* labels,
+  int nLabels,
+  DataT* silhouette_scorePerSample,
+  cudaStream_t stream,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded)
+{
+  ASSERT(nLabels >= 2 && nLabels <= (nRows - 1),
+         "silhouette Score not defined for the given number of labels!");
+
+  // compute the distance matrix
+  rmm::device_uvector<DataT> distanceMatrix(nRows * nRows, stream);
+  rmm::device_uvector<char> workspace(1, stream);
+
+  raft::distance::pairwise_distance(
+    handle, X_in, X_in, distanceMatrix.data(), nRows, nRows, nCols, metric);
+
+  // deciding on the array of silhouette scores for each dataPoint
+  rmm::device_uvector<DataT> silhouette_scoreSamples(0, stream);
+  DataT* perSampleSilScore = nullptr;
+  if (silhouette_scorePerSample == nullptr) {
+    silhouette_scoreSamples.resize(nRows, stream);
+    perSampleSilScore = silhouette_scoreSamples.data();
+  } else {
+    perSampleSilScore = silhouette_scorePerSample;
+  }
+  RAFT_CUDA_TRY(cudaMemsetAsync(perSampleSilScore, 0, nRows * sizeof(DataT), stream));
+
+  // getting the sample count per cluster
+  rmm::device_uvector<DataT> binCountArray(nLabels, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(binCountArray.data(), 0, nLabels * sizeof(DataT), stream));
+  countLabels(labels, binCountArray.data(), nRows, nLabels, workspace, stream);
+
+  // calculating the sample-cluster-distance-sum-array
+  rmm::device_uvector<DataT> sampleToClusterSumOfDistances(nRows * nLabels, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(
+    sampleToClusterSumOfDistances.data(), 0, nRows * nLabels * sizeof(DataT), stream));
+  raft::linalg::reduce_cols_by_key(distanceMatrix.data(),
+                                   labels,
+                                   sampleToClusterSumOfDistances.data(),
+                                   nRows,
+                                   nRows,
+                                   nLabels,
+                                   stream);
+
+  // creating the a array and b array
+  rmm::device_uvector<DataT> d_aArray(nRows, stream);
+  rmm::device_uvector<DataT> d_bArray(nRows, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(d_aArray.data(), 0, nRows * sizeof(DataT), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(d_bArray.data(), 0, nRows * sizeof(DataT), stream));
+
+  // kernel that populates the d_aArray
+  // kernel configuration
+  dim3 numThreadsPerBlock(32, 1, 1);
+  dim3 numBlocks(raft::ceildiv<int>(nRows, numThreadsPerBlock.x), 1, 1);
+
+  // calling the kernel
+  populateAKernel<<<numBlocks, numThreadsPerBlock, 0, stream>>>(
+    sampleToClusterSumOfDistances.data(),
+    binCountArray.data(),
+    d_aArray.data(),
+    labels,
+    nRows,
+    nLabels,
+    std::numeric_limits<DataT>::max());
+
+  // elementwise dividing by bincounts
+  rmm::device_uvector<DataT> averageDistanceBetweenSampleAndCluster(nRows * nLabels, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(
+    averageDistanceBetweenSampleAndCluster.data(), 0, nRows * nLabels * sizeof(DataT), stream));
+
+  raft::linalg::matrixVectorOp<DataT, DivOp<DataT>>(averageDistanceBetweenSampleAndCluster.data(),
+                                                    sampleToClusterSumOfDistances.data(),
+                                                    binCountArray.data(),
+                                                    binCountArray.data(),
+                                                    nLabels,
+                                                    nRows,
+                                                    true,
+                                                    true,
+                                                    DivOp<DataT>(),
+                                                    stream);
+
+  // calculating row-wise minimum
+  raft::linalg::reduce<DataT, DataT, int, raft::Nop<DataT>, MinOp<DataT>>(
+    d_bArray.data(),
+    averageDistanceBetweenSampleAndCluster.data(),
+    nLabels,
+    nRows,
+    std::numeric_limits<DataT>::max(),
+    true,
+    true,
+    stream,
+    false,
+    raft::Nop<DataT>(),
+    MinOp<DataT>());
+
+  // calculating the silhouette score per sample using the d_aArray and d_bArray
+  raft::linalg::binaryOp<DataT, SilOp<DataT>>(
+    perSampleSilScore, d_aArray.data(), d_bArray.data(), nRows, SilOp<DataT>(), stream);
+
+  // calculating the sum of all the silhouette score
+  rmm::device_scalar<DataT> d_avgSilhouetteScore(stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(d_avgSilhouetteScore.data(), 0, sizeof(DataT), stream));
+
+  raft::linalg::mapThenSumReduce<double, raft::Nop<DataT>>(d_avgSilhouetteScore.data(),
+                                                           nRows,
+                                                           raft::Nop<DataT>(),
+                                                           stream,
+                                                           perSampleSilScore,
+                                                           perSampleSilScore);
+
+  DataT avgSilhouetteScore = d_avgSilhouetteScore.value(stream);
+
+  handle.sync_stream(stream);
+
+  avgSilhouetteScore /= nRows;
+
+  return avgSilhouetteScore;
+}
+
+};  // namespace detail
+};  // namespace stats
+};  // namespace raft
diff --git a/cpp/include/raft/stats/detail/trustworthiness_score.cuh b/cpp/include/raft/stats/detail/trustworthiness_score.cuh
new file mode 100644
index 0000000000..04ae0228d6
--- /dev/null
+++ b/cpp/include/raft/stats/detail/trustworthiness_score.cuh
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/distance.hpp>
+#include <raft/matrix/col_wise_sort.hpp>
+#include <raft/spatial/knn/knn.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#define N_THREADS 512
+
+namespace raft {
+namespace stats {
+namespace detail {
+
+/**
+ * @brief Build the lookup table
+ * @param[out] lookup_table: Lookup table giving nearest neighbor order
+ *                of pairwise distance calculations given sample index
+ * @param[in] X_ind: Sorted indexes of pairwise distance calculations of X
+ * @param n: Number of samples
+ * @param work: Number of elements to consider
+ */
+__global__ void build_lookup_table(int* lookup_table, const int* X_ind, int n, int work)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= work) return;
+
+  int sample_idx = i / n;
+  int nn_idx     = i % n;
+
+  int idx                              = X_ind[i];
+  lookup_table[(sample_idx * n) + idx] = nn_idx;
+}
+
+/**
+ * @brief Compute a the rank of trustworthiness score
+ * @param[out] rank: Resulting rank
+ * @param[out] lookup_table: Lookup table giving nearest neighbor order
+ *                of pairwise distance calculations given sample index
+ * @param[in] emb_ind: Indexes of KNN on embeddings
+ * @param n: Number of samples
+ * @param n_neighbors: Number of neighbors considered by trustworthiness score
+ * @param work: Batch to consider (to do it at once use n * n_neighbors)
+ */
+template <typename knn_index_t>
+__global__ void compute_rank(double* rank,
+                             const int* lookup_table,
+                             const knn_index_t* emb_ind,
+                             int n,
+                             int n_neighbors,
+                             int work)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= work) return;
+
+  int sample_idx = i / n_neighbors;
+
+  knn_index_t emb_nn_ind = emb_ind[i];
+
+  int r   = lookup_table[(sample_idx * n) + emb_nn_ind];
+  int tmp = r - n_neighbors + 1;
+  if (tmp > 0) raft::myAtomicAdd<double>(rank, tmp);
+}
+
+/**
+ * @brief Compute a kNN and returns the indices of the nearest neighbors
+ * @param h Raft handle
+ * @param[in] input Input matrix containing the dataset
+ * @param n Number of samples
+ * @param d Number of features
+ * @param n_neighbors number of neighbors
+ * @param[out] indices KNN indexes
+ * @param[out] distances KNN distances
+ */
+template <raft::distance::DistanceType distance_type, typename math_t>
+void run_knn(const raft::handle_t& h,
+             math_t* input,
+             int n,
+             int d,
+             int n_neighbors,
+             int64_t* indices,
+             math_t* distances)
+{
+  std::vector<math_t*> ptrs(1);
+  std::vector<int> sizes(1);
+  ptrs[0]  = input;
+  sizes[0] = n;
+
+  raft::spatial::knn::brute_force_knn<int64_t, float, int>(h,
+                                                           ptrs,
+                                                           sizes,
+                                                           d,
+                                                           input,
+                                                           n,
+                                                           indices,
+                                                           distances,
+                                                           n_neighbors,
+                                                           true,
+                                                           true,
+                                                           nullptr,
+                                                           distance_type);
+}
+
+/**
+ * @brief Compute the trustworthiness score
+ * @param h Raft handle
+ * @param X[in]: Data in original dimension
+ * @param X_embedded[in]: Data in target dimension (embedding)
+ * @param n: Number of samples
+ * @param m: Number of features in high/original dimension
+ * @param d: Number of features in low/embedded dimension
+ * @param n_neighbors Number of neighbors considered by trustworthiness score
+ * @param batchSize Batch size
+ * @return Trustworthiness score
+ */
+template <typename math_t, raft::distance::DistanceType distance_type>
+double trustworthiness_score(const raft::handle_t& h,
+                             const math_t* X,
+                             math_t* X_embedded,
+                             int n,
+                             int m,
+                             int d,
+                             int n_neighbors,
+                             int batchSize = 512)
+{
+  cudaStream_t stream = h.get_stream();
+
+  const int KNN_ALLOC = n * (n_neighbors + 1);
+  rmm::device_uvector<int64_t> emb_ind(KNN_ALLOC, stream);
+  rmm::device_uvector<math_t> emb_dist(KNN_ALLOC, stream);
+
+  run_knn<distance_type>(h, X_embedded, n, d, n_neighbors + 1, emb_ind.data(), emb_dist.data());
+
+  const int PAIRWISE_ALLOC = batchSize * n;
+  rmm::device_uvector<int> X_ind(PAIRWISE_ALLOC, stream);
+  rmm::device_uvector<math_t> X_dist(PAIRWISE_ALLOC, stream);
+  rmm::device_uvector<int> lookup_table(PAIRWISE_ALLOC, stream);
+
+  double t = 0.0;
+  rmm::device_scalar<double> t_dbuf(stream);
+
+  int toDo = n;
+  while (toDo > 0) {
+    int curBatchSize = min(toDo, batchSize);
+
+    // Takes at most batchSize vectors at a time
+    raft::distance::pairwise_distance(
+      h, &X[(n - toDo) * m], X, X_dist.data(), curBatchSize, n, m, distance_type);
+
+    size_t colSortWorkspaceSize = 0;
+    bool bAllocWorkspace        = false;
+
+    raft::matrix::sort_cols_per_row(X_dist.data(),
+                                    X_ind.data(),
+                                    curBatchSize,
+                                    n,
+                                    bAllocWorkspace,
+                                    nullptr,
+                                    colSortWorkspaceSize,
+                                    stream);
+
+    if (bAllocWorkspace) {
+      rmm::device_uvector<char> sortColsWorkspace(colSortWorkspaceSize, stream);
+
+      raft::matrix::sort_cols_per_row(X_dist.data(),
+                                      X_ind.data(),
+                                      curBatchSize,
+                                      n,
+                                      bAllocWorkspace,
+                                      sortColsWorkspace.data(),
+                                      colSortWorkspaceSize,
+                                      stream);
+    }
+
+    int work     = curBatchSize * n;
+    int n_blocks = raft::ceildiv(work, N_THREADS);
+    build_lookup_table<<<n_blocks, N_THREADS, 0, stream>>>(
+      lookup_table.data(), X_ind.data(), n, work);
+
+    RAFT_CUDA_TRY(cudaMemsetAsync(t_dbuf.data(), 0, sizeof(double), stream));
+
+    work     = curBatchSize * (n_neighbors + 1);
+    n_blocks = raft::ceildiv(work, N_THREADS);
+    compute_rank<<<n_blocks, N_THREADS, 0, stream>>>(
+      t_dbuf.data(),
+      lookup_table.data(),
+      &emb_ind.data()[(n - toDo) * (n_neighbors + 1)],
+      n,
+      n_neighbors + 1,
+      work);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+    t += t_dbuf.value(stream);
+
+    toDo -= curBatchSize;
+  }
+
+  t = 1.0 - ((2.0 / ((n * n_neighbors) * ((2.0 * n) - (3.0 * n_neighbors) - 1.0))) * t);
+
+  return t;
+}
+
+}  // namespace detail
+}  // namespace stats
+}  // namespace raft
diff --git a/cpp/include/raft/stats/detail/v_measure.cuh b/cpp/include/raft/stats/detail/v_measure.cuh
new file mode 100644
index 0000000000..c51ababbb9
--- /dev/null
+++ b/cpp/include/raft/stats/detail/v_measure.cuh
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file v_measure.cuh
+ */
+
+#include <raft/stats/homogeneity_score.hpp>
+
+namespace raft {
+namespace stats {
+namespace detail {
+
+/**
+ * @brief Function to calculate the v-measure between two clusters
+ *
+ * @param truthClusterArray: the array of truth classes of type T
+ * @param predClusterArray: the array of predicted classes of type T
+ * @param size: the size of the data points of type int
+ * @param lowerLabelRange: the lower bound of the range of labels
+ * @param upperLabelRange: the upper bound of the range of labels
+ * @param stream: the cudaStream object
+ * @param beta: v_measure parameter
+ */
+template <typename T>
+double v_measure(const T* truthClusterArray,
+                 const T* predClusterArray,
+                 int size,
+                 T lowerLabelRange,
+                 T upperLabelRange,
+                 cudaStream_t stream,
+                 double beta = 1.0)
+{
+  double computedHomogeity, computedCompleteness, computedVMeasure;
+
+  computedHomogeity = raft::stats::homogeneity_score(
+    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+  computedCompleteness = raft::stats::homogeneity_score(
+    predClusterArray, truthClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+
+  if (computedCompleteness + computedHomogeity == 0.0)
+    computedVMeasure = 0.0;
+  else
+    computedVMeasure = ((1 + beta) * computedHomogeity * computedCompleteness /
+                        (beta * computedHomogeity + computedCompleteness));
+
+  return computedVMeasure;
+}
+
+};  // end namespace detail
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/dispersion.hpp b/cpp/include/raft/stats/dispersion.hpp
new file mode 100644
index 0000000000..381f210d85
--- /dev/null
+++ b/cpp/include/raft/stats/dispersion.hpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/stats/detail/dispersion.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute cluster dispersion metric. This is very useful for
+ * automatically finding the 'k' (in kmeans) that improves this metric.
+ * @tparam DataT data type
+ * @tparam IdxT index type
+ * @tparam TPB threads block for kernels launched
+ * @param centroids the cluster centroids. This is assumed to be row-major
+ *   and of dimension (nClusters x dim)
+ * @param clusterSizes number of points in the dataset which belong to each
+ *   cluster. This is of length nClusters
+ * @param globalCentroid compute the global weighted centroid of all cluster
+ *   centroids. This is of length dim. Pass a nullptr if this is not needed
+ * @param nClusters number of clusters
+ * @param nPoints number of points in the dataset
+ * @param dim dataset dimensionality
+ * @param stream cuda stream
+ * @return the cluster dispersion value
+ */
+template <typename DataT, typename IdxT = int, int TPB = 256>
+DataT dispersion(const DataT* centroids,
+                 const IdxT* clusterSizes,
+                 DataT* globalCentroid,
+                 IdxT nClusters,
+                 IdxT nPoints,
+                 IdxT dim,
+                 cudaStream_t stream)
+{
+  return detail::dispersion(
+    centroids, clusterSizes, globalCentroid, nClusters, nPoints, dim, stream);
+}
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/include/raft/stats/entropy.hpp b/cpp/include/raft/stats/entropy.hpp
new file mode 100644
index 0000000000..c1f15cb0fe
--- /dev/null
+++ b/cpp/include/raft/stats/entropy.hpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/stats/detail/entropy.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate entropy
+ * <a href="https://en.wikipedia.org/wiki/Entropy_(information_theory)">more info on entropy</a>
+ *
+ * @param clusterArray: the array of classes of type T
+ * @param size: the size of the data points of type int
+ * @param lowerLabelRange: the lower bound of the range of labels
+ * @param upperLabelRange: the upper bound of the range of labels
+ * @param stream: the cudaStream object
+ * @return the entropy score
+ */
+template <typename T>
+double entropy(const T* clusterArray,
+               const int size,
+               const T lowerLabelRange,
+               const T upperLabelRange,
+               cudaStream_t stream)
+{
+  return detail::entropy(clusterArray, size, lowerLabelRange, upperLabelRange, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/homogeneity_score.hpp b/cpp/include/raft/stats/homogeneity_score.hpp
new file mode 100644
index 0000000000..e94d519902
--- /dev/null
+++ b/cpp/include/raft/stats/homogeneity_score.hpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/stats/detail/homogeneity_score.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate the homogeneity score between two clusters
+ * <a href="https://en.wikipedia.org/wiki/Homogeneity_(statistics)">more info on mutual
+ * information</a>
+ * @param truthClusterArray: the array of truth classes of type T
+ * @param predClusterArray: the array of predicted classes of type T
+ * @param size: the size of the data points of type int
+ * @param lowerLabelRange: the lower bound of the range of labels
+ * @param upperLabelRange: the upper bound of the range of labels
+ * @param stream: the cudaStream object
+ */
+template <typename T>
+double homogeneity_score(const T* truthClusterArray,
+                         const T* predClusterArray,
+                         int size,
+                         T lowerLabelRange,
+                         T upperLabelRange,
+                         cudaStream_t stream)
+{
+  return detail::homogeneity_score(
+    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/information_criterion.hpp b/cpp/include/raft/stats/information_criterion.hpp
new file mode 100644
index 0000000000..c367471953
--- /dev/null
+++ b/cpp/include/raft/stats/information_criterion.hpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file information_criterion.hpp
+ * @brief These information criteria are used to evaluate the quality of models
+ *        by balancing the quality of the fit and the number of parameters.
+ *
+ * See:
+ *  - AIC: https://en.wikipedia.org/wiki/Akaike_information_criterion
+ *  - AICc: https://en.wikipedia.org/wiki/Akaike_information_criterion#AICc
+ *  - BIC: https://en.wikipedia.org/wiki/Bayesian_information_criterion
+ */
+#pragma once
+
+#include <raft/stats/common.hpp>
+#include <raft/stats/detail/batched/information_criterion.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * Compute the given type of information criterion
+ *
+ * @note: it is safe to do the computation in-place (i.e give same pointer
+ *        as input and output)
+ *
+ * @param[out] d_ic             Information criterion to be returned for each
+ *                              series (device)
+ * @param[in]  d_loglikelihood  Log-likelihood for each series (device)
+ * @param[in]  ic_type          Type of criterion to compute. See IC_Type
+ * @param[in]  n_params         Number of parameters in the model
+ * @param[in]  batch_size       Number of series in the batch
+ * @param[in]  n_samples        Number of samples in each series
+ * @param[in]  stream           CUDA stream
+ */
+template <typename ScalarT, typename IdxT>
+void information_criterion_batched(ScalarT* d_ic,
+                                   const ScalarT* d_loglikelihood,
+                                   IC_Type ic_type,
+                                   IdxT n_params,
+                                   IdxT batch_size,
+                                   IdxT n_samples,
+                                   cudaStream_t stream)
+{
+  batched::detail::information_criterion(
+    d_ic, d_loglikelihood, ic_type, n_params, batch_size, n_samples, stream);
+}
+
+}  // namespace stats
+}  // namespace raft
diff --git a/cpp/include/raft/stats/kl_divergence.hpp b/cpp/include/raft/stats/kl_divergence.hpp
new file mode 100644
index 0000000000..377e96719d
--- /dev/null
+++ b/cpp/include/raft/stats/kl_divergence.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/stats/detail/kl_divergence.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate KL Divergence
+ * <a href="https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence">more info on KL
+ * Divergence</a>
+ *
+ * @tparam DataT: Data type of the input array
+ * @param modelPDF: the model array of probability density functions of type DataT
+ * @param candidatePDF: the candidate array of probability density functions of type DataT
+ * @param size: the size of the data points of type int
+ * @param stream: the cudaStream object
+ */
+template <typename DataT>
+DataT kl_divergence(const DataT* modelPDF, const DataT* candidatePDF, int size, cudaStream_t stream)
+{
+  return detail::kl_divergence(modelPDF, candidatePDF, size, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/mutual_info_score.hpp b/cpp/include/raft/stats/mutual_info_score.hpp
new file mode 100644
index 0000000000..b1044d0a3c
--- /dev/null
+++ b/cpp/include/raft/stats/mutual_info_score.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/stats/detail/mutual_info_score.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate the mutual information between two clusters
+ * <a href="https://en.wikipedia.org/wiki/Mutual_information">more info on mutual information</a>
+ * @param firstClusterArray: the array of classes of type T
+ * @param secondClusterArray: the array of classes of type T
+ * @param size: the size of the data points of type int
+ * @param lowerLabelRange: the lower bound of the range of labels
+ * @param upperLabelRange: the upper bound of the range of labels
+ * @param stream: the cudaStream object
+ */
+template <typename T>
+double mutual_info_score(const T* firstClusterArray,
+                         const T* secondClusterArray,
+                         int size,
+                         T lowerLabelRange,
+                         T upperLabelRange,
+                         cudaStream_t stream)
+{
+  return detail::mutual_info_score(
+    firstClusterArray, secondClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/r2_score.hpp b/cpp/include/raft/stats/r2_score.hpp
new file mode 100644
index 0000000000..4858a2b2a8
--- /dev/null
+++ b/cpp/include/raft/stats/r2_score.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/stats/detail/scores.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * Calculates the "Coefficient of Determination" (R-Squared) score
+ * normalizing the sum of squared errors by the total sum of squares.
+ *
+ * This score indicates the proportionate amount of variation in an
+ * expected response variable is explained by the independent variables
+ * in a linear regression model. The larger the R-squared value, the
+ * more variability is explained by the linear regression model.
+ *
+ * @param y: Array of ground-truth response variables
+ * @param y_hat: Array of predicted response variables
+ * @param n: Number of elements in y and y_hat
+ * @param stream: cuda stream
+ * @return: The R-squared value.
+ */
+template <typename math_t>
+math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
+{
+  return detail::r2_score(y, y_hat, n, stream);
+}
+
+}  // namespace stats
+}  // namespace raft
diff --git a/cpp/include/raft/stats/rand_index.hpp b/cpp/include/raft/stats/rand_index.hpp
new file mode 100644
index 0000000000..602ff11f47
--- /dev/null
+++ b/cpp/include/raft/stats/rand_index.hpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/stats/detail/rand_index.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate RandIndex
+ * <a href="https://en.wikipedia.org/wiki/Rand_index">more info on rand index</a>
+ * @param firstClusterArray: the array of classes of type T
+ * @param secondClusterArray: the array of classes of type T
+ * @param size: the size of the data points of type uint64_t
+ * @param stream: the cudaStream object
+ */
+template <typename T>
+double rand_index(T* firstClusterArray, T* secondClusterArray, uint64_t size, cudaStream_t stream)
+{
+  return detail::compute_rand_index(firstClusterArray, secondClusterArray, size, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/include/raft/stats/regression_metrics.hpp b/cpp/include/raft/stats/regression_metrics.hpp
new file mode 100644
index 0000000000..4cfbb88231
--- /dev/null
+++ b/cpp/include/raft/stats/regression_metrics.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/stats/detail/scores.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute regression metrics mean absolute error, mean squared error, median absolute error
+ * @tparam T: data type for predictions (e.g., float or double for regression).
+ * @param[in] predictions: array of predictions (GPU pointer).
+ * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer).
+ * @param[in] n: number of elements in each of predictions, ref_predictions. Should be > 0.
+ * @param[in] stream: cuda stream.
+ * @param[out] mean_abs_error: Mean Absolute Error. Sum over n of (|predictions[i] -
+ * ref_predictions[i]|) / n.
+ * @param[out] mean_squared_error: Mean Squared Error. Sum over n of ((predictions[i] -
+ * ref_predictions[i])^2) / n.
+ * @param[out] median_abs_error: Median Absolute Error. Median of |predictions[i] -
+ * ref_predictions[i]| for i in [0, n).
+ */
+template <typename T>
+void regression_metrics(const T* predictions,
+                        const T* ref_predictions,
+                        int n,
+                        cudaStream_t stream,
+                        double& mean_abs_error,
+                        double& mean_squared_error,
+                        double& median_abs_error)
+{
+  detail::regression_metrics(
+    predictions, ref_predictions, n, stream, mean_abs_error, mean_squared_error, median_abs_error);
+}
+}  // namespace stats
+}  // namespace raft
diff --git a/cpp/include/raft/stats/silhouette_score.hpp b/cpp/include/raft/stats/silhouette_score.hpp
new file mode 100644
index 0000000000..c0e4afb413
--- /dev/null
+++ b/cpp/include/raft/stats/silhouette_score.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/stats/detail/batched/silhouette_score.cuh>
+#include <raft/stats/detail/silhouette_score.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief main function that returns the average silhouette score for a given set of data and its
+ * clusterings
+ * @tparam DataT: type of the data samples
+ * @tparam LabelT: type of the labels
+ * @param handle: raft handle for managing expensive resources
+ * @param X_in: pointer to the input Data samples array (nRows x nCols)
+ * @param nRows: number of data samples
+ * @param nCols: number of features
+ * @param labels: the pointer to the array containing labels for every data sample (1 x nRows)
+ * @param nLabels: number of Labels
+ * @param silhouette_scorePerSample: pointer to the array that is optionally taken in as input and
+ * is populated with the silhouette score for every sample (1 x nRows)
+ * @param stream: the cuda stream where to launch this kernel
+ * @param metric: the numerical value that maps to the type of distance metric to be used in the
+ * calculations
+ */
+template <typename DataT, typename LabelT>
+DataT silhouette_score(
+  const raft::handle_t& handle,
+  DataT* X_in,
+  int nRows,
+  int nCols,
+  LabelT* labels,
+  int nLabels,
+  DataT* silhouette_scorePerSample,
+  cudaStream_t stream,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded)
+{
+  return detail::silhouette_score(
+    handle, X_in, nRows, nCols, labels, nLabels, silhouette_scorePerSample, stream, metric);
+}
+
+template <typename value_t, typename value_idx, typename label_idx>
+value_t silhouette_score_batched(
+  const raft::handle_t& handle,
+  value_t* X,
+  value_idx n_rows,
+  value_idx n_cols,
+  label_idx* y,
+  label_idx n_labels,
+  value_t* scores,
+  value_idx chunk,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded)
+{
+  return batched::detail::silhouette_score(
+    handle, X, n_rows, n_cols, y, n_labels, scores, chunk, metric);
+}
+
+};  // namespace stats
+};  // namespace raft
diff --git a/cpp/include/raft/stats/specializations.hpp b/cpp/include/raft/stats/specializations.hpp
new file mode 100644
index 0000000000..8f33690e5b
--- /dev/null
+++ b/cpp/include/raft/stats/specializations.hpp
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/specializations.hpp>
+#include <raft/spatial/knn/specializations.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/stats/trustworthiness_score.hpp b/cpp/include/raft/stats/trustworthiness_score.hpp
new file mode 100644
index 0000000000..f3f1bacfd4
--- /dev/null
+++ b/cpp/include/raft/stats/trustworthiness_score.hpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/stats/detail/trustworthiness_score.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute the trustworthiness score
+ * @param[in] h: raft handle
+ * @param[in] X: Data in original dimension
+ * @param[in] X_embedded: Data in target dimension (embedding)
+ * @param[in] n: Number of samples
+ * @param[in] m: Number of features in high/original dimension
+ * @param[in] d: Number of features in low/embedded dimension
+ * @param[in] n_neighbors Number of neighbors considered by trustworthiness score
+ * @param[in] batchSize Batch size
+ * @return[out] Trustworthiness score
+ */
+template <typename math_t, raft::distance::DistanceType distance_type>
+double trustworthiness_score(const raft::handle_t& h,
+                             const math_t* X,
+                             math_t* X_embedded,
+                             int n,
+                             int m,
+                             int d,
+                             int n_neighbors,
+                             int batchSize = 512)
+{
+  return detail::trustworthiness_score<math_t, distance_type>(
+    h, X, X_embedded, n, m, d, n_neighbors, batchSize);
+}
+}  // namespace stats
+}  // namespace raft
diff --git a/cpp/include/raft/stats/v_measure.hpp b/cpp/include/raft/stats/v_measure.hpp
new file mode 100644
index 0000000000..c7c4c3942d
--- /dev/null
+++ b/cpp/include/raft/stats/v_measure.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <raft/stats/detail/v_measure.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate the v-measure between two clusters
+ *
+ * @param truthClusterArray: the array of truth classes of type T
+ * @param predClusterArray: the array of predicted classes of type T
+ * @param size: the size of the data points of type int
+ * @param lowerLabelRange: the lower bound of the range of labels
+ * @param upperLabelRange: the upper bound of the range of labels
+ * @param stream: the cudaStream object
+ * @param beta: v_measure parameter
+ */
+template <typename T>
+double v_measure(const T* truthClusterArray,
+                 const T* predClusterArray,
+                 int size,
+                 T lowerLabelRange,
+                 T upperLabelRange,
+                 cudaStream_t stream,
+                 double beta = 1.0)
+{
+  return detail::v_measure(
+    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream, beta);
+}
+
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 430b69341c..82937c0ba3 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -71,6 +71,7 @@ add_executable(test_raft
     test/linalg/unary_op.cu
     test/matrix/math.cu
     test/matrix/matrix.cu
+    test/matrix/columnSort.cu
     test/matrix/linewise_op.cu
     test/mr/host/buffer.cpp
     test/mr/device/buffer.cpp
@@ -111,15 +112,28 @@ add_executable(test_raft
     test/spatial/faiss_mr.cu
     test/spatial/selection.cu
     test/spectral_matrix.cu
+    test/stats/adjusted_rand_index.cu
+    test/stats/completeness_score.cu
+    test/stats/contingencyMatrix.cu
     test/stats/cov.cu
+    test/stats/dispersion.cu
+    test/stats/entropy.cu
     test/stats/histogram.cu
+    test/stats/homogeneity_score.cu
+    test/stats/information_criterion.cu
+    test/stats/kl_divergence.cu
     test/stats/mean.cu
     test/stats/meanvar.cu
     test/stats/mean_center.cu
     test/stats/minmax.cu
+    test/stats/mutual_info_score.cu
+    test/stats/rand_index.cu
+    test/stats/silhouette_score.cu
     test/stats/stddev.cu
     test/stats/sum.cu
+    test/stats/trustworthiness.cu
     test/stats/weighted_mean.cu
+    test/stats/v_measure.cu
     test/test.cpp
 )
 
diff --git a/cpp/test/matrix/columnSort.cu b/cpp/test/matrix/columnSort.cu
new file mode 100644
index 0000000000..d0b27bb4a4
--- /dev/null
+++ b/cpp/test/matrix/columnSort.cu
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <algorithm>
+#include <gtest/gtest.h>
+#include <numeric>
+#include <raft/cudart_utils.h>
+#include <raft/matrix/col_wise_sort.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace matrix {
+
+template <typename T>
+std::vector<int>* sort_indexes(const std::vector<T>& v)
+{
+  // initialize original index locations
+  std::vector<int>* idx = new std::vector<int>(v.size());
+  std::iota((*idx).begin(), (*idx).end(), 0);
+
+  // sort indexes based on comparing values in v
+  std::sort((*idx).begin(), (*idx).end(), [&v](int i1, int i2) { return v[i1] < v[i2]; });
+  return idx;
+}
+
+template <typename T>
+struct columnSort {
+  T tolerance;
+  int n_row;
+  int n_col;
+  bool testKeys;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const columnSort<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+class ColumnSort : public ::testing::TestWithParam<columnSort<T>> {
+ protected:
+  ColumnSort()
+    : keyIn(0, stream),
+      keySorted(0, stream),
+      keySortGolden(0, stream),
+      valueOut(0, stream),
+      goldenValOut(0, stream),
+      workspacePtr(0, stream)
+  {
+  }
+
+  void SetUp() override
+  {
+    params  = ::testing::TestWithParam<columnSort<T>>::GetParam();
+    int len = params.n_row * params.n_col;
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    keyIn.resize(len, stream);
+    valueOut.resize(len, stream);
+    goldenValOut.resize(len, stream);
+    if (params.testKeys) {
+      keySorted.resize(len, stream);
+      keySortGolden.resize(len, stream);
+    }
+
+    std::vector<T> vals(len);
+    std::vector<int> cValGolden(len);
+    std::iota(vals.begin(), vals.end(),
+              1.0f);  // will have to change input param type
+    std::random_shuffle(vals.begin(), vals.end());
+
+    std::vector<T> cKeyGolden(len);
+
+    for (int i = 0; i < params.n_row; i++) {
+      std::vector<T> tmp(vals.begin() + i * params.n_col, vals.begin() + (i + 1) * params.n_col);
+      auto cpuOut = sort_indexes(tmp);
+      std::copy((*cpuOut).begin(), (*cpuOut).end(), cValGolden.begin() + i * params.n_col);
+      delete cpuOut;
+
+      if (params.testKeys) {
+        std::sort(tmp.begin(), tmp.end());
+        std::copy(tmp.begin(), tmp.end(), cKeyGolden.begin() + i * params.n_col);
+      }
+    }
+
+    raft::update_device(keyIn.data(), &vals[0], len, stream);
+    raft::update_device(goldenValOut.data(), &cValGolden[0], len, stream);
+
+    if (params.testKeys) raft::update_device(keySortGolden.data(), &cKeyGolden[0], len, stream);
+
+    bool needWorkspace   = false;
+    size_t workspaceSize = 0;
+    // Remove this branch once the implementation of descending sort is fixed.
+    sort_cols_per_row(keyIn.data(),
+                      valueOut.data(),
+                      params.n_row,
+                      params.n_col,
+                      needWorkspace,
+                      NULL,
+                      workspaceSize,
+                      stream,
+                      keySorted.data());
+    if (needWorkspace) {
+      workspacePtr.resize(workspaceSize, stream);
+      sort_cols_per_row(keyIn.data(),
+                        valueOut.data(),
+                        params.n_row,
+                        params.n_col,
+                        needWorkspace,
+                        workspacePtr.data(),
+                        workspaceSize,
+                        stream,
+                        keySorted.data());
+    }
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+  }
+
+ protected:
+  cudaStream_t stream = 0;
+  columnSort<T> params;
+  rmm::device_uvector<T> keyIn, keySorted, keySortGolden;
+  rmm::device_uvector<int> valueOut, goldenValOut;  // valueOut are indexes
+  rmm::device_uvector<char> workspacePtr;
+};
+
+const std::vector<columnSort<float>> inputsf1 = {{0.000001f, 503, 2000, false},
+                                                 {0.000001f, 113, 20000, true},
+                                                 {0.000001f, 503, 2000, false},
+                                                 {0.000001f, 113, 20000, true}};
+
+typedef ColumnSort<float> ColumnSortF;
+TEST_P(ColumnSortF, Result)
+{
+  // Remove this condition once the implementation of of descending sort is
+  // fixed.
+  ASSERT_TRUE(devArrMatch(valueOut.data(),
+                          goldenValOut.data(),
+                          params.n_row * params.n_col,
+                          raft::CompareApprox<float>(params.tolerance)));
+  if (params.testKeys) {
+    ASSERT_TRUE(devArrMatch(keySorted.data(),
+                            keySortGolden.data(),
+                            params.n_row * params.n_col,
+                            raft::CompareApprox<float>(params.tolerance)));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(ColumnSortTests, ColumnSortF, ::testing::ValuesIn(inputsf1));
+
+}  // end namespace matrix
+}  // end namespace raft
diff --git a/cpp/test/stats/adjusted_rand_index.cu b/cpp/test/stats/adjusted_rand_index.cu
new file mode 100644
index 0000000000..33e05295e1
--- /dev/null
+++ b/cpp/test/stats/adjusted_rand_index.cu
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <algorithm>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <raft/cudart_utils.h>
+#include <raft/stats/adjusted_rand_index.hpp>
+#include <raft/stats/contingency_matrix.hpp>
+#include <random>
+
+namespace raft {
+namespace stats {
+
+struct adjustedRandIndexParam {
+  int nElements;
+  int lowerLabelRange;
+  int upperLabelRange;
+  bool sameArrays;
+  double tolerance;
+  // if this is true, then it is assumed that `sameArrays` is also true
+  // further it also assumes `lowerLabelRange` and `upperLabelRange` are 0
+  bool testZeroArray;
+};
+
+template <typename T, typename MathT = int>
+class adjustedRandIndexTest : public ::testing::TestWithParam<adjustedRandIndexParam> {
+ protected:
+  adjustedRandIndexTest() : firstClusterArray(0, stream), secondClusterArray(0, stream) {}
+
+  void SetUp() override
+  {
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    params    = ::testing::TestWithParam<adjustedRandIndexParam>::GetParam();
+    nElements = params.nElements;
+
+    firstClusterArray.resize(nElements, stream);
+    secondClusterArray.resize(nElements, stream);
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(firstClusterArray.data(), 0, firstClusterArray.size() * sizeof(T), stream));
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(secondClusterArray.data(), 0, secondClusterArray.size() * sizeof(T), stream));
+
+    if (!params.testZeroArray) {
+      SetUpDifferentArrays();
+    } else {
+      SetupZeroArray();
+    }
+    // allocating and initializing memory to the GPU
+    computed_adjusted_rand_index = adjusted_rand_index<T, MathT>(
+      firstClusterArray.data(), secondClusterArray.data(), nElements, stream);
+  }
+
+  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
+
+  void SetUpDifferentArrays()
+  {
+    lowerLabelRange = params.lowerLabelRange;
+    upperLabelRange = params.upperLabelRange;
+    std::vector<int> arr1(nElements, 0);
+    std::vector<int> arr2(nElements, 0);
+    std::random_device rd;
+    std::default_random_engine dre(rd());
+    std::uniform_int_distribution<int> intGenerator(lowerLabelRange, upperLabelRange);
+    std::generate(arr1.begin(), arr1.end(), [&]() { return intGenerator(dre); });
+    if (params.sameArrays) {
+      arr2 = arr1;
+    } else {
+      std::generate(arr2.begin(), arr2.end(), [&]() { return intGenerator(dre); });
+    }
+    // calculating golden output
+    int numUniqueClasses = upperLabelRange - lowerLabelRange + 1;
+    size_t sizeOfMat     = numUniqueClasses * numUniqueClasses * sizeof(int);
+    int* hGoldenOutput   = (int*)malloc(sizeOfMat);
+    memset(hGoldenOutput, 0, sizeOfMat);
+    for (int i = 0; i < nElements; i++) {
+      int row    = arr1[i] - lowerLabelRange;
+      int column = arr2[i] - lowerLabelRange;
+      hGoldenOutput[row * numUniqueClasses + column] += 1;
+    }
+    int sumOfNijCTwo = 0;
+    int* a           = (int*)malloc(numUniqueClasses * sizeof(int));
+    int* b           = (int*)malloc(numUniqueClasses * sizeof(int));
+    memset(a, 0, numUniqueClasses * sizeof(int));
+    memset(b, 0, numUniqueClasses * sizeof(int));
+    int sumOfAiCTwo = 0;
+    int sumOfBiCTwo = 0;
+    // calculating the sum of number of pairwise points in each index
+    // and also the reducing contingency matrix along row and column
+    for (int i = 0; i < numUniqueClasses; ++i) {
+      for (int j = 0; j < numUniqueClasses; ++j) {
+        int Nij = hGoldenOutput[i * numUniqueClasses + j];
+        sumOfNijCTwo += ((Nij) * (Nij - 1)) / 2;
+        a[i] += hGoldenOutput[i * numUniqueClasses + j];
+        b[i] += hGoldenOutput[j * numUniqueClasses + i];
+      }
+    }
+    // claculating the sum of number pairwise points in ever column sum
+    // claculating the sum of number pairwise points in ever row sum
+    for (int i = 0; i < numUniqueClasses; ++i) {
+      sumOfAiCTwo += ((a[i]) * (a[i] - 1)) / 2;
+      sumOfBiCTwo += ((b[i]) * (b[i] - 1)) / 2;
+    }
+    // calculating the ARI
+    double nCTwo         = double(nElements) * double(nElements - 1) / 2.0;
+    double expectedIndex = (double(sumOfBiCTwo) * double(sumOfAiCTwo)) / double(nCTwo);
+    double maxIndex      = (double(sumOfAiCTwo) + double(sumOfBiCTwo)) / 2.0;
+    double index         = (double)sumOfNijCTwo;
+    if (maxIndex - expectedIndex)
+      truth_adjusted_rand_index = (index - expectedIndex) / (maxIndex - expectedIndex);
+    else
+      truth_adjusted_rand_index = 0;
+    raft::update_device(firstClusterArray.data(), &arr1[0], nElements, stream);
+    raft::update_device(secondClusterArray.data(), &arr2[0], nElements, stream);
+  }
+
+  void SetupZeroArray()
+  {
+    lowerLabelRange           = 0;
+    upperLabelRange           = 0;
+    truth_adjusted_rand_index = 1.0;
+  }
+
+  adjustedRandIndexParam params;
+  T lowerLabelRange, upperLabelRange;
+  rmm::device_uvector<T> firstClusterArray;
+  rmm::device_uvector<T> secondClusterArray;
+  int nElements                       = 0;
+  double truth_adjusted_rand_index    = 0;
+  double computed_adjusted_rand_index = 0;
+  cudaStream_t stream                 = 0;
+};
+
+const std::vector<adjustedRandIndexParam> inputs = {
+  {199, 1, 10, false, 0.000001, false},
+  {200, 15, 100, false, 0.000001, false},
+  {100, 1, 20, false, 0.000001, false},
+  {10, 1, 10, false, 0.000001, false},
+  {198, 1, 100, false, 0.000001, false},
+  {300, 3, 99, false, 0.000001, false},
+  {199, 1, 10, true, 0.000001, false},
+  {200, 15, 100, true, 0.000001, false},
+  {100, 1, 20, true, 0.000001, false},
+  // FIXME: disabled temporarily due to flaky test
+  // {10, 1, 10, true, 0.000001, false},
+  {198, 1, 100, true, 0.000001, false},
+  {300, 3, 99, true, 0.000001, false},
+
+  {199, 0, 0, false, 0.000001, true},
+  {200, 0, 0, false, 0.000001, true},
+  {100, 0, 0, false, 0.000001, true},
+  {10, 0, 0, false, 0.000001, true},
+  {198, 0, 0, false, 0.000001, true},
+  {300, 0, 0, false, 0.000001, true},
+  {199, 0, 0, true, 0.000001, true},
+  {200, 0, 0, true, 0.000001, true},
+  {100, 0, 0, true, 0.000001, true},
+  {10, 0, 0, true, 0.000001, true},
+  {198, 0, 0, true, 0.000001, true},
+  {300, 0, 0, true, 0.000001, true},
+};
+
+const std::vector<adjustedRandIndexParam> large_inputs = {
+  {2000000, 1, 1000, false, 0.000001, false},
+  {2000000, 1, 1000, true, 0.000001, false},
+
+  {2000000, 0, 0, false, 0.000001, true},
+  {2000000, 0, 0, true, 0.000001, true},
+};
+
+typedef adjustedRandIndexTest<int, int> ARI_ii;
+TEST_P(ARI_ii, Result)
+{
+  ASSERT_NEAR(computed_adjusted_rand_index, truth_adjusted_rand_index, params.tolerance);
+}
+INSTANTIATE_TEST_CASE_P(adjusted_rand_index, ARI_ii, ::testing::ValuesIn(inputs));
+
+typedef adjustedRandIndexTest<int, unsigned long long> ARI_il;
+TEST_P(ARI_il, Result)
+{
+  ASSERT_NEAR(computed_adjusted_rand_index, truth_adjusted_rand_index, params.tolerance);
+}
+INSTANTIATE_TEST_CASE_P(adjusted_rand_index, ARI_il, ::testing::ValuesIn(inputs));
+INSTANTIATE_TEST_CASE_P(adjusted_rand_index_large, ARI_il, ::testing::ValuesIn(large_inputs));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/stats/completeness_score.cu b/cpp/test/stats/completeness_score.cu
new file mode 100644
index 0000000000..b8ca65ed7b
--- /dev/null
+++ b/cpp/test/stats/completeness_score.cu
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../test_utils.h"
+#include <algorithm>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <raft/cudart_utils.h>
+#include <raft/stats/completeness_score.hpp>
+#include <raft/stats/entropy.hpp>
+#include <raft/stats/mutual_info_score.hpp>
+#include <random>
+
+namespace raft {
+namespace stats {
+
+// parameter structure definition
+struct completenessParam {
+  int nElements;
+  int lowerLabelRange;
+  int upperLabelRange;
+  bool sameArrays;
+  double tolerance;
+};
+
+// test fixture class
+template <typename T>
+class completenessTest : public ::testing::TestWithParam<completenessParam> {
+ protected:
+  // the constructor
+  void SetUp() override
+  {
+    // getting the parameters
+    params = ::testing::TestWithParam<completenessParam>::GetParam();
+
+    nElements       = params.nElements;
+    lowerLabelRange = params.lowerLabelRange;
+    upperLabelRange = params.upperLabelRange;
+
+    // generating random value test input
+    std::vector<int> arr1(nElements, 0);
+    std::vector<int> arr2(nElements, 0);
+    std::random_device rd;
+    std::default_random_engine dre(rd());
+    std::uniform_int_distribution<int> intGenerator(lowerLabelRange, upperLabelRange);
+
+    std::generate(arr1.begin(), arr1.end(), [&]() { return intGenerator(dre); });
+    if (params.sameArrays) {
+      arr2 = arr1;
+    } else {
+      std::generate(arr2.begin(), arr2.end(), [&]() { return intGenerator(dre); });
+    }
+
+    // allocating and initializing memory to the GPU
+
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+
+    rmm::device_uvector<T> truthClusterArray(nElements, stream);
+    rmm::device_uvector<T> predClusterArray(nElements, stream);
+    raft::update_device(truthClusterArray.data(), arr1.data(), (int)nElements, stream);
+    raft::update_device(predClusterArray.data(), arr2.data(), (int)nElements, stream);
+
+    // calculating the golden output
+    double truthMI, truthEntropy;
+
+    truthMI      = raft::stats::mutual_info_score(truthClusterArray.data(),
+                                             predClusterArray.data(),
+                                             nElements,
+                                             lowerLabelRange,
+                                             upperLabelRange,
+                                             stream);
+    truthEntropy = raft::stats::entropy(
+      predClusterArray.data(), nElements, lowerLabelRange, upperLabelRange, stream);
+
+    if (truthEntropy) {
+      truthCompleteness = truthMI / truthEntropy;
+    } else
+      truthCompleteness = 1.0;
+
+    if (nElements == 0) truthCompleteness = 1.0;
+
+    // calling the completeness CUDA implementation
+    computedCompleteness = raft::stats::completeness_score(truthClusterArray.data(),
+                                                           predClusterArray.data(),
+                                                           nElements,
+                                                           lowerLabelRange,
+                                                           upperLabelRange,
+                                                           stream);
+  }
+
+  // the destructor
+  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
+
+  // declaring the data values
+  completenessParam params;
+  T lowerLabelRange, upperLabelRange;
+  int nElements               = 0;
+  double truthCompleteness    = 0;
+  double computedCompleteness = 0;
+  cudaStream_t stream         = 0;
+};
+
+// setting test parameter values
+const std::vector<completenessParam> inputs = {{199, 1, 10, false, 0.000001},
+                                               {200, 15, 100, false, 0.000001},
+                                               {100, 1, 20, false, 0.000001},
+                                               {10, 1, 10, false, 0.000001},
+                                               {198, 1, 100, false, 0.000001},
+                                               {300, 3, 99, false, 0.000001},
+                                               {199, 1, 10, true, 0.000001},
+                                               {200, 15, 100, true, 0.000001},
+                                               {100, 1, 20, true, 0.000001},
+                                               {10, 1, 10, true, 0.000001},
+                                               {198, 1, 100, true, 0.000001},
+                                               {300, 3, 99, true, 0.000001}};
+
+// writing the test suite
+typedef completenessTest<int> completenessTestClass;
+TEST_P(completenessTestClass, Result)
+{
+  ASSERT_NEAR(computedCompleteness, truthCompleteness, params.tolerance);
+}
+INSTANTIATE_TEST_CASE_P(completeness, completenessTestClass, ::testing::ValuesIn(inputs));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/stats/contingencyMatrix.cu b/cpp/test/stats/contingencyMatrix.cu
new file mode 100644
index 0000000000..fbae9f5224
--- /dev/null
+++ b/cpp/test/stats/contingencyMatrix.cu
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <algorithm>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <raft/cudart_utils.h>
+#include <raft/interruptible.hpp>
+#include <raft/stats/contingency_matrix.hpp>
+#include <random>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace stats {
+
+struct ContingencyMatrixParam {
+  int nElements;
+  int minClass;
+  int maxClass;
+  bool calcCardinality;
+  bool skipLabels;
+  float tolerance;
+};
+
+template <typename T>
+class ContingencyMatrixTest : public ::testing::TestWithParam<ContingencyMatrixParam> {
+ protected:
+  ContingencyMatrixTest()
+    : pWorkspace(0, stream),
+      dY(0, stream),
+      dYHat(0, stream),
+      dComputedOutput(0, stream),
+      dGoldenOutput(0, stream)
+  {
+  }
+
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<ContingencyMatrixParam>::GetParam();
+
+    int numElements     = params.nElements;
+    int lowerLabelRange = params.minClass;
+    int upperLabelRange = params.maxClass;
+
+    std::vector<int> y(numElements, 0);
+    std::vector<int> y_hat(numElements, 0);
+    std::random_device rd;
+    std::default_random_engine dre(rd());
+    std::uniform_int_distribution<int> intGenerator(lowerLabelRange, upperLabelRange);
+
+    std::generate(y.begin(), y.end(), [&]() { return intGenerator(dre); });
+    std::generate(y_hat.begin(), y_hat.end(), [&]() { return intGenerator(dre); });
+
+    if (params.skipLabels) {
+      // remove two label value from input arrays
+      int y1 = (upperLabelRange - lowerLabelRange) / 2;
+      int y2 = y1 + (upperLabelRange - lowerLabelRange) / 4;
+
+      // replacement values
+      int y1_R = y1 + 1;
+      int y2_R = y2 + 1;
+
+      std::replace(y.begin(), y.end(), y1, y1_R);
+      std::replace(y.begin(), y.end(), y2, y2_R);
+      std::replace(y_hat.begin(), y_hat.end(), y1, y1_R);
+      std::replace(y_hat.begin(), y_hat.end(), y2, y2_R);
+    }
+
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    dY.resize(numElements, stream);
+    dYHat.resize(numElements, stream);
+
+    raft::update_device(dYHat.data(), &y_hat[0], numElements, stream);
+    raft::update_device(dY.data(), &y[0], numElements, stream);
+
+    if (params.calcCardinality) {
+      raft::stats::getInputClassCardinality(dY.data(), numElements, stream, minLabel, maxLabel);
+    } else {
+      minLabel = lowerLabelRange;
+      maxLabel = upperLabelRange;
+    }
+
+    numUniqueClasses = maxLabel - minLabel + 1;
+
+    dComputedOutput.resize(numUniqueClasses * numUniqueClasses, stream);
+    dGoldenOutput.resize(numUniqueClasses * numUniqueClasses, stream);
+
+    // generate golden output on CPU
+    size_t sizeOfMat = numUniqueClasses * numUniqueClasses * sizeof(int);
+    std::vector<int> hGoldenOutput(sizeOfMat, 0);
+
+    for (int i = 0; i < numElements; i++) {
+      auto row    = y[i] - minLabel;
+      auto column = y_hat[i] - minLabel;
+      hGoldenOutput[row * numUniqueClasses + column] += 1;
+    }
+
+    raft::update_device(
+      dGoldenOutput.data(), hGoldenOutput.data(), numUniqueClasses * numUniqueClasses, stream);
+
+    workspaceSz = raft::stats::getContingencyMatrixWorkspaceSize(
+      numElements, dY.data(), stream, minLabel, maxLabel);
+    pWorkspace.resize(workspaceSz, stream);
+    raft::interruptible::synchronize(stream);
+  }
+
+  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
+
+  void RunTest()
+  {
+    int numElements = params.nElements;
+    raft::stats::contingencyMatrix(dY.data(),
+                                   dYHat.data(),
+                                   numElements,
+                                   dComputedOutput.data(),
+                                   stream,
+                                   (void*)pWorkspace.data(),
+                                   workspaceSz,
+                                   minLabel,
+                                   maxLabel);
+
+    raft::interruptible::synchronize(stream);
+    ASSERT_TRUE(raft::devArrMatch(dComputedOutput.data(),
+                                  dGoldenOutput.data(),
+                                  numUniqueClasses * numUniqueClasses,
+                                  raft::Compare<T>()));
+  }
+
+  ContingencyMatrixParam params;
+  int numUniqueClasses = -1;
+  T minLabel, maxLabel;
+  cudaStream_t stream = 0;
+  size_t workspaceSz;
+  rmm::device_uvector<char> pWorkspace;
+  rmm::device_uvector<T> dY, dYHat;
+  rmm::device_uvector<int> dComputedOutput, dGoldenOutput;
+};
+
+const std::vector<ContingencyMatrixParam> inputs = {
+  {10000, 1, 10, true, false, 0.000001},
+  {10000, 1, 5000, true, false, 0.000001},
+  {10000, 1, 10000, true, false, 0.000001},
+  {10000, 1, 20000, true, false, 0.000001},
+  {10000, 1, 10, false, false, 0.000001},
+  {10000, 1, 5000, false, false, 0.000001},
+  {10000, 1, 10000, false, false, 0.000001},
+  {10000, 1, 20000, false, false, 0.000001},
+  {100000, 1, 100, false, false, 0.000001},
+  {1000000, 1, 1200, true, false, 0.000001},
+  {1000000, 1, 10000, false, false, 0.000001},
+  {100000, 1, 100, false, true, 0.000001},
+};
+
+typedef ContingencyMatrixTest<int> ContingencyMatrixTestS;
+TEST_P(ContingencyMatrixTestS, Result) { RunTest(); }
+INSTANTIATE_TEST_CASE_P(ContingencyMatrix, ContingencyMatrixTestS, ::testing::ValuesIn(inputs));
+}  // namespace stats
+}  // namespace raft
diff --git a/cpp/test/stats/dispersion.cu b/cpp/test/stats/dispersion.cu
new file mode 100644
index 0000000000..256469be7c
--- /dev/null
+++ b/cpp/test/stats/dispersion.cu
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/interruptible.hpp>
+#include <raft/random/rng.hpp>
+#include <raft/stats/dispersion.hpp>
+#include <rmm/device_uvector.hpp>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+namespace raft {
+namespace stats {
+
+template <typename T>
+struct DispersionInputs {
+  T tolerance;
+  int dim, clusters;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const DispersionInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+class DispersionTest : public ::testing::TestWithParam<DispersionInputs<T>> {
+ protected:
+  DispersionTest() : exp_mean(0, stream), act_mean(0, stream) {}
+
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<DispersionInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int len = params.clusters * params.dim;
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    rmm::device_uvector<T> data(len, stream);
+    rmm::device_uvector<int> counts(params.clusters, stream);
+    exp_mean.resize(params.dim, stream);
+    act_mean.resize(params.dim, stream);
+    r.uniform(data.data(), len, (T)-1.0, (T)1.0, stream);
+    r.uniformInt(counts.data(), params.clusters, 1, 100, stream);
+    std::vector<int> h_counts(params.clusters, 0);
+    raft::update_host(&(h_counts[0]), counts.data(), params.clusters, stream);
+    npoints = 0;
+    for (const auto& val : h_counts) {
+      npoints += val;
+    }
+    actualVal = dispersion(
+      data.data(), counts.data(), act_mean.data(), params.clusters, npoints, params.dim, stream);
+    expectedVal = T(0);
+    std::vector<T> h_data(len, T(0));
+    raft::update_host(&(h_data[0]), data.data(), len, stream);
+    std::vector<T> mean(params.dim, T(0));
+    for (int i = 0; i < params.clusters; ++i) {
+      for (int j = 0; j < params.dim; ++j) {
+        mean[j] += h_data[i * params.dim + j] * T(h_counts[i]);
+      }
+    }
+    for (int i = 0; i < params.dim; ++i) {
+      mean[i] /= T(npoints);
+    }
+    raft::update_device(exp_mean.data(), &(mean[0]), params.dim, stream);
+    for (int i = 0; i < params.clusters; ++i) {
+      for (int j = 0; j < params.dim; ++j) {
+        auto diff = h_data[i * params.dim + j] - mean[j];
+        expectedVal += diff * diff * T(h_counts[i]);
+      }
+    }
+    expectedVal = sqrt(expectedVal);
+    raft::interruptible::synchronize(stream);
+  }
+
+  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
+
+ protected:
+  DispersionInputs<T> params;
+  rmm::device_uvector<T> exp_mean, act_mean;
+  cudaStream_t stream = 0;
+  int npoints;
+  T expectedVal, actualVal;
+};
+
+const std::vector<DispersionInputs<float>> inputsf = {
+  {0.001f, 10, 1000, 1234ULL}, {0.001f, 100, 100, 1234ULL}, {0.001f, 1000, 1000, 1234ULL}};
+typedef DispersionTest<float> DispersionTestF;
+TEST_P(DispersionTestF, Result)
+{
+  auto eq = raft::CompareApprox<float>(params.tolerance);
+  ASSERT_TRUE(devArrMatch(exp_mean.data(), act_mean.data(), params.dim, eq));
+  ASSERT_TRUE(match(expectedVal, actualVal, eq));
+}
+INSTANTIATE_TEST_CASE_P(DispersionTests, DispersionTestF, ::testing::ValuesIn(inputsf));
+
+const std::vector<DispersionInputs<double>> inputsd = {
+  {0.001, 10, 1000, 1234ULL}, {0.001, 100, 100, 1234ULL}, {0.001, 1000, 1000, 1234ULL}};
+typedef DispersionTest<double> DispersionTestD;
+TEST_P(DispersionTestD, Result)
+{
+  auto eq = raft::CompareApprox<double>(params.tolerance);
+  ASSERT_TRUE(devArrMatch(exp_mean.data(), act_mean.data(), params.dim, eq));
+  ASSERT_TRUE(match(expectedVal, actualVal, eq));
+}
+INSTANTIATE_TEST_CASE_P(DispersionTests, DispersionTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/stats/entropy.cu b/cpp/test/stats/entropy.cu
new file mode 100644
index 0000000000..7074b1a6aa
--- /dev/null
+++ b/cpp/test/stats/entropy.cu
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../test_utils.h"
+#include <algorithm>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <raft/cudart_utils.h>
+#include <raft/interruptible.hpp>
+#include <raft/stats/entropy.hpp>
+#include <random>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace stats {
+
+struct entropyParam {
+  int nElements;
+  int lowerLabelRange;
+  int upperLabelRange;
+  double tolerance;
+};
+
+// test fixture class
+template <typename T>
+class entropyTest : public ::testing::TestWithParam<entropyParam> {
+ protected:
+  // the constructor
+  void SetUp() override
+  {
+    // getting the parameters
+    params = ::testing::TestWithParam<entropyParam>::GetParam();
+
+    nElements       = params.nElements;
+    lowerLabelRange = params.lowerLabelRange;
+    upperLabelRange = params.upperLabelRange;
+
+    // generating random value test input
+    std::vector<int> arr1(nElements, 0);
+    std::random_device rd;
+    std::default_random_engine dre(rd());
+    std::uniform_int_distribution<int> intGenerator(lowerLabelRange, upperLabelRange);
+
+    std::generate(arr1.begin(), arr1.end(), [&]() { return intGenerator(dre); });
+
+    // generating the golden output
+    int numUniqueClasses = upperLabelRange - lowerLabelRange + 1;
+
+    int* p = (int*)malloc(numUniqueClasses * sizeof(int));
+    memset(p, 0, numUniqueClasses * sizeof(int));
+
+    // calculating the bincount array
+    for (int i = 0; i < nElements; ++i) {
+      ++p[arr1[i] - lowerLabelRange];
+    }
+
+    // calculating the aggregate entropy
+    for (int i = 0; i < numUniqueClasses; ++i) {
+      if (p[i])
+        truthEntropy +=
+          -1 * (double(p[i]) / double(nElements)) * (log(double(p[i])) - log(double(nElements)));
+    }
+
+    // allocating and initializing memory to the GPU
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    rmm::device_uvector<T> clusterArray(nElements, stream);
+    raft::update_device(clusterArray.data(), &arr1[0], (int)nElements, stream);
+
+    raft::interruptible::synchronize(stream);
+    // calling the entropy CUDA implementation
+    computedEntropy = raft::stats::entropy(
+      clusterArray.data(), nElements, lowerLabelRange, upperLabelRange, stream);
+    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+  }
+
+  // declaring the data values
+  entropyParam params;
+  T lowerLabelRange, upperLabelRange;
+
+  int nElements          = 0;
+  double truthEntropy    = 0;
+  double computedEntropy = 0;
+  cudaStream_t stream    = 0;
+};
+
+// setting test parameter values
+const std::vector<entropyParam> inputs = {{199, 1, 10, 0.000001},
+                                          {200, 15, 100, 0.000001},
+                                          {100, 1, 20, 0.000001},
+                                          {10, 1, 10, 0.000001},
+                                          {198, 1, 100, 0.000001},
+                                          {300, 3, 99, 0.000001},
+                                          {199, 1, 10, 0.000001},
+                                          {200, 15, 100, 0.000001},
+                                          {100, 1, 20, 0.000001},
+                                          {10, 1, 10, 0.000001},
+                                          {198, 1, 100, 0.000001},
+                                          {300, 3, 99, 0.000001}};
+
+// writing the test suite
+typedef entropyTest<int> entropyTestClass;
+TEST_P(entropyTestClass, Result) { ASSERT_NEAR(computedEntropy, truthEntropy, params.tolerance); }
+INSTANTIATE_TEST_CASE_P(entropy, entropyTestClass, ::testing::ValuesIn(inputs));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/stats/homogeneity_score.cu b/cpp/test/stats/homogeneity_score.cu
new file mode 100644
index 0000000000..44434aef8d
--- /dev/null
+++ b/cpp/test/stats/homogeneity_score.cu
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../test_utils.h"
+#include <algorithm>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <raft/cudart_utils.h>
+#include <raft/stats/homogeneity_score.hpp>
+#include <raft/stats/mutual_info_score.hpp>
+#include <random>
+
+namespace raft {
+namespace stats {
+
+// parameter structure definition
+struct homogeneityParam {
+  int nElements;
+  int lowerLabelRange;
+  int upperLabelRange;
+  bool sameArrays;
+  double tolerance;
+};
+
+// test fixture class
+template <typename T>
+class homogeneityTest : public ::testing::TestWithParam<homogeneityParam> {
+ protected:
+  // the constructor
+  void SetUp() override
+  {
+    // getting the parameters
+    params = ::testing::TestWithParam<homogeneityParam>::GetParam();
+
+    nElements       = params.nElements;
+    lowerLabelRange = params.lowerLabelRange;
+    upperLabelRange = params.upperLabelRange;
+
+    // generating random value test input
+    std::vector<int> arr1(nElements, 0);
+    std::vector<int> arr2(nElements, 0);
+    std::random_device rd;
+    std::default_random_engine dre(rd());
+    std::uniform_int_distribution<int> intGenerator(lowerLabelRange, upperLabelRange);
+
+    std::generate(arr1.begin(), arr1.end(), [&]() { return intGenerator(dre); });
+    if (params.sameArrays) {
+      arr2 = arr1;
+    } else {
+      std::generate(arr2.begin(), arr2.end(), [&]() { return intGenerator(dre); });
+    }
+
+    // allocating and initializing memory to the GPU
+
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+
+    rmm::device_uvector<T> truthClusterArray(nElements, stream);
+    rmm::device_uvector<T> predClusterArray(nElements, stream);
+    raft::update_device(truthClusterArray.data(), &arr1[0], (int)nElements, stream);
+    raft::update_device(predClusterArray.data(), &arr2[0], (int)nElements, stream);
+
+    // calculating the golden output
+    double truthMI, truthEntropy;
+
+    truthMI      = raft::stats::mutual_info_score(truthClusterArray.data(),
+                                             predClusterArray.data(),
+                                             nElements,
+                                             lowerLabelRange,
+                                             upperLabelRange,
+                                             stream);
+    truthEntropy = raft::stats::entropy(
+      truthClusterArray.data(), nElements, lowerLabelRange, upperLabelRange, stream);
+
+    if (truthEntropy) {
+      truthHomogeneity = truthMI / truthEntropy;
+    } else
+      truthHomogeneity = 1.0;
+
+    if (nElements == 0) truthHomogeneity = 1.0;
+
+    // calling the homogeneity CUDA implementation
+    computedHomogeneity = raft::stats::homogeneity_score(truthClusterArray.data(),
+                                                         predClusterArray.data(),
+                                                         nElements,
+                                                         lowerLabelRange,
+                                                         upperLabelRange,
+                                                         stream);
+    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+  }
+
+  // declaring the data values
+  homogeneityParam params;
+  T lowerLabelRange, upperLabelRange;
+  int nElements              = 0;
+  double truthHomogeneity    = 0;
+  double computedHomogeneity = 0;
+  cudaStream_t stream        = 0;
+};
+
+// setting test parameter values
+const std::vector<homogeneityParam> inputs = {{199, 1, 10, false, 0.000001},
+                                              {200, 15, 100, false, 0.000001},
+                                              {100, 1, 20, false, 0.000001},
+                                              {10, 1, 10, false, 0.000001},
+                                              {198, 1, 100, false, 0.000001},
+                                              {300, 3, 99, false, 0.000001},
+                                              {199, 1, 10, true, 0.000001},
+                                              {200, 15, 100, true, 0.000001},
+                                              {100, 1, 20, true, 0.000001},
+                                              {10, 1, 10, true, 0.000001},
+                                              {198, 1, 100, true, 0.000001},
+                                              {300, 3, 99, true, 0.000001}};
+
+// writing the test suite
+typedef homogeneityTest<int> homogeneityTestClass;
+TEST_P(homogeneityTestClass, Result)
+{
+  ASSERT_NEAR(computedHomogeneity, truthHomogeneity, params.tolerance);
+}
+INSTANTIATE_TEST_CASE_P(homogeneity, homogeneityTestClass, ::testing::ValuesIn(inputs));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/stats/information_criterion.cu b/cpp/test/stats/information_criterion.cu
new file mode 100644
index 0000000000..034567efa5
--- /dev/null
+++ b/cpp/test/stats/information_criterion.cu
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <test_utils.h>
+
+#include <raft/stats/information_criterion.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/mr/device/allocator.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <random>
+#include <vector>
+
+namespace raft {
+namespace stats {
+
+template <typename T>
+void naive_ic(
+  T* h_ic, const T* h_loglike, IC_Type ic_type, int n_params, int batch_size, int n_samples)
+{
+  T ic_base{};
+  T N = static_cast<T>(n_params);
+  T M = static_cast<T>(n_samples);
+  switch (ic_type) {
+    case AIC: ic_base = (T)2 * N; break;
+    case AICc: ic_base = (T)2 * (N + (N * (N + (T)1)) / (M - N - (T)1)); break;
+    case BIC: ic_base = std::log(M) * N; break;
+  }
+#pragma omp parallel for
+  for (int bid = 0; bid < batch_size; bid++) {
+    h_ic[bid] = ic_base - (T)2.0 * h_loglike[bid];
+  }
+}
+
+template <typename T>
+struct BatchedICInputs {
+  int batch_size;
+  int n_params;
+  int n_samples;
+  IC_Type ic_type;
+  T tolerance;
+};
+
+template <typename T>
+class BatchedICTest : public ::testing::TestWithParam<BatchedICInputs<T>> {
+ protected:
+  void SetUp() override
+  {
+    using std::vector;
+    params = ::testing::TestWithParam<BatchedICInputs<T>>::GetParam();
+
+    // Create stream and allocator
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    allocator = std::make_shared<raft::mr::device::default_allocator>();
+
+    // Create arrays
+    std::vector<T> loglike_h = std::vector<T>(params.batch_size);
+    res_h.resize(params.batch_size);
+    T* loglike_d = (T*)allocator->allocate(sizeof(T) * params.batch_size, stream);
+    res_d        = (T*)allocator->allocate(sizeof(T) * params.batch_size, stream);
+
+    // Generate random data
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<T> udis(0.001, 1.0);  // 0 has no log
+    for (int i = 0; i < params.batch_size; i++)
+      loglike_h[i] = std::log(udis(gen));
+
+    // Copy the data to the device
+    raft::update_device(loglike_d, loglike_h.data(), params.batch_size, stream);
+
+    // Compute the tested results
+    information_criterion_batched(res_d,
+                                  loglike_d,
+                                  params.ic_type,
+                                  params.n_params,
+                                  params.batch_size,
+                                  params.n_samples,
+                                  stream);
+
+    // Compute the expected results
+    naive_ic(res_h.data(),
+             loglike_h.data(),
+             params.ic_type,
+             params.n_params,
+             params.batch_size,
+             params.n_samples);
+
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+
+    allocator->deallocate(loglike_d, sizeof(T) * params.batch_size, stream);
+  }
+
+  void TearDown() override
+  {
+    allocator->deallocate(res_d, sizeof(T) * params.batch_size, stream);
+    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+  }
+
+ protected:
+  std::shared_ptr<raft::mr::device::default_allocator> allocator;
+  BatchedICInputs<T> params;
+  T* res_d;
+  std::vector<T> res_h;
+  cudaStream_t stream = 0;
+};
+
+// Test parameters (op, n_batches, m, n, p, q, tolerance)
+const std::vector<BatchedICInputs<double>> inputsd = {
+  {1, 5, 52, AIC, 1e-3}, {10, 7, 100, AICc, 1e-3}, {67, 2, 350, BIC, 1e-3}};
+
+// Test parameters (op, n_batches, m, n, p, q, tolerance)
+const std::vector<BatchedICInputs<float>> inputsf = {
+  {1, 5, 52, AIC, 1e-3}, {10, 7, 100, AICc, 1e-3}, {67, 2, 350, BIC, 1e-3}};
+
+using BatchedICTestD = BatchedICTest<double>;
+using BatchedICTestF = BatchedICTest<float>;
+TEST_P(BatchedICTestD, Result)
+{
+  ASSERT_TRUE(devArrMatchHost(
+    res_h.data(), res_d, params.batch_size, raft::CompareApprox<double>(params.tolerance), stream));
+}
+TEST_P(BatchedICTestF, Result)
+{
+  ASSERT_TRUE(devArrMatchHost(
+    res_h.data(), res_d, params.batch_size, raft::CompareApprox<float>(params.tolerance), stream));
+}
+
+INSTANTIATE_TEST_CASE_P(BatchedICTests, BatchedICTestD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(BatchedICTests, BatchedICTestF, ::testing::ValuesIn(inputsf));
+
+}  // namespace stats
+}  // namespace raft
diff --git a/cpp/test/stats/kl_divergence.cu b/cpp/test/stats/kl_divergence.cu
new file mode 100644
index 0000000000..050f48f557
--- /dev/null
+++ b/cpp/test/stats/kl_divergence.cu
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../test_utils.h"
+#include <algorithm>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <raft/cudart_utils.h>
+#include <raft/stats/kl_divergence.hpp>
+#include <random>
+
+namespace raft {
+namespace stats {
+
+// parameter structure definition
+struct klDivergenceParam {
+  int nElements;
+  double tolerance;
+};
+
+// test fixture class
+template <typename DataT>
+class klDivergenceTest : public ::testing::TestWithParam<klDivergenceParam> {
+ protected:
+  // the constructor
+  void SetUp() override
+  {
+    // getting the parameters
+    params = ::testing::TestWithParam<klDivergenceParam>::GetParam();
+
+    nElements = params.nElements;
+
+    // generating random value test input
+    std::vector<DataT> h_modelPDF(nElements, 0);
+    std::vector<DataT> h_candidatePDF(nElements, 0);
+    std::random_device rd;
+    std::default_random_engine dre(rd());
+    std::uniform_real_distribution<DataT> realGenerator(0.0, 1.0);
+
+    std::generate(h_modelPDF.begin(), h_modelPDF.end(), [&]() { return realGenerator(dre); });
+    std::generate(
+      h_candidatePDF.begin(), h_candidatePDF.end(), [&]() { return realGenerator(dre); });
+
+    // allocating and initializing memory to the GPU
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+
+    rmm::device_uvector<DataT> d_modelPDF(nElements, stream);
+    rmm::device_uvector<DataT> d_candidatePDF(nElements, stream);
+    RAFT_CUDA_TRY(cudaMemset(d_modelPDF.data(), 0, d_modelPDF.size() * sizeof(DataT)));
+    RAFT_CUDA_TRY(cudaMemset(d_candidatePDF.data(), 0, d_candidatePDF.size() * sizeof(DataT)));
+
+    raft::update_device(d_modelPDF.data(), &h_modelPDF[0], (int)nElements, stream);
+    raft::update_device(d_candidatePDF.data(), &h_candidatePDF[0], (int)nElements, stream);
+
+    // generating the golden output
+    for (int i = 0; i < nElements; ++i) {
+      if (h_modelPDF[i] == 0.0)
+        truthklDivergence += 0;
+
+      else
+        truthklDivergence += h_modelPDF[i] * log(h_modelPDF[i] / h_candidatePDF[i]);
+    }
+
+    // calling the kl_divergence CUDA implementation
+    computedklDivergence =
+      raft::stats::kl_divergence(d_modelPDF.data(), d_candidatePDF.data(), nElements, stream);
+    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+  }
+
+  // declaring the data values
+  klDivergenceParam params;
+  int nElements              = 0;
+  DataT truthklDivergence    = 0;
+  DataT computedklDivergence = 0;
+  cudaStream_t stream        = 0;
+};
+
+// setting test parameter values
+const std::vector<klDivergenceParam> inputs = {
+  {500, 0.000001}, {200, 0.001}, {5000, 0.000001}, {500000, 0.000001}
+
+};
+
+// writing the test suite
+typedef klDivergenceTest<double> klDivergenceTestClass;
+TEST_P(klDivergenceTestClass, Result)
+{
+  ASSERT_NEAR(computedklDivergence, truthklDivergence, params.tolerance);
+}
+INSTANTIATE_TEST_CASE_P(klDivergence, klDivergenceTestClass, ::testing::ValuesIn(inputs));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/stats/mutual_info_score.cu b/cpp/test/stats/mutual_info_score.cu
new file mode 100644
index 0000000000..b7f6406009
--- /dev/null
+++ b/cpp/test/stats/mutual_info_score.cu
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../test_utils.h"
+#include <algorithm>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <raft/cudart_utils.h>
+#include <raft/stats/contingency_matrix.hpp>
+#include <raft/stats/mutual_info_score.hpp>
+#include <random>
+
+namespace raft {
+namespace stats {
+
+// parameter structure definition
+struct mutualInfoParam {
+  int nElements;
+  int lowerLabelRange;
+  int upperLabelRange;
+  bool sameArrays;
+  double tolerance;
+};
+
+// test fixture class
+template <typename T>
+class mutualInfoTest : public ::testing::TestWithParam<mutualInfoParam> {
+ protected:
+  // the constructor
+  void SetUp() override
+  {
+    // getting the parameters
+    params = ::testing::TestWithParam<mutualInfoParam>::GetParam();
+
+    nElements       = params.nElements;
+    lowerLabelRange = params.lowerLabelRange;
+    upperLabelRange = params.upperLabelRange;
+
+    // generating random value test input
+    std::vector<int> arr1(nElements, 0);
+    std::vector<int> arr2(nElements, 0);
+    std::random_device rd;
+    std::default_random_engine dre(rd());
+    std::uniform_int_distribution<int> intGenerator(lowerLabelRange, upperLabelRange);
+
+    std::generate(arr1.begin(), arr1.end(), [&]() { return intGenerator(dre); });
+    if (params.sameArrays) {
+      arr2 = arr1;
+    } else {
+      std::generate(arr2.begin(), arr2.end(), [&]() { return intGenerator(dre); });
+    }
+
+    // generating the golden output
+    // calculating the contingency matrix
+    int numUniqueClasses = upperLabelRange - lowerLabelRange + 1;
+    size_t sizeOfMat     = numUniqueClasses * numUniqueClasses * sizeof(int);
+    int* hGoldenOutput   = (int*)malloc(sizeOfMat);
+    memset(hGoldenOutput, 0, sizeOfMat);
+    int i, j;
+    for (i = 0; i < nElements; i++) {
+      int row    = arr1[i] - lowerLabelRange;
+      int column = arr2[i] - lowerLabelRange;
+
+      hGoldenOutput[row * numUniqueClasses + column] += 1;
+    }
+
+    int* a = (int*)malloc(numUniqueClasses * sizeof(int));
+    int* b = (int*)malloc(numUniqueClasses * sizeof(int));
+    memset(a, 0, numUniqueClasses * sizeof(int));
+    memset(b, 0, numUniqueClasses * sizeof(int));
+
+    // and also the reducing contingency matrix along row and column
+    for (i = 0; i < numUniqueClasses; ++i) {
+      for (j = 0; j < numUniqueClasses; ++j) {
+        a[i] += hGoldenOutput[i * numUniqueClasses + j];
+        b[i] += hGoldenOutput[j * numUniqueClasses + i];
+      }
+    }
+
+    // calculating the truth mutual information
+    for (int i = 0; i < numUniqueClasses; ++i) {
+      for (int j = 0; j < numUniqueClasses; ++j) {
+        if (a[i] * b[j] != 0 && hGoldenOutput[i * numUniqueClasses + j] != 0) {
+          truthmutualInfo +=
+            (double)(hGoldenOutput[i * numUniqueClasses + j]) *
+            (log((double)(double(nElements) * hGoldenOutput[i * numUniqueClasses + j])) -
+             log((double)(a[i] * b[j])));
+        }
+      }
+    }
+
+    truthmutualInfo /= nElements;
+
+    // allocating and initializing memory to the GPU
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+
+    rmm::device_uvector<T> firstClusterArray(nElements, stream);
+    rmm::device_uvector<T> secondClusterArray(nElements, stream);
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(firstClusterArray.data(), 0, firstClusterArray.size() * sizeof(T), stream));
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(secondClusterArray.data(), 0, secondClusterArray.size() * sizeof(T), stream));
+
+    raft::update_device(firstClusterArray.data(), &arr1[0], (int)nElements, stream);
+    raft::update_device(secondClusterArray.data(), &arr2[0], (int)nElements, stream);
+
+    // calling the mutualInfo CUDA implementation
+    computedmutualInfo = raft::stats::mutual_info_score(firstClusterArray.data(),
+                                                        secondClusterArray.data(),
+                                                        nElements,
+                                                        lowerLabelRange,
+                                                        upperLabelRange,
+                                                        stream);
+  }
+
+  // the destructor
+  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
+
+  // declaring the data values
+  mutualInfoParam params;
+  T lowerLabelRange, upperLabelRange;
+  int nElements             = 0;
+  double truthmutualInfo    = 0;
+  double computedmutualInfo = 0;
+  cudaStream_t stream       = 0;
+};
+
+// setting test parameter values
+const std::vector<mutualInfoParam> inputs = {{199, 1, 10, false, 0.000001},
+                                             {200, 15, 100, false, 0.000001},
+                                             {100, 1, 20, false, 0.000001},
+                                             {10, 1, 10, false, 0.000001},
+                                             {198, 1, 100, false, 0.000001},
+                                             {300, 3, 99, false, 0.000001},
+                                             {199, 1, 10, true, 0.000001},
+                                             {200, 15, 100, true, 0.000001},
+                                             {100, 1, 20, true, 0.000001},
+                                             {10, 1, 10, true, 0.000001},
+                                             {198, 1, 100, true, 0.000001},
+                                             {300, 3, 99, true, 0.000001}};
+
+// writing the test suite
+typedef mutualInfoTest<int> mutualInfoTestClass;
+TEST_P(mutualInfoTestClass, Result)
+{
+  ASSERT_NEAR(computedmutualInfo, truthmutualInfo, params.tolerance);
+}
+INSTANTIATE_TEST_CASE_P(mutualInfo, mutualInfoTestClass, ::testing::ValuesIn(inputs));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/stats/rand_index.cu b/cpp/test/stats/rand_index.cu
new file mode 100644
index 0000000000..1f4805a160
--- /dev/null
+++ b/cpp/test/stats/rand_index.cu
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+
+#include <raft/cudart_utils.h>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <iostream>
+#include <raft/stats/rand_index.hpp>
+#include <random>
+
+namespace raft {
+namespace stats {
+
+// parameter structure definition
+struct randIndexParam {
+  uint64_t nElements;
+  int lowerLabelRange;
+  int upperLabelRange;
+  double tolerance;
+};
+
+// test fixture class
+template <typename T>
+class randIndexTest : public ::testing::TestWithParam<randIndexParam> {
+ protected:
+  // the constructor
+  void SetUp() override
+  {
+    // getting the parameters
+    params = ::testing::TestWithParam<randIndexParam>::GetParam();
+
+    size            = params.nElements;
+    lowerLabelRange = params.lowerLabelRange;
+    upperLabelRange = params.upperLabelRange;
+
+    // generating random value test input
+    std::vector<int> arr1(size, 0);
+    std::vector<int> arr2(size, 0);
+    std::random_device rd;
+    std::default_random_engine dre(rd());
+    std::uniform_int_distribution<int> intGenerator(lowerLabelRange, upperLabelRange);
+
+    std::generate(arr1.begin(), arr1.end(), [&]() { return intGenerator(dre); });
+    std::generate(arr2.begin(), arr2.end(), [&]() { return intGenerator(dre); });
+
+    // generating the golden output
+    int64_t a_truth = 0;
+    int64_t b_truth = 0;
+
+    for (uint64_t iter = 0; iter < size; ++iter) {
+      for (uint64_t jiter = 0; jiter < iter; ++jiter) {
+        if (arr1[iter] == arr1[jiter] && arr2[iter] == arr2[jiter]) {
+          ++a_truth;
+        } else if (arr1[iter] != arr1[jiter] && arr2[iter] != arr2[jiter]) {
+          ++b_truth;
+        }
+      }
+    }
+    uint64_t nChooseTwo = (size * (size - 1)) / 2;
+    truthRandIndex      = (double)(((double)(a_truth + b_truth)) / (double)nChooseTwo);
+
+    // allocating and initializing memory to the GPU
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+
+    rmm::device_uvector<T> firstClusterArray(size, stream);
+    rmm::device_uvector<T> secondClusterArray(size, stream);
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(firstClusterArray.data(), 0, firstClusterArray.size() * sizeof(T), stream));
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(secondClusterArray.data(), 0, secondClusterArray.size() * sizeof(T), stream));
+
+    raft::update_device(firstClusterArray.data(), &arr1[0], (int)size, stream);
+    raft::update_device(secondClusterArray.data(), &arr2[0], (int)size, stream);
+
+    // calling the rand_index CUDA implementation
+    computedRandIndex =
+      raft::stats::rand_index(firstClusterArray.data(), secondClusterArray.data(), size, stream);
+  }
+
+  // the destructor
+  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
+
+  // declaring the data values
+  randIndexParam params;
+  int lowerLabelRange = 0, upperLabelRange = 2;
+  uint64_t size            = 0;
+  double truthRandIndex    = 0;
+  double computedRandIndex = 0;
+  cudaStream_t stream      = 0;
+};
+
+// setting test parameter values
+const std::vector<randIndexParam> inputs = {{199, 1, 10, 0.000001},
+                                            {200, 1, 100, 0.000001},
+                                            {10, 1, 1200, 0.000001},
+                                            {100, 1, 10000, 0.000001},
+                                            {198, 1, 100, 0.000001},
+                                            {300, 3, 99, 0.000001},
+                                            {2, 0, 0, 0.00001}};
+
+// writing the test suite
+typedef randIndexTest<int> randIndexTestClass;
+TEST_P(randIndexTestClass, Result)
+{
+  ASSERT_NEAR(computedRandIndex, truthRandIndex, params.tolerance);
+}
+INSTANTIATE_TEST_CASE_P(randIndex, randIndexTestClass, ::testing::ValuesIn(inputs));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/stats/silhouette_score.cu b/cpp/test/stats/silhouette_score.cu
new file mode 100644
index 0000000000..6efb3a4f78
--- /dev/null
+++ b/cpp/test/stats/silhouette_score.cu
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../test_utils.h"
+#include <algorithm>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <raft/cudart_utils.h>
+#include <raft/distance/distance_type.hpp>
+
+#if defined RAFT_DISTANCE_COMPILED && defined RAFT_NN_COMPILED
+#include <raft/stats/specializations.hpp>
+#endif
+
+#include <raft/stats/silhouette_score.hpp>
+#include <random>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace stats {
+
+// parameter structure definition
+struct silhouetteScoreParam {
+  int nRows;
+  int nCols;
+  int nLabels;
+  raft::distance::DistanceType metric;
+  int chunk;
+  double tolerance;
+};
+
+// test fixture class
+template <typename LabelT, typename DataT>
+class silhouetteScoreTest : public ::testing::TestWithParam<silhouetteScoreParam> {
+ protected:
+  silhouetteScoreTest()
+    : d_X(0, handle.get_stream()),
+      sampleSilScore(0, handle.get_stream()),
+      d_labels(0, handle.get_stream())
+  {
+  }
+
+  void host_silhouette_score()
+  {
+    // generating random value test input
+    std::vector<double> h_X(nElements, 0.0);
+    std::vector<int> h_labels(nRows, 0);
+    std::random_device rd;
+    std::default_random_engine dre(nElements * nLabels);
+    std::uniform_int_distribution<int> intGenerator(0, nLabels - 1);
+    std::uniform_real_distribution<double> realGenerator(0, 100);
+
+    std::generate(h_X.begin(), h_X.end(), [&]() { return realGenerator(dre); });
+    std::generate(h_labels.begin(), h_labels.end(), [&]() { return intGenerator(dre); });
+
+    // allocating and initializing memory to the GPU
+    auto stream = handle.get_stream();
+    d_X.resize(nElements, stream);
+    d_labels.resize(nElements, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(d_X.data(), 0, d_X.size() * sizeof(DataT), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(d_labels.data(), 0, d_labels.size() * sizeof(LabelT), stream));
+    sampleSilScore.resize(nElements, stream);
+
+    raft::update_device(d_X.data(), &h_X[0], (int)nElements, stream);
+    raft::update_device(d_labels.data(), &h_labels[0], (int)nElements, stream);
+
+    // finding the distance matrix
+
+    rmm::device_uvector<double> d_distanceMatrix(nRows * nRows, stream);
+    double* h_distanceMatrix = (double*)malloc(nRows * nRows * sizeof(double*));
+
+    raft::distance::pairwise_distance(
+      handle, d_X.data(), d_X.data(), d_distanceMatrix.data(), nRows, nRows, nCols, params.metric);
+
+    handle.sync_stream(stream);
+
+    raft::update_host(h_distanceMatrix, d_distanceMatrix.data(), nRows * nRows, stream);
+
+    // finding the bincount array
+
+    double* binCountArray = (double*)malloc(nLabels * sizeof(double*));
+    memset(binCountArray, 0, nLabels * sizeof(double));
+
+    for (int i = 0; i < nRows; ++i) {
+      binCountArray[h_labels[i]] += 1;
+    }
+
+    // finding the average intra cluster distance for every element
+
+    double* a = (double*)malloc(nRows * sizeof(double*));
+
+    for (int i = 0; i < nRows; ++i) {
+      int myLabel               = h_labels[i];
+      double sumOfIntraClusterD = 0;
+
+      for (int j = 0; j < nRows; ++j) {
+        if (h_labels[j] == myLabel) { sumOfIntraClusterD += h_distanceMatrix[i * nRows + j]; }
+      }
+
+      if (binCountArray[myLabel] <= 1)
+        a[i] = -1;
+      else
+        a[i] = sumOfIntraClusterD / (binCountArray[myLabel] - 1);
+    }
+
+    // finding the average inter cluster distance for every element
+
+    double* b = (double*)malloc(nRows * sizeof(double*));
+
+    for (int i = 0; i < nRows; ++i) {
+      int myLabel          = h_labels[i];
+      double minAvgInterCD = ULLONG_MAX;
+
+      for (int j = 0; j < nLabels; ++j) {
+        int curClLabel = j;
+        if (curClLabel == myLabel) continue;
+        double avgInterCD = 0;
+
+        for (int k = 0; k < nRows; ++k) {
+          if (h_labels[k] == curClLabel) { avgInterCD += h_distanceMatrix[i * nRows + k]; }
+        }
+
+        if (binCountArray[curClLabel])
+          avgInterCD /= binCountArray[curClLabel];
+        else
+          avgInterCD = ULLONG_MAX;
+        minAvgInterCD = min(minAvgInterCD, avgInterCD);
+      }
+
+      b[i] = minAvgInterCD;
+    }
+
+    // finding the silhouette score for every element
+
+    double* truthSampleSilScore = (double*)malloc(nRows * sizeof(double*));
+    for (int i = 0; i < nRows; ++i) {
+      if (a[i] == -1)
+        truthSampleSilScore[i] = 0;
+      else if (a[i] == 0 && b[i] == 0)
+        truthSampleSilScore[i] = 0;
+      else
+        truthSampleSilScore[i] = (b[i] - a[i]) / max(a[i], b[i]);
+      truthSilhouetteScore += truthSampleSilScore[i];
+    }
+
+    truthSilhouetteScore /= nRows;
+  }
+
+  // the constructor
+  void SetUp() override
+  {
+    // getting the parameters
+    params = ::testing::TestWithParam<silhouetteScoreParam>::GetParam();
+
+    nRows     = params.nRows;
+    nCols     = params.nCols;
+    nLabels   = params.nLabels;
+    chunk     = params.chunk;
+    nElements = nRows * nCols;
+
+    host_silhouette_score();
+
+    // calling the silhouette_score CUDA implementation
+    computedSilhouetteScore = raft::stats::silhouette_score(handle,
+                                                            d_X.data(),
+                                                            nRows,
+                                                            nCols,
+                                                            d_labels.data(),
+                                                            nLabels,
+                                                            sampleSilScore.data(),
+                                                            handle.get_stream(),
+                                                            params.metric);
+
+    batchedSilhouetteScore = raft::stats::silhouette_score_batched(handle,
+                                                                   d_X.data(),
+                                                                   nRows,
+                                                                   nCols,
+                                                                   d_labels.data(),
+                                                                   nLabels,
+                                                                   sampleSilScore.data(),
+                                                                   chunk,
+                                                                   params.metric);
+  }
+
+  // declaring the data values
+  silhouetteScoreParam params;
+  int nLabels;
+  rmm::device_uvector<DataT> d_X;
+  rmm::device_uvector<DataT> sampleSilScore;
+  rmm::device_uvector<LabelT> d_labels;
+  int nRows;
+  int nCols;
+  int nElements;
+  double truthSilhouetteScore    = 0;
+  double computedSilhouetteScore = 0;
+  double batchedSilhouetteScore  = 0;
+  raft::handle_t handle;
+  int chunk;
+};
+
+// setting test parameter values
+const std::vector<silhouetteScoreParam> inputs = {
+  {4, 2, 3, raft::distance::DistanceType::L2Expanded, 4, 0.00001},
+  {4, 2, 2, raft::distance::DistanceType::L2SqrtUnexpanded, 2, 0.00001},
+  {8, 8, 3, raft::distance::DistanceType::L2Unexpanded, 4, 0.00001},
+  {11, 2, 5, raft::distance::DistanceType::L2Expanded, 3, 0.00001},
+  {40, 2, 8, raft::distance::DistanceType::L2Expanded, 10, 0.00001},
+  {12, 7, 3, raft::distance::DistanceType::CosineExpanded, 8, 0.00001},
+  {7, 5, 5, raft::distance::DistanceType::L1, 2, 0.00001}};
+
+// writing the test suite
+typedef silhouetteScoreTest<int, double> silhouetteScoreTestClass;
+TEST_P(silhouetteScoreTestClass, Result)
+{
+  ASSERT_NEAR(computedSilhouetteScore, truthSilhouetteScore, params.tolerance);
+  ASSERT_NEAR(batchedSilhouetteScore, truthSilhouetteScore, params.tolerance);
+}
+INSTANTIATE_TEST_CASE_P(silhouetteScore, silhouetteScoreTestClass, ::testing::ValuesIn(inputs));
+
+}  // end namespace stats
+}  // end namespace raft
diff --git a/cpp/test/stats/trustworthiness.cu b/cpp/test/stats/trustworthiness.cu
new file mode 100644
index 0000000000..ebbd52a332
--- /dev/null
+++ b/cpp/test/stats/trustworthiness.cu
@@ -0,0 +1,340 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <raft/cudart_utils.h>
+#include <raft/distance/distance.hpp>
+
+#if defined RAFT_DISTANCE_COMPILED && defined RAFT_NN_COMPILED
+#include <raft/stats/specializations.hpp>
+#endif
+
+#include <raft/stats/trustworthiness_score.hpp>
+#include <vector>
+
+namespace raft {
+namespace stats {
+
+class TrustworthinessScoreTest : public ::testing::Test {
+ protected:
+  void basicTest()
+  {
+    std::vector<float> X = {
+      5.6142087,   8.59787,     -4.382763,   -3.6452143,  -5.8816037,  -0.6330313,  4.6920023,
+      -0.79210913, 0.6106314,   2.1210914,   5.919943,    -8.43784,    -6.4819884,  0.41001374,
+      -6.1052523,  -4.0825715,  -5.314755,   -2.834671,   5.751696,    -6.5012555,  -0.4719201,
+      -7.53353,    7.6789393,   -1.4959852,  -5.5977287,  -9.564147,   1.2902534,   3.559834,
+      -6.7659483,  8.265964,    4.595404,    9.133477,    -6.1553917,  -6.319754,   -2.9039452,
+      4.4150834,   -3.094395,   -4.426273,   9.584571,    -5.64133,    6.6209483,   7.4044604,
+      3.9620576,   5.639907,    10.33007,    -0.8792053,  5.143776,    -7.464049,   1.2448754,
+      -5.6300974,  5.4518576,   4.119535,    6.749645,    7.627064,    -7.2298336,  1.9681473,
+      -6.9083176,  6.404673,    0.07186685,  9.0994835,   8.51037,     -8.986389,   0.40534487,
+      2.115397,    4.086756,    1.2284287,   -2.6272132,  0.06527536,  -9.587425,   -7.206078,
+      7.864875,    7.4397306,   -6.9233336,  -2.6643622,  3.3466153,   7.0408177,   -3.6069896,
+      -9.971769,   4.4075623,   7.9063697,   2.559074,    4.323717,    1.6867131,   -1.1576937,
+      -9.893141,   -3.251416,   -7.4889135,  -4.0588717,  -2.73338,    -7.4852257,  3.4460473,
+      9.759119,    -5.4680476,  -4.722435,   -8.032619,   -1.4598992,  4.227361,    3.135568,
+      1.1950601,   1.1982028,   6.998856,    -6.131138,   -6.6921015,  0.5361224,   -7.1213965,
+      -5.6104236,  -7.2212887,  -2.2710054,  8.544764,    -6.0254574,  1.4582269,   -5.5587835,
+      8.031556,    -0.26328218, -5.2591386,  -9.262641,   2.8691363,   5.299787,    -9.209455,
+      8.523085,    5.180329,    10.655528,   -5.7171874,  -6.7739563,  -3.6306462,  4.067106,
+      -1.5912259,  -3.2345476,  8.042973,    -3.6364832,  4.1242137,   9.886953,    5.4743724,
+      6.3058076,   9.369645,    -0.5175337,  4.9859877,   -7.879498,   1.358422,    -4.147944,
+      3.8984218,   5.894656,    6.4903927,   8.702036,    -8.023722,   2.802145,    -7.748032,
+      5.8461113,   -0.34215945, 11.298865,   1.4107164,   -9.949621,   -1.6257563,  -10.655836,
+      2.4528909,   1.1570255,   5.170669,    2.8398793,   7.1838694,   9.088459,    2.631155,
+      3.964414,    2.8769252,   0.04198391,  -0.16993195, 3.6747139,   -2.8377378,  6.1782537,
+      10.759618,   -4.5642614,  -8.522967,   0.8614642,   6.623416,    -1.029324,   5.5488334,
+      -7.804511,   2.128833,    7.9042315,   7.789576,    -2.7944536,  0.72271067,  -10.511495,
+      -0.78634536, -10.661714,  2.9376361,   1.9148129,   6.22859,     0.26264945,  8.028384,
+      6.8743043,   0.9351067,   7.0690722,   4.2846055,   1.4134506,   -0.18144785, 5.2778087,
+      -1.7140163,  9.217541,    8.602799,    -2.6537218,  -7.8377395,  1.1244944,   5.4540544,
+      -0.38506773, 3.9885726,   -10.76455,   1.4440702,   9.136163,    6.664117,    -5.7046547,
+      8.038592,    -9.229767,   -0.2799413,  3.6064725,   4.187257,    1.0516582,   -2.0707326,
+      -0.7615968,  -8.561018,   -3.7831352,  10.300297,   5.332594,    -6.5880876,  -4.2508664,
+      1.7985519,   5.7226253,   -4.1223383,  -9.6697855,  1.4885283,   7.524974,    1.7206005,
+      4.890457,    3.7264557,   0.4428284,   -9.922455,   -4.250455,   -6.4410596,  -2.107994,
+      -1.4109765,  -6.1325397,  0.32883006,  6.0489736,   7.7257385,   -8.281174,   1.0129383,
+      -10.792166,  8.378851,    10.802716,   9.848448,    -9.188757,   1.3151443,   1.9971865,
+      -2.521849,   4.3268294,   -7.775683,   -2.2902298,  3.0824065,   -7.17559,    9.6100855,
+      7.3965735,   -10.476525,  5.895973,    -3.6974669,  -7.6688933,  1.7354839,   -7.4045196,
+      -1.7992063,  -4.0394845,  5.2471714,   -2.250571,   2.528036,    -8.343515,   -2.2374575,
+      -10.019771,  0.73371273,  3.1853926,   2.7994921,   2.6637669,   7.620401,    7.515571,
+      0.68636256,  5.834537,    4.650282,    -1.0362619,  0.4461701,   3.7870514,   -4.1340904,
+      7.202998,    9.736904,    -3.005512,   -8.920467,   1.1228397,   6.2598724,   1.2812365,
+      4.5442104,   -8.791537,   0.92113096,  8.464749,    8.359035,    -4.3923397,  1.2252625,
+      -10.1986475, -1.4409319,  -10.013967,  3.9071581,   1.683064,    4.877419,    1.6570637,
+      9.559105,    7.3546534,   0.36635467,  5.220211,    4.6303267,   0.6601065,   0.16149978,
+      3.8818731,   -3.4438233,  8.42085,     8.659159,    -3.0935583,  -8.039611,   2.3060374,
+      5.134666,    1.0458113,   6.0190983,   -9.143728,   0.99048865,  9.210842,    6.670241,
+      -5.9614363,  0.8747396,   7.078824,    8.067469,    -10.314754,  0.45977542,  -9.28306,
+      9.1838665,   9.318644,    7.189082,    -11.092555,  1.0320464,   3.882163,    0.10953151,
+      7.9029684,   -6.9068265,  -1.3526366,  5.3996363,   -8.430931,   11.452577,   6.39663,
+      -11.090514,  4.6662245,   -3.1268113,  -8.357452,   2.2276728,   -10.357126,  -0.9291848,
+      -3.4193344,  3.1289792,   -2.5030103,  6.772719,    11.457757,   -4.2125936,  -6.684548,
+      -4.7611327,  3.6960156,   -2.3030636,  -3.0591488,  10.452471,   -4.1267314,  5.66614,
+      7.501461,    5.072407,    6.636537,    8.990381,    -0.2559256,  4.737867,    -6.2149944,
+      2.535682,    -5.5484023,  5.7113924,   3.4742818,   7.9915137,   7.0052586,   -7.156467,
+      1.4354781,   -8.286235,   5.7523417,   -2.4175215,  9.678009,    0.05066403,  -9.645226,
+      -2.2658763,  -9.518178,   4.493372,    2.3232365,   2.1659086,   0.42507997,  8.360246,
+      8.23535,     2.6878164,   5.236947,    3.4924245,   -0.6089895,  0.8884741,   4.359464,
+      -4.6073823,  7.83441,     8.958755,    -3.4690795,  -9.182282,   1.2478025,   5.6311107,
+      -1.2408862,  3.6316886,   -8.684654,   2.1078515,   7.2813864,   7.9265943,   -3.6135032,
+      0.4571511,   8.493568,    10.496853,   -7.432897,   0.8625995,   -9.607528,   7.2899456,
+      8.83158,     8.908199,    -10.300263,  1.1451302,   3.7871468,   -0.97040755, 5.7664757,
+      -8.9688,     -2.146672,   5.9641485,   -6.2908535,  10.126465,   6.1553903,   -12.066902,
+      6.301596,    -5.0419583,  -8.228695,   2.4879954,   -8.918582,   -3.7434099,  -4.1593685,
+      3.7431836,   -1.1704745,  0.5524103,   9.109399,    9.571567,    -11.209955,  1.2462777,
+      -9.554555,   9.091726,    11.477966,   7.630937,    -10.450911,  1.9205878,   5.358983,
+      -0.44546837, 6.7611346,   -9.74753,    -0.5939732,  3.8892255,   -6.437991,   10.294727,
+      5.6723895,   -10.7883,    6.192348,    -5.293862,   -10.811491,  1.0194173,   -7.074576,
+      -3.192368,   -2.5231771,  4.2791643,   -0.53309685, 0.501366,    9.636625,    7.710316,
+      -6.4219728,  1.0975566,   -8.218886,   6.9011984,   9.873679,    8.903804,    -9.316832,
+      1.2404599,   4.9039655,   1.2272617,   4.541515,    -5.2753224,  -3.2196746,  3.1303136,
+      -7.285681,   9.041425,    5.6417427,   -9.93667,    5.7548947,   -5.113397,   -8.544622,
+      4.182665,    -7.7709813,  -3.2810235,  -3.312072,   3.8900535,   -2.0604856,  6.709082,
+      -8.461194,   1.2666026,   4.8770437,   2.6955879,   3.0340345,   -1.1614609,  -3.536341,
+      -7.090382,   -5.36146,    9.072544,    6.4554095,   -4.4728956,  -1.88395,    3.1095037,
+      8.782348,    -3.316743,   -8.65248,    1.6802986,   8.186188,    2.1783829,   4.931278,
+      4.158475,    1.4033595,   -11.320101,  -3.7084908,  -6.740436,   -2.5555193,  -1.0451177,
+      -6.5569925,  0.82810307,  8.505919,    8.332857,    -9.488569,   -0.21588463, -8.056692,
+      8.493993,    7.6401625,   8.812983,    -9.377281,   2.4369764,   3.1766508,   0.6300803,
+      5.6666765,   -7.913654,   -0.42301777, 4.506412,    -7.8954244,  10.904591,   5.042256,
+      -9.626183,   8.347351,    -3.605006,   -7.923387,   1.1024277,   -8.705793,   -2.5151258,
+      -2.5066147,  4.0515003,   -2.060757,   6.2635093,   8.286584,    -6.0509276,  -6.76452,
+      -3.1158175,  1.6578803,   -1.4608748,  -1.24211,    8.151246,    -4.2970877,  6.093071,
+      7.4911637,   4.51018,     4.8425875,   9.211085,    -2.4386222,  4.5830803,   -5.6079445,
+      2.3713675,   -4.0707507,  3.1787417,   5.462342,    6.915912,    6.3928423,   -7.2970796,
+      5.0112796,   -9.140893,   4.9990606,   0.38391754,  7.7088532,   1.9340848,   8.18833,
+      8.16617,     -9.42086,    -0.3388326,  -9.659727,   8.243045,    8.099073,    8.439428,
+      -7.038694,   2.1077902,   3.3866816,   -1.9975324,  7.4972878,   -7.2525196,  -1.553731,
+      4.08758,     -6.6922374,  9.50525,     4.026735,    -9.243538,   7.2740564,   -3.9319072,
+      -6.3228955,  1.6693478,   -7.923119,   -3.7423058,  -2.2813146,  5.3469067,   -1.8285407,
+      3.3118162,   8.826356,    -4.4641976,  -6.4751124,  -9.200089,   -2.519147,   4.225298,
+      2.4105988,   -0.4344186,  0.53441775,  5.2836394,   -8.2816105,  -4.996147,   -1.6870759,
+      -7.8543897,  -3.9788852,  -7.0346904,  -3.1289773,  7.4567637,   -5.6227813,  1.0709786,
+      -8.866012,   8.427324,    -1.1755563,  -5.789216,   -8.197835,   5.3342214,   6.0646234,
+      -6.8975716,  7.717031,    3.480355,    8.312151,    -3.6645212,  -3.0976524,  -8.090359,
+      -1.9176173,  2.4257212,   1.9700835,   0.4098958,   2.1341088,   7.652741,    -9.9595585,
+      -5.989757,   0.10119354,  -7.935407,   -5.792786,   -5.22783,    -4.318978,   5.414037,
+      -6.4621663,  1.670883,    -6.9224787,  8.696932,    -2.0214002,  -6.6681314,  -8.326418,
+      4.9049683,   5.4442496,   -6.403739,   7.5822453,   7.0972915,   -9.072851,   -0.23897195,
+      1.7662339,   5.3096304,   1.983179,    -2.222645,   -0.34700772, -9.094717,   -6.107907,
+      9.525174,    8.1550665,   -5.6940084,  -4.1636486,  1.7360662,   8.528821,    -3.7299833,
+      -9.341266,   2.608542,    9.108706,    0.7978509,   4.2488184,   2.454484,    0.9446999,
+      -10.106636,  -3.8973773,  -6.6566644,  -4.5647273,  -0.99837756, -6.568582,   9.324853,
+      -7.9020953,  2.0910501,   2.2896829,   1.6790711,   1.3159255,   -3.5258796,  1.8898442,
+      -8.105812,   -4.924962,   8.771129,    7.1202874,   -5.991957,   -3.4106019,  2.4450088,
+      7.796387,    -3.055946,   -7.8971434,  1.9856719,   9.001636,    1.8511922,   3.019749,
+      3.1227696,   0.4822102,   -10.021213,  -3.530504,   -6.225959,   -3.0029628,  -1.7881511,
+      -7.3879776,  1.3925704,   9.499782,    -3.7318087,  -3.7074296,  -7.7466836,  -1.5284524,
+      4.0535855,   3.112011,    0.10340207,  -0.5429599,  6.67026,     -9.155924,   -4.924038,
+      0.64248866,  -10.0103655, -3.2742946,  -4.850029,   -3.6707063,  8.586258,    -5.855605,
+      4.906918,    -6.7813993,  7.9938135,   -2.5473144,  -5.688948,   -7.822478,   2.1421318,
+      4.66659,     -9.701272,   9.549149,    0.8998125,   -8.651497,   -0.56899565, -8.639817,
+      2.3088377,   2.1264515,   3.2764478,   2.341989,    8.594338,    8.630639,    2.8440373,
+      6.2043204,   4.433932,    0.6320018,   -1.8179281,  5.09452,     -1.5741565,  8.153934,
+      8.744339,    -3.6945698,  -8.883078,   1.5329908,   5.2745943,   0.44716078,  4.8809066,
+      -7.9594903,  1.134374,    9.233994,    6.5528665,   -4.520542,   9.477355,    -8.622195,
+      -0.23191702, 2.0485356,   3.9379985,   1.5916302,   -1.4516805,  -0.0843819,  -7.8554378,
+      -5.88308,    7.999766,    6.2572145,   -5.585321,   -4.0097756,  0.42382592,  6.160884,
+      -3.631315,   -8.333449,   2.770595,    7.8495173,   3.3331623,   4.940415,    3.6207345,
+      -0.037517,   -11.034698,  -3.185103,   -6.614664,   -3.2177854,  -2.0792234,  -6.8879867,
+      7.821685,    -8.455084,   1.0784642,   4.0033927,   2.7343264,   2.6052725,   -4.1224284,
+      -0.89305353, -6.8267674,  -4.9715133,  8.880253,    5.6994023,   -5.9695024,  -4.9181266,
+      1.3017995,   7.972617,    -3.9452884,  -10.424556,  2.4504194,   6.21529,     0.93840516,
+      4.2070026,   6.159839,    0.91979957,  -8.706724,   -4.317946,   -6.6823545,  -3.0388,
+      -2.464262,   -7.3716645,  1.3926703,   6.544412,    -5.6251183,  -5.122411,   -8.622049,
+      -2.3905911,  3.9138813,   1.9779967,   -0.05011125, 0.13310997,  7.229751,    -9.742043,
+      -8.08724,    1.2426697,   -7.9230795,  -3.3162494,  -7.129571,   -3.5488048,  7.4701195,
+      -5.2357526,  0.5917681,   -6.272206,   6.342328,    -2.909731,   -4.991607,   -8.845513,
+      3.3228495,   7.033246,    -7.8180246,  8.214469,    6.3910093,   9.185153,    -6.20472,
+      -7.713809,   -3.8481297,  3.5579286,   0.7078448,   -3.2893546,  7.384514,    -4.448121,
+      3.0104196,   9.492943,    8.024847,    4.9114385,   9.965594,    -3.014036,   5.182494,
+      -5.8806014,  2.5312455,   -5.9926524,  4.474469,    6.3717875,   6.993105,    6.493093,
+      -8.935534,   3.004074,    -8.055647,   8.315765,    -1.3026813,  8.250377,    0.02606229,
+      6.8508425,   9.655665,    -7.0116496,  -0.41060972, -10.049198,  7.897801,    6.7791023,
+      8.3362,      -9.821014,   2.491157,    3.5160472,   -1.6228812,  7.398063,    -8.769123,
+      -3.1743705,  3.2827861,   -6.497855,   10.831924,   5.2761307,   -9.704417,   4.3817043,
+      -3.9841619,  -8.111647,   1.1883026,   -8.115312,   -2.9240117,  -5.8879666,  4.20928,
+      -0.3587938,  6.935672,    -10.177582,  0.48819053,  3.1250648,   2.9306343,   3.082544,
+      -3.477687,   -1.3768549,  -7.4922366,  -3.756631,   10.039836,   3.6670392,   -5.9761434,
+      -4.4728765,  3.244255,    7.027899,    -2.3806512,  -10.4100685, 1.605716,    7.7953773,
+      0.5408159,   1.7156523,   3.824097,    -1.0604783,  -10.142124,  -5.246805,   -6.5283823,
+      -4.579547,   -2.42714,    -6.709197,   2.7782338,   7.33353,     -6.454507,   -2.9929368,
+      -7.8362985,  -2.695445,   2.4900775,   1.6682367,   0.4641757,   -1.0495365,  6.9631333,
+      -9.291356,   -8.23837,    -0.34263706, -8.275113,   -2.8454232,  -5.0864096,  -2.681942,
+      7.5450225,   -6.2517986,  0.06810654,  -6.470652,   4.9042645,   -1.8369255,  -6.6937943,
+      -7.9625087,  2.8510258,   6.180508,    -8.282598,   7.919079,    1.4897474,   6.7217417,
+      -4.2459426,  -4.114431,   -8.375707,   -2.143264,   5.6972933,   1.5574739,   0.39375135,
+      1.7930849,   5.1737595,   -7.826241,   -5.160268,   -0.80433255, -7.839536,   -5.2620406,
+      -5.4643164,  -3.185536,   6.620315,    -7.065227,   1.0524757,   -6.125088,   5.7126627,
+      -1.6161644,  -3.852159,   -9.164279,   2.7005782,   5.946544,    -8.468236,   8.2145405,
+      1.1035942,   6.590157,    -4.0461283,  -4.8090615,  -7.6702685,  -2.1121511,  5.1147075,
+      1.6128504,   2.0064135,   1.0544407,   6.0038295,   -7.8282537,  -4.801278,   0.32349443,
+      -8.0649805,  -4.372714,   -5.61336,    -5.21394,    8.176595,    -5.4753284,  1.7800134,
+      -8.267283,   7.2133374,   -0.16594432, -6.317046,   -9.490406,   4.1261597,   5.473317,
+      -7.7551675,  7.007468,    7.478628,    -8.801905,   0.10975724,  3.5478222,   4.797803,
+      1.3825226,   -3.357369,   0.99262005,  -6.94877,    -5.4781394,  9.632604,    5.7492557,
+      -5.9014316,  -3.1632116,  2.340859,    8.708098,    -3.1255999,  -8.848661,   4.5612836,
+      8.455157,    0.73460823,  4.112301,    4.392744,    -0.30759293, -6.8036823,  -3.0331545,
+      -8.269506,   -2.82415,    -0.9411246,  -5.993506,   2.1618164,   -8.716055,   -0.7432543,
+      -10.255819,  3.095418,    2.5131428,   4.752442,    0.9907621,   7.8279433,   7.85814,
+      0.50430876,  5.2840405,   4.457291,    0.03330028,  -0.40692952, 3.9244103,   -2.117118,
+      7.6977615,   8.759009,    -4.2157164,  -9.136053,   3.247858,    4.668686,    0.76162136,
+      5.3833632,   -9.231471,   0.44309422,  8.380872,    6.7211227,   -3.091507,   2.173508,
+      -9.038242,   -1.3666698,  -9.819077,   0.37825826,  2.3898845,   4.2440815,   1.9161536,
+      7.24787,     6.9124637,   1.6238527,   5.1140285,   3.1935842,   1.02845,     -1.1273454,
+      5.638998,    -2.497932,   8.342559,    8.586319,    -2.9069402,  -7.6387944,  3.5975037,
+      4.4115705,   0.41506064,  4.9078383,   -9.68327,    1.8159529,   9.744613,    8.40622,
+      -4.495336,   9.244892,    -8.789869,   1.3158468,   4.018167,    3.3922846,   2.652022,
+      -2.7495477,  0.2528986,   -8.268324,   -6.004913,   10.428784,   6.6580734,   -5.537176,
+      -1.7177434,  2.7504628,   6.7735,      -2.4454272,  -9.998361,   2.9483433,   6.8266654,
+      2.3787718,   4.472637,    2.5871701,   0.7355365,   -7.7027745,  -4.1879907,  -7.172832,
+      -4.1843605,  -0.03646783, -5.419406,   6.958486,    11.011111,   -7.1821184,  -7.956423,
+      -3.408451,   4.6850276,   -2.348787,   -4.398289,   6.9787564,   -3.8324208,  5.967827,
+      8.433518,    4.660108,    5.5657144,   9.964243,    -1.3515275,  6.404833,    -6.4805903,
+      2.4379845,   -6.0816774,  1.752272,    5.3771873,   6.9613523,   6.9788294,   -6.3894596,
+      3.7521114,   -6.8034263,  6.4458385,   -0.7233525,  10.512529,   4.362273,    9.231461,
+      -6.3382263,  -7.659,      -3.461823,   4.71463,     0.17817476,  -3.685746,   7.2962036,
+      -4.6489477,  5.218017,    11.546999,   4.7218375,   6.8498397,   9.281103,    -3.900459,
+      6.844054,    -7.0886965,  -0.05019227, -8.233724,   5.5808983,   6.374517,    8.321048,
+      7.969449,    -7.3478637,  1.4917561,   -8.003144,   4.780668,    -1.1981848,  7.753739,
+      2.0260844,   -8.880096,   -3.4258451,  -7.141975,   1.9637157,   1.814725,    5.311151,
+      1.4831505,   7.8483663,   7.257948,    1.395786,    6.417756,    5.376912,    0.59505713,
+      0.00062552,  3.6634305,   -4.159713,   7.3571978,   10.966816,   -2.5419605,  -8.466229,
+      1.904205,    5.6338267,   -0.52567476, 5.59736,     -8.361799,   0.5009981,   8.460681,
+      7.3891273,   -3.5272243,  5.0552278,   9.921456,    -7.69693,    -7.286378,   -1.9198836,
+      3.1666567,   -2.5832257,  -2.2445817,  9.888111,    -5.076563,   5.677401,    7.497946,
+      5.662994,    5.414262,    8.566503,    -2.5530663,  7.1032815,   -6.0612082,  1.3419591,
+      -4.9595256,  4.3377542,   4.3790717,   6.793512,    8.383502,    -7.1278043,  3.3240774,
+      -9.379446,   6.838661,    -0.81241214, 8.694813,    0.79141915,  7.632467,    8.575382,
+      -8.533798,   0.28954387,  -7.5675836,  5.8653326,   8.97235,     7.1649346,   -10.575289,
+      0.9359381,   5.02381,     -0.5609511,  5.543464,    -7.69131,    -2.1792977,  2.4729247,
+      -6.1917787,  10.373678,   7.6549597,   -8.809486,   5.5657206,   -3.3169382,  -8.042887,
+      2.0874746,   -7.079005,   -3.33398,    -3.6843317,  4.0172358,   -2.0754814,  1.1726758,
+      7.4618697,   6.9483604,   -8.469206,   0.7401797,   -10.318176,  8.384557,    10.5476265,
+      9.146971,    -9.250223,   0.6290606,   4.4941425,   -0.7514017,  7.2271705,   -8.309598,
+      -1.4761636,  4.0140634,   -6.021102,   9.132852,    5.6610966,   -11.249811,  8.359293,
+      -1.9445792,  -7.7393436,  -0.3931331,  -8.824441,   -2.5995944,  -2.5714035,  4.140213,
+      -3.6863053,  5.517265,    9.020411,    -4.9286127,  -7.871219,   -3.7446704,  2.5179656,
+      -1.4543481,  -2.2703636,  7.010597,    -3.6436229,  6.753862,    7.4129915,   7.1406755,
+      5.653706,    9.5445175,   0.15698843,  4.761813,    -7.698002,   1.6870106,   -4.5410123,
+      4.171763,    5.3747005,   6.341021,    7.456738,    -8.231657,   2.763487,    -9.208167,
+      6.676799,    -1.1957736,  10.062605,   4.0975976,   7.312957,    -2.4981596,  -2.9658387,
+      -8.150425,   -2.1075552,  2.64375,     1.6636052,   1.1483809,   0.09276015,  5.8556347,
+      -7.8481026,  -5.9913163,  -0.02840613, -9.937289,   -1.0486673,  -5.2340155,  -3.83912,
+      7.7165728,   -8.409944,   0.80863273,  -6.9119215,  7.5712357,   0.36031485,  -6.056131,
+      -8.470033,   1.8678337,   3.0121377,   -7.3096333,  8.205484,    5.262654,    8.774514,
+      -4.7603083,  -7.2096143,  -4.437014,   3.6080024,   -1.624254,   -4.2787876,  8.880863,
+      -4.8984556,  5.1782074,   9.944454,    3.911282,    3.5396595,   8.867042,    -1.2006199,
+      5.393288,    -5.6455317,  0.7829499,   -4.0338907,  2.479272,    6.5080743,   8.582535,
+      7.0097537,   -6.9823785,  3.984318,    -7.225381,   5.3135114,   -1.0391048,  8.951443,
+      -0.70119005, -8.510742,   -0.42949116, -10.9224825, 2.8176029,   1.6800792,   5.778404,
+      1.7269998,   7.1975236,   7.7258267,   2.7632928,   5.3399253,   3.4650044,   0.01971426,
+      -1.6468811,  4.114996,    -1.5110453,  6.8689218,   8.269899,    -3.1568048,  -7.0344677,
+      1.2911975,   5.950357,    0.19028673,  4.657226,    -8.199647,   2.246055,    8.989509,
+      5.3101015,   -4.2400866};
+
+    std::vector<float> X_embedded = {
+      -0.41849962, -0.53906363, 0.46958843,  -0.35832694, -0.23779503, -0.29751351, -0.01072748,
+      -0.21353109, -0.54769957, -0.55086273, 0.37093949,  -0.12714292, -0.06639574, -0.36098689,
+      -0.13060696, -0.07362658, -1.01205945, -0.39285606, 0.2864089,   -0.32031146, -0.19595343,
+      0.08900568,  -0.04813879, -0.06563424, -0.42655188, -0.69014251, 0.51459783,  -0.1942696,
+      -0.07767916, -0.6119386,  0.04813685,  -0.22557008, -0.56890118, -0.60293794, 0.43429622,
+      -0.09240723, -0.00624062, -0.25800395, -0.1886092,  0.01655941,  -0.01961523, -0.14147359,
+      0.41414487,  -0.8512944,  -0.61199242, -0.18586016, 0.14024924,  -0.41635606, -0.02890144,
+      0.1065347,   0.39700791,  -1.14060664, -0.95313865, 0.14416681,  0.17306046,  -0.53189689,
+      -0.98987544, -0.67918193, 0.41787854,  -0.20878236, -0.06612862, 0.03502904,  -0.03765266,
+      -0.0980606,  -0.00971657, 0.29432917,  0.36575687,  -1.1645509,  -0.89094597, 0.03718805,
+      0.2310573,   -0.38345811, -0.10401925, -0.10653082, 0.38469055,  -0.88302094, -0.80197543,
+      0.03548668,  0.02775662,  -0.54374295, 0.03379983,  0.00923623,  0.29320273,  -1.05263519,
+      -0.93360096, 0.03778313,  0.12360487,  -0.56437284, 0.0644429,   0.33432651,  0.36450726,
+      -1.22978747, -0.83822101, -0.18796451, 0.34888434,  -0.3801491,  -0.45327303, -0.59747899,
+      0.39697698,  -0.15616602, -0.06159166, -0.40301991, -0.11725303, -0.11913263, -0.12406619,
+      -0.11227967, 0.43083835,  -0.90535849, -0.81646025, 0.10012121,  -0.0141237,  -0.63747931,
+      0.04805023,  0.34190539,  0.50725192,  -1.17861414, -0.74641538, -0.09333111, 0.27992678,
+      -0.56214809, 0.04970971,  0.36249384,  0.57705611,  -1.16913795, -0.69849908, 0.10957897,
+      0.27983218,  -0.62088525, 0.0410459,   0.23973398,  0.40960434,  -1.14183664, -0.83321381,
+      0.02149482,  0.21720445,  -0.49869928, -0.95655465, -0.51680422, 0.45761383,  -0.08351214,
+      -0.12151554, 0.00819737,  -0.20813803, -0.01055793, 0.25319234,  0.36154974,  0.1822421,
+      -1.15837133, -0.92209691, -0.0501582,  0.08535917,  -0.54003763, -1.08675635, -1.04009593,
+      0.09408128,  0.07009826,  -0.01762833, -0.19180447, -0.18029785, -0.20342001, 0.04034991,
+      0.1814747,   0.36906669,  -1.13532007, -0.8852452,  0.0782818,   0.16825101,  -0.50301319,
+      -0.29128098, -0.65341312, 0.51484352,  -0.38758236, -0.22531103, -0.55021971, 0.10804344,
+      -0.3521522,  -0.38849035, -0.74110794, 0.53761131,  -0.25142813, -0.1118066,  -0.47453368,
+      0.06347904,  -0.23796193, -1.02682328, -0.47594091, 0.39515916,  -0.2782529,  -0.16566519,
+      0.08063579,  0.00810116,  -0.06213913, -1.059654,   -0.62496334, 0.53698546,  -0.11806234,
+      0.00356161,  0.11513405,  -0.14213292, 0.04102662,  -0.36622161, -0.73686272, 0.48323864,
+      -0.27338892, -0.14203401, -0.41736352, 0.03332564,  -0.21907479, -0.06396769, 0.01831361,
+      0.46263444,  -1.01878166, -0.86486858, 0.17622118,  -0.01249686, -0.74530888, -0.9354887,
+      -0.5027945,  0.38170099,  -0.15547098, 0.00677824,  -0.04677663, -0.13541745, 0.07253501,
+      -0.97933143, -0.58001202, 0.48235369,  -0.18836913, -0.02430783, 0.07572441,  -0.08101331,
+      0.00630076,  -0.16881248, -0.67989182, 0.46083611,  -0.43910736, -0.29321918, -0.38735861,
+      0.07669903,  -0.29749861, -0.40047669, -0.56722462, 0.33168188,  -0.13118173, -0.06672747,
+      -0.56856316, -0.26269144, -0.14236671, 0.10651901,  0.4962585,   0.38848072,  -1.06653547,
+      -0.64079332, -0.47378591, 0.43195483,  -0.04856951, -0.9840439,  -0.70610428, 0.34028092,
+      -0.2089237,  -0.05382041, 0.01625874,  -0.02080803, -0.12535211, -0.04146428, -1.24533033,
+      0.48944879,  0.0578458,   0.26708388,  -0.90321028, 0.35377088,  -0.36791429, -0.35382384,
+      -0.52748734, 0.42854419,  -0.31744713, -0.19174226, -0.39073724, -0.03258846, -0.19978228,
+      -0.36185205, -0.57412046, 0.43681973,  -0.25414538, -0.12904905, -0.46334973, -0.03123853,
+      -0.11303604, -0.87073672, -0.45441297, 0.41825858,  -0.25303507, -0.21845073, 0.10248682,
+      -0.11045569, -0.10002795, -0.00572806, 0.16519061,  0.42651513,  -1.11417019, -0.83789682,
+      0.02995787,  0.16843079,  -0.53874511, 0.03056994,  0.17877036,  0.49632853,  -1.03276777,
+      -0.74778616, -0.03971953, 0.10907949,  -0.67385727, -0.9523471,  -0.56550741, 0.40409449,
+      -0.2703723,  -0.10175014, 0.13605487,  -0.06306008, -0.01768126, -0.4749442,  -0.56964815,
+      0.39389887,  -0.19248079, -0.04161081, -0.38728487, -0.20341556, -0.12656988, -0.35949609,
+      -0.46137866, 0.28798422,  -0.06603147, -0.04363992, -0.60343552, -0.23565227, -0.10242701,
+      -0.06792886, 0.09689897,  0.33259571,  -0.98854214, -0.84444433, 0.00673901,  0.13457057,
+      -0.43145794, -0.51500046, -0.50821936, 0.38000089,  0.0132636,   0.0580942,   -0.40157595,
+      -0.11967677, 0.02549113,  -0.10350953, 0.22918226,  0.40411913,  -1.05619383, -0.71218503,
+      -0.02197581, 0.26422262,  -0.34765676, 0.06601537,  0.21712676,  0.34723559,  -1.20982027,
+      -0.95646334, 0.00793948,  0.27620381,  -0.43475035, -0.67326003, -0.6137197,  0.43724492,
+      -0.17666136, -0.06591748, -0.18937394, -0.07400128, -0.06881691, -0.5201112,  -0.61088628,
+      0.4225319,   -0.18969463, -0.06921366, -0.33993208, -0.06990873, -0.10288513, -0.70659858,
+      -0.56003648, 0.46628812,  -0.16090363, -0.0185108,  -0.1431348,  -0.1128775,  -0.0078648,
+      -0.02323332, 0.04292452,  0.39291084,  -0.94897962, -0.63863206, -0.16546988, 0.23698957,
+      -0.30633628};
+
+    raft::handle_t handle;
+
+    cudaStream_t stream = handle.get_stream();
+
+    rmm::device_uvector<float> d_X(X.size(), stream);
+    rmm::device_uvector<float> d_X_embedded(X_embedded.size(), stream);
+
+    raft::update_device(d_X.data(), X.data(), X.size(), stream);
+    raft::update_device(d_X_embedded.data(), X_embedded.data(), X_embedded.size(), stream);
+
+    // euclidean test
+    score = trustworthiness_score<float, raft::distance::DistanceType::L2SqrtUnexpanded>(
+      handle, d_X.data(), d_X_embedded.data(), 50, 30, 8, 5);
+  }
+
+  void SetUp() override { basicTest(); }
+
+  void TearDown() override {}
+
+ protected:
+  double score;
+};
+
+typedef TrustworthinessScoreTest TrustworthinessScoreTestF;
+TEST_F(TrustworthinessScoreTestF, Result) { ASSERT_TRUE(0.9375 < score && score < 0.9379); }
+};  // namespace stats
+};  // namespace raft
diff --git a/cpp/test/stats/v_measure.cu b/cpp/test/stats/v_measure.cu
new file mode 100644
index 0000000000..2ff60c0a86
--- /dev/null
+++ b/cpp/test/stats/v_measure.cu
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../test_utils.h"
+#include <algorithm>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <raft/cudart_utils.h>
+#include <raft/stats/homogeneity_score.hpp>
+#include <raft/stats/v_measure.hpp>
+#include <random>
+
+namespace raft {
+namespace stats {
+
+// parameter structure definition
+struct vMeasureParam {
+  int nElements;
+  int lowerLabelRange;
+  int upperLabelRange;
+  double beta;
+  bool sameArrays;
+  double tolerance;
+};
+
+// test fixture class
+template <typename T>
+class vMeasureTest : public ::testing::TestWithParam<vMeasureParam> {
+ protected:
+  // the constructor
+  void SetUp() override
+  {
+    // getting the parameters
+    params = ::testing::TestWithParam<vMeasureParam>::GetParam();
+
+    nElements       = params.nElements;
+    lowerLabelRange = params.lowerLabelRange;
+    upperLabelRange = params.upperLabelRange;
+
+    // generating random value test input
+    std::vector<int> arr1(nElements, 0);
+    std::vector<int> arr2(nElements, 0);
+    std::random_device rd;
+    std::default_random_engine dre(rd());
+    std::uniform_int_distribution<int> intGenerator(lowerLabelRange, upperLabelRange);
+
+    std::generate(arr1.begin(), arr1.end(), [&]() { return intGenerator(dre); });
+    if (params.sameArrays) {
+      arr2 = arr1;
+    } else {
+      std::generate(arr2.begin(), arr2.end(), [&]() { return intGenerator(dre); });
+    }
+
+    // allocating and initializing memory to the GPU
+
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    rmm::device_uvector<T> truthClusterArray(nElements, stream);
+    rmm::device_uvector<T> predClusterArray(nElements, stream);
+    raft::update_device(truthClusterArray.data(), &arr1[0], (int)nElements, stream);
+    raft::update_device(predClusterArray.data(), &arr2[0], (int)nElements, stream);
+
+    // calculating the golden output
+    double truthHomogeity, truthCompleteness;
+
+    truthHomogeity    = raft::stats::homogeneity_score(truthClusterArray.data(),
+                                                    predClusterArray.data(),
+                                                    nElements,
+                                                    lowerLabelRange,
+                                                    upperLabelRange,
+                                                    stream);
+    truthCompleteness = raft::stats::homogeneity_score(predClusterArray.data(),
+                                                       truthClusterArray.data(),
+                                                       nElements,
+                                                       lowerLabelRange,
+                                                       upperLabelRange,
+                                                       stream);
+
+    if (truthCompleteness + truthHomogeity == 0.0)
+      truthVMeasure = 0.0;
+    else
+      truthVMeasure = ((1 + params.beta) * truthHomogeity * truthCompleteness /
+                       (params.beta * truthHomogeity + truthCompleteness));
+    // calling the v_measure CUDA implementation
+    computedVMeasure = raft::stats::v_measure(truthClusterArray.data(),
+                                              predClusterArray.data(),
+                                              nElements,
+                                              lowerLabelRange,
+                                              upperLabelRange,
+                                              stream,
+                                              params.beta);
+  }
+
+  // the destructor
+  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
+
+  // declaring the data values
+  vMeasureParam params;
+  T lowerLabelRange, upperLabelRange;
+  int nElements           = 0;
+  double truthVMeasure    = 0;
+  double computedVMeasure = 0;
+  cudaStream_t stream     = 0;
+};
+
+// setting test parameter values
+const std::vector<vMeasureParam> inputs = {{199, 1, 10, 1.0, false, 0.000001},
+                                           {200, 15, 100, 1.0, false, 0.000001},
+                                           {100, 1, 20, 1.0, false, 0.000001},
+                                           {10, 1, 10, 1.0, false, 0.000001},
+                                           {198, 1, 100, 1.0, false, 0.000001},
+                                           {300, 3, 99, 1.0, false, 0.000001},
+                                           {199, 1, 10, 1.0, true, 0.000001},
+                                           {200, 15, 100, 1.0, true, 0.000001},
+                                           {100, 1, 20, 1.0, true, 0.000001},
+                                           {10, 1, 10, 1.0, true, 0.000001},
+                                           {198, 1, 100, 1.0, true, 0.000001},
+                                           {300, 3, 99, 1.0, true, 0.000001}};
+
+// writing the test suite
+typedef vMeasureTest<int> vMeasureTestClass;
+TEST_P(vMeasureTestClass, Result)
+{
+  ASSERT_NEAR(computedVMeasure, truthVMeasure, params.tolerance);
+}
+INSTANTIATE_TEST_CASE_P(vMeasure, vMeasureTestClass, ::testing::ValuesIn(inputs));
+
+}  // end namespace stats
+}  // end namespace raft

From 5e0db8ed9c2820aa985418906e3720c40f7a758a Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Mon, 21 Feb 2022 16:41:25 -0500
Subject: [PATCH 121/171] Fixing Python conda package and installation (#520)

Conda package wasn't installing the artifacts it was supposed to be installing.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/raft/pull/520
---
 build.sh                        |  25 +-
 python/.flake8                  |   9 +
 python/.flake8.cython           |   2 +-
 python/raft/_version.py         | 567 ++++++++++++++++++++++++++++++++
 python/raft/common/__init__.pxd |   0
 python/setup.cfg                |  53 ++-
 python/setup.py                 |  11 +-
 7 files changed, 646 insertions(+), 21 deletions(-)
 create mode 100644 python/.flake8
 create mode 100644 python/raft/_version.py
 create mode 100644 python/raft/common/__init__.pxd

diff --git a/build.sh b/build.sh
index ca5b97b698..1c581eff19 100755
--- a/build.sh
+++ b/build.sh
@@ -43,11 +43,11 @@ HELP="$0 [<target> ...] [<flag> ...]
 
  default action (no args) is to build both libraft and pyraft targets
 "
-CPP_RAFT_BUILD_DIR=${REPODIR}/cpp/build
+LIBRAFT_BUILD_DIR=${LIBRAFT_BUILD_DIR:=${REPODIR}/cpp/build}
 SPHINX_BUILD_DIR=${REPODIR}/docs
 PY_RAFT_BUILD_DIR=${REPODIR}/python/build
 PYTHON_DEPS_CLONE=${REPODIR}/python/external_repositories
-BUILD_DIRS="${CPP_RAFT_BUILD_DIR} ${PY_RAFT_BUILD_DIR} ${PYTHON_DEPS_CLONE}"
+BUILD_DIRS="${LIBRAFT_BUILD_DIR} ${PY_RAFT_BUILD_DIR} ${PYTHON_DEPS_CLONE}"
 
 # Set defaults for vars modified by flags to this script
 CMAKE_LOG_LEVEL=""
@@ -59,7 +59,6 @@ COMPILE_LIBRARIES=OFF
 COMPILE_NN_LIBRARY=OFF
 COMPILE_DIST_LIBRARY=OFF
 ENABLE_NN_DEPENDENCIES=${BUILD_TESTS}
-SINGLEGPU=""
 NVTX=OFF
 CLEAN=0
 DISABLE_DEPRECATION_WARNINGS=ON
@@ -117,16 +116,16 @@ if hasArg --nogtest; then
     ENABLE_NN_DEPENDENCIES=OFF
 fi
 
-if hasArg --compile-libs; then
+if hasArg --compile-libs || (( ${NUMARGS} == 0 )); then
   COMPILE_LIBRARIES=ON
 fi
 
-if hasArg --compile-nn || hasArg --compile-libs; then
+if hasArg --compile-nn || hasArg --compile-libs || (( ${NUMARGS} == 0 )); then
     ENABLE_NN_DEPENDENCIES=ON
     COMPILE_NN_LIBRARY=ON
     CMAKE_TARGET="raft_nn_lib;${CMAKE_TARGET}"
 fi
-if hasArg --compile-dist || hasArg --compile-libs; then
+if hasArg --compile-dist || hasArg --compile-libs || (( ${NUMARGS} == 0 )); then
     COMPILE_DIST_LIBRARY=ON
     CMAKE_TARGET="raft_distance_lib;${CMAKE_TARGET}"
 fi
@@ -178,9 +177,9 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs; then
         echo "Building for *ALL* supported GPU architectures..."
     fi
 
-    mkdir -p ${CPP_RAFT_BUILD_DIR}
-    cd ${CPP_RAFT_BUILD_DIR}
-    cmake -S ${REPODIR}/cpp -B ${CPP_RAFT_BUILD_DIR} \
+    mkdir -p ${LIBRAFT_BUILD_DIR}
+    cd ${LIBRAFT_BUILD_DIR}
+    cmake -S ${REPODIR}/cpp -B ${LIBRAFT_BUILD_DIR} \
           -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
           -DCMAKE_CUDA_ARCHITECTURES=${RAFT_CMAKE_CUDA_ARCHITECTURES} \
           -DRAFT_COMPILE_LIBRARIES=${COMPILE_LIBRARIES} \
@@ -200,7 +199,7 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs; then
       fi
 
       echo "-- Compiling targets: ${CMAKE_TARGET}, verbose=${VERBOSE_FLAG}"
-      cmake --build  "${CPP_RAFT_BUILD_DIR}" ${VERBOSE_FLAG} -j${PARALLEL_LEVEL} --target ${CMAKE_TARGET} ${INSTALL_TARGET}
+      cmake --build  "${LIBRAFT_BUILD_DIR}" ${VERBOSE_FLAG} -j${PARALLEL_LEVEL} --target ${CMAKE_TARGET} ${INSTALL_TARGET}
   fi
 fi
 
@@ -209,14 +208,14 @@ if (( ${NUMARGS} == 0 )) || hasArg pyraft || hasArg docs; then
 
     cd ${REPODIR}/python
     if [[ ${INSTALL_TARGET} != "" ]]; then
-        python setup.py build_ext -j${PARALLEL_LEVEL:-1} --inplace ${SINGLEGPU}
+        python setup.py build_ext -j${PARALLEL_LEVEL:-1} --inplace --library-dir=${LIBRAFT_BUILD_DIR} install --single-version-externally-managed --record=record.txt
     else
-        python setup.py build_ext -j${PARALLEL_LEVEL:-1} --inplace --library-dir=${LIBRAFT_BUILD_DIR} ${SINGLEGPU}
+        python setup.py build_ext -j${PARALLEL_LEVEL:-1} --inplace --library-dir=${LIBRAFT_BUILD_DIR}
     fi
 fi
 
 if hasArg docs; then
-    cmake --build ${CPP_RAFT_BUILD_DIR} --target docs_raft
+    cmake --build ${LIBRAFT_BUILD_DIR} --target docs_raft
     cd ${SPHINX_BUILD_DIR}
     make html
 fi
diff --git a/python/.flake8 b/python/.flake8
new file mode 100644
index 0000000000..ef2e5a8495
--- /dev/null
+++ b/python/.flake8
@@ -0,0 +1,9 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+[flake8]
+exclude = __init__.py
+ignore =
+    # line break before binary operator
+    W503
+    # whitespace before :
+    E203
\ No newline at end of file
diff --git a/python/.flake8.cython b/python/.flake8.cython
index c07e670e4c..3cd436d3f3 100644
--- a/python/.flake8.cython
+++ b/python/.flake8.cython
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/_version.py b/python/raft/_version.py
new file mode 100644
index 0000000000..454b0fe7aa
--- /dev/null
+++ b/python/raft/_version.py
@@ -0,0 +1,567 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+# This file helps to compute a version number in source trees obtained from
+# git-archive tarball (such as those provided by githubs download-from-tag
+# feature). Distribution tarballs (built by setup.py sdist) and build
+# directories (produced by setup.py build) will contain a much shorter file
+# that just contains the computed version number.
+
+# This file is released into the public domain. Generated by
+# versioneer-0.18 (https://github.com/warner/python-versioneer)
+
+"""Git implementation of _version.py."""
+
+import errno
+import os
+import re
+import subprocess
+import sys
+
+
+def get_keywords():
+    """Get the keywords needed to look up the version information."""
+    # these strings will be replaced by git during git-archive.
+    # setup.py/versioneer.py will grep for the variable names, so they must
+    # each be defined on a line of their own. _version.py will just call
+    # get_keywords().
+    git_refnames = "$Format:%d$"
+    git_full = "$Format:%H$"
+    git_date = "$Format:%ci$"
+    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
+    return keywords
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_config():
+    """Create, populate and return the VersioneerConfig() object."""
+    # these strings are filled in when 'setup.py versioneer' creates
+    # _version.py
+    cfg = VersioneerConfig()
+    cfg.VCS = "git"
+    cfg.style = "pep440"
+    cfg.tag_prefix = "v"
+    cfg.parentdir_prefix = "raft-"
+    cfg.versionfile_source = "raft/_version.py"
+    cfg.verbose = False
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Decorator to mark a method as the handler for a particular VCS."""
+
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+
+    return decorate
+
+
+def run_command(
+        commands, args, cwd=None, verbose=False, hide_stderr=False, env=None
+):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen(
+                [c] + args,
+                cwd=cwd,
+                env=env,
+                stdout=subprocess.PIPE,
+                stderr=(subprocess.PIPE if hide_stderr else None),
+                )
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %s" % dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %s" % (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip()
+    if sys.version_info[0] >= 3:
+        stdout = stdout.decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %s (error)" % dispcmd)
+            print("stdout was %s" % stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {
+                "version": dirname[len(parentdir_prefix):],
+                "full-revisionid": None,
+                "dirty": False,
+                "error": None,
+                "date": None,
+            }
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print(
+            "Tried directories %s but none started with prefix %s"
+            % (str(rootdirs), parentdir_prefix)
+        )
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r"\d", r)])
+        if verbose:
+            print("discarding '%s', no digits" % ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %s" % ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %s" % r)
+            return {
+                "version": r,
+                "full-revisionid": keywords["full"].strip(),
+                "dirty": False,
+                "error": None,
+                "date": date,
+            }
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {
+        "version": "0+unknown",
+        "full-revisionid": keywords["full"].strip(),
+        "dirty": False,
+        "error": "no suitable tags",
+        "date": None,
+    }
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(
+        GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True
+    )
+    if rc != 0:
+        if verbose:
+            print("Directory %s not under git control" % root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(
+        GITS,
+        [
+            "describe",
+            "--tags",
+            "--dirty",
+            "--always",
+            "--long",
+            "--match",
+            "%s*" % tag_prefix,
+            ],
+        cwd=root,
+    )
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[: git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = (
+                    "unable to parse git-describe output: '%s'" % describe_out
+            )
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%s' doesn't start with prefix '%s'"
+                print(fmt % (full_tag, tag_prefix))
+            pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
+                full_tag,
+                tag_prefix,
+            )
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(
+            GITS, ["rev-list", "HEAD", "--count"], cwd=root
+        )
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
+        0
+    ].strip()
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post.dev%d" % pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post.dev%d" % pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Eexceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {
+            "version": "unknown",
+            "full-revisionid": pieces.get("long"),
+            "dirty": None,
+            "error": pieces["error"],
+            "date": None,
+        }
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%s'" % style)
+
+    return {
+        "version": rendered,
+        "full-revisionid": pieces["long"],
+        "dirty": pieces["dirty"],
+        "error": None,
+        "date": pieces.get("date"),
+    }
+
+
+def get_versions():
+    """Get version information or return default if unable to do so."""
+    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
+    # __file__, we can work backwards from there to the root. Some
+    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
+    # case we can only use expanded keywords.
+
+    cfg = get_config()
+    verbose = cfg.verbose
+
+    try:
+        return git_versions_from_keywords(
+            get_keywords(), cfg.tag_prefix, verbose
+        )
+    except NotThisMethod:
+        pass
+
+    try:
+        root = os.path.realpath(__file__)
+        # versionfile_source is the relative path from the top of the source
+        # tree (where the .git directory might live) to this file. Invert
+        # this to find the root from __file__.
+        for i in cfg.versionfile_source.split("/"):
+            root = os.path.dirname(root)
+    except NameError:
+        return {
+            "version": "0+unknown",
+            "full-revisionid": None,
+            "dirty": None,
+            "error": "unable to find root of source tree",
+            "date": None,
+        }
+
+    try:
+        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
+        return render(pieces, cfg.style)
+    except NotThisMethod:
+        pass
+
+    try:
+        if cfg.parentdir_prefix:
+            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+    except NotThisMethod:
+        pass
+
+    return {
+        "version": "0+unknown",
+        "full-revisionid": None,
+        "dirty": None,
+        "error": "unable to compute version",
+        "date": None,
+    }
diff --git a/python/raft/common/__init__.pxd b/python/raft/common/__init__.pxd
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/python/setup.cfg b/python/setup.cfg
index bc65780383..f6c096818b 100644
--- a/python/setup.cfg
+++ b/python/setup.cfg
@@ -1,7 +1,7 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 [flake8]
-exclude = cuML,ml-prims,__init__.py,versioneer.py
+exclude = __init__.py,versioneer.py
 # See the docstring in versioneer.py for instructions. Note that you must
 # re-run 'versioneer.py setup' after changing this section, and commit the
 # resulting files.
@@ -9,7 +9,50 @@ exclude = cuML,ml-prims,__init__.py,versioneer.py
 [versioneer]
 VCS = git
 style = pep440
-versionfile_source = cuml/_version.py
-versionfile_build = cuml/_version.py
+versionfile_source = raft/_version.py
+versionfile_build = raft/_version.py
 tag_prefix = v
-parentdir_prefix = cuml-
+parentdir_prefix = raft-
+
+[isort]
+line_length=79
+multi_line_output=3
+include_trailing_comma=True
+force_grid_wrap=0
+combine_as_imports=True
+order_by_type=True
+known_dask=
+    dask
+    distributed
+    dask_cuda
+known_rapids=
+    nvtext
+    cudf
+    cuml
+    cugraph
+    dask_cudf
+    rmm
+known_first_party=
+    raft
+default_section=THIRDPARTY
+sections=FUTURE,STDLIB,THIRDPARTY,DASK,RAPIDS,FIRSTPARTY,LOCALFOLDER
+skip=
+    thirdparty
+    .eggs
+    .git
+    .hg
+    .mypy_cache
+    .tox
+    .venv
+    _build
+    buck-out
+    build
+    dist
+    __init__.py
+
+[options]
+packages = find:
+install_requires =
+    numpy
+    numba>=0.49
+python_requires = >=3.7,<3.9
diff --git a/python/setup.py b/python/setup.py
index 80f687a442..10beca1eb4 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -50,7 +50,7 @@
 # - Dependencies include and lib folder setup --------------------------------
 
 install_requires = [
-    'cython',
+    'cython'
 ]
 
 cuda_home = get_environment_option("CUDA_HOME")
@@ -187,7 +187,14 @@ def finalize_options(self):
       author="NVIDIA Corporation",
       setup_requires=['cython'],
       ext_modules=extensions,
-      packages=find_packages(include=['cuml', 'cuml.*']),
+      package_data=dict.fromkeys(
+                         find_packages(include=["raft.dask.common",
+                                                "raft.dask.common.includes",
+                                                "raft.common",
+                                                "raft.common.includes"]),
+                         ["*.hpp", "*.pxd"],
+      ),
+      packages=find_packages(include=['raft', 'raft.*']),
       install_requires=install_requires,
       license="Apache",
       cmdclass=cmdclass,

From 57703c5200ef10462a6a19deb2ffd72339507241 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 22 Feb 2022 13:40:26 +0800
Subject: [PATCH 122/171] mdspan integration. (#437)

This is an early version of the integration, it doesn't have padding support, the file is placed under test directory.  Opening a PR so that we can continue the design discussion with actual code we can reference.

For `mdspan`, it's currently using a fork of mine, but in general, the required changes are trivial so it's directly integrated as a cmake module.  Padding will add some more complexity, but I think we can implement it as custom layouts.

For `mdarray`, due to the `operator()`, copy constructor, and CUDA stream, I copied the reference implementation with some substantial changes. (need to add license).
- For the `operator()`, I have created a `device_reference` that's a simplified version of `thrust::device_reference` but supports the CUDA stream.
- For supporting CUDA stream, I have removed most of the constructors to make sure there's a valid stream being passed into mdarray.  Also, a new container policy is required to store the stream.
- For the `extent` template parameter, I have changed the signature to match `mdspan`.
- To integrate rmm allocator, I made a thin wrapper over `rmm::device_uvector` with a custom policy.
- For the container policy, the ref implementation reuses `container_policy` for `mdspan::accessor_policy`.  This is done by having multiple overloads of `access` methods, including `reference access(container const& c)` and `reference access(pointer ptr)`, and the latter is a host device function.  I have split it up into 2 policies as the container policy might have unwanted states for kernels like CUDA stream.

Lastly, I have added host and device varients of both mdarray and mdspan.

@cjnolet @divyegala @achirkin

Related:
- https://github.com/rapidsai/raft/issues/408
- https://github.com/rapidsai/raft/issues/423

Todos:
- [ ] Finalize the design doc.
- [x] Add host/device accessor policy.
- [x] Add view type.
- [x] Change default accessor.
- [ ] Add padding support.
- [ ] Upstream changes to mdspan.
- [x] Add more tests.

Authors:
  - Jiaming Yuan (https://github.com/trivialfis)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/437
---
 cpp/CMakeLists.txt                    |   6 +-
 cpp/cmake/thirdparty/get_mdspan.cmake |  15 +
 cpp/include/raft/detail/mdarray.hpp   | 240 ++++++++++++
 cpp/include/raft/mdarray.hpp          | 532 ++++++++++++++++++++++++++
 cpp/test/CMakeLists.txt               |   1 +
 cpp/test/mdarray.cu                   | 420 ++++++++++++++++++++
 thirdparty/LICENSES/mdarray.license   |  42 ++
 7 files changed, 1254 insertions(+), 2 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/get_mdspan.cmake
 create mode 100644 cpp/include/raft/detail/mdarray.hpp
 create mode 100644 cpp/include/raft/mdarray.hpp
 create mode 100644 cpp/test/mdarray.cu
 create mode 100644 thirdparty/LICENSES/mdarray.license

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 118a48525a..9c54d15adc 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -113,6 +113,7 @@ include(cmake/thirdparty/get_rmm.cmake)
 include(cmake/thirdparty/get_cuco.cmake)
 include(cmake/thirdparty/get_libcudacxx.cmake)
 include(cmake/thirdparty/get_faiss.cmake)
+include(cmake/thirdparty/get_mdspan.cmake)
 
 if(BUILD_TESTS)
   include(cmake/thirdparty/get_gtest.cmake)
@@ -139,7 +140,8 @@ target_link_libraries(raft INTERFACE
   CUDA::cusparse
   $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
   rmm::rmm
-  cuco::cuco)
+  cuco::cuco
+  std::mdspan)
 
 target_compile_definitions(raft INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
 target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
@@ -292,7 +294,7 @@ install(DIRECTORY include/raft/
 
 # Temporary install of raft.hpp while the file is removed
 install(FILES include/raft.hpp
-	DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft)
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft)
 
 ##############################################################################
 # - install export -----------------------------------------------------------
diff --git a/cpp/cmake/thirdparty/get_mdspan.cmake b/cpp/cmake/thirdparty/get_mdspan.cmake
new file mode 100644
index 0000000000..c88d4e6857
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_mdspan.cmake
@@ -0,0 +1,15 @@
+function(find_and_configure_mdspan VERSION)
+  rapids_cpm_find(
+    mdspan ${VERSION}
+    GLOBAL_TARGETS std::mdspan
+    BUILD_EXPORT_SET    raft-exports
+    INSTALL_EXPORT_SET  raft-exports
+    CPM_ARGS
+      GIT_REPOSITORY https://github.com/rapidsai/mdspan.git
+      GIT_TAG b3042485358d2ee168ae2b486c98c2c61ec5aec1
+      OPTIONS "MDSPAN_ENABLE_CUDA ON"
+              "MDSPAN_CXX_STANDARD ON"
+  )
+endfunction()
+
+find_and_configure_mdspan(0.2.0)
diff --git a/cpp/include/raft/detail/mdarray.hpp b/cpp/include/raft/detail/mdarray.hpp
new file mode 100644
index 0000000000..9f0f275eaa
--- /dev/null
+++ b/cpp/include/raft/detail/mdarray.hpp
@@ -0,0 +1,240 @@
+/*
+ * Copyright (2019) Sandia Corporation
+ *
+ * The source code is licensed under the 3-clause BSD license found in the LICENSE file
+ * thirdparty/LICENSES/mdarray.license
+ */
+
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <experimental/mdspan>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <thrust/device_ptr.h>
+
+namespace raft::detail {
+/**
+ * @brief A simplified version of thrust::device_reference with support for CUDA stream.
+ */
+template <typename T>
+class device_reference {
+ public:
+  using value_type    = typename std::remove_cv_t<T>;
+  using pointer       = thrust::device_ptr<T>;
+  using const_pointer = thrust::device_ptr<T const>;
+
+ private:
+  std::conditional_t<std::is_const<T>::value, const_pointer, pointer> ptr_;
+  rmm::cuda_stream_view stream_;
+
+ public:
+  device_reference(thrust::device_ptr<T> ptr, rmm::cuda_stream_view stream)
+    : ptr_{ptr}, stream_{stream}
+  {
+  }
+
+  operator value_type() const  // NOLINT
+  {
+    auto* raw = ptr_.get();
+    value_type v{};
+    update_host(&v, raw, 1, stream_);
+    return v;
+  }
+  auto operator=(T const& other) -> device_reference&
+  {
+    auto* raw = ptr_.get();
+    update_device(raw, &other, 1, stream_);
+    return *this;
+  }
+};
+
+/**
+ * @brief A thin wrapper over rmm::device_uvector for implementing the mdarray container policy.
+ *
+ */
+template <typename T>
+class device_uvector {
+  rmm::device_uvector<T> data_;
+
+ public:
+  using value_type = T;
+  using size_type  = std::size_t;
+
+  using reference       = device_reference<T>;
+  using const_reference = device_reference<T const>;
+
+  using pointer       = value_type*;
+  using const_pointer = value_type const*;
+
+  using iterator       = pointer;
+  using const_iterator = const_pointer;
+
+ public:
+  ~device_uvector()                         = default;
+  device_uvector(device_uvector&&) noexcept = default;
+  device_uvector(device_uvector const& that) : data_{that.data_, that.data_.stream()} {}
+
+  auto operator=(device_uvector<T> const& that) -> device_uvector<T>&
+  {
+    data_ = rmm::device_uvector<T>{that.data_, that.data_.stream()};
+    return *this;
+  }
+  auto operator=(device_uvector<T>&& that) noexcept -> device_uvector<T>& = default;
+
+  /**
+   * @brief Default ctor is deleted as it doesn't accept stream.
+   */
+  device_uvector() = delete;
+  /**
+   * @brief Ctor that accepts a size, stream and an optional mr.
+   */
+  explicit device_uvector(
+    std::size_t size,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+    : data_{size, stream, mr}
+  {
+  }
+  /**
+   * @brief Index operator that returns a proxy to the actual data.
+   */
+  template <typename Index>
+  auto operator[](Index i) noexcept -> reference
+  {
+    return device_reference<T>{thrust::device_ptr<T>{data_.data() + i}, data_.stream()};
+  }
+  /**
+   * @brief Index operator that returns a proxy to the actual data.
+   */
+  template <typename Index>
+  auto operator[](Index i) const noexcept
+  {
+    return device_reference<T const>{thrust::device_ptr<T const>{data_.data() + i}, data_.stream()};
+  }
+
+  [[nodiscard]] auto data() noexcept -> pointer { return data_.data(); }
+  [[nodiscard]] auto data() const noexcept -> const_pointer { return data_.data(); }
+};
+
+/**
+ * @brief A container policy for device mdarray.
+ */
+template <typename ElementType>
+class device_uvector_policy {
+  rmm::cuda_stream_view stream_;
+
+ public:
+  using element_type   = ElementType;
+  using container_type = device_uvector<element_type>;
+  // FIXME(jiamingy): allocator type is not supported by rmm::device_uvector
+  using pointer         = typename container_type::pointer;
+  using const_pointer   = typename container_type::const_pointer;
+  using reference       = device_reference<element_type>;
+  using const_reference = device_reference<element_type const>;
+
+  using accessor_policy       = std::experimental::default_accessor<element_type>;
+  using const_accessor_policy = std::experimental::default_accessor<element_type const>;
+
+ public:
+  auto create(size_t n) -> container_type { return container_type(n, stream_); }
+
+  device_uvector_policy() = delete;
+  explicit device_uvector_policy(rmm::cuda_stream_view stream) noexcept(
+    std::is_nothrow_copy_constructible_v<rmm::cuda_stream_view>)
+    : stream_{stream}
+  {
+  }
+
+  [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference
+  {
+    return c[n];
+  }
+  [[nodiscard]] constexpr auto access(container_type const& c, size_t n) const noexcept
+    -> const_reference
+  {
+    return c[n];
+  }
+
+  [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; }
+  [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; }
+};
+
+/**
+ * @brief A container policy for host mdarray.
+ */
+template <typename ElementType, typename Allocator = std::allocator<ElementType>>
+class host_vector_policy {
+ public:
+  using element_type          = ElementType;
+  using container_type        = std::vector<element_type, Allocator>;
+  using allocator_type        = typename container_type::allocator_type;
+  using pointer               = typename container_type::pointer;
+  using const_pointer         = typename container_type::const_pointer;
+  using reference             = element_type&;
+  using const_reference       = element_type const&;
+  using accessor_policy       = std::experimental::default_accessor<element_type>;
+  using const_accessor_policy = std::experimental::default_accessor<element_type const>;
+
+ public:
+  auto create(size_t n) -> container_type { return container_type(n); }
+
+  constexpr host_vector_policy() noexcept(std::is_nothrow_default_constructible_v<ElementType>) =
+    default;
+  explicit constexpr host_vector_policy(rmm::cuda_stream_view) noexcept(
+    std::is_nothrow_default_constructible_v<ElementType>)
+    : host_vector_policy()
+  {
+  }
+
+  [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference
+  {
+    return c[n];
+  }
+  [[nodiscard]] constexpr auto access(container_type const& c, size_t n) const noexcept
+    -> const_reference
+  {
+    return c[n];
+  }
+
+  [[nodiscard]] auto make_accessor_policy() noexcept { return accessor_policy{}; }
+  [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; }
+};
+
+/**
+ * @brief A mixin to distinguish host and device memory.
+ */
+template <typename AccessorPolicy, bool is_host>
+struct accessor_mixin : public AccessorPolicy {
+  using accessor_type = AccessorPolicy;
+  using is_host_type  = std::conditional_t<is_host, std::true_type, std::false_type>;
+  // make sure the explicit ctor can fall through
+  using AccessorPolicy::AccessorPolicy;
+  accessor_mixin(AccessorPolicy const& that) : AccessorPolicy{that} {}  // NOLINT
+};
+
+template <typename AccessorPolicy>
+using host_accessor = accessor_mixin<AccessorPolicy, true>;
+
+template <typename AccessorPolicy>
+using device_accessor = accessor_mixin<AccessorPolicy, false>;
+
+namespace stdex = std::experimental;
+
+using vector_extent = stdex::extents<stdex::dynamic_extent>;
+using matrix_extent = stdex::extents<stdex::dynamic_extent, stdex::dynamic_extent>;
+using scalar_extent = stdex::extents<1>;
+}  // namespace raft::detail
diff --git a/cpp/include/raft/mdarray.hpp b/cpp/include/raft/mdarray.hpp
new file mode 100644
index 0000000000..44ca526c16
--- /dev/null
+++ b/cpp/include/raft/mdarray.hpp
@@ -0,0 +1,532 @@
+/*
+ * Copyright (2019) Sandia Corporation
+ *
+ * The source code is licensed under the 3-clause BSD license found in the LICENSE file
+ * thirdparty/LICENSES/mdarray.license
+ */
+
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <experimental/mdspan>
+#include <raft/detail/mdarray.hpp>
+#include <rmm/cuda_stream_view.hpp>
+
+namespace raft {
+/**
+ * @\brief C-Contiguous layout for mdarray and mdspan. Implies row-major and contiguous memory.
+ */
+using layout_c_contiguous = detail::stdex::layout_right;
+
+/**
+ * @\brief F-Contiguous layout for mdarray and mdspan. Implies column-major and contiguous memory.
+ */
+using layout_f_contiguous = detail::stdex::layout_left;
+
+/**
+ * @brief stdex::mdspan with device tag to avoid accessing incorrect memory location.
+ */
+template <typename ElementType,
+          typename Extents,
+          typename LayoutPolicy   = layout_c_contiguous,
+          typename AccessorPolicy = detail::stdex::default_accessor<ElementType>>
+using device_mdspan = detail::stdex::
+  mdspan<ElementType, Extents, LayoutPolicy, detail::device_accessor<AccessorPolicy>>;
+
+/**
+ * @brief stdex::mdspan with host tag to avoid accessing incorrect memory location.
+ */
+template <typename ElementType,
+          typename Extents,
+          typename LayoutPolicy   = layout_c_contiguous,
+          typename AccessorPolicy = detail::stdex::default_accessor<ElementType>>
+using host_mdspan =
+  detail::stdex::mdspan<ElementType, Extents, LayoutPolicy, detail::host_accessor<AccessorPolicy>>;
+
+/**
+ * @brief Modified from the c++ mdarray proposal
+ *
+ *   https://isocpp.org/files/papers/D1684R0.html
+ *
+ * mdarray is a container type for mdspan with similar template arguments.  However there
+ * are some inconsistencies in between them.  We have made some modificiations to fit our
+ * needs, which are listed below.
+ *
+ * - Layout policy is different, the mdarray in raft uses `stdex::extent` directly just
+ *   like `mdspan`, while the `mdarray` in the reference implementation uses varidic
+ *   template.
+ *
+ * - Most of the constructors from the reference implementation is removed to make sure
+ *   CUDA stream is honorred.
+ *
+ * - unique_size is not implemented, which is still working in progress in the proposal
+ *
+ * - For container policy, we adopt the alternative approach documented in the proposal
+ *   [sec 2.4.3], which requires an additional make_accessor method for it to be used in
+ *   mdspan.  The container policy reference implementation has multiple `access` methods
+ *   that accommodate needs for both mdarray and mdspan.  This is more difficult for us
+ *   since the policy might contain states that are unwanted inside a CUDA kernel.  Also,
+ *   on host we return a proxy to the actual value as `device_ref` so different access
+ *   methods will have different return type, which is less desirable.
+ *
+ * - For the above reasons, copying from other mdarray with different policy type is also
+ *   removed.
+ */
+template <typename ElementType, typename Extents, typename LayoutPolicy, typename ContainerPolicy>
+class mdarray {
+  static_assert(!std::is_const<ElementType>::value,
+                "Element type for container must not be const.");
+
+ public:
+  using extents_type = Extents;
+  using layout_type  = LayoutPolicy;
+  using mapping_type = typename layout_type::template mapping<extents_type>;
+  using element_type = ElementType;
+
+  using value_type      = std::remove_cv_t<element_type>;
+  using index_type      = std::size_t;
+  using difference_type = std::ptrdiff_t;
+  // Naming: ref impl: container_policy_type, proposal: container_policy
+  using container_policy_type = ContainerPolicy;
+  using container_type        = typename container_policy_type::container_type;
+
+  using pointer         = typename container_policy_type::pointer;
+  using const_pointer   = typename container_policy_type::const_pointer;
+  using reference       = typename container_policy_type::reference;
+  using const_reference = typename container_policy_type::const_reference;
+
+ private:
+  template <typename E,
+            typename ViewAccessorPolicy =
+              std::conditional_t<std::is_const_v<E>,
+                                 typename container_policy_type::const_accessor_policy,
+                                 typename container_policy_type::accessor_policy>>
+  using view_type_impl =
+    std::conditional_t<container_policy_type::is_host_type::value,
+                       host_mdspan<E, extents_type, layout_type, ViewAccessorPolicy>,
+                       device_mdspan<E, extents_type, layout_type, ViewAccessorPolicy>>;
+
+ public:
+  /**
+   * \brief the mdspan type returned by view method.
+   */
+  using view_type       = view_type_impl<element_type>;
+  using const_view_type = view_type_impl<element_type const>;
+
+ public:
+  constexpr mdarray() noexcept(std::is_nothrow_default_constructible_v<container_type>)
+    : cp_{rmm::cuda_stream_default}, c_{cp_.create(0)} {};
+  constexpr mdarray(mdarray const&) noexcept(std::is_nothrow_copy_constructible_v<container_type>) =
+    default;
+  constexpr mdarray(mdarray&&) noexcept(std::is_nothrow_move_constructible<container_type>::value) =
+    default;
+
+  constexpr auto operator                                               =(mdarray const&) noexcept(
+    std::is_nothrow_copy_assignable<container_type>::value) -> mdarray& = default;
+  constexpr auto operator                                               =(mdarray&&) noexcept(
+    std::is_nothrow_move_assignable<container_type>::value) -> mdarray& = default;
+
+  ~mdarray() noexcept(std::is_nothrow_destructible<container_type>::value) = default;
+
+#ifndef RAFT_MDARRAY_CTOR_CONSTEXPR
+#if !(__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ <= 2)
+// 11.0:
+// Error: Internal Compiler Error (codegen): "there was an error in verifying the lgenfe output!"
+//
+// 11.2:
+// Call parameter type does not match function signature!
+// i8** null
+// i8*  %call14 = call i32 null(void (i8*)* null, i8* null, i8** null), !dbg !1060
+// <unnamed>: parse Invalid record (Producer: 'LLVM7.0.1' Reader: 'LLVM 7.0.1')
+#define RAFT_MDARRAY_CTOR_CONSTEXPR constexpr
+#else
+#define RAFT_MDARRAY_CTOR_CONSTEXPR
+#endif  // !(__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ <= 2)
+#endif  // RAFT_MDARRAY_CTOR_CONSTEXPR
+
+  /**
+   * @brief The only constructor that can create storage, this is to make sure CUDA stream is being
+   * used.
+   */
+  RAFT_MDARRAY_CTOR_CONSTEXPR mdarray(mapping_type const& m, container_policy_type const& cp)
+    : cp_(cp), map_(m), c_(cp_.create(map_.required_span_size()))
+  {
+  }
+  RAFT_MDARRAY_CTOR_CONSTEXPR mdarray(mapping_type const& m, container_policy_type& cp)
+    : cp_(cp), map_(m), c_(cp_.create(map_.required_span_size()))
+  {
+  }
+
+#undef RAFT_MDARRAY_CTOR_CONSTEXPR
+
+  /**
+   * @brief Get a mdspan that can be passed down to CUDA kernels.
+   */
+  auto view() noexcept { return view_type(c_.data(), map_, cp_.make_accessor_policy()); }
+  /**
+   * @brief Get a mdspan that can be passed down to CUDA kernels.
+   */
+  auto view() const noexcept
+  {
+    return const_view_type(c_.data(), map_, cp_.make_accessor_policy());
+  }
+
+  [[nodiscard]] constexpr auto size() const noexcept -> index_type { return this->view().size(); }
+
+  [[nodiscard]] auto data() noexcept -> pointer { return c_.data(); }
+  [[nodiscard]] constexpr auto data() const noexcept -> const_pointer { return c_.data(); }
+
+  /**
+   * @brief Indexing operator, use it sparingly since it triggers a device<->host copy.
+   */
+  template <typename... IndexType>
+  auto operator()(IndexType&&... indices)
+    -> std::enable_if_t<sizeof...(IndexType) == extents_type::rank() &&
+                          (std::is_convertible_v<IndexType, index_type> && ...) &&
+                          std::is_constructible_v<extents_type, IndexType...> &&
+                          std::is_constructible_v<mapping_type, extents_type>,
+                        /* device policy is not default constructible due to requirement for CUDA
+                           stream. */
+                        /* std::is_default_constructible_v<container_policy_type> */
+                        reference>
+  {
+    return cp_.access(c_, map_(std::forward<IndexType>(indices)...));
+  }
+
+  /**
+   * @brief Indexing operator, use it sparingly since it triggers a device<->host copy.
+   */
+  template <typename... IndexType>
+  auto operator()(IndexType&&... indices) const
+    -> std::enable_if_t<sizeof...(IndexType) == extents_type::rank() &&
+                          (std::is_convertible_v<IndexType, index_type> && ...) &&
+                          std::is_constructible_v<extents_type, IndexType...> &&
+                          std::is_constructible<mapping_type, extents_type>::value,
+                        /* device policy is not default constructible due to requirement for CUDA
+                           stream. */
+                        /* std::is_default_constructible_v<container_policy_type> */
+                        const_reference>
+  {
+    return cp_.access(c_, map_(std::forward<IndexType>(indices)...));
+  }
+
+  // basic_mdarray observers of the domain multidimensional index space (also in basic_mdspan)
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto rank() noexcept -> index_type
+  {
+    return extents_type::rank();
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto rank_dynamic() noexcept -> index_type
+  {
+    return extents_type::rank_dynamic();
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto static_extent(size_t r) noexcept
+    -> index_type
+  {
+    return extents_type::static_extent(r);
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto extents() const noexcept -> extents_type
+  {
+    return map_.extents();
+  }
+  /**
+   * @brief the extent of rank r
+   */
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto extent(size_t r) const noexcept -> index_type
+  {
+    return map_.extents().extent(r);
+  }
+  // mapping
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto mapping() const noexcept -> mapping_type
+  {
+    return map_;
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_unique() const noexcept -> bool
+  {
+    return map_.is_unique();
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_contiguous() const noexcept -> bool
+  {
+    return map_.is_contiguous();
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto is_strided() const noexcept -> bool
+  {
+    return map_.is_strided();
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION constexpr auto stride(size_t r) const -> index_type
+  {
+    return map_.stride(r);
+  }
+
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_unique() noexcept -> bool
+  {
+    return mapping_type::is_always_unique();
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_contiguous() noexcept -> bool
+  {
+    return mapping_type::is_always_contiguous();
+  }
+  [[nodiscard]] MDSPAN_INLINE_FUNCTION static constexpr auto is_always_strided() noexcept -> bool
+  {
+    return mapping_type::is_always_strided();
+  }
+
+ private:
+  template <typename, typename, typename, typename>
+  friend class mdarray;
+
+ private:
+  container_policy_type cp_;
+  mapping_type map_;
+  container_type c_;
+};
+
+/**
+ * @brief mdarray with host container policy
+ */
+template <typename ElementType,
+          typename Extents,
+          typename LayoutPolicy    = layout_c_contiguous,
+          typename ContainerPolicy = detail::host_vector_policy<ElementType>>
+using host_mdarray =
+  mdarray<ElementType, Extents, LayoutPolicy, detail::host_accessor<ContainerPolicy>>;
+
+/**
+ * @brief mdarray with device container policy
+ */
+template <typename ElementType,
+          typename Extents,
+          typename LayoutPolicy    = layout_c_contiguous,
+          typename ContainerPolicy = detail::device_uvector_policy<ElementType>>
+using device_mdarray =
+  mdarray<ElementType, Extents, LayoutPolicy, detail::device_accessor<ContainerPolicy>>;
+
+/**
+ * @brief Shorthand for 0-dim host mdarray (scalar).
+ *
+ * Underlying storage is std::vector.
+ */
+template <typename ElementType>
+using host_scalar = host_mdarray<ElementType, detail::scalar_extent>;
+
+/**
+ * @brief Shorthand for 0-dim host mdarray (scalar).
+ *
+ * Similar to rmm::device_scalar, underying storage is rmm::device_uvector.
+ */
+template <typename ElementType>
+using device_scalar = device_mdarray<ElementType, detail::scalar_extent>;
+
+/**
+ * @brief Shorthand for 1-dim host mdarray.
+ */
+template <typename ElementType>
+using host_vector = host_mdarray<ElementType, detail::vector_extent>;
+
+/**
+ * @brief Shorthand for 1-dim device mdarray.
+ */
+template <typename ElementType>
+using device_vector = device_mdarray<ElementType, detail::vector_extent>;
+
+/**
+ * @brief Shorthand for c-contiguous host matrix.
+ */
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+using host_matrix = host_mdarray<ElementType, detail::matrix_extent, LayoutPolicy>;
+
+/**
+ * @brief Shorthand for c-contiguous device matrix.
+ */
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+using device_matrix = device_mdarray<ElementType, detail::matrix_extent, LayoutPolicy>;
+
+/**
+ * @brief Shorthand for 0-dim host mdspan (scalar).
+ */
+template <typename ElementType>
+using host_scalar_view = host_mdspan<ElementType, detail::scalar_extent>;
+
+/**
+ * @brief Shorthand for 0-dim host mdspan (scalar).
+ */
+template <typename ElementType>
+using device_scalar_view = device_mdspan<ElementType, detail::scalar_extent>;
+
+/**
+ * @brief Shorthand for 1-dim host mdspan.
+ */
+template <typename ElementType>
+using host_vector_view = host_mdspan<ElementType, detail::vector_extent>;
+
+/**
+ * @brief Shorthand for 1-dim device mdspan.
+ */
+template <typename ElementType>
+using device_vector_view = device_mdspan<ElementType, detail::vector_extent>;
+
+/**
+ * @brief Shorthand for c-contiguous host matrix view.
+ */
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+using host_matrix_view = host_mdspan<ElementType, detail::matrix_extent, LayoutPolicy>;
+/**
+ * @brief Shorthand for c-contiguous device matrix view.
+ */
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+using device_matrix_view = device_mdspan<ElementType, detail::matrix_extent, LayoutPolicy>;
+
+/**
+ * @brief Create a 0-dim (scalar) mdspan instance for host value.
+ */
+template <typename ElementType>
+auto make_host_scalar_view(ElementType* ptr)
+{
+  detail::scalar_extent extents;
+  return host_scalar_view<ElementType>{ptr, extents};
+}
+
+/**
+ * @brief Create a 0-dim (scalar) mdspan instance for device value.
+ */
+template <typename ElementType>
+auto make_device_scalar_view(ElementType* ptr)
+{
+  detail::scalar_extent extents;
+  return device_scalar_view<ElementType>{ptr, extents};
+}
+
+/**
+ * @brief Create a 2-dim c-contiguous mdspan instance for host pointer.
+ */
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+auto make_host_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
+{
+  detail::matrix_extent extents{n_rows, n_cols};
+  return host_matrix_view<ElementType, LayoutPolicy>{ptr, extents};
+}
+/**
+ * @brief Create a 2-dim c-contiguous mdspan instance for device pointer.
+ */
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+auto make_device_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
+{
+  detail::matrix_extent extents{n_rows, n_cols};
+  return device_matrix_view<ElementType, LayoutPolicy>{ptr, extents};
+}
+
+/**
+ * @brief Create a 1-dim mdspan instance for host pointer.
+ */
+template <typename ElementType>
+auto make_host_vector_view(ElementType* ptr, size_t n)
+{
+  detail::vector_extent extents{n};
+  return host_matrix_view<ElementType>{ptr, extents};
+}
+
+/**
+ * @brief Create a 1-dim mdspan instance for device pointer.
+ */
+template <typename ElementType>
+auto make_device_vector_view(ElementType* ptr, size_t n)
+{
+  detail::vector_extent extents{n};
+  return device_matrix_view<ElementType>{ptr, extents};
+}
+
+/**
+ * @brief Create a 2-dim c-contiguous host mdarray.
+ */
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+auto make_host_matrix(size_t n_rows, size_t n_cols)
+{
+  detail::matrix_extent extents{n_rows, n_cols};
+  using policy_t = typename host_matrix<ElementType>::container_policy_type;
+  policy_t policy;
+  return host_matrix<ElementType, LayoutPolicy>{extents, policy};
+}
+
+/**
+ * @brief Create a 2-dim c-contiguous device mdarray.
+ */
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+auto make_device_matrix(size_t n_rows, size_t n_cols, rmm::cuda_stream_view stream)
+{
+  detail::matrix_extent extents{n_rows, n_cols};
+  using policy_t = typename device_matrix<ElementType>::container_policy_type;
+  policy_t policy{stream};
+  return device_matrix<ElementType, LayoutPolicy>{extents, policy};
+}
+
+/**
+ * @brief Create a host scalar from v.
+ *
+ * Underlying storage is std::vector.
+ */
+template <typename ElementType>
+auto make_host_scalar(ElementType const& v)
+{
+  // FIXME(jiamingy): We can optimize this by using std::array as container policy, which
+  // requires some more compile time dispatching. This is enabled in the ref impl but
+  // hasn't been ported here yet.
+  detail::scalar_extent extents;
+  using policy_t = typename host_scalar<ElementType>::container_policy_type;
+  policy_t policy;
+  auto scalar = host_scalar<ElementType>{extents, policy};
+  scalar(0)   = v;
+  return scalar;
+}
+
+/**
+ * @brief Create a device scalar from v.
+ *
+ * Similar to rmm::device_scalar, underying storage is rmm::device_uvector.
+ */
+template <typename ElementType>
+auto make_device_scalar(ElementType const& v, rmm::cuda_stream_view stream)
+{
+  detail::scalar_extent extents;
+  using policy_t = typename device_scalar<ElementType>::container_policy_type;
+  policy_t policy{stream};
+  auto scalar = device_scalar<ElementType>{extents, policy};
+  scalar(0)   = v;
+  return scalar;
+}
+
+/**
+ * @brief Create a 1-dim host mdarray.
+ */
+template <typename ElementType>
+auto make_host_vector(size_t n)
+{
+  detail::vector_extent extents{n};
+  using policy_t = typename host_vector<ElementType>::container_policy_type;
+  policy_t policy;
+  return host_vector<ElementType>{extents, policy};
+}
+
+/**
+ * @brief Create a 1-dim device mdarray.
+ */
+template <typename ElementType>
+auto make_device_vector(size_t n, rmm::cuda_stream_view stream)
+{
+  detail::vector_extent extents{n};
+  using policy_t = typename device_vector<ElementType>::container_policy_type;
+  policy_t policy{stream};
+  return device_vector<ElementType>{extents, policy};
+}
+}  // namespace raft
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 82937c0ba3..ee57b7c09a 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -73,6 +73,7 @@ add_executable(test_raft
     test/matrix/matrix.cu
     test/matrix/columnSort.cu
     test/matrix/linewise_op.cu
+    test/mdarray.cu
     test/mr/host/buffer.cpp
     test/mr/device/buffer.cpp
     test/mst.cu
diff --git a/cpp/test/mdarray.cu b/cpp/test/mdarray.cu
new file mode 100644
index 0000000000..60860f90f4
--- /dev/null
+++ b/cpp/test/mdarray.cu
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <experimental/mdspan>
+#include <gtest/gtest.h>
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/mdarray.hpp>
+#include <rmm/cuda_stream.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/device_vector.hpp>
+#include <rmm/exec_policy.hpp>
+#include <thrust/device_vector.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+
+namespace {
+namespace stdex = std::experimental;
+void check_status(int32_t* d_status, rmm::cuda_stream_view stream)
+{
+  stream.synchronize();
+  int32_t h_status{1};
+  raft::update_host(&h_status, d_status, 1, stream);
+  ASSERT_EQ(h_status, 0);
+}
+
+// just simple integration test, main tests are in mdspan ref implementation.
+void test_mdspan()
+{
+  auto stream = rmm::cuda_stream_default;
+  rmm::device_uvector<float> a{16ul, stream};
+  thrust::sequence(rmm::exec_policy(stream), a.begin(), a.end());
+  stdex::mdspan<float, stdex::extents<stdex::dynamic_extent, stdex::dynamic_extent>> span{
+    a.data(), 4, 4};
+  thrust::device_vector<int32_t> status(1, 0);
+  auto p_status = status.data().get();
+  thrust::for_each_n(
+    rmm::exec_policy(stream), thrust::make_counting_iterator(0ul), 4, [=] __device__(size_t i) {
+      auto v = span(0, i);
+      if (v != i) { raft::myAtomicAdd(p_status, 1); }
+      auto k = stdex::submdspan(span, 0, stdex::full_extent);
+      if (k(i) != i) { raft::myAtomicAdd(p_status, 1); }
+    });
+  check_status(p_status, stream);
+}
+}  // namespace
+
+TEST(MDSpan, Basic) { test_mdspan(); }
+
+namespace raft {
+void test_uvector_policy()
+{
+  auto s = rmm::cuda_stream{};
+  detail::device_uvector<float> dvec(10, s);
+  auto a  = dvec[2];
+  a       = 3;
+  float c = a;
+  ASSERT_EQ(c, 3);
+}
+
+TEST(MDArray, Policy) { test_uvector_policy(); }
+
+void test_mdarray_basic()
+{
+  using matrix_extent = stdex::extents<stdex::dynamic_extent, stdex::dynamic_extent>;
+  auto s              = rmm::cuda_stream_default;
+  {
+    /**
+     * device policy
+     */
+    layout_c_contiguous::mapping<matrix_extent> layout{matrix_extent{4, 4}};
+    using mdarray_t = device_mdarray<float, matrix_extent, layout_c_contiguous>;
+    auto policy     = mdarray_t::container_policy_type{s};
+    static_assert(std::is_same_v<typename decltype(policy)::accessor_type,
+                                 detail::device_uvector_policy<float>>);
+    device_mdarray<float, matrix_extent, layout_c_contiguous> array{layout, policy};
+
+    array(0, 3) = 1;
+    ASSERT_EQ(array(0, 3), 1);
+    // non-const access
+    auto d_view = array.view();
+    static_assert(!decltype(d_view)::accessor_type::is_host_type::value);
+
+    thrust::device_vector<int32_t> status(1, 0);
+    auto p_status = status.data().get();
+    thrust::for_each_n(rmm::exec_policy(s),
+                       thrust::make_counting_iterator(0ul),
+                       1,
+                       [d_view, p_status] __device__(auto i) {
+                         if (d_view(0, 3) != 1) { myAtomicAdd(p_status, 1); }
+                         d_view(0, 2) = 3;
+                         if (d_view(0, 2) != 3) { myAtomicAdd(p_status, 1); }
+                       });
+    check_status(p_status, s);
+
+    // const ref access
+    auto const& arr = array;
+    ASSERT_EQ(arr(0, 3), 1);
+    auto const_d_view = arr.view();
+    thrust::for_each_n(rmm::exec_policy(s),
+                       thrust::make_counting_iterator(0ul),
+                       1,
+                       [const_d_view, p_status] __device__(auto i) {
+                         if (const_d_view(0, 3) != 1) { myAtomicAdd(p_status, 1); }
+                       });
+    check_status(p_status, s);
+
+    // utilities
+    static_assert(array.rank_dynamic() == 2);
+    static_assert(array.rank() == 2);
+    static_assert(array.is_unique());
+    static_assert(array.is_contiguous());
+    static_assert(array.is_strided());
+
+    static_assert(!std::is_nothrow_default_constructible<mdarray_t>::value);  // cuda stream
+    static_assert(std::is_nothrow_move_constructible<mdarray_t>::value);
+    static_assert(std::is_nothrow_move_assignable<mdarray_t>::value);
+  }
+  {
+    /**
+     * host policy
+     */
+    using mdarray_t = host_mdarray<float, matrix_extent, layout_c_contiguous>;
+    mdarray_t::container_policy_type policy;
+    static_assert(
+      std::is_same_v<typename decltype(policy)::accessor_type, detail::host_vector_policy<float>>);
+    layout_c_contiguous::mapping<matrix_extent> layout{matrix_extent{4, 4}};
+    host_mdarray<float, matrix_extent, layout_c_contiguous> array{layout, policy};
+
+    array(0, 3) = 1;
+    ASSERT_EQ(array(0, 3), 1);
+    auto h_view = array.view();
+    static_assert(decltype(h_view)::accessor_type::is_host_type::value);
+    thrust::for_each_n(thrust::host, thrust::make_counting_iterator(0ul), 1, [h_view](auto i) {
+      ASSERT_EQ(h_view(0, 3), 1);
+    });
+
+    static_assert(std::is_nothrow_default_constructible<mdarray_t>::value);
+    static_assert(std::is_nothrow_move_constructible<mdarray_t>::value);
+    static_assert(std::is_nothrow_move_assignable<mdarray_t>::value);
+  }
+  {
+    /**
+     * static extent
+     */
+    using static_extent = stdex::extents<16, 16>;
+    layout_c_contiguous::mapping<static_extent> layout{static_extent{}};
+    using mdarray_t = device_mdarray<float, static_extent, layout_c_contiguous>;
+    mdarray_t::container_policy_type policy{s};
+    device_mdarray<float, static_extent, layout_c_contiguous> array{layout, policy};
+
+    static_assert(array.rank_dynamic() == 0);
+    static_assert(array.rank() == 2);
+    static_assert(array.is_unique());
+    static_assert(array.is_contiguous());
+    static_assert(array.is_strided());
+
+    array(0, 3) = 1;
+    ASSERT_EQ(array(0, 3), 1);
+
+    auto const& ref = array;
+    ASSERT_EQ(ref(0, 3), 1);
+  }
+}
+
+TEST(MDArray, Basic) { test_mdarray_basic(); }
+
+template <typename BasicMDarray, typename PolicyFn, typename ThrustPolicy>
+void test_mdarray_copy_move(ThrustPolicy exec, PolicyFn make_policy)
+{
+  using matrix_extent = stdex::extents<stdex::dynamic_extent, stdex::dynamic_extent>;
+  layout_c_contiguous::mapping<matrix_extent> layout{matrix_extent{4, 4}};
+
+  using mdarray_t = BasicMDarray;
+  using policy_t  = typename mdarray_t::container_policy_type;
+  auto policy     = make_policy();
+
+  mdarray_t arr_origin{layout, policy};
+  thrust::sequence(exec, arr_origin.data(), arr_origin.data() + arr_origin.size());
+
+  auto check_eq = [](auto const& l, auto const& r) {
+    ASSERT_EQ(l.extents(), r.extents());
+    for (size_t i = 0; i < l.view().extent(0); ++i) {
+      for (size_t j = 0; j < l.view().extent(1); ++j) {
+        ASSERT_EQ(l(i, j), r(i, j));
+      }
+    }
+  };
+
+  {
+    // copy ctor
+    auto policy = make_policy();
+    mdarray_t arr{layout, policy};
+    thrust::sequence(exec, arr.data(), arr.data() + arr.size());
+    mdarray_t arr_copy_construct{arr};
+    check_eq(arr, arr_copy_construct);
+
+    auto const& ref = arr;
+    mdarray_t arr_copy_construct_1{ref};
+    check_eq(ref, arr_copy_construct_1);
+  }
+
+  {
+    // copy assign
+    auto policy = make_policy();
+    mdarray_t arr{layout, policy};
+    thrust::sequence(exec, arr.data(), arr.data() + arr.size());
+    mdarray_t arr_copy_assign{layout, policy};
+    arr_copy_assign = arr;
+    check_eq(arr, arr_copy_assign);
+
+    auto const& ref = arr;
+    mdarray_t arr_copy_assign_1{layout, policy};
+    arr_copy_assign_1 = ref;
+    check_eq(ref, arr_copy_assign_1);
+  }
+
+  {
+    // move ctor
+    auto policy = make_policy();
+    mdarray_t arr{layout, policy};
+    thrust::sequence(exec, arr.data(), arr.data() + arr.size());
+    mdarray_t arr_move_construct{std::move(arr)};
+    ASSERT_EQ(arr.data(), nullptr);
+    check_eq(arr_origin, arr_move_construct);
+  }
+
+  {
+    // move assign
+    auto policy = make_policy();
+    mdarray_t arr{layout, policy};
+    thrust::sequence(exec, arr.data(), arr.data() + arr.size());
+    mdarray_t arr_move_assign{layout, policy};
+    arr_move_assign = std::move(arr);
+    ASSERT_EQ(arr.data(), nullptr);
+    check_eq(arr_origin, arr_move_assign);
+  }
+}
+
+TEST(MDArray, CopyMove)
+{
+  using matrix_extent = stdex::extents<stdex::dynamic_extent, stdex::dynamic_extent>;
+  using d_matrix_t    = device_mdarray<float, matrix_extent>;
+  using policy_t      = typename d_matrix_t::container_policy_type;
+  auto s              = rmm::cuda_stream_default;
+  test_mdarray_copy_move<d_matrix_t>(rmm::exec_policy(s), [s]() { return policy_t{s}; });
+
+  using h_matrix_t = host_mdarray<float, matrix_extent>;
+  test_mdarray_copy_move<h_matrix_t>(thrust::host,
+                                     []() { return detail::host_vector_policy<float>{}; });
+
+  {
+    d_matrix_t arr;
+    auto s = rmm::cuda_stream();
+    policy_t policy{s};
+    matrix_extent extents{3, 3};
+    d_matrix_t::layout_type::mapping<matrix_extent> layout{extents};
+    d_matrix_t non_dft{layout, policy};
+
+    arr = non_dft;
+    ASSERT_NE(arr.data(), non_dft.data());
+    ASSERT_EQ(arr.extent(0), non_dft.extent(0));
+  }
+  {
+    h_matrix_t arr;
+    using h_policy_t = typename h_matrix_t::container_policy_type;
+    h_policy_t policy{s};
+    matrix_extent extents{3, 3};
+    h_matrix_t::layout_type::mapping<matrix_extent> layout{extents};
+    h_matrix_t non_dft{layout, policy};
+
+    arr = non_dft;
+    ASSERT_NE(arr.data(), non_dft.data());
+    ASSERT_EQ(arr.extent(0), non_dft.extent(0));
+  }
+}
+
+namespace {
+void test_factory_methods()
+{
+  size_t n{100};
+  rmm::device_uvector<float> d_vec(n, rmm::cuda_stream_default);
+  {
+    auto d_matrix = make_device_matrix_view(d_vec.data(), d_vec.size() / 2, 2);
+    ASSERT_EQ(d_matrix.extent(0), n / 2);
+    ASSERT_EQ(d_matrix.extent(1), 2);
+    ASSERT_EQ(d_matrix.data(), d_vec.data());
+  }
+  {
+    auto const& vec_ref = d_vec;
+    auto d_matrix       = make_device_matrix_view(vec_ref.data(), d_vec.size() / 2, 2);
+    ASSERT_EQ(d_matrix.extent(0), n / 2);
+    ASSERT_EQ(d_matrix.extent(1), 2);
+    ASSERT_EQ(d_matrix.data(), d_vec.data());
+  }
+
+  std::vector<float> h_vec(n);
+  {
+    auto h_matrix = make_host_matrix_view(h_vec.data(), h_vec.size() / 2, 2);
+    ASSERT_EQ(h_matrix.extent(0), n / 2);
+    ASSERT_EQ(h_matrix.extent(1), 2);
+    ASSERT_EQ(h_matrix.data(), h_vec.data());
+    h_matrix(0, 0) = 13;
+    ASSERT_EQ(h_matrix(0, 0), 13);
+  }
+  {
+    auto const& vec_ref = h_vec;
+    auto h_matrix       = make_host_matrix_view(vec_ref.data(), d_vec.size() / 2, 2);
+    ASSERT_EQ(h_matrix.extent(0), n / 2);
+    ASSERT_EQ(h_matrix.extent(1), 2);
+    ASSERT_EQ(h_matrix.data(), h_vec.data());
+    // const, cannot assign
+    // h_matrix(0, 0) = 13;
+    ASSERT_EQ(h_matrix(0, 0), 13);
+  }
+
+  {
+    // host mdarray
+    auto h_matrix = make_host_matrix<float>(n, n);
+    ASSERT_EQ(h_matrix.extent(0), n);
+    ASSERT_EQ(h_matrix.extent(1), n);
+    static_assert(h_matrix.rank() == 2);
+
+    auto h_vec = make_host_vector<float>(n);
+    static_assert(h_vec.rank() == 1);
+    ASSERT_EQ(h_vec.extent(0), n);
+  }
+  {
+    // device mdarray
+    auto d_matrix = make_device_matrix<float>(n, n, rmm::cuda_stream_default);
+    ASSERT_EQ(d_matrix.extent(0), n);
+    ASSERT_EQ(d_matrix.extent(1), n);
+    static_assert(d_matrix.rank() == 2);
+
+    auto d_vec = make_device_vector<float>(n, rmm::cuda_stream_default);
+    static_assert(d_vec.rank() == 1);
+    ASSERT_EQ(d_vec.extent(0), n);
+  }
+
+  {
+    // device scalar
+    auto d_scalar = make_device_scalar<double>(17.0, rmm::cuda_stream_default);
+    static_assert(d_scalar.rank() == 1);
+    static_assert(d_scalar.rank_dynamic() == 0);
+    ASSERT_EQ(d_scalar(0), 17.0);
+
+    auto view = d_scalar.view();
+    thrust::device_vector<int32_t> status(1, 0);
+    auto p_status = status.data().get();
+    thrust::for_each_n(rmm::exec_policy(rmm::cuda_stream_default),
+                       thrust::make_counting_iterator(0),
+                       1,
+                       [=] __device__(auto i) {
+                         if (view(i) != 17.0) { myAtomicAdd(p_status, 1); }
+                       });
+    check_status(p_status, rmm::cuda_stream_default);
+  }
+  {
+    // host scalar
+    auto h_scalar = make_host_scalar<double>(17.0);
+    static_assert(h_scalar.rank() == 1);
+    static_assert(h_scalar.rank_dynamic() == 0);
+    ASSERT_EQ(h_scalar(0), 17.0);
+    ASSERT_EQ(h_scalar.view()(0), 17.0);
+
+    auto view = make_host_scalar_view(h_scalar.data());
+    ASSERT_EQ(view(0), 17.0);
+  }
+}
+}  // anonymous namespace
+
+TEST(MDArray, Factory) { test_factory_methods(); }
+
+namespace {
+template <typename T, typename LayoutPolicy>
+void check_matrix_layout(device_matrix_view<T, LayoutPolicy> in)
+{
+  static_assert(in.rank() == 2);
+  static_assert(in.is_contiguous());
+
+  bool constexpr kIsCContiguous = std::is_same_v<LayoutPolicy, layout_c_contiguous>;
+  bool constexpr kIsFContiguous = std::is_same_v<LayoutPolicy, layout_f_contiguous>;
+  // only 1 of them is true
+  static_assert(kIsCContiguous || kIsFContiguous);
+  static_assert(!(kIsCContiguous && kIsFContiguous));
+}
+}  // anonymous namespace
+
+TEST(MDArray, FuncArg)
+{
+  {
+    auto d_matrix = make_device_matrix<float>(10, 10, rmm::cuda_stream_default);
+    check_matrix_layout(d_matrix.view());
+  }
+  {
+    auto d_matrix =
+      make_device_matrix<float, layout_f_contiguous>(10, 10, rmm::cuda_stream_default);
+    check_matrix_layout(d_matrix.view());
+
+    // FIXME(jiamingy): The slice has a default accessor instead of accessor_mixin, due to
+    // the hardcoded policy in submdspan implementation.  We need to have a rewritten
+    // version of submdspan for implementing padding.
+    // auto slice =
+    //   stdex::submdspan(d_matrix.view(), std::make_tuple(2ul, 4ul), std::make_tuple(2ul, 5ul));
+    // check_matrix_layout(slice);
+  }
+}
+}  // namespace raft
diff --git a/thirdparty/LICENSES/mdarray.license b/thirdparty/LICENSES/mdarray.license
new file mode 100644
index 0000000000..e636b86032
--- /dev/null
+++ b/thirdparty/LICENSES/mdarray.license
@@ -0,0 +1,42 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2019) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
\ No newline at end of file

From 11f39d5276624b212007738a815ac9bd2d078389 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Tue, 22 Feb 2022 06:37:25 -0500
Subject: [PATCH 123/171] Adjusting conda packaging to remove duplicate
 dependencies (#508)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Jordan Jacobelli (https://github.com/Ethyling)

URL: https://github.com/rapidsai/raft/pull/508
---
 conda/recipes/libraft_distance/meta.yaml | 3 +++
 conda/recipes/libraft_headers/meta.yaml  | 3 +++
 conda/recipes/libraft_nn/meta.yaml       | 3 +++
 conda/recipes/pyraft/meta.yaml           | 4 ++--
 4 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/conda/recipes/libraft_distance/meta.yaml b/conda/recipes/libraft_distance/meta.yaml
index a4f2dbac94..4474629df4 100644
--- a/conda/recipes/libraft_distance/meta.yaml
+++ b/conda/recipes/libraft_distance/meta.yaml
@@ -45,12 +45,15 @@ requirements:
     - ucx-proc=*=gpu
     - gtest=1.10.0
     - gmock
+    - librmm {{ minor_version }}
   run:
+    - libraft-headers {{ version }}
     - nccl>=2.9.9
     - ucx-py {{ ucx_py_version }}
     - ucx-proc=*=gpu
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
     - libcusolver>=11.2.1
+    - librmm {{ minor_version }}
 
 about:
   home: http://rapids.ai/
diff --git a/conda/recipes/libraft_headers/meta.yaml b/conda/recipes/libraft_headers/meta.yaml
index a03146a7d5..aec6fa4351 100644
--- a/conda/recipes/libraft_headers/meta.yaml
+++ b/conda/recipes/libraft_headers/meta.yaml
@@ -44,8 +44,11 @@ requirements:
     - ucx-proc=*=gpu
     - gtest=1.10.0
     - gmock
+    - librmm {{ minor_version}}
+    - libcusolver>=11.2.1
   run:
     - nccl>=2.9.9
+    - librmm {{ minor_version}}
     - ucx-py {{ ucx_py_version }}
     - ucx-proc=*=gpu
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
diff --git a/conda/recipes/libraft_nn/meta.yaml b/conda/recipes/libraft_nn/meta.yaml
index 53f44f2e97..9d6732d56b 100644
--- a/conda/recipes/libraft_nn/meta.yaml
+++ b/conda/recipes/libraft_nn/meta.yaml
@@ -45,11 +45,14 @@ requirements:
     - libfaiss 1.7.0 *_cuda
     - gtest=1.10.0
     - gmock
+    - librmm {{ minor_version }}
   run:
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
+    - libraft-headers {{ version }}
     - faiss-proc=*=cuda
     - libfaiss 1.7.0 *_cuda
     - libcusolver>=11.2.1
+    - librmm {{ minor_version }}
 
 about:
   home: http://rapids.ai/
diff --git a/conda/recipes/pyraft/meta.yaml b/conda/recipes/pyraft/meta.yaml
index 4182137f85..eae9963204 100644
--- a/conda/recipes/pyraft/meta.yaml
+++ b/conda/recipes/pyraft/meta.yaml
@@ -33,13 +33,13 @@ requirements:
     - libraft-headers {{ version }}
     - cudatoolkit {{ cuda_version }}.*
     - cuda-python >=11.5,<12.0
+    - nccl>=2.9.9
     - ucx-py {{ ucx_py_version }}
     - ucx-proc=*=gpu
   run:
     - python x.x
     - dask-cuda {{ minor_version }}
     - libraft-headers {{ version }}
-    - cupy>=7.8.0,<10.0.0a0
     - nccl>=2.9.9
     - rmm {{ minor_version }}
     - ucx-py {{ ucx_py_version }}
@@ -60,4 +60,4 @@ about:
   home: http://rapids.ai/
   license: Apache-2.0
   # license_file: LICENSE
-  summary: pyraft library
\ No newline at end of file
+  summary: pyraft library

From 1c5353e6ca9770acc35ea2cfb89c75677b462354 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Akif=20=C3=87=C3=96RD=C3=9CK?= <akifcorduk@gmail.com>
Date: Tue, 22 Feb 2022 13:27:57 +0100
Subject: [PATCH 124/171] Use shfl_xor in warpReduce for broadcast (#521)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some applications might need the reduction result in all lanes of the warp. To prevent executing additional shuffle operations after the reduction, it is better to have an option to do the shuffle during reduction. The implementation uses `shfl_xor` to keep the valid results across results at each step of the reduction. For >SM_80 devices we can consider implicit reduction operations [B.21. Warp Reduce Functions](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-reduce-functions)

Authors:
  - Akif ÇÖRDÜK (https://github.com/akifcorduk)

Approvers:
  - Thejaswi. N. S (https://github.com/teju85)

URL: https://github.com/rapidsai/raft/pull/521
---
 cpp/include/raft/cuda_utils.cuh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/cuda_utils.cuh b/cpp/include/raft/cuda_utils.cuh
index be995ea824..362dba66c5 100644
--- a/cpp/include/raft/cuda_utils.cuh
+++ b/cpp/include/raft/cuda_utils.cuh
@@ -652,7 +652,8 @@ DI T shfl_xor(T val, int laneMask, int width = WarpSize, uint32_t mask = 0xfffff
 /**
  * @brief Warp-level sum reduction
  * @param val input value
- * @return only the lane0 will contain valid reduced result
+ * @tparam T Value type to be reduced
+ * @return Reduction result. All lanes will have the valid result.
  * @note Why not cub? Because cub doesn't seem to allow working with arbitrary
  *       number of warps in a block. All threads in the warp must enter this
  *       function together
@@ -663,7 +664,7 @@ DI T warpReduce(T val)
 {
 #pragma unroll
   for (int i = WarpSize / 2; i > 0; i >>= 1) {
-    T tmp = shfl(val, laneId() + i);
+    T tmp = shfl_xor(val, i);
     val += tmp;
   }
   return val;

From badaee0e396baa1cff9cf6b007906b42d30b6bc4 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Tue, 22 Feb 2022 13:51:07 -0500
Subject: [PATCH 125/171] Updating some of the ci check scripts (#522)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Jordan Jacobelli (https://github.com/Ethyling)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/522
---
 ci/checks/copyright.py  | 172 +++++++++++++++++++++++++++++++----
 ci/checks/style.sh      |   4 +-
 cpp/scripts/gitutils.py | 197 +++++++++++++++++++++++++++++++++++-----
 3 files changed, 330 insertions(+), 43 deletions(-)

diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py
index 79a0d70005..2440e61cb1 100644
--- a/ci/checks/copyright.py
+++ b/ci/checks/copyright.py
@@ -15,8 +15,20 @@
 
 import datetime
 import re
-import gitutils
+import argparse
+import io
+import os
+import sys
 
+SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
+
+# Add the scripts dir for gitutils
+sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR,
+                                              "../../cpp/scripts")))
+
+# Now import gitutils. Ignore flake8 error here since there is no other way to
+# set up imports
+import gitutils  # noqa: E402
 
 FilesToCheck = [
     re.compile(r"[.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx)$"),
@@ -26,11 +38,25 @@
     re.compile(r"[.]flake8[.]cython$"),
     re.compile(r"meta[.]yaml$")
 ]
+ExemptFiles = []
+
+# this will break starting at year 10000, which is probably OK :)
+CheckSimple = re.compile(
+    r"Copyright *(?:\(c\))? *(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)")
+CheckDouble = re.compile(
+    r"Copyright *(?:\(c\))? *(\d{4})-(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)"  # noqa: E501
+)
 
 
 def checkThisFile(f):
-    if gitutils.isFileEmpty(f):
+    # This check covers things like symlinks which point to files that DNE
+    if not (os.path.exists(f)):
         return False
+    if gitutils and gitutils.isFileEmpty(f):
+        return False
+    for exempt in ExemptFiles:
+        if exempt.search(f):
+            return False
     for checker in FilesToCheck:
         if checker.search(f):
             return True
@@ -38,17 +64,25 @@ def checkThisFile(f):
 
 
 def getCopyrightYears(line):
-    res = re.search(r"Copyright \(c\) (\d{4}), NVIDIA CORPORATION", line)
+    res = CheckSimple.search(line)
     if res:
         return (int(res.group(1)), int(res.group(1)))
-    res = re.search(r"Copyright \(c\) (\d{4})-(\d{4}), NVIDIA CORPORATION",
-                    line)
+    res = CheckDouble.search(line)
     if res:
         return (int(res.group(1)), int(res.group(2)))
     return (None, None)
 
 
-def checkCopyright(f):
+def replaceCurrentYear(line, start, end):
+    # first turn a simple regex into double (if applicable). then update years
+    res = CheckSimple.sub(r"Copyright (c) \1-\1, NVIDIA CORPORATION", line)
+    res = CheckDouble.sub(
+        r"Copyright (c) {:04d}-{:04d}, NVIDIA CORPORATION".format(start, end),
+        res)
+    return res
+
+
+def checkCopyright(f, update_current_year):
     """
     Checks for copyright headers and their years
     """
@@ -57,48 +91,152 @@ def checkCopyright(f):
     lineNum = 0
     crFound = False
     yearMatched = False
-    fp = open(f, "r")
-    for line in fp.readlines():
+    with io.open(f, "r", encoding="utf-8") as fp:
+        lines = fp.readlines()
+    for line in lines:
         lineNum += 1
         start, end = getCopyrightYears(line)
         if start is None:
             continue
         crFound = True
+        if start > end:
+            e = [
+                f,
+                lineNum,
+                "First year after second year in the copyright "
+                "header (manual fix required)",
+                None
+            ]
+            errs.append(e)
         if thisYear < start or thisYear > end:
-            errs.append((f, lineNum,
-                         "Current year not included in the copyright header"))
+            e = [
+                f,
+                lineNum,
+                "Current year not included in the "
+                "copyright header",
+                None
+            ]
+            if thisYear < start:
+                e[-1] = replaceCurrentYear(line, thisYear, end)
+            if thisYear > end:
+                e[-1] = replaceCurrentYear(line, start, thisYear)
+            errs.append(e)
         else:
             yearMatched = True
     fp.close()
     # copyright header itself not found
     if not crFound:
-        errs.append((f, 0,
-                     "Copyright header missing or formatted incorrectly"))
+        e = [
+            f,
+            0,
+            "Copyright header missing or formatted incorrectly "
+            "(manual fix required)",
+            None
+        ]
+        errs.append(e)
     # even if the year matches a copyright header, make the check pass
     if yearMatched:
         errs = []
+
+    if update_current_year:
+        errs_update = [x for x in errs if x[-1] is not None]
+        if len(errs_update) > 0:
+            print("File: {}. Changing line(s) {}".format(
+                f, ', '.join(str(x[1]) for x in errs if x[-1] is not None)))
+            for _, lineNum, __, replacement in errs_update:
+                lines[lineNum - 1] = replacement
+            with io.open(f, "w", encoding="utf-8") as out_file:
+                for new_line in lines:
+                    out_file.write(new_line)
+        errs = [x for x in errs if x[-1] is None]
+
     return errs
 
 
-def checkCopyrightForAll():
+def getAllFilesUnderDir(root, pathFilter=None):
+    retList = []
+    for (dirpath, dirnames, filenames) in os.walk(root):
+        for fn in filenames:
+            filePath = os.path.join(dirpath, fn)
+            if pathFilter(filePath):
+                retList.append(filePath)
+    return retList
+
+
+def checkCopyright_main():
     """
     Checks for copyright headers in all the modified files. In case of local
     repo, this script will just look for uncommitted files and in case of CI
     it compares between branches "$PR_TARGET_BRANCH" and "current-pr-branch"
     """
-    files = gitutils.modifiedFiles(filter=checkThisFile)
+    retVal = 0
+    global ExemptFiles
+
+    argparser = argparse.ArgumentParser(
+        "Checks for a consistent copyright header in git's modified files")
+    argparser.add_argument("--update-current-year",
+                           dest='update_current_year',
+                           action="store_true",
+                           required=False,
+                           help="If set, "
+                                "update the current year if a header "
+                                "is already present and well formatted.")
+    argparser.add_argument("--git-modified-only",
+                           dest='git_modified_only',
+                           action="store_true",
+                           required=False,
+                           help="If set, "
+                                "only files seen as modified by git will be "
+                                "processed.")
+    argparser.add_argument("--exclude",
+                           dest='exclude',
+                           action="append",
+                           required=False,
+                           default=["python/cuml/_thirdparty/"],
+                           help=("Exclude the paths specified (regexp). "
+                                 "Can be specified multiple times."))
+
+    (args, dirs) = argparser.parse_known_args()
+    try:
+        ExemptFiles = ExemptFiles + [pathName for pathName in args.exclude]
+        ExemptFiles = [re.compile(file) for file in ExemptFiles]
+    except re.error as reException:
+        print("Regular expression error:")
+        print(reException)
+        return 1
+
+    if args.git_modified_only:
+        files = gitutils.modifiedFiles(pathFilter=checkThisFile)
+    else:
+        files = []
+        for d in [os.path.abspath(d) for d in dirs]:
+            if not (os.path.isdir(d)):
+                raise ValueError(f"{d} is not a directory.")
+            files += getAllFilesUnderDir(d, pathFilter=checkThisFile)
+
     errors = []
     for f in files:
-        errors += checkCopyright(f)
+        errors += checkCopyright(f, args.update_current_year)
+
     if len(errors) > 0:
         print("Copyright headers incomplete in some of the files!")
         for e in errors:
             print("  %s:%d Issue: %s" % (e[0], e[1], e[2]))
         print("")
-        raise Exception("Copyright check failed! Check above to know more")
+        n_fixable = sum(1 for e in errors if e[-1] is not None)
+        path_parts = os.path.abspath(__file__).split(os.sep)
+        file_from_repo = os.sep.join(path_parts[path_parts.index("ci"):])
+        if n_fixable > 0:
+            print(("You can run `python {} --git-modified-only "
+                   "--update-current-year` to fix {} of these "
+                   "errors.\n").format(file_from_repo, n_fixable))
+        retVal = 1
     else:
         print("Copyright check passed")
 
+    return retVal
+
 
 if __name__ == "__main__":
-    checkCopyrightForAll()
+    import sys
+    sys.exit(checkCopyright_main())
diff --git a/ci/checks/style.sh b/ci/checks/style.sh
index e928ccb186..2ce8b446b8 100644
--- a/ci/checks/style.sh
+++ b/ci/checks/style.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #####################
 # RAFT Style Tester #
 #####################
@@ -26,7 +26,7 @@ else
 fi
 
 # Check for copyright headers in the files modified currently
-COPYRIGHT=`env PYTHONPATH=cpp/scripts python ci/checks/copyright.py 2>&1`
+COPYRIGHT=`python ci/checks/copyright.py --git-modified-only 2>&1`
 CR_RETVAL=$?
 if [ "$RETVAL" = "0" ]; then
   RETVAL=$CR_RETVAL
diff --git a/cpp/scripts/gitutils.py b/cpp/scripts/gitutils.py
index cde5571871..8d4af79129 100644
--- a/cpp/scripts/gitutils.py
+++ b/cpp/scripts/gitutils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@ def __git(*opts):
     """Runs a git command and returns its output"""
     cmd = "git " + " ".join(list(opts))
     ret = subprocess.check_output(cmd, shell=True)
-    return ret.decode("UTF-8")
+    return ret.decode("UTF-8").rstrip("\n")
 
 
 def __gitdiff(*opts):
@@ -41,6 +41,111 @@ def branch():
     return name
 
 
+def repo_version():
+    """
+    Determines the version of the repo by using `git describe`
+
+    Returns
+    -------
+    str
+        The full version of the repo in the format 'v#.#.#{a|b|rc}'
+    """
+    return __git("describe", "--tags", "--abbrev=0")
+
+
+def repo_version_major_minor():
+    """
+    Determines the version of the repo using `git describe` and returns only
+    the major and minor portion
+
+    Returns
+    -------
+    str
+        The partial version of the repo in the format '{major}.{minor}'
+    """
+
+    full_repo_version = repo_version()
+
+    match = re.match(r"^v?(?P<major>[0-9]+)(?:\.(?P<minor>[0-9]+))?",
+                     full_repo_version)
+
+    if (match is None):
+        print("   [DEBUG] Could not determine repo major minor version. "
+              f"Full repo version: {full_repo_version}.")
+        return None
+
+    out_version = match.group("major")
+
+    if (match.group("minor")):
+        out_version += "." + match.group("minor")
+
+    return out_version
+
+
+def determine_merge_commit(current_branch="HEAD"):
+    """
+    When running outside of CI, this will estimate the target merge commit hash
+    of `current_branch` by finding a common ancester with the remote branch
+    'branch-{major}.{minor}' where {major} and {minor} are determined from the
+    repo version.
+
+    Parameters
+    ----------
+    current_branch : str, optional
+        Which branch to consider as the current branch, by default "HEAD"
+
+    Returns
+    -------
+    str
+        The common commit hash ID
+    """
+
+    try:
+        # Try to determine the target branch from the most recent tag
+        head_branch = __git("describe",
+                            "--all",
+                            "--tags",
+                            "--match='branch-*'",
+                            "--abbrev=0")
+    except subprocess.CalledProcessError:
+        print("   [DEBUG] Could not determine target branch from most recent "
+              "tag. Falling back to 'branch-{major}.{minor}.")
+        head_branch = None
+
+    if (head_branch is not None):
+        # Convert from head to branch name
+        head_branch = __git("name-rev", "--name-only", head_branch)
+    else:
+        # Try and guess the target branch as "branch-<major>.<minor>"
+        version = repo_version_major_minor()
+
+        if (version is None):
+            return None
+
+        head_branch = "branch-{}".format(version)
+
+    try:
+        # Now get the remote tracking branch
+        remote_branch = __git("rev-parse",
+                              "--abbrev-ref",
+                              "--symbolic-full-name",
+                              head_branch + "@{upstream}")
+    except subprocess.CalledProcessError:
+        print("   [DEBUG] Could not remote tracking reference for "
+              f"branch {head_branch}.")
+        remote_branch = None
+
+    if (remote_branch is None):
+        return None
+
+    print(f"   [DEBUG] Determined TARGET_BRANCH as: '{remote_branch}'. "
+          "Finding common ancestor.")
+
+    common_commit = __git("merge-base", remote_branch, current_branch)
+
+    return common_commit
+
+
 def uncommittedFiles():
     """
     Returns a list of all changed files that are not yet committed. This
@@ -59,14 +164,25 @@ def uncommittedFiles():
     return ret
 
 
-def changedFilesBetween(b1, b2):
-    """Returns a list of files changed between branches b1 and b2"""
+def changedFilesBetween(baseName, branchName, commitHash):
+    """
+    Returns a list of files changed between branches baseName and latest commit
+    of branchName.
+    """
     current = branch()
-    __git("checkout", "--quiet", b1)
-    __git("checkout", "--quiet", b2)
-    files = __gitdiff("--name-only", "--ignore-submodules", "%s...%s" %
-                      (b1, b2))
-    __git("checkout", "--quiet", current)
+    # checkout "base" branch
+    __git("checkout", "--force", baseName)
+    # checkout branch for comparing
+    __git("checkout", "--force", branchName)
+    # checkout latest commit from branch
+    __git("checkout", "-fq", commitHash)
+
+    files = __gitdiff("--name-only",
+                      "--ignore-submodules",
+                      f"{baseName}..{branchName}")
+
+    # restore the original branch
+    __git("checkout", "--force", current)
     return files.splitlines()
 
 
@@ -75,8 +191,13 @@ def changesInFileBetween(file, b1, b2, filter=None):
     current = branch()
     __git("checkout", "--quiet", b1)
     __git("checkout", "--quiet", b2)
-    diffs = __gitdiff("--ignore-submodules", "-w", "--minimal", "-U0",
-                      "%s...%s" % (b1, b2), "--", file)
+    diffs = __gitdiff("--ignore-submodules",
+                      "-w",
+                      "--minimal",
+                      "-U0",
+                      "%s...%s" % (b1, b2),
+                      "--",
+                      file)
     __git("checkout", "--quiet", current)
     lines = []
     for line in diffs.splitlines():
@@ -85,12 +206,14 @@ def changesInFileBetween(file, b1, b2, filter=None):
     return lines
 
 
-def modifiedFiles(filter=None):
+def modifiedFiles(pathFilter=None):
     """
-    If inside a CI-env (ie. currentBranch=current-pr-branch and the env-var
-    PR_TARGET_BRANCH is defined), then lists out all files modified between
-    these 2 branches. Else, lists out all the uncommitted files in the current
-    branch.
+    If inside a CI-env (ie. TARGET_BRANCH and COMMIT_HASH are defined, and
+    current branch is "current-pr-branch"), then lists out all files modified
+    between these 2 branches. Locally, TARGET_BRANCH will try to be determined
+    from the current repo version and finding a coresponding branch named
+    'branch-{major}.{minor}'. If this fails, this functino will list out all
+    the uncommitted files in the current branch.
 
     Such utility function is helpful while putting checker scripts as part of
     cmake, as well as CI process. This way, during development, only the files
@@ -98,15 +221,41 @@ def modifiedFiles(filter=None):
     process ALL files modified by the dev, as submiited in the PR, will be
     checked. This happens, all the while using the same script.
     """
-    if "PR_TARGET_BRANCH" in os.environ and branch() == "current-pr-branch":
-        allFiles = changedFilesBetween(os.environ["PR_TARGET_BRANCH"],
-                                       branch())
+    targetBranch = os.environ.get("TARGET_BRANCH")
+    commitHash = os.environ.get("COMMIT_HASH")
+    currentBranch = branch()
+    print(
+        f"   [DEBUG] TARGET_BRANCH={targetBranch}, COMMIT_HASH={commitHash}, "
+        f"currentBranch={currentBranch}")
+
+    if targetBranch and commitHash and (currentBranch == "current-pr-branch"):
+        print("   [DEBUG] Assuming a CI environment.")
+        allFiles = changedFilesBetween(targetBranch, currentBranch, commitHash)
     else:
-        allFiles = uncommittedFiles()
+        print("   [DEBUG] Did not detect CI environment. "
+              "Determining TARGET_BRANCH locally.")
+
+        common_commit = determine_merge_commit(currentBranch)
+
+        if (common_commit is not None):
+
+            # Now get the diff. Use --staged to get both diff between
+            # common_commit..HEAD and any locally staged files
+            allFiles = __gitdiff("--name-only",
+                                 "--ignore-submodules",
+                                 "--staged",
+                                 f"{common_commit}").splitlines()
+        else:
+            # Fallback to just uncommitted files
+            allFiles = uncommittedFiles()
+
     files = []
     for f in allFiles:
-        if filter is None or filter(f):
+        if pathFilter is None or pathFilter(f):
             files.append(f)
+
+    filesToCheckString = "\n\t".join(files) if files else "<None>"
+    print(f"   [DEBUG] Found files to check:\n\t{filesToCheckString}\n")
     return files
 
 
@@ -131,7 +280,7 @@ def listFilesToCheck(filesDirs, filter=None):
                 allFiles.append(f)
         elif os.path.isdir(f):
             files = listAllFilesInDir(f)
-            for f in files:
-                if filter is None or filter(f):
-                    allFiles.append(f)
+            for f_ in files:
+                if filter is None or filter(f_):
+                    allFiles.append(f_)
     return allFiles

From e6d148b9e570e98313137d70763538ba063b5d67 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 23 Feb 2022 02:55:20 +0800
Subject: [PATCH 126/171] Use `dynamic_extent` from `stdex`. (#523)

Close https://github.com/rapidsai/raft/issues/478 .

Authors:
  - Jiaming Yuan (https://github.com/trivialfis)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/523
---
 cpp/include/raft/detail/mdarray.hpp | 5 +++--
 cpp/include/raft/detail/span.hpp    | 3 ++-
 cpp/test/mdarray.cu                 | 8 ++++----
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/cpp/include/raft/detail/mdarray.hpp b/cpp/include/raft/detail/mdarray.hpp
index 9f0f275eaa..624c7a4d07 100644
--- a/cpp/include/raft/detail/mdarray.hpp
+++ b/cpp/include/raft/detail/mdarray.hpp
@@ -22,6 +22,7 @@
  */
 #pragma once
 #include <experimental/mdspan>
+#include <raft/detail/span.hpp>  // dynamic_extent
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <thrust/device_ptr.h>
@@ -234,7 +235,7 @@ using device_accessor = accessor_mixin<AccessorPolicy, false>;
 
 namespace stdex = std::experimental;
 
-using vector_extent = stdex::extents<stdex::dynamic_extent>;
-using matrix_extent = stdex::extents<stdex::dynamic_extent, stdex::dynamic_extent>;
+using vector_extent = stdex::extents<dynamic_extent>;
+using matrix_extent = stdex::extents<dynamic_extent, dynamic_extent>;
 using scalar_extent = stdex::extents<1>;
 }  // namespace raft::detail
diff --git a/cpp/include/raft/detail/span.hpp b/cpp/include/raft/detail/span.hpp
index 8a26a33247..2a54e9c8a2 100644
--- a/cpp/include/raft/detail/span.hpp
+++ b/cpp/include/raft/detail/span.hpp
@@ -15,12 +15,13 @@
  */
 #pragma once
 
+#include <experimental/mdspan>
 #include <limits>                // numeric_limits
 #include <thrust/host_vector.h>  // __host__ __device__
 #include <type_traits>
 
 namespace raft {
-constexpr std::size_t dynamic_extent = std::numeric_limits<std::size_t>::max();
+constexpr std::size_t dynamic_extent = std::experimental::dynamic_extent;
 
 template <class ElementType, bool is_device, std::size_t Extent>
 class span;
diff --git a/cpp/test/mdarray.cu b/cpp/test/mdarray.cu
index 60860f90f4..961a703a8b 100644
--- a/cpp/test/mdarray.cu
+++ b/cpp/test/mdarray.cu
@@ -42,7 +42,7 @@ void test_mdspan()
   auto stream = rmm::cuda_stream_default;
   rmm::device_uvector<float> a{16ul, stream};
   thrust::sequence(rmm::exec_policy(stream), a.begin(), a.end());
-  stdex::mdspan<float, stdex::extents<stdex::dynamic_extent, stdex::dynamic_extent>> span{
+  stdex::mdspan<float, stdex::extents<raft::dynamic_extent, raft::dynamic_extent>> span{
     a.data(), 4, 4};
   thrust::device_vector<int32_t> status(1, 0);
   auto p_status = status.data().get();
@@ -74,7 +74,7 @@ TEST(MDArray, Policy) { test_uvector_policy(); }
 
 void test_mdarray_basic()
 {
-  using matrix_extent = stdex::extents<stdex::dynamic_extent, stdex::dynamic_extent>;
+  using matrix_extent = stdex::extents<dynamic_extent, dynamic_extent>;
   auto s              = rmm::cuda_stream_default;
   {
     /**
@@ -180,7 +180,7 @@ TEST(MDArray, Basic) { test_mdarray_basic(); }
 template <typename BasicMDarray, typename PolicyFn, typename ThrustPolicy>
 void test_mdarray_copy_move(ThrustPolicy exec, PolicyFn make_policy)
 {
-  using matrix_extent = stdex::extents<stdex::dynamic_extent, stdex::dynamic_extent>;
+  using matrix_extent = stdex::extents<dynamic_extent, dynamic_extent>;
   layout_c_contiguous::mapping<matrix_extent> layout{matrix_extent{4, 4}};
 
   using mdarray_t = BasicMDarray;
@@ -251,7 +251,7 @@ void test_mdarray_copy_move(ThrustPolicy exec, PolicyFn make_policy)
 
 TEST(MDArray, CopyMove)
 {
-  using matrix_extent = stdex::extents<stdex::dynamic_extent, stdex::dynamic_extent>;
+  using matrix_extent = stdex::extents<dynamic_extent, dynamic_extent>;
   using d_matrix_t    = device_mdarray<float, matrix_extent>;
   using policy_t      = typename d_matrix_t::container_policy_type;
   auto s              = rmm::cuda_stream_default;

From 309c9f277e54f4698fe1702d0d88ab01713b3aee Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Tue, 22 Feb 2022 17:34:51 -0500
Subject: [PATCH 127/171] Adding instructions to install from conda and build
 using CPM (#519)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/519
---
 BUILD.md  |  52 ++++++++++++++++++++++++----
 README.md | 100 +++++++++++++++++++++++++++++++++++++++++++++++-------
 build.sh  |   3 --
 3 files changed, 133 insertions(+), 22 deletions(-)

diff --git a/BUILD.md b/BUILD.md
index b10dc87f89..1bf3783fae 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -140,15 +140,47 @@ The following example shows how to use the `libraft-distance` API with the pre-c
 
 ### <a id="build_cxx_source"></a>Building RAFT C++ from source in cmake
 
-RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library, so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` link target and `RAFT_INCLUDE_DIR` for includes. The `COMPILE_LIBRARIES` option enables the building of the shared libraries 
+RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library, so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` link target and `RAFT_INCLUDE_DIR` for includes. The `COMPILE_LIBRARIES` option enables the building of the shared libraries.
+
+The following `cmake` snippet enables a flexible configuration of RAFT: 
 
 ```cmake
-function(find_and_configure_raft)
 
-  set(oneValueArgs VERSION FORK PINNED_TAG USE_FAISS_STATIC COMPILE_LIBRARIES ENABLE_NN_DEPENDENCIES)
+set(RAFT_VERSION "22.04")
+
+function(find_and_configure_raft)
+  set(oneValueArgs VERSION FORK PINNED_TAG USE_FAISS_STATIC 
+          COMPILE_LIBRARIES ENABLE_NN_DEPENDENCIES CLONE_ON_PIN
+          USE_NN_LIBRARY USE_DISTANCE_LIBRARY)
   cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
                             "${multiValueArgs}" ${ARGN} )
 
+  #-----------------------------------------------------
+  # Clone RAFT locally if PINNED_TAG has been changed
+  #-----------------------------------------------------
+  if(PKG_CLONE_ON_PIN AND NOT PKG_PINNED_TAG STREQUAL "branch-${RAFT_VERSION}")
+    message("Pinned tag found: ${PKG_PINNED_TAG}. Cloning raft locally.")
+    set(CPM_DOWNLOAD_raft ON)
+    set(CMAKE_IGNORE_PATH "${CMAKE_INSTALL_PREFIX}/include/raft;${CMAKE_IGNORE_PATH})
+  endif()
+
+  #-----------------------------------------------------
+  # Add components 
+  #-----------------------------------------------------
+
+  string(APPEND RAFT_COMPONENTS "")
+  if(PKG_USE_NN_LIBRARY)
+    string(APPEND RAFT_COMPONENTS " nn")
+  endif()
+  
+  if(PKG_USE_DISTANCE_LIBRARY)
+    string(APPEND RAFT_COMPONENTS " distance")
+  endif()
+
+  #-----------------------------------------------------
+  # Invoke CPM find_package()
+  #-----------------------------------------------------
+
   rapids_cpm_find(raft ${PKG_VERSION}
           GLOBAL_TARGETS      raft::raft
           BUILD_EXPORT_SET    proj-exports
@@ -170,11 +202,19 @@ endfunction()
 # Change pinned tag here to test a commit in CI
 # To use a different RAFT locally, set the CMake variable
 # CPM_raft_SOURCE=/path/to/local/raft
-find_and_configure_raft(VERSION    22.02.00
+find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
         FORK             rapidsai
-        PINNED_TAG       branch-22.02
+        PINNED_TAG       branch-${RAFT_VERSION}
+
+        # When PINNED_TAG above doesn't match cuml,
+        # force local raft clone in build directory
+        # even if it's already installed.
+        CLONE_ON_PIN     ON
+
         COMPILE_LIBRARIES      NO
-        ENABLE_NN_DEPENDENCIES NO
+        USE_NN_LIBRARY         NO
+        USE_DISTANCE_LIBRARY   NO
+        ENABLE_NN_DEPENDENCIES NO  # This builds FAISS if not installed
         USE_FAISS_STATIC       NO
 )
 ```
diff --git a/README.md b/README.md
index 9260c755dd..a79679c579 100755
--- a/README.md
+++ b/README.md
@@ -3,12 +3,11 @@
 RAFT contains fundamental widely-used algorithms and primitives for data science, graph and machine learning. The algorithms are CUDA-accelerated and form building-blocks for rapidly composing analytics in the [RAPIDS](https://rapids.ai) ecosystem. 
 
 By taking a primitives-based approach to algorithm development, RAFT
-1. accelerates algorithm construction time
-2. reduces the maintenance burden by maximizing reuse across projects, and
-3. centralizes the core computations, allowing future optimizations to benefit all algorithms that use them.
-
-At its core, RAFT is a header-only C++ library with optional shared libraries that span the following categories:
+- accelerates algorithm construction time
+- reduces the maintenance burden by maximizing reuse across projects, and
+- centralizes the core computations, allowing future optimizations to benefit all algorithms that use them.
 
+The algorithms in RAFT span the following general categories:
 #####
 | Category | Examples |
 | --- | --- |
@@ -16,18 +15,20 @@ At its core, RAFT is a header-only C++ library with optional shared libraries th
 | **Data Generation** | sparse, spatial, machine learning datasets |
 | **Dense Linear Algebra** | matrix arithmetic, norms, factorization, least squares, svd & eigenvalue problems |
 | **Spatial** | pairwise distances, nearest neighbors, neighborhood graph construction |
-| **Sparse Operations** | linear algebra, eigenvalue problems, slicing, symmetrization, connected component labeling |
+| **Sparse Operations** | linear algebra, eigenvalue problems, slicing, symmetrization, labeling |
 | **Basic Clustering** | spectral clustering, hierarchical clustering, k-means |
-| **Combinatorial Optimization** | linear assignment problem, minimum spanning forest |
-| **Iterative Solvers** | lanczos |
+| **Optimization** | combinatorial optimization, iterative solvers |
 | **Statistics** | sampling, moments and summary statistics, metrics |
 | **Distributed Tools** | multi-node multi-gpu infrastructure |
 
-RAFT also provides a Python library that includes
+RAFT provides a header-only C++ library and pre-compiled shared libraries that can 1) speed up compile times and 2) enable the APIs to be used without CUDA-enabled compilers.
+
+RAFT also provides a Python library that is currently limited to
 1. a python wrapper around the `raft::handle_t` for managing cuda library resources
-2. building multi-node multi-GPU algorithms that leverage [Dask](https://dask.org/)
+2. definitions for using `raft::handle_t` directly in cython
+3. tools for building multi-node multi-GPU algorithms that leverage [Dask](https://dask.org/)
 
-We are continuing to improve the Python API by exposing the core algorithms and primitives from the categories above.
+The Python API is being improved to wrap the algorithms and primitives from the categories above.
 
 ## Getting started
 
@@ -65,9 +66,82 @@ raft::distance::pairwise_distance(handle, input.data(), input.data(),
                                   workspace.data(), metric);
 ```
 
-## Build/Install RAFT
+## Installing
+
+RAFT can be installed through conda, cmake-package-manager (cpm), or by building the repository from source. 
+
+### Conda
+
+The easiest way to install RAFT is through conda and several packages are provided.
+- `libraft-headers` contains all the CUDA/C++ headers
+- `libraft-nn` (optional) contains precompiled shared libraries for the nearest neighbors algorithms. If FAISS is not already installed in your environment, this will need to be installed to use the nearest neighbors headers.
+- `libraft-distance` (optional) contains shared libraries for distance algorithms.
+- `pyraft` (optional) contains the Python library
+
+To install RAFT with conda (change to `rapidsai-nightly` for more up-to-date but less stable nightly packages)
+```bash
+conda install -c rapidsai libraft-headers libraft-nn libraft-distance pyraft
+```
+
+After installing RAFT, `find_package(raft COMPONENTS nn distance)` can be used in your CUDA/C++ build. Note that the `COMPONENTS` are optional and will depend on the packages installed.
+
+### CPM
+
+RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library, which makes it simple to include in downstream cmake projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). 
+
+After [installing](https://github.com/rapidsai/rapids-cmake#installation) rapids-cmake in your project, you can begin using RAFT by placing the code snippet below in a file named `get_raft.cmake` and including it in your cmake build with `include(get_raft.cmake)`. This will create the `raft::raft` target to add to configure the link libraries for your artifacts.
+
+```cmake
+
+set(RAFT_VERSION "22.04")
+
+function(find_and_configure_raft)
+  set(oneValueArgs VERSION FORK PINNED_TAG USE_FAISS_STATIC 
+          COMPILE_LIBRARIES ENABLE_NN_DEPENDENCIES)
+  cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+                            "${multiValueArgs}" ${ARGN} )
+
+  #-----------------------------------------------------
+  # Invoke CPM find_package()
+  #-----------------------------------------------------
+
+  rapids_cpm_find(raft ${PKG_VERSION}
+          GLOBAL_TARGETS      raft::raft
+          BUILD_EXPORT_SET    proj-exports
+          INSTALL_EXPORT_SET  proj-exports
+          CPM_ARGS
+          GIT_REPOSITORY https://github.com/${PKG_FORK}/raft.git
+          GIT_TAG        ${PKG_PINNED_TAG}
+          SOURCE_SUBDIR  cpp
+          OPTIONS
+          "BUILD_TESTS OFF"
+          "RAFT_ENABLE_NN_DEPENDENCIES ${PKG_ENABLE_NN_DEPENDENCIES}"
+          "RAFT_USE_FAISS_STATIC ${PKG_USE_FAISS_STATIC}"
+          "RAFT_COMPILE_LIBRARIES ${PKG_COMPILE_LIBRARIES}"
+  )
+
+endfunction()
+
+# Change pinned tag here to test a commit in CI
+# To use a different RAFT locally, set the CMake variable
+# CPM_raft_SOURCE=/path/to/local/raft
+find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
+        FORK             rapidsai
+        PINNED_TAG       branch-${RAFT_VERSION}
+
+        COMPILE_LIBRARIES      NO
+        ENABLE_NN_DEPENDENCIES NO
+        USE_FAISS_STATIC       NO
+)
+```
+
+### Source
+
+The easiest way to build RAFT from source is to use the `build.sh` script at the root of the repository,
+1. create an environment with the RAFT dependencies: `conda env create --name raft_dev -f conda/environments/raft_dev_cuda11.5.yml`
+2. run the build script from the repository root: `./build.sh pyraft libraft --compile-libs`
 
-Refer to the [Build](BUILD.md) instructions for details on building and including the RAFT library in downstream projects.
+The [Build](BUILD.md) instructions contain more details on building RAFT from source and including it in downstream projects. You can also find a more comprehensive version of the above CPM code snippet the [Building RAFT C++ from source](BUILD.md#build_cxx_source) guide.
 
 ## Folder Structure and Contents
 
diff --git a/build.sh b/build.sh
index 1c581eff19..9a3295321f 100755
--- a/build.sh
+++ b/build.sh
@@ -133,9 +133,6 @@ fi
 if hasArg --buildfaiss; then
     BUILD_STATIC_FAISS=ON
 fi
-if hasArg --singlegpu; then
-    SINGLEGPU="--singlegpu"
-fi
 if hasArg --nvtx; then
     NVTX=ON
 fi

From 5de31e04f03ed457ec1cb1148be54ff1cf9888cd Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Tue, 22 Feb 2022 19:00:52 -0800
Subject: [PATCH 128/171] Increase parallelism in allgatherv (#525)

allgatherv is implemented using multiple NCCL broadcast operations.

Previously, RAFT performed these broadcast operations sequentially creating a hot-spot around the root node in each broadcast operations.

These PR places multiple broadcast operations inside ncclGroupStart and ncclGroupEnd increasing the parallelism and more evenly stressing the communication interconnect.

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Jiaming Yuan (https://github.com/trivialfis)

URL: https://github.com/rapidsai/raft/pull/525
---
 cpp/include/raft/comms/detail/mpi_comms.hpp | 4 ++++
 cpp/include/raft/comms/detail/std_comms.hpp | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/cpp/include/raft/comms/detail/mpi_comms.hpp b/cpp/include/raft/comms/detail/mpi_comms.hpp
index ec1101032e..f06506888c 100644
--- a/cpp/include/raft/comms/detail/mpi_comms.hpp
+++ b/cpp/include/raft/comms/detail/mpi_comms.hpp
@@ -275,8 +275,11 @@ class mpi_comms : public comms_iface {
                   datatype_t datatype,
                   cudaStream_t stream) const
   {
+    RAFT_EXPECTS(size_ <= 2048,
+                 "# NCCL operations between ncclGroupStart & ncclGroupEnd shouldn't exceed 2048.");
     // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" -
     // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4.
+    RAFT_NCCL_TRY(ncclGroupStart());
     for (int root = 0; root < size_; ++root) {
       RAFT_NCCL_TRY(
         ncclBroadcast(sendbuf,
@@ -287,6 +290,7 @@ class mpi_comms : public comms_iface {
                       nccl_comm_,
                       stream));
     }
+    RAFT_NCCL_TRY(ncclGroupEnd());
   }
 
   void gather(const void* sendbuff,
diff --git a/cpp/include/raft/comms/detail/std_comms.hpp b/cpp/include/raft/comms/detail/std_comms.hpp
index 1a4cc2fcf9..0d54a7e55c 100644
--- a/cpp/include/raft/comms/detail/std_comms.hpp
+++ b/cpp/include/raft/comms/detail/std_comms.hpp
@@ -367,6 +367,9 @@ class std_comms : public comms_iface {
   {
     // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" -
     // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4.
+    RAFT_EXPECTS(num_ranks_ <= 2048,
+                 "# NCCL operations between ncclGroupStart & ncclGroupEnd shouldn't exceed 2048.");
+    RAFT_NCCL_TRY(ncclGroupStart());
     for (int root = 0; root < num_ranks_; ++root) {
       size_t dtype_size = get_datatype_size(datatype);
       RAFT_NCCL_TRY(ncclBroadcast(sendbuf,
@@ -377,6 +380,7 @@ class std_comms : public comms_iface {
                                   nccl_comm_,
                                   stream));
     }
+    RAFT_NCCL_TRY(ncclGroupEnd());
   }
 
   void gather(const void* sendbuff,

From 0bbdd4dd8007e2a3e72d89a3b2cd647a26392a42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Akif=20=C3=87=C3=96RD=C3=9CK?= <akifcorduk@gmail.com>
Date: Wed, 23 Feb 2022 19:30:46 +0100
Subject: [PATCH 129/171] Include thrust/sort.h in ball_cover.cuh (#526)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New thrust release (11.6) has removed some unnecessary header includes. In order for raft to build, we need to include `thrust/sort.h` in `ball_cover.cuh`.

Authors:
  - Akif ÇÖRDÜK (https://github.com/akifcorduk)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/526
---
 cpp/include/raft/spatial/knn/detail/ball_cover.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
index 2b245d06cb..4911582ed9 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
@@ -45,6 +45,7 @@
 #include <thrust/functional.h>
 #include <thrust/reduce.h>
 #include <thrust/sequence.h>
+#include <thrust/sort.h>
 
 namespace raft {
 namespace spatial {

From 28fd5efdbc06eb3aca80c79312a556e9b5ee7b43 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Wed, 23 Feb 2022 22:53:10 -0500
Subject: [PATCH 130/171] Random ball cover in 3d (#510)

Todo:
- [x] new gtests w/ `make_blobs`

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Vinay Deshpande (https://github.com/vinaydes)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/510
---
 cpp/CMakeLists.txt                            |  15 +-
 cpp/include/raft/spatial/knn/ball_cover.hpp   |   8 +-
 .../raft/spatial/knn/detail/ball_cover.cuh    |  92 +++++++----
 .../knn/detail/ball_cover/registers.cuh       |  31 ++--
 .../detail/ball_cover_lowdim.hpp              |  35 +++-
 .../detail/ball_cover_lowdim_pass_one_2d.cu   |  43 +++++
 ...im.cu => ball_cover_lowdim_pass_one_3d.cu} |   7 +-
 .../detail/ball_cover_lowdim_pass_two_2d.cu   |  42 +++++
 .../detail/ball_cover_lowdim_pass_two_3d.cu   |  42 +++++
 cpp/test/spatial/ball_cover.cu                | 153 ++++++++----------
 10 files changed, 328 insertions(+), 140 deletions(-)
 create mode 100644 cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
 rename cpp/src/nn/specializations/detail/{ball_cover_lowdim.cu => ball_cover_lowdim_pass_one_3d.cu} (92%)
 create mode 100644 cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
 create mode 100644 cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9c54d15adc..484285bf84 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -48,6 +48,7 @@ option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies"
 option(DISABLE_DEPRECATION_WARNINGS "Disable depreaction warnings " ON)
 option(DISABLE_OPENMP "Disable OpenMP" OFF)
 option(NVTX "Enable nvtx markers" OFF)
+option(RAFT_STATIC_LINK_LIBRARIES "Statically link compiled libraft libraries")
 
 option(RAFT_COMPILE_LIBRARIES "Enable building raft shared library instantiations" ON)
 option(RAFT_COMPILE_NN_LIBRARY "Enable building raft nearest neighbors shared library instantiations" OFF)
@@ -156,6 +157,11 @@ SECTIONS
 }
 ]=])
 endif()
+
+set(RAFT_LIB_TYPE SHARED)
+if(${RAFT_STATIC_LINK_LIBRARIES})
+  set(RAFT_LIB_TYPE STATIC)
+endif()
 ##############################################################################
 # - raft_distance ------------------------------------------------------------
 add_library(raft_distance INTERFACE)
@@ -167,7 +173,7 @@ endif()
 set_target_properties(raft_distance PROPERTIES EXPORT_NAME distance)
 
 if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
-  add_library(raft_distance_lib SHARED
+  add_library(raft_distance_lib ${RAFT_LIB_TYPE}
     src/distance/specializations/detail
     src/distance/specializations/detail/canberra.cu
     src/distance/specializations/detail/chebyshev.cu
@@ -231,9 +237,12 @@ endif()
 set_target_properties(raft_nn PROPERTIES EXPORT_NAME nn)
 
 if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_NN_LIBRARY)
-  add_library(raft_nn_lib SHARED
+  add_library(raft_nn_lib ${RAFT_LIB_TYPE}
     src/nn/specializations/ball_cover.cu
-    src/nn/specializations/detail/ball_cover_lowdim.cu
+    src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
+    src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
+    src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
+    src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
     src/nn/specializations/fused_l2_knn_long_float_true.cu
     src/nn/specializations/fused_l2_knn_long_float_false.cu
     src/nn/specializations/fused_l2_knn_int_float_true.cu
diff --git a/cpp/include/raft/spatial/knn/ball_cover.hpp b/cpp/include/raft/spatial/knn/ball_cover.hpp
index 5b93439218..d44e87710b 100644
--- a/cpp/include/raft/spatial/knn/ball_cover.hpp
+++ b/cpp/include/raft/spatial/knn/ball_cover.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@ template <typename value_idx = std::int64_t, typename value_t, typename value_in
 void rbc_build_index(const raft::handle_t& handle,
                      BallCoverIndex<value_idx, value_t, value_int>& index)
 {
-  ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions");
+  ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
   if (index.metric == raft::distance::DistanceType::Haversine) {
     detail::rbc_build_index(handle, index, detail::HaversineFunc<value_t, value_int>());
   } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
@@ -82,7 +82,7 @@ void rbc_all_knn_query(const raft::handle_t& handle,
                        bool perform_post_filtering = true,
                        float weight                = 1.0)
 {
-  ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions");
+  ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
   if (index.metric == raft::distance::DistanceType::Haversine) {
     detail::rbc_all_knn_query(handle,
                               index,
@@ -149,7 +149,7 @@ void rbc_knn_query(const raft::handle_t& handle,
                    bool perform_post_filtering = true,
                    float weight                = 1.0)
 {
-  ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions");
+  ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
   if (index.metric == raft::distance::DistanceType::Haversine) {
     detail::rbc_knn_query(handle,
                           index,
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
index 4911582ed9..d430a98ea0 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
@@ -248,33 +248,65 @@ void perform_rbc_query(const raft::handle_t& handle,
                dists + (k * n_query_pts),
                std::numeric_limits<value_t>::max());
 
-  // Compute nearest k for each neighborhood in each closest R
-  rbc_low_dim_pass_one(handle,
-                       index,
-                       query,
-                       n_query_pts,
-                       k,
-                       R_knn_inds,
-                       R_knn_dists,
-                       dfunc,
-                       inds,
-                       dists,
-                       weight,
-                       dists_counter);
-
-  if (perform_post_filtering) {
-    rbc_low_dim_pass_two(handle,
-                         index,
-                         query,
-                         n_query_pts,
-                         k,
-                         R_knn_inds,
-                         R_knn_dists,
-                         dfunc,
-                         inds,
-                         dists,
-                         weight,
-                         post_dists_counter);
+  if (index.n == 2) {
+    // Compute nearest k for each neighborhood in each closest R
+    rbc_low_dim_pass_one<value_idx, value_t, value_int, 2>(handle,
+                                                           index,
+                                                           query,
+                                                           n_query_pts,
+                                                           k,
+                                                           R_knn_inds,
+                                                           R_knn_dists,
+                                                           dfunc,
+                                                           inds,
+                                                           dists,
+                                                           weight,
+                                                           dists_counter);
+
+    if (perform_post_filtering) {
+      rbc_low_dim_pass_two<value_idx, value_t, value_int, 2>(handle,
+                                                             index,
+                                                             query,
+                                                             n_query_pts,
+                                                             k,
+                                                             R_knn_inds,
+                                                             R_knn_dists,
+                                                             dfunc,
+                                                             inds,
+                                                             dists,
+                                                             weight,
+                                                             post_dists_counter);
+    }
+
+  } else if (index.n == 3) {
+    // Compute nearest k for each neighborhood in each closest R
+    rbc_low_dim_pass_one<value_idx, value_t, value_int, 3>(handle,
+                                                           index,
+                                                           query,
+                                                           n_query_pts,
+                                                           k,
+                                                           R_knn_inds,
+                                                           R_knn_dists,
+                                                           dfunc,
+                                                           inds,
+                                                           dists,
+                                                           weight,
+                                                           dists_counter);
+
+    if (perform_post_filtering) {
+      rbc_low_dim_pass_two<value_idx, value_t, value_int, 3>(handle,
+                                                             index,
+                                                             query,
+                                                             n_query_pts,
+                                                             k,
+                                                             R_knn_inds,
+                                                             R_knn_dists,
+                                                             dfunc,
+                                                             inds,
+                                                             dists,
+                                                             weight,
+                                                             post_dists_counter);
+    }
   }
 }
 
@@ -297,7 +329,7 @@ void rbc_build_index(const raft::handle_t& handle,
                      BallCoverIndex<value_idx, value_t, value_int>& index,
                      distance_func dfunc)
 {
-  ASSERT(index.n == 2, "only 2d vectors are supported in current implementation");
+  ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
   ASSERT(!index.is_index_trained(), "index cannot be previously trained");
 
   rmm::device_uvector<value_idx> R_knn_inds(index.m, handle.get_stream());
@@ -357,7 +389,7 @@ void rbc_all_knn_query(const raft::handle_t& handle,
                        bool perform_post_filtering = true,
                        float weight                = 1.0)
 {
-  ASSERT(index.n == 2, "only 2d vectors are supported in current implementation");
+  ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
   ASSERT(index.n_landmarks >= k, "number of landmark samples must be >= k");
   ASSERT(!index.is_index_trained(), "index cannot be previously trained");
 
@@ -423,7 +455,7 @@ void rbc_knn_query(const raft::handle_t& handle,
                    bool perform_post_filtering = true,
                    float weight                = 1.0)
 {
-  ASSERT(index.n == 2, "only 2d vectors are supported in current implementation");
+  ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
   ASSERT(index.n_landmarks >= k, "number of landmark samples must be >= k");
   ASSERT(index.is_index_trained(), "index must be previously trained");
 
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
index 7c5859e043..ae9e607626 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,6 +61,7 @@ namespace detail {
 template <typename value_idx,
           typename value_t,
           typename value_int = std::uint32_t,
+          int col_q          = 2,
           int tpb            = 32,
           typename distance_func>
 __global__ void perform_post_filter_registers(const value_t* X,
@@ -87,7 +88,7 @@ __global__ void perform_post_filter_registers(const value_t* X,
   __syncthreads();
 
   // TODO: Would it be faster to use L1 for this?
-  value_t local_x_ptr[2];
+  value_t local_x_ptr[col_q];
   for (value_int j = 0; j < n_cols; ++j) {
     local_x_ptr[j] = X[n_cols * blockIdx.x + j];
   }
@@ -466,6 +467,7 @@ __global__ void block_rbc_kernel_registers(const value_t* X_index,
 template <typename value_idx,
           typename value_t,
           typename value_int = std::uint32_t,
+          int dims           = 2,
           typename dist_func>
 void rbc_low_dim_pass_one(const raft::handle_t& handle,
                           BallCoverIndex<value_idx, value_t, value_int>& index,
@@ -481,7 +483,7 @@ void rbc_low_dim_pass_one(const raft::handle_t& handle,
                           value_int* dists_counter)
 {
   if (k <= 32)
-    block_rbc_kernel_registers<value_idx, value_t, 32, 2, 128, 2, value_int>
+    block_rbc_kernel_registers<value_idx, value_t, 32, 2, 128, dims, value_int>
       <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
                                                       query,
                                                       index.n,
@@ -518,7 +520,7 @@ void rbc_low_dim_pass_one(const raft::handle_t& handle,
                                                       dfunc,
                                                       weight);
   else if (k <= 128)
-    block_rbc_kernel_registers<value_idx, value_t, 128, 3, 128, 2, value_int>
+    block_rbc_kernel_registers<value_idx, value_t, 128, 3, 128, dims, value_int>
       <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
                                                       query,
                                                       index.n,
@@ -537,7 +539,7 @@ void rbc_low_dim_pass_one(const raft::handle_t& handle,
                                                       weight);
 
   else if (k <= 256)
-    block_rbc_kernel_registers<value_idx, value_t, 256, 4, 128, 2, value_int>
+    block_rbc_kernel_registers<value_idx, value_t, 256, 4, 128, dims, value_int>
       <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
                                                       query,
                                                       index.n,
@@ -556,7 +558,7 @@ void rbc_low_dim_pass_one(const raft::handle_t& handle,
                                                       weight);
 
   else if (k <= 512)
-    block_rbc_kernel_registers<value_idx, value_t, 512, 8, 64, 2, value_int>
+    block_rbc_kernel_registers<value_idx, value_t, 512, 8, 64, dims, value_int>
       <<<n_query_rows, 64, 0, handle.get_stream()>>>(index.get_X(),
                                                      query,
                                                      index.n,
@@ -575,7 +577,7 @@ void rbc_low_dim_pass_one(const raft::handle_t& handle,
                                                      weight);
 
   else if (k <= 1024)
-    block_rbc_kernel_registers<value_idx, value_t, 1024, 8, 64, 2, value_int>
+    block_rbc_kernel_registers<value_idx, value_t, 1024, 8, 64, dims, value_int>
       <<<n_query_rows, 64, 0, handle.get_stream()>>>(index.get_X(),
                                                      query,
                                                      index.n,
@@ -597,6 +599,7 @@ void rbc_low_dim_pass_one(const raft::handle_t& handle,
 template <typename value_idx,
           typename value_t,
           typename value_int = std::uint32_t,
+          int dims           = 2,
           typename dist_func>
 void rbc_low_dim_pass_two(const raft::handle_t& handle,
                           BallCoverIndex<value_idx, value_t, value_int>& index,
@@ -616,7 +619,7 @@ void rbc_low_dim_pass_two(const raft::handle_t& handle,
   rmm::device_uvector<std::uint32_t> bitset(bitset_size * index.m, handle.get_stream());
   thrust::fill(handle.get_thrust_policy(), bitset.data(), bitset.data() + bitset.size(), 0);
 
-  perform_post_filter_registers<value_idx, value_t, value_int, 128>
+  perform_post_filter_registers<value_idx, value_t, value_int, dims, 128>
     <<<n_query_rows, 128, bitset_size * sizeof(std::uint32_t), handle.get_stream()>>>(
       index.get_X(),
       index.n,
@@ -640,7 +643,7 @@ void rbc_low_dim_pass_two(const raft::handle_t& handle,
                                   32,
                                   2,
                                   128,
-                                  2>
+                                  dims>
       <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
                                                       query,
                                                       index.n,
@@ -665,7 +668,7 @@ void rbc_low_dim_pass_two(const raft::handle_t& handle,
                                   64,
                                   3,
                                   128,
-                                  2>
+                                  dims>
       <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
                                                       query,
                                                       index.n,
@@ -690,7 +693,7 @@ void rbc_low_dim_pass_two(const raft::handle_t& handle,
                                   128,
                                   3,
                                   128,
-                                  2>
+                                  dims>
       <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
                                                       query,
                                                       index.n,
@@ -715,7 +718,7 @@ void rbc_low_dim_pass_two(const raft::handle_t& handle,
                                   256,
                                   4,
                                   128,
-                                  2>
+                                  dims>
       <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
                                                       query,
                                                       index.n,
@@ -740,7 +743,7 @@ void rbc_low_dim_pass_two(const raft::handle_t& handle,
                                   512,
                                   8,
                                   64,
-                                  2>
+                                  dims>
       <<<n_query_rows, 64, 0, handle.get_stream()>>>(index.get_X(),
                                                      query,
                                                      index.n,
@@ -765,7 +768,7 @@ void rbc_low_dim_pass_two(const raft::handle_t& handle,
                                   1024,
                                   8,
                                   64,
-                                  2>
+                                  dims>
       <<<n_query_rows, 64, 0, handle.get_stream()>>>(index.get_X(),
                                                      query,
                                                      index.n,
diff --git a/cpp/include/raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp b/cpp/include/raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp
index d0e4813332..afee3bd7a3 100644
--- a/cpp/include/raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp
+++ b/cpp/include/raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@ namespace spatial {
 namespace knn {
 namespace detail {
 
-extern template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t>(
+extern template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t, 2>(
   const raft::handle_t& handle,
   BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
   const float* query,
@@ -37,7 +37,7 @@ extern template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t>(
   float weight,
   std::uint32_t* dists_counter);
 
-extern template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t>(
+extern template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 2>(
   const raft::handle_t& handle,
   BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
   const float* query,
@@ -50,6 +50,35 @@ extern template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t>(
   float* dists,
   float weight,
   std::uint32_t* post_dists_counter);
+
+extern template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t, 3>(
+  const raft::handle_t& handle,
+  BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
+  const float* query,
+  const std::uint32_t n_query_rows,
+  std::uint32_t k,
+  const std::int64_t* R_knn_inds,
+  const float* R_knn_dists,
+  DistFunc<float, std::uint32_t>& dfunc,
+  std::int64_t* inds,
+  float* dists,
+  float weight,
+  std::uint32_t* dists_counter);
+
+extern template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 3>(
+  const raft::handle_t& handle,
+  BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
+  const float* query,
+  const std::uint32_t n_query_rows,
+  std::uint32_t k,
+  const std::int64_t* R_knn_inds,
+  const float* R_knn_dists,
+  DistFunc<float, std::uint32_t>& dfunc,
+  std::int64_t* inds,
+  float* dists,
+  float weight,
+  std::uint32_t* post_dists_counter);
+
 };  // namespace detail
 };  // namespace knn
 };  // namespace spatial
diff --git a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
new file mode 100644
index 0000000000..8950ff8d5c
--- /dev/null
+++ b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/spatial/knn/detail/ball_cover/registers.cuh>
+#include <raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+
+template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t>(
+  const raft::handle_t& handle,
+  BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
+  const float* query,
+  const std::uint32_t n_query_rows,
+  std::uint32_t k,
+  const std::int64_t* R_knn_inds,
+  const float* R_knn_dists,
+  DistFunc<float, std::uint32_t>& dfunc,
+  std::int64_t* inds,
+  float* dists,
+  float weight,
+  std::uint32_t* dists_counter);
+
+};  // namespace detail
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
\ No newline at end of file
diff --git a/cpp/src/nn/specializations/detail/ball_cover_lowdim.cu b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
similarity index 92%
rename from cpp/src/nn/specializations/detail/ball_cover_lowdim.cu
rename to cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
index dea7fe8d41..7b8b6ce9a2 100644
--- a/cpp/src/nn/specializations/detail/ball_cover_lowdim.cu
+++ b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,14 @@
 
 #include <cstdint>
 #include <raft/spatial/knn/detail/ball_cover/registers.cuh>
+#include <raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp>
 
 namespace raft {
 namespace spatial {
 namespace knn {
 namespace detail {
 
-template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t>(
+template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t, 3>(
   const raft::handle_t& handle,
   BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
   const float* query,
@@ -36,7 +37,7 @@ template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t>(
   float weight,
   std::uint32_t* dists_counter);
 
-template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t>(
+template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 3>(
   const raft::handle_t& handle,
   BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
   const float* query,
diff --git a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
new file mode 100644
index 0000000000..29e8eec8c8
--- /dev/null
+++ b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/spatial/knn/detail/ball_cover/registers.cuh>
+#include <raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+
+template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 2>(
+  const raft::handle_t& handle,
+  BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
+  const float* query,
+  const std::uint32_t n_query_rows,
+  std::uint32_t k,
+  const std::int64_t* R_knn_inds,
+  const float* R_knn_dists,
+  DistFunc<float, std::uint32_t>& dfunc,
+  std::int64_t* inds,
+  float* dists,
+  float weight,
+  std::uint32_t* post_dists_counter);
+};  // namespace detail
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
\ No newline at end of file
diff --git a/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
new file mode 100644
index 0000000000..d6d4b356c8
--- /dev/null
+++ b/cpp/src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/spatial/knn/detail/ball_cover/registers.cuh>
+#include <raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+
+template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 3>(
+  const raft::handle_t& handle,
+  BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
+  const float* query,
+  const std::uint32_t n_query_rows,
+  std::uint32_t k,
+  const std::int64_t* R_knn_inds,
+  const float* R_knn_dists,
+  DistFunc<float, std::uint32_t>& dfunc,
+  std::int64_t* inds,
+  float* dists,
+  float weight,
+  std::uint32_t* post_dists_counter);
+};  // namespace detail
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
\ No newline at end of file
diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu
index 66cd11be1f..0cdc0d8765 100644
--- a/cpp/test/spatial/ball_cover.cu
+++ b/cpp/test/spatial/ball_cover.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include "spatial_data.h"
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance_type.hpp>
+#include <raft/random/make_blobs.hpp>
 #include <raft/spatial/knn/ball_cover.hpp>
 #include <raft/spatial/knn/detail/knn_brute_force_faiss.cuh>
 #if defined RAFT_NN_COMPILED
@@ -57,13 +58,6 @@ __global__ void count_discrepancies_kernel(value_idx* actual_idx,
       value_t d    = actual[row * n + i] - expected[row * n + i];
       bool matches = (fabsf(d) <= thres) || (actual_idx[row * n + i] == expected_idx[row * n + i] &&
                                              actual_idx[row * n + i] == row);
-      //      if (!matches) {
-      //                  printf("row=%d, actual_idx=%ld, actual=%f, expected_id=%ld,
-      //                  expected=%f\n",
-      //                         row, actual_idx[row*n+i], actual[row*n+i], expected_idx[row*n+i],
-      //                         expected[row*n+i]);
-      //      }
-
       n_diffs += !matches;
       out[row] = n_diffs;
     }
@@ -98,7 +92,8 @@ template <typename value_t>
 void compute_bfknn(const raft::handle_t& handle,
                    const value_t* X1,
                    const value_t* X2,
-                   uint32_t n,
+                   uint32_t n_rows,
+                   uint32_t n_query_rows,
                    uint32_t d,
                    uint32_t k,
                    const raft::distance::DistanceType metric,
@@ -106,7 +101,7 @@ void compute_bfknn(const raft::handle_t& handle,
                    int64_t* inds)
 {
   std::vector<value_t*> input_vec = {const_cast<value_t*>(X1)};
-  std::vector<uint32_t> sizes_vec = {n};
+  std::vector<uint32_t> sizes_vec = {n_rows};
 
   std::vector<int64_t>* translations = nullptr;
 
@@ -115,7 +110,7 @@ void compute_bfknn(const raft::handle_t& handle,
                                                                       sizes_vec,
                                                                       d,
                                                                       const_cast<value_t*>(X2),
-                                                                      n,
+                                                                      n_query_rows,
                                                                       inds,
                                                                       dists,
                                                                       k,
@@ -131,7 +126,10 @@ struct ToRadians {
 
 struct BallCoverInputs {
   uint32_t k;
+  uint32_t n_rows;
+  uint32_t n_cols;
   float weight;
+  uint32_t n_query;
   raft::distance::DistanceType metric;
 };
 
@@ -143,34 +141,31 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam<BallCoverInputs> {
     params = ::testing::TestWithParam<BallCoverInputs>::GetParam();
     raft::handle_t handle;
 
-    uint32_t k   = params.k;
-    float weight = params.weight;
-    auto metric  = params.metric;
-
-    std::vector<value_t> h_train_inputs = spatial_data;
+    uint32_t k         = params.k;
+    uint32_t n_centers = 25;
+    float weight       = params.weight;
+    auto metric        = params.metric;
 
-    uint32_t n = h_train_inputs.size() / d;
+    rmm::device_uvector<value_t> X(params.n_rows * params.n_cols, handle.get_stream());
+    rmm::device_uvector<uint32_t> Y(params.n_rows, handle.get_stream());
 
-    rmm::device_uvector<value_idx> d_ref_I(n * k, handle.get_stream());
-    rmm::device_uvector<value_t> d_ref_D(n * k, handle.get_stream());
+    raft::random::make_blobs(
+      X.data(), Y.data(), params.n_rows, params.n_cols, n_centers, handle.get_stream());
 
-    // Allocate input
-    rmm::device_uvector<value_t> d_train_inputs(n * d, handle.get_stream());
-    raft::update_device(d_train_inputs.data(), h_train_inputs.data(), n * d, handle.get_stream());
+    rmm::device_uvector<value_idx> d_ref_I(params.n_query * k, handle.get_stream());
+    rmm::device_uvector<value_t> d_ref_D(params.n_query * k, handle.get_stream());
 
     if (metric == raft::distance::DistanceType::Haversine) {
-      thrust::transform(handle.get_thrust_policy(),
-                        d_train_inputs.data(),
-                        d_train_inputs.data() + d_train_inputs.size(),
-                        d_train_inputs.data(),
-                        ToRadians());
+      thrust::transform(
+        handle.get_thrust_policy(), X.data(), X.data() + X.size(), X.data(), ToRadians());
     }
 
     compute_bfknn(handle,
-                  d_train_inputs.data(),
-                  d_train_inputs.data(),
-                  n,
-                  d,
+                  X.data(),
+                  X.data(),
+                  params.n_rows,
+                  params.n_query,
+                  params.n_cols,
                   k,
                   metric,
                   d_ref_D.data(),
@@ -179,21 +174,22 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam<BallCoverInputs> {
     RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
 
     // Allocate predicted arrays
-    rmm::device_uvector<value_idx> d_pred_I(n * k, handle.get_stream());
-    rmm::device_uvector<value_t> d_pred_D(n * k, handle.get_stream());
+    rmm::device_uvector<value_idx> d_pred_I(params.n_query * k, handle.get_stream());
+    rmm::device_uvector<value_t> d_pred_D(params.n_query * k, handle.get_stream());
 
-    BallCoverIndex<value_idx, value_t> index(handle, d_train_inputs.data(), n, d, metric);
+    BallCoverIndex<value_idx, value_t> index(
+      handle, X.data(), params.n_rows, params.n_cols, metric);
 
     raft::spatial::knn::rbc_build_index(handle, index);
     raft::spatial::knn::rbc_knn_query(
-      handle, index, k, d_train_inputs.data(), n, d_pred_I.data(), d_pred_D.data(), true, weight);
+      handle, index, k, X.data(), params.n_query, d_pred_I.data(), d_pred_D.data(), true, weight);
 
     RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
     // What we really want are for the distances to match exactly. The
     // indices may or may not match exactly, depending upon the ordering which
     // can be nondeterministic.
 
-    rmm::device_uvector<uint32_t> discrepancies(n, handle.get_stream());
+    rmm::device_uvector<uint32_t> discrepancies(params.n_query, handle.get_stream());
     thrust::fill(handle.get_thrust_policy(),
                  discrepancies.data(),
                  discrepancies.data() + discrepancies.size(),
@@ -203,7 +199,7 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam<BallCoverInputs> {
                                   d_pred_I.data(),
                                   d_ref_D.data(),
                                   d_pred_D.data(),
-                                  n,
+                                  params.n_query,
                                   k,
                                   discrepancies.data(),
                                   handle.get_stream());
@@ -228,55 +224,44 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam<BallCoverInputs> {
     params = ::testing::TestWithParam<BallCoverInputs>::GetParam();
     raft::handle_t handle;
 
-    uint32_t k   = params.k;
-    float weight = params.weight;
-    auto metric  = params.metric;
-
-    std::vector<value_t> h_train_inputs = spatial_data;
+    uint32_t k         = params.k;
+    uint32_t n_centers = 25;
+    float weight       = params.weight;
+    auto metric        = params.metric;
 
-    uint32_t n = h_train_inputs.size() / d;
+    rmm::device_uvector<value_t> X(params.n_rows * params.n_cols, handle.get_stream());
+    rmm::device_uvector<uint32_t> Y(params.n_rows, handle.get_stream());
 
-    rmm::device_uvector<value_idx> d_ref_I(n * k, handle.get_stream());
-    rmm::device_uvector<value_t> d_ref_D(n * k, handle.get_stream());
+    raft::random::make_blobs(
+      X.data(), Y.data(), params.n_rows, params.n_cols, n_centers, handle.get_stream());
 
-    // Allocate input
-    rmm::device_uvector<value_t> d_train_inputs(n * d, handle.get_stream());
-    raft::update_device(d_train_inputs.data(), h_train_inputs.data(), n * d, handle.get_stream());
+    rmm::device_uvector<value_idx> d_ref_I(params.n_rows * k, handle.get_stream());
+    rmm::device_uvector<value_t> d_ref_D(params.n_rows * k, handle.get_stream());
 
     if (metric == raft::distance::DistanceType::Haversine) {
-      thrust::transform(handle.get_thrust_policy(),
-                        d_train_inputs.data(),
-                        d_train_inputs.data() + d_train_inputs.size(),
-                        d_train_inputs.data(),
-                        ToRadians());
+      thrust::transform(
+        handle.get_thrust_policy(), X.data(), X.data() + X.size(), X.data(), ToRadians());
     }
 
-    std::vector<int64_t>* translations = nullptr;
-
-    std::vector<float*> input_vec   = {d_train_inputs.data()};
-    std::vector<uint32_t> sizes_vec = {n};
-
-    raft::spatial::knn::detail::brute_force_knn_impl<uint32_t, int64_t>(handle,
-                                                                        input_vec,
-                                                                        sizes_vec,
-                                                                        d,
-                                                                        d_train_inputs.data(),
-                                                                        n,
-                                                                        d_ref_I.data(),
-                                                                        d_ref_D.data(),
-                                                                        k,
-                                                                        true,
-                                                                        true,
-                                                                        translations,
-                                                                        metric);
+    compute_bfknn(handle,
+                  X.data(),
+                  X.data(),
+                  params.n_rows,
+                  params.n_rows,
+                  params.n_cols,
+                  k,
+                  metric,
+                  d_ref_D.data(),
+                  d_ref_I.data());
 
     RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
 
     // Allocate predicted arrays
-    rmm::device_uvector<value_idx> d_pred_I(n * k, handle.get_stream());
-    rmm::device_uvector<value_t> d_pred_D(n * k, handle.get_stream());
+    rmm::device_uvector<value_idx> d_pred_I(params.n_rows * k, handle.get_stream());
+    rmm::device_uvector<value_t> d_pred_D(params.n_rows * k, handle.get_stream());
 
-    BallCoverIndex<value_idx, value_t> index(handle, d_train_inputs.data(), n, d, metric);
+    BallCoverIndex<value_idx, value_t> index(
+      handle, X.data(), params.n_rows, params.n_cols, metric);
 
     raft::spatial::knn::rbc_all_knn_query(
       handle, index, k, d_pred_I.data(), d_pred_D.data(), true, weight);
@@ -286,7 +271,7 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam<BallCoverInputs> {
     // indices may or may not match exactly, depending upon the ordering which
     // can be nondeterministic.
 
-    rmm::device_uvector<uint32_t> discrepancies(n, handle.get_stream());
+    rmm::device_uvector<uint32_t> discrepancies(params.n_rows, handle.get_stream());
     thrust::fill(handle.get_thrust_policy(),
                  discrepancies.data(),
                  discrepancies.data() + discrepancies.size(),
@@ -296,7 +281,7 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam<BallCoverInputs> {
                                        d_pred_I.data(),
                                        d_ref_D.data(),
                                        d_pred_D.data(),
-                                       n,
+                                       params.n_rows,
                                        k,
                                        discrepancies.data(),
                                        handle.get_stream());
@@ -308,7 +293,6 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam<BallCoverInputs> {
   void TearDown() override {}
 
  protected:
-  uint32_t d = 2;
   BallCoverInputs params;
 };
 
@@ -316,12 +300,15 @@ typedef BallCoverAllKNNTest<int64_t, float> BallCoverAllKNNTestF;
 typedef BallCoverKNNQueryTest<int64_t, float> BallCoverKNNQueryTestF;
 
 const std::vector<BallCoverInputs> ballcover_inputs = {
-  {2, 1.0, raft::distance::DistanceType::Haversine},
-  {4, 1.0, raft::distance::DistanceType::Haversine},
-  {7, 1.0, raft::distance::DistanceType::Haversine},
-  {2, 1.0, raft::distance::DistanceType::L2SqrtUnexpanded},
-  {4, 1.0, raft::distance::DistanceType::L2SqrtUnexpanded},
-  {7, 1.0, raft::distance::DistanceType::L2SqrtUnexpanded},
+  {2, 10000, 2, 1.0, 5000, raft::distance::DistanceType::Haversine},
+  {11, 10000, 2, 1.0, 5000, raft::distance::DistanceType::Haversine},
+  {25, 10000, 2, 1.0, 5000, raft::distance::DistanceType::Haversine},
+  {2, 10000, 2, 1.0, 5000, raft::distance::DistanceType::L2SqrtUnexpanded},
+  {11, 10000, 2, 1.0, 5000, raft::distance::DistanceType::L2SqrtUnexpanded},
+  {25, 10000, 2, 1.0, 5000, raft::distance::DistanceType::L2SqrtUnexpanded},
+  {2, 10000, 3, 1.0, 5000, raft::distance::DistanceType::L2SqrtUnexpanded},
+  {11, 10000, 3, 1.0, 5000, raft::distance::DistanceType::L2SqrtUnexpanded},
+  {25, 10000, 3, 1.0, 5000, raft::distance::DistanceType::L2SqrtUnexpanded},
 };
 
 INSTANTIATE_TEST_CASE_P(BallCoverAllKNNTest,

From 08003c24c8ffa7ac2d62b8c7bb301d57be74b36e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Thu, 24 Feb 2022 21:29:37 -0500
Subject: [PATCH 131/171] Moving device functions to cuh files and deprecating
 hpp (#524)

For consistency, we had originally scraped through the primitive functions and used the `hpp` extension across the public API. However, it was brought to my attention more recently that this is confusing when considering the larger scope of the project- which also contains many host-only APIs that don't require a cuda-enabled compiler.

However, as we're gaining more consumers, we need to start being more careful about making breaking changes to the public APIs and their header files. For this reason, I'm opting to copy the existing `hpp` files into `cuh` files, deprecating the hpp files, and using `#define` w/ conditionals to make sure the contents from only one file get defined even if both are included (for example, when a user includes `filea.hpp` but raft internally includes `filea.cuh`. This should allow us to set a version where we can make an announcement to remove the offending `hpp` files and give ample notice before the breaking change is made.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Mark Sadang (https://github.com/msadang)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/524
---
 build.sh                                      |   2 +-
 ci/release/update-version.sh                  |   2 +-
 cpp/cmake/thirdparty/get_faiss.cmake          |   2 +-
 cpp/include/raft/cluster/detail/kmeans.cuh    |   2 +-
 .../raft/cluster/{kmeans.hpp => kmeans.cuh}   |   2 +-
 cpp/include/raft/comms/comms.hpp              |   2 +-
 cpp/include/raft/comms/comms_test.hpp         |   2 +-
 cpp/include/raft/comms/detail/ucp_helper.hpp  |   2 +-
 cpp/include/raft/comms/helper.hpp             |   2 +-
 cpp/include/raft/comms/mpi_comms.hpp          |   2 +-
 cpp/include/raft/comms/std_comms.hpp          |   2 +-
 .../raft/distance/detail/correlation.cuh      |   4 +-
 cpp/include/raft/distance/detail/cosine.cuh   |   4 +-
 cpp/include/raft/distance/detail/distance.cuh |   2 +-
 .../raft/distance/detail/euclidean.cuh        |   4 +-
 .../raft/distance/detail/fused_l2_nn.cuh      |   4 +-
 .../raft/distance/detail/hellinger.cuh        |   4 +-
 .../detail/pairwise_distance_base.cuh         |   4 +-
 cpp/include/raft/distance/distance.cuh        | 325 ++++++++++++
 cpp/include/raft/distance/distance.hpp        |  11 +-
 cpp/include/raft/distance/fused_l2_nn.cuh     | 118 +++++
 cpp/include/raft/distance/fused_l2_nn.hpp     |  11 +-
 cpp/include/raft/distance/specializations.cuh |  24 +
 cpp/include/raft/distance/specializations.hpp |  13 +-
 .../detail/{canberra.hpp => canberra.cuh}     |   2 +-
 .../detail/{chebyshev.hpp => chebyshev.cuh}   |   2 +-
 .../{correlation.hpp => correlation.cuh}      |   2 +-
 .../detail/{cosine.hpp => cosine.cuh}         |   2 +-
 ..._unexpanded.hpp => hamming_unexpanded.cuh} |   2 +-
 ...er_expanded.hpp => hellinger_expanded.cuh} |   2 +-
 ...{jensen_shannon.hpp => jensen_shannon.cuh} |   2 +-
 .../{kl_divergence.hpp => kl_divergence.cuh}  |   2 +-
 .../specializations/detail/{l1.hpp => l1.cuh} |   2 +-
 .../{l2_expanded.hpp => l2_expanded.cuh}      |   2 +-
 ...sqrt_expanded.hpp => l2_sqrt_expanded.cuh} |   2 +-
 ..._unexpanded.hpp => l2_sqrt_unexpanded.cuh} |   2 +-
 .../{l2_unexpanded.hpp => l2_unexpanded.cuh}  |   2 +-
 .../{lp_unexpanded.hpp => lp_unexpanded.cuh}  |   2 +-
 .../{distance.hpp => distance.cuh}            |  30 +-
 cpp/include/raft/label/classlabels.cuh        | 121 +++++
 cpp/include/raft/label/classlabels.hpp        |   7 +-
 cpp/include/raft/label/detail/classlabels.cuh |   4 +-
 .../raft/label/detail/merge_labels.cuh        |   4 +-
 cpp/include/raft/label/merge_labels.cuh       |  71 +++
 cpp/include/raft/label/merge_labels.hpp       |   9 +-
 cpp/include/raft/lap/detail/d_structs.h       |   2 +-
 cpp/include/raft/lap/detail/lap_functions.cuh |   2 +-
 cpp/include/raft/lap/detail/lap_kernels.cuh   |   2 +-
 cpp/include/raft/lap/{lap.hpp => lap.cuh}     |   2 +-
 cpp/include/raft/linalg/add.cuh               |  90 ++++
 cpp/include/raft/linalg/add.hpp               |   9 +
 cpp/include/raft/linalg/axpy.cuh              |  55 ++
 cpp/include/raft/linalg/axpy.hpp              |  11 +-
 cpp/include/raft/linalg/binary_op.cuh         |  58 +++
 cpp/include/raft/linalg/binary_op.hpp         |   9 +
 .../raft/linalg/cholesky_r1_update.cuh        | 138 ++++++
 .../raft/linalg/cholesky_r1_update.hpp        |  11 +-
 .../raft/linalg/coalesced_reduction.cuh       |  76 +++
 .../raft/linalg/coalesced_reduction.hpp       |   9 +
 cpp/include/raft/linalg/contractions.cuh      | 211 ++++++++
 cpp/include/raft/linalg/contractions.hpp      |   9 +
 cpp/include/raft/linalg/cublas_macros.h       | 116 +++++
 cpp/include/raft/linalg/cusolver_macros.h     | 112 +++++
 cpp/include/raft/linalg/detail/add.cuh        |   4 +-
 .../raft/linalg/detail/{axpy.hpp => axpy.cuh} |   0
 ...y_r1_update.hpp => cholesky_r1_update.cuh} |   3 +-
 .../raft/linalg/detail/cublas_wrappers.hpp    |   1 +
 .../linalg/detail/{divide.hpp => divide.cuh}  |   2 +-
 .../raft/linalg/detail/{eig.hpp => eig.cuh}   |   3 +-
 .../detail/{eltwise.hpp => eltwise.cuh}       |   4 +-
 .../detail/{lanczos.hpp => lanczos.cuh}       |   0
 .../linalg/detail/{lstsq.hpp => lstsq.cuh}    |  21 +-
 .../raft/linalg/detail/matrix_vector_op.cuh   |   2 +-
 ...uared_error.hpp => mean_squared_error.cuh} |   2 +-
 .../detail/{multiply.hpp => multiply.cuh}     |   2 +-
 .../raft/linalg/detail/{norm.hpp => norm.cuh} |   2 +-
 cpp/include/raft/linalg/detail/qr.cuh         |   2 +-
 .../linalg/detail/{reduce.hpp => reduce.cuh}  |   4 +-
 cpp/include/raft/linalg/detail/rsvd.cuh       |  16 +-
 .../raft/linalg/detail/strided_reduction.cuh  |   2 +-
 cpp/include/raft/linalg/detail/subtract.cuh   |   4 +-
 .../raft/linalg/detail/{svd.hpp => svd.cuh}   |  10 +-
 .../detail/{transpose.hpp => transpose.cuh}   |   0
 cpp/include/raft/linalg/divide.cuh            |  49 ++
 cpp/include/raft/linalg/divide.hpp            |  11 +-
 cpp/include/raft/linalg/eig.cuh               | 120 +++++
 cpp/include/raft/linalg/eig.hpp               |  11 +-
 cpp/include/raft/linalg/eltwise.cuh           | 106 ++++
 cpp/include/raft/linalg/eltwise.hpp           |  11 +-
 cpp/include/raft/linalg/gemm.cuh              | 179 +++++++
 cpp/include/raft/linalg/gemm.hpp              |   9 +
 cpp/include/raft/linalg/gemv.cuh              | 211 ++++++++
 cpp/include/raft/linalg/gemv.hpp              |   9 +
 cpp/include/raft/linalg/init.cuh              |  60 +++
 cpp/include/raft/linalg/init.hpp              |   9 +
 cpp/include/raft/linalg/lanczos.cuh           | 162 ++++++
 cpp/include/raft/linalg/lanczos.hpp           |  11 +-
 cpp/include/raft/linalg/lstsq.cuh             | 121 +++++
 cpp/include/raft/linalg/lstsq.hpp             |  11 +-
 cpp/include/raft/linalg/map.cuh               |  54 ++
 cpp/include/raft/linalg/map.hpp               |   9 +
 cpp/include/raft/linalg/map_then_reduce.cuh   |  91 ++++
 cpp/include/raft/linalg/map_then_reduce.hpp   |   9 +
 cpp/include/raft/linalg/matrix_vector_op.cuh  | 105 ++++
 cpp/include/raft/linalg/matrix_vector_op.hpp  |   9 +
 .../raft/linalg/mean_squared_error.cuh        |  47 ++
 .../raft/linalg/mean_squared_error.hpp        |  11 +-
 cpp/include/raft/linalg/multiply.cuh          |  47 ++
 cpp/include/raft/linalg/multiply.hpp          |  11 +-
 cpp/include/raft/linalg/norm.cuh              |  94 ++++
 cpp/include/raft/linalg/norm.hpp              |  11 +-
 cpp/include/raft/linalg/power.cuh             |   8 +-
 cpp/include/raft/linalg/power.hpp             |  74 +++
 cpp/include/raft/linalg/qr.cuh                |  78 +++
 cpp/include/raft/linalg/qr.hpp                |   9 +
 cpp/include/raft/linalg/reduce.cuh            |  81 +++
 cpp/include/raft/linalg/reduce.hpp            |  11 +-
 .../raft/linalg/reduce_cols_by_key.cuh        |   4 +
 .../raft/linalg/reduce_cols_by_key.hpp        |  62 +++
 .../raft/linalg/reduce_rows_by_key.cuh        |   6 +-
 .../raft/linalg/reduce_rows_by_key.hpp        | 119 +++++
 cpp/include/raft/linalg/rsvd.cuh              |   4 +
 cpp/include/raft/linalg/rsvd.hpp              | 148 ++++++
 cpp/include/raft/linalg/sqrt.cuh              |   6 +-
 cpp/include/raft/linalg/sqrt.hpp              |  53 ++
 cpp/include/raft/linalg/strided_reduction.cuh |  77 +++
 cpp/include/raft/linalg/strided_reduction.hpp |   9 +
 cpp/include/raft/linalg/subtract.cuh          |  90 ++++
 cpp/include/raft/linalg/subtract.hpp          |   9 +
 cpp/include/raft/linalg/svd.cuh               | 188 +++++++
 cpp/include/raft/linalg/svd.hpp               |  11 +-
 cpp/include/raft/linalg/ternary_op.cuh        |   7 +-
 cpp/include/raft/linalg/ternary_op.hpp        |  59 +++
 cpp/include/raft/linalg/transpose.cuh         |  61 +++
 cpp/include/raft/linalg/transpose.hpp         |  11 +-
 cpp/include/raft/linalg/unary_op.cuh          |  77 +++
 cpp/include/raft/linalg/unary_op.hpp          |   9 +
 cpp/include/raft/matrix/col_wise_sort.cuh     |  56 +++
 cpp/include/raft/matrix/col_wise_sort.hpp     |   9 +
 cpp/include/raft/matrix/detail/math.cuh       |  10 +-
 cpp/include/raft/matrix/detail/matrix.cuh     |   2 +-
 cpp/include/raft/matrix/math.cuh              | 468 ++++++++++++++++++
 cpp/include/raft/matrix/math.hpp              |  11 +-
 cpp/include/raft/matrix/matrix.cuh            | 278 +++++++++++
 cpp/include/raft/matrix/matrix.hpp            |   9 +
 cpp/include/raft/mr/buffer_base.hpp           |   2 +-
 cpp/include/raft/mr/device/buffer.hpp         |   2 +-
 cpp/include/raft/mr/host/buffer.hpp           |   2 +-
 cpp/include/raft/random/detail/make_blobs.cuh |   4 +-
 .../raft/random/detail/make_regression.cuh    |  14 +-
 .../random/detail/multi_variable_gaussian.cuh |   4 +-
 cpp/include/raft/random/make_blobs.cuh        |  96 ++++
 cpp/include/raft/random/make_blobs.hpp        |  12 +-
 cpp/include/raft/random/make_regression.cuh   | 105 ++++
 cpp/include/raft/random/make_regression.hpp   |  12 +-
 .../raft/random/multi_variable_gaussian.cuh   |  64 +++
 .../raft/random/multi_variable_gaussian.hpp   |  11 +-
 cpp/include/raft/random/permute.cuh           |  63 +++
 cpp/include/raft/random/permute.hpp           |  11 +-
 cpp/include/raft/random/rng.cuh               | 380 ++++++++++++++
 cpp/include/raft/random/rng.hpp               |   9 +
 cpp/include/raft/sparse/convert/coo.cuh       |  46 ++
 cpp/include/raft/sparse/convert/coo.hpp       |  13 +-
 cpp/include/raft/sparse/convert/csr.cuh       | 142 ++++++
 cpp/include/raft/sparse/convert/csr.hpp       |  13 +-
 cpp/include/raft/sparse/convert/dense.cuh     |  67 +++
 cpp/include/raft/sparse/convert/dense.hpp     |  13 +-
 .../raft/sparse/convert/detail/coo.cuh        |   2 +-
 .../raft/sparse/convert/detail/csr.cuh        |   6 +-
 .../raft/sparse/convert/detail/dense.cuh      |   2 +-
 cpp/include/raft/sparse/detail/csr.cuh        |   2 +-
 .../raft/sparse/detail/cusparse_macros.h      |   2 +-
 .../raft/sparse/detail/cusparse_wrappers.h    |   2 +-
 .../sparse/distance/detail/bin_distance.cuh   |   2 +-
 .../raft/sparse/distance/detail/coo_spmv.cuh  |   2 +-
 .../sparse/distance/detail/ip_distance.cuh    |   6 +-
 .../sparse/distance/detail/l2_distance.cuh    |   6 +-
 .../sparse/distance/detail/lp_distance.cuh    |   4 +-
 .../raft/sparse/distance/detail/utils.cuh     |   2 +-
 cpp/include/raft/sparse/distance/distance.cuh | 137 +++++
 cpp/include/raft/sparse/distance/distance.hpp |  11 +-
 .../sparse/hierarchy/detail/agglomerative.cuh |   2 +-
 .../hierarchy/detail/connectivities.cuh       |   8 +-
 .../raft/sparse/hierarchy/detail/mst.cuh      |   6 +-
 ...{single_linkage.hpp => single_linkage.cuh} |   2 +-
 .../raft/sparse/hierarchy/single_linkage.cuh  |  65 +++
 .../raft/sparse/hierarchy/single_linkage.hpp  |  13 +-
 cpp/include/raft/sparse/linalg/add.cuh        |  99 ++++
 cpp/include/raft/sparse/linalg/add.hpp        |  11 +-
 cpp/include/raft/sparse/linalg/degree.cuh     | 123 +++++
 cpp/include/raft/sparse/linalg/degree.hpp     |  11 +-
 cpp/include/raft/sparse/linalg/detail/add.cuh |   2 +-
 .../raft/sparse/linalg/detail/norm.cuh        |   2 +-
 .../raft/sparse/linalg/detail/spectral.cuh    |   8 +-
 .../raft/sparse/linalg/detail/symmetrize.cuh  |   8 +-
 .../raft/sparse/linalg/detail/transpose.h     |   2 +-
 cpp/include/raft/sparse/linalg/norm.cuh       |  73 +++
 cpp/include/raft/sparse/linalg/norm.hpp       |  13 +-
 cpp/include/raft/sparse/linalg/spectral.cuh   |  43 ++
 cpp/include/raft/sparse/linalg/spectral.hpp   |  11 +-
 cpp/include/raft/sparse/linalg/symmetrize.cuh | 168 +++++++
 cpp/include/raft/sparse/linalg/symmetrize.hpp |  11 +-
 cpp/include/raft/sparse/linalg/transpose.cuh  |  74 +++
 cpp/include/raft/sparse/linalg/transpose.hpp  |  11 +-
 cpp/include/raft/sparse/mst/mst.cuh           |   6 +-
 cpp/include/raft/sparse/mst/mst.hpp           |  63 +++
 cpp/include/raft/sparse/op/detail/filter.cuh  |   4 +-
 cpp/include/raft/sparse/op/detail/reduce.cuh  |   6 +-
 cpp/include/raft/sparse/op/detail/row_op.cuh  |   2 +-
 .../sparse/op/detail/{slice.h => slice.cuh}   |   4 +-
 cpp/include/raft/sparse/op/detail/sort.h      |   2 +-
 cpp/include/raft/sparse/op/filter.cuh         |  94 ++++
 cpp/include/raft/sparse/op/filter.hpp         |  11 +-
 cpp/include/raft/sparse/op/reduce.cuh         |  87 ++++
 cpp/include/raft/sparse/op/reduce.hpp         |  11 +-
 cpp/include/raft/sparse/op/row_op.cuh         |  48 ++
 cpp/include/raft/sparse/op/row_op.hpp         |  11 +-
 cpp/include/raft/sparse/op/slice.cuh          |  81 +++
 cpp/include/raft/sparse/op/slice.hpp          |  13 +-
 cpp/include/raft/sparse/op/sort.cuh           |  78 +++
 cpp/include/raft/sparse/op/sort.hpp           |  11 +-
 .../sparse/selection/connect_components.cuh   |  82 +++
 .../sparse/selection/connect_components.hpp   |  11 +-
 .../selection/detail/connect_components.cuh   |  14 +-
 .../raft/sparse/selection/detail/knn.cuh      |  12 +-
 .../sparse/selection/detail/knn_graph.cuh     |   6 +-
 cpp/include/raft/sparse/selection/knn.cuh     | 102 ++++
 cpp/include/raft/sparse/selection/knn.hpp     |  11 +-
 .../raft/sparse/selection/knn_graph.cuh       |  63 +++
 .../raft/sparse/selection/knn_graph.hpp       |  11 +-
 cpp/include/raft/spatial/knn/ann.cuh          |  87 ++++
 cpp/include/raft/spatial/knn/ann.hpp          |  11 +-
 cpp/include/raft/spatial/knn/ann_common.h     |   2 +-
 cpp/include/raft/spatial/knn/ball_cover.cuh   | 192 +++++++
 cpp/include/raft/spatial/knn/ball_cover.hpp   |   9 +
 .../raft/spatial/knn/ball_cover_common.h      |   2 +-
 .../knn/detail/ann_quantized_faiss.cuh        |   4 +-
 .../raft/spatial/knn/detail/ball_cover.cuh    |   6 +-
 .../raft/spatial/knn/detail/common_faiss.h    |   2 +-
 .../knn/detail/epsilon_neighborhood.cuh       |   2 +-
 .../raft/spatial/knn/detail/fused_l2_knn.cuh  |   4 +-
 .../spatial/knn/detail/haversine_distance.cuh |   2 +-
 .../knn/detail/knn_brute_force_faiss.cuh      |   2 +-
 .../raft/spatial/knn/detail/processing.hpp    |  12 +-
 .../raft/spatial/knn/epsilon_neighborhood.cuh |  64 +++
 .../raft/spatial/knn/epsilon_neighborhood.hpp |   9 +
 cpp/include/raft/spatial/knn/knn.cuh          | 162 ++++++
 cpp/include/raft/spatial/knn/knn.hpp          |  11 +-
 .../raft/spatial/knn/specializations.cuh      |  26 +
 .../raft/spatial/knn/specializations.hpp      |  17 +-
 .../{ball_cover.hpp => ball_cover.cuh}        |   4 +-
 .../{fused_l2_knn.hpp => fused_l2_knn.cuh}    |   2 +-
 .../knn/specializations/{knn.hpp => knn.cuh}  |   4 +-
 cpp/include/raft/spectral/cluster_solvers.cuh |  84 ++++
 cpp/include/raft/spectral/cluster_solvers.hpp |  15 +-
 cpp/include/raft/spectral/detail/lapack.hpp   |   2 +-
 ...atrix_wrappers.cuh => matrix_wrappers.hpp} |   2 +-
 .../detail/modularity_maximization.hpp        |   6 +-
 .../raft/spectral/detail/partition.hpp        |   6 +-
 .../raft/spectral/detail/spectral_util.cuh    |   2 +-
 cpp/include/raft/spectral/eigen_solvers.cuh   | 107 ++++
 cpp/include/raft/spectral/eigen_solvers.hpp   |  14 +-
 cpp/include/raft/spectral/matrix_wrappers.hpp |   4 +-
 .../raft/spectral/modularity_maximization.cuh |  92 ++++
 .../raft/spectral/modularity_maximization.hpp |  11 +-
 cpp/include/raft/spectral/partition.cuh       | 102 ++++
 cpp/include/raft/spectral/partition.hpp       |  12 +-
 cpp/include/raft/stats/accuracy.cuh           |  45 ++
 cpp/include/raft/stats/accuracy.hpp           |   9 +
 .../raft/stats/adjusted_rand_index.cuh        |  54 ++
 .../raft/stats/adjusted_rand_index.hpp        |  10 +
 cpp/include/raft/stats/completeness_score.cuh |  52 ++
 cpp/include/raft/stats/completeness_score.hpp |  11 +-
 cpp/include/raft/stats/contingency_matrix.cuh | 106 ++++
 cpp/include/raft/stats/contingency_matrix.hpp |   9 +
 cpp/include/raft/stats/cov.cuh                |  63 +++
 cpp/include/raft/stats/cov.hpp                |   9 +
 .../raft/stats/detail/adjusted_rand_index.cuh |   6 +-
 .../detail/batched/information_criterion.cuh  |   2 +-
 .../raft/stats/detail/completeness_score.cuh  |   6 +-
 cpp/include/raft/stats/detail/cov.cuh         |   4 +-
 cpp/include/raft/stats/detail/dispersion.cuh  |   2 +-
 cpp/include/raft/stats/detail/entropy.cuh     |   4 +-
 .../raft/stats/detail/homogeneity_score.cuh   |   4 +-
 .../raft/stats/detail/kl_divergence.cuh       |   2 +-
 cpp/include/raft/stats/detail/mean.cuh        |   4 +-
 cpp/include/raft/stats/detail/mean_center.cuh |   2 +-
 cpp/include/raft/stats/detail/meanvar.cuh     |   2 +-
 .../raft/stats/detail/mutual_info_score.cuh   |   4 +-
 cpp/include/raft/stats/detail/scores.cuh      |  10 +-
 .../raft/stats/detail/silhouette_score.cuh    |  12 +-
 cpp/include/raft/stats/detail/stddev.cuh      |   4 +-
 cpp/include/raft/stats/detail/sum.cuh         |   4 +-
 .../stats/detail/trustworthiness_score.cuh    |   6 +-
 cpp/include/raft/stats/detail/v_measure.cuh   |   4 +-
 .../raft/stats/detail/weighted_mean.cuh       |   4 +-
 cpp/include/raft/stats/dispersion.cuh         |  61 +++
 cpp/include/raft/stats/dispersion.hpp         |   9 +
 cpp/include/raft/stats/entropy.cuh            |  50 ++
 cpp/include/raft/stats/entropy.hpp            |   9 +
 cpp/include/raft/stats/histogram.cuh          |  67 +++
 cpp/include/raft/stats/histogram.hpp          |   9 +
 cpp/include/raft/stats/homogeneity_score.cuh  |  53 ++
 cpp/include/raft/stats/homogeneity_score.hpp  |  10 +
 .../raft/stats/information_criterion.cuh      |  68 +++
 .../raft/stats/information_criterion.hpp      |  10 +
 cpp/include/raft/stats/kl_divergence.cuh      |  47 ++
 cpp/include/raft/stats/kl_divergence.hpp      |  10 +
 cpp/include/raft/stats/mean.cuh               |  56 +++
 cpp/include/raft/stats/mean.hpp               |  11 +-
 cpp/include/raft/stats/mean_center.cuh        |  84 ++++
 cpp/include/raft/stats/mean_center.hpp        |  11 +-
 cpp/include/raft/stats/meanvar.cuh            |  60 +++
 cpp/include/raft/stats/meanvar.hpp            |   9 +
 cpp/include/raft/stats/minmax.cuh             |  73 +++
 cpp/include/raft/stats/minmax.hpp             |   9 +
 cpp/include/raft/stats/mutual_info_score.cuh  |  52 ++
 cpp/include/raft/stats/mutual_info_score.hpp  |  10 +
 cpp/include/raft/stats/r2_score.cuh           |  51 ++
 cpp/include/raft/stats/r2_score.hpp           |   9 +
 cpp/include/raft/stats/rand_index.cuh         |  43 ++
 cpp/include/raft/stats/rand_index.hpp         |   9 +
 cpp/include/raft/stats/regression_metrics.cuh |  55 ++
 cpp/include/raft/stats/regression_metrics.hpp |   9 +
 cpp/include/raft/stats/silhouette_score.cuh   |  79 +++
 cpp/include/raft/stats/silhouette_score.hpp   |   9 +
 cpp/include/raft/stats/specializations.cuh    |  24 +
 cpp/include/raft/stats/specializations.hpp    |  13 +-
 cpp/include/raft/stats/stddev.cuh             |  93 ++++
 cpp/include/raft/stats/stddev.hpp             |  11 +-
 cpp/include/raft/stats/sum.cuh                |  52 ++
 cpp/include/raft/stats/sum.hpp                |  11 +-
 .../raft/stats/trustworthiness_score.cuh      |  54 ++
 .../raft/stats/trustworthiness_score.hpp      |   9 +
 cpp/include/raft/stats/v_measure.cuh          |  53 ++
 cpp/include/raft/stats/v_measure.hpp          |  12 +-
 cpp/include/raft/stats/weighted_mean.cuh      |  65 +++
 cpp/include/raft/stats/weighted_mean.hpp      |   9 +
 ...jensen_shannon_double_double_double_int.cu |   2 +-
 .../jensen_shannon_float_float_float_int.cu   |   2 +-
 ...jensen_shannon_float_float_float_uint32.cu |   2 +-
 .../kl_divergence_double_double_double_int.cu |   2 +-
 .../kl_divergence_float_float_float_int.cu    |   2 +-
 .../kl_divergence_float_float_float_uint32.cu |   2 +-
 .../detail/l1_double_double_double_int.cu     |   2 +-
 .../detail/l1_float_float_float_int.cu        |   2 +-
 .../detail/l1_float_float_float_uint32.cu     |   2 +-
 .../l2_expanded_double_double_double_int.cu   |   2 +-
 .../l2_expanded_float_float_float_int.cu      |   2 +-
 .../l2_expanded_float_float_float_uint32.cu   |   2 +-
 ..._sqrt_expanded_double_double_double_int.cu |   2 +-
 .../l2_sqrt_expanded_float_float_float_int.cu |   2 +-
 ..._sqrt_expanded_float_float_float_uint32.cu |   2 +-
 ...qrt_unexpanded_double_double_double_int.cu |   2 +-
 ...2_sqrt_unexpanded_float_float_float_int.cu |   2 +-
 ...qrt_unexpanded_float_float_float_uint32.cu |   2 +-
 .../l2_unexpanded_double_double_double_int.cu |   2 +-
 .../l2_unexpanded_float_float_float_int.cu    |   2 +-
 .../l2_unexpanded_float_float_float_uint32.cu |   2 +-
 .../lp_unexpanded_double_double_double_int.cu |   2 +-
 .../lp_unexpanded_float_float_float_int.cu    |   2 +-
 .../lp_unexpanded_float_float_float_uint32.cu |   2 +-
 cpp/src/nn/specializations/ball_cover.cu      |  10 +-
 cpp/src/nn/specializations/knn.cu             |   4 +-
 cpp/test/CMakeLists.txt                       |   2 +-
 cpp/test/cluster_solvers.cu                   |   6 +-
 cpp/test/distance/dist_adj.cu                 |   6 +-
 cpp/test/distance/distance_base.cuh           |   6 +-
 cpp/test/distance/fused_l2_nn.cu              |   8 +-
 cpp/test/eigen_solvers.cu                     |   6 +-
 cpp/test/handle.cpp                           |   2 +-
 cpp/test/label/label.cu                       |   4 +-
 cpp/test/label/merge_labels.cu                |   4 +-
 cpp/test/lap/lap.cu                           |   4 +-
 cpp/test/linalg/add.cu                        |   6 +-
 cpp/test/linalg/add.cuh                       |   4 +-
 cpp/test/linalg/binary_op.cu                  |   6 +-
 cpp/test/linalg/binary_op.cuh                 |   4 +-
 cpp/test/linalg/cholesky_r1.cu                |   4 +-
 cpp/test/linalg/coalesced_reduction.cu        |   6 +-
 cpp/test/linalg/divide.cu                     |   6 +-
 cpp/test/linalg/eig.cu                        |   6 +-
 cpp/test/linalg/eig_sel.cu                    |   2 +-
 cpp/test/linalg/eltwise.cu                    |   6 +-
 cpp/test/linalg/gemm_layout.cu                |   6 +-
 cpp/test/linalg/gemv.cu                       |   6 +-
 cpp/test/linalg/map.cu                        |   8 +-
 cpp/test/linalg/map_then_reduce.cu            |   6 +-
 cpp/test/linalg/matrix_vector_op.cu           |   4 +-
 cpp/test/linalg/matrix_vector_op.cuh          |   4 +-
 cpp/test/linalg/multiply.cu                   |   6 +-
 cpp/test/linalg/norm.cu                       |   6 +-
 cpp/test/linalg/power.cu                      |   2 +-
 cpp/test/linalg/reduce.cu                     |   6 +-
 cpp/test/linalg/reduce.cuh                    |   4 +-
 cpp/test/linalg/reduce_cols_by_key.cu         |   2 +-
 cpp/test/linalg/reduce_rows_by_key.cu         |   2 +-
 cpp/test/linalg/rsvd.cu                       |   2 +-
 cpp/test/linalg/sqrt.cu                       |   2 +-
 cpp/test/linalg/strided_reduction.cu          |   6 +-
 cpp/test/linalg/subtract.cu                   |   6 +-
 cpp/test/linalg/svd.cu                        |   8 +-
 cpp/test/linalg/ternary_op.cu                 |   2 +-
 cpp/test/linalg/transpose.cu                  |   6 +-
 cpp/test/linalg/unary_op.cu                   |   6 +-
 cpp/test/linalg/unary_op.cuh                  |   4 +-
 cpp/test/matrix/columnSort.cu                 |   2 +-
 cpp/test/matrix/linewise_op.cu                |   8 +-
 cpp/test/matrix/math.cu                       |   6 +-
 cpp/test/matrix/matrix.cu                     |   6 +-
 cpp/test/mr/device/buffer.cpp                 |   2 +-
 cpp/test/mr/host/buffer.cpp                   |   2 +-
 cpp/test/random/make_blobs.cu                 |   2 +-
 cpp/test/random/make_regression.cu            |   6 +-
 cpp/test/random/multi_variable_gaussian.cu    |   2 +-
 cpp/test/random/permute.cu                    |   4 +-
 cpp/test/random/rng.cu                        |   6 +-
 cpp/test/random/rng_int.cu                    |   2 +-
 cpp/test/random/sample_without_replacement.cu |   2 +-
 cpp/test/sparse/add.cu                        |   6 +-
 cpp/test/sparse/connect_components.cu         |  12 +-
 cpp/test/sparse/convert_coo.cu                |   6 +-
 cpp/test/sparse/convert_csr.cu                |   6 +-
 cpp/test/sparse/csr_row_slice.cu              |   4 +-
 cpp/test/sparse/csr_to_dense.cu               |   4 +-
 cpp/test/sparse/csr_transpose.cu              |   4 +-
 cpp/test/sparse/degree.cu                     |   6 +-
 cpp/test/sparse/dist_coo_spmv.cu              |   6 +-
 cpp/test/sparse/distance.cu                   |   4 +-
 cpp/test/sparse/filter.cu                     |   8 +-
 cpp/test/sparse/knn.cu                        |   4 +-
 cpp/test/sparse/knn_graph.cu                  |   8 +-
 cpp/test/sparse/linkage.cu                    |   6 +-
 cpp/test/sparse/norm.cu                       |   6 +-
 cpp/test/sparse/reduce.cu                     |   4 +-
 cpp/test/sparse/row_op.cu                     |   6 +-
 cpp/test/sparse/sort.cu                       |   6 +-
 cpp/test/sparse/symmetrize.cu                 |   8 +-
 cpp/test/spatial/ball_cover.cu                |   2 +-
 cpp/test/spatial/epsilon_neighborhood.cu      |   4 +-
 cpp/test/spatial/faiss_mr.cu                  |   4 +-
 cpp/test/spatial/fused_l2_knn.cu              |   6 +-
 cpp/test/spatial/haversine.cu                 |   2 +-
 cpp/test/spatial/knn.cu                       |   6 +-
 cpp/test/spatial/selection.cu                 |   6 +-
 cpp/test/spatial/spatial_data.h               |  58 ++-
 cpp/test/spectral_matrix.cu                   |   2 +-
 cpp/test/stats/adjusted_rand_index.cu         |   4 +-
 cpp/test/stats/completeness_score.cu          |   6 +-
 cpp/test/stats/contingencyMatrix.cu           |   2 +-
 cpp/test/stats/cov.cu                         |   6 +-
 cpp/test/stats/dispersion.cu                  |   4 +-
 cpp/test/stats/entropy.cu                     |   2 +-
 cpp/test/stats/histogram.cu                   |   4 +-
 cpp/test/stats/homogeneity_score.cu           |   4 +-
 cpp/test/stats/information_criterion.cu       |   2 +-
 cpp/test/stats/kl_divergence.cu               |   2 +-
 cpp/test/stats/mean.cu                        |   6 +-
 cpp/test/stats/mean_center.cu                 |   8 +-
 cpp/test/stats/meanvar.cu                     |   6 +-
 cpp/test/stats/minmax.cu                      |   4 +-
 cpp/test/stats/mutual_info_score.cu           |   4 +-
 cpp/test/stats/rand_index.cu                  |   2 +-
 cpp/test/stats/silhouette_score.cu            |   4 +-
 cpp/test/stats/stddev.cu                      |  10 +-
 cpp/test/stats/sum.cu                         |   8 +-
 cpp/test/stats/trustworthiness.cu             |   6 +-
 cpp/test/stats/v_measure.cu                   |   4 +-
 cpp/test/stats/weighted_mean.cu               |   4 +-
 python/raft/dask/common/comms_utils.pyx       |   2 +-
 python/raft/dask/common/nccl.pyx              |   2 +-
 471 files changed, 11388 insertions(+), 646 deletions(-)
 rename cpp/include/raft/cluster/{kmeans.hpp => kmeans.cuh} (98%)
 create mode 100644 cpp/include/raft/distance/distance.cuh
 create mode 100644 cpp/include/raft/distance/fused_l2_nn.cuh
 create mode 100644 cpp/include/raft/distance/specializations.cuh
 rename cpp/include/raft/distance/specializations/detail/{canberra.hpp => canberra.cuh} (97%)
 rename cpp/include/raft/distance/specializations/detail/{chebyshev.hpp => chebyshev.cuh} (97%)
 rename cpp/include/raft/distance/specializations/detail/{correlation.hpp => correlation.cuh} (97%)
 rename cpp/include/raft/distance/specializations/detail/{cosine.hpp => cosine.cuh} (97%)
 rename cpp/include/raft/distance/specializations/detail/{hamming_unexpanded.hpp => hamming_unexpanded.cuh} (97%)
 rename cpp/include/raft/distance/specializations/detail/{hellinger_expanded.hpp => hellinger_expanded.cuh} (97%)
 rename cpp/include/raft/distance/specializations/detail/{jensen_shannon.hpp => jensen_shannon.cuh} (98%)
 rename cpp/include/raft/distance/specializations/detail/{kl_divergence.hpp => kl_divergence.cuh} (97%)
 rename cpp/include/raft/distance/specializations/detail/{l1.hpp => l1.cuh} (97%)
 rename cpp/include/raft/distance/specializations/detail/{l2_expanded.hpp => l2_expanded.cuh} (98%)
 rename cpp/include/raft/distance/specializations/detail/{l2_sqrt_expanded.hpp => l2_sqrt_expanded.cuh} (97%)
 rename cpp/include/raft/distance/specializations/detail/{l2_sqrt_unexpanded.hpp => l2_sqrt_unexpanded.cuh} (97%)
 rename cpp/include/raft/distance/specializations/detail/{l2_unexpanded.hpp => l2_unexpanded.cuh} (97%)
 rename cpp/include/raft/distance/specializations/detail/{lp_unexpanded.hpp => lp_unexpanded.cuh} (97%)
 rename cpp/include/raft/distance/specializations/{distance.hpp => distance.cuh} (54%)
 create mode 100644 cpp/include/raft/label/classlabels.cuh
 create mode 100644 cpp/include/raft/label/merge_labels.cuh
 rename cpp/include/raft/lap/{lap.hpp => lap.cuh} (99%)
 create mode 100644 cpp/include/raft/linalg/add.cuh
 create mode 100644 cpp/include/raft/linalg/axpy.cuh
 create mode 100644 cpp/include/raft/linalg/binary_op.cuh
 create mode 100644 cpp/include/raft/linalg/cholesky_r1_update.cuh
 create mode 100644 cpp/include/raft/linalg/coalesced_reduction.cuh
 create mode 100644 cpp/include/raft/linalg/contractions.cuh
 create mode 100644 cpp/include/raft/linalg/cublas_macros.h
 create mode 100644 cpp/include/raft/linalg/cusolver_macros.h
 rename cpp/include/raft/linalg/detail/{axpy.hpp => axpy.cuh} (100%)
 rename cpp/include/raft/linalg/detail/{cholesky_r1_update.hpp => cholesky_r1_update.cuh} (98%)
 rename cpp/include/raft/linalg/detail/{divide.hpp => divide.cuh} (96%)
 rename cpp/include/raft/linalg/detail/{eig.hpp => eig.cuh} (99%)
 rename cpp/include/raft/linalg/detail/{eltwise.hpp => eltwise.cuh} (97%)
 rename cpp/include/raft/linalg/detail/{lanczos.hpp => lanczos.cuh} (100%)
 rename cpp/include/raft/linalg/detail/{lstsq.hpp => lstsq.cuh} (98%)
 rename cpp/include/raft/linalg/detail/{mean_squared_error.hpp => mean_squared_error.cuh} (96%)
 rename cpp/include/raft/linalg/detail/{multiply.hpp => multiply.cuh} (96%)
 rename cpp/include/raft/linalg/detail/{norm.hpp => norm.cuh} (99%)
 rename cpp/include/raft/linalg/detail/{reduce.hpp => reduce.cuh} (95%)
 rename cpp/include/raft/linalg/detail/{svd.hpp => svd.cuh} (98%)
 rename cpp/include/raft/linalg/detail/{transpose.hpp => transpose.cuh} (100%)
 create mode 100644 cpp/include/raft/linalg/divide.cuh
 create mode 100644 cpp/include/raft/linalg/eig.cuh
 create mode 100644 cpp/include/raft/linalg/eltwise.cuh
 create mode 100644 cpp/include/raft/linalg/gemm.cuh
 create mode 100644 cpp/include/raft/linalg/gemv.cuh
 create mode 100644 cpp/include/raft/linalg/init.cuh
 create mode 100644 cpp/include/raft/linalg/lanczos.cuh
 create mode 100644 cpp/include/raft/linalg/lstsq.cuh
 create mode 100644 cpp/include/raft/linalg/map.cuh
 create mode 100644 cpp/include/raft/linalg/map_then_reduce.cuh
 create mode 100644 cpp/include/raft/linalg/matrix_vector_op.cuh
 create mode 100644 cpp/include/raft/linalg/mean_squared_error.cuh
 create mode 100644 cpp/include/raft/linalg/multiply.cuh
 create mode 100644 cpp/include/raft/linalg/norm.cuh
 create mode 100644 cpp/include/raft/linalg/power.hpp
 create mode 100644 cpp/include/raft/linalg/qr.cuh
 create mode 100644 cpp/include/raft/linalg/reduce.cuh
 create mode 100644 cpp/include/raft/linalg/reduce_cols_by_key.hpp
 create mode 100644 cpp/include/raft/linalg/reduce_rows_by_key.hpp
 create mode 100644 cpp/include/raft/linalg/rsvd.hpp
 create mode 100644 cpp/include/raft/linalg/sqrt.hpp
 create mode 100644 cpp/include/raft/linalg/strided_reduction.cuh
 create mode 100644 cpp/include/raft/linalg/subtract.cuh
 create mode 100644 cpp/include/raft/linalg/svd.cuh
 create mode 100644 cpp/include/raft/linalg/ternary_op.hpp
 create mode 100644 cpp/include/raft/linalg/transpose.cuh
 create mode 100644 cpp/include/raft/linalg/unary_op.cuh
 create mode 100644 cpp/include/raft/matrix/col_wise_sort.cuh
 create mode 100644 cpp/include/raft/matrix/math.cuh
 create mode 100644 cpp/include/raft/matrix/matrix.cuh
 create mode 100644 cpp/include/raft/random/make_blobs.cuh
 create mode 100644 cpp/include/raft/random/make_regression.cuh
 create mode 100644 cpp/include/raft/random/multi_variable_gaussian.cuh
 create mode 100644 cpp/include/raft/random/permute.cuh
 create mode 100644 cpp/include/raft/random/rng.cuh
 create mode 100644 cpp/include/raft/sparse/convert/coo.cuh
 create mode 100644 cpp/include/raft/sparse/convert/csr.cuh
 create mode 100644 cpp/include/raft/sparse/convert/dense.cuh
 create mode 100644 cpp/include/raft/sparse/distance/distance.cuh
 rename cpp/include/raft/sparse/hierarchy/detail/{single_linkage.hpp => single_linkage.cuh} (99%)
 create mode 100644 cpp/include/raft/sparse/hierarchy/single_linkage.cuh
 create mode 100644 cpp/include/raft/sparse/linalg/add.cuh
 create mode 100644 cpp/include/raft/sparse/linalg/degree.cuh
 create mode 100644 cpp/include/raft/sparse/linalg/norm.cuh
 create mode 100644 cpp/include/raft/sparse/linalg/spectral.cuh
 create mode 100644 cpp/include/raft/sparse/linalg/symmetrize.cuh
 create mode 100644 cpp/include/raft/sparse/linalg/transpose.cuh
 create mode 100644 cpp/include/raft/sparse/mst/mst.hpp
 rename cpp/include/raft/sparse/op/detail/{slice.h => slice.cuh} (97%)
 create mode 100644 cpp/include/raft/sparse/op/filter.cuh
 create mode 100644 cpp/include/raft/sparse/op/reduce.cuh
 create mode 100644 cpp/include/raft/sparse/op/row_op.cuh
 create mode 100644 cpp/include/raft/sparse/op/slice.cuh
 create mode 100644 cpp/include/raft/sparse/op/sort.cuh
 create mode 100644 cpp/include/raft/sparse/selection/connect_components.cuh
 create mode 100644 cpp/include/raft/sparse/selection/knn.cuh
 create mode 100644 cpp/include/raft/sparse/selection/knn_graph.cuh
 create mode 100644 cpp/include/raft/spatial/knn/ann.cuh
 create mode 100644 cpp/include/raft/spatial/knn/ball_cover.cuh
 create mode 100644 cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh
 create mode 100644 cpp/include/raft/spatial/knn/knn.cuh
 create mode 100644 cpp/include/raft/spatial/knn/specializations.cuh
 rename cpp/include/raft/spatial/knn/specializations/{ball_cover.hpp => ball_cover.cuh} (95%)
 rename cpp/include/raft/spatial/knn/specializations/{fused_l2_knn.hpp => fused_l2_knn.cuh} (98%)
 rename cpp/include/raft/spatial/knn/specializations/{knn.hpp => knn.cuh} (97%)
 create mode 100644 cpp/include/raft/spectral/cluster_solvers.cuh
 rename cpp/include/raft/spectral/detail/{matrix_wrappers.cuh => matrix_wrappers.hpp} (99%)
 create mode 100644 cpp/include/raft/spectral/eigen_solvers.cuh
 create mode 100644 cpp/include/raft/spectral/modularity_maximization.cuh
 create mode 100644 cpp/include/raft/spectral/partition.cuh
 create mode 100644 cpp/include/raft/stats/accuracy.cuh
 create mode 100644 cpp/include/raft/stats/adjusted_rand_index.cuh
 create mode 100644 cpp/include/raft/stats/completeness_score.cuh
 create mode 100644 cpp/include/raft/stats/contingency_matrix.cuh
 create mode 100644 cpp/include/raft/stats/cov.cuh
 create mode 100644 cpp/include/raft/stats/dispersion.cuh
 create mode 100644 cpp/include/raft/stats/entropy.cuh
 create mode 100644 cpp/include/raft/stats/histogram.cuh
 create mode 100644 cpp/include/raft/stats/homogeneity_score.cuh
 create mode 100644 cpp/include/raft/stats/information_criterion.cuh
 create mode 100644 cpp/include/raft/stats/kl_divergence.cuh
 create mode 100644 cpp/include/raft/stats/mean.cuh
 create mode 100644 cpp/include/raft/stats/mean_center.cuh
 create mode 100644 cpp/include/raft/stats/meanvar.cuh
 create mode 100644 cpp/include/raft/stats/minmax.cuh
 create mode 100644 cpp/include/raft/stats/mutual_info_score.cuh
 create mode 100644 cpp/include/raft/stats/r2_score.cuh
 create mode 100644 cpp/include/raft/stats/rand_index.cuh
 create mode 100644 cpp/include/raft/stats/regression_metrics.cuh
 create mode 100644 cpp/include/raft/stats/silhouette_score.cuh
 create mode 100644 cpp/include/raft/stats/specializations.cuh
 create mode 100644 cpp/include/raft/stats/stddev.cuh
 create mode 100644 cpp/include/raft/stats/sum.cuh
 create mode 100644 cpp/include/raft/stats/trustworthiness_score.cuh
 create mode 100644 cpp/include/raft/stats/v_measure.cuh
 create mode 100644 cpp/include/raft/stats/weighted_mean.cuh

diff --git a/build.sh b/build.sh
index 9a3295321f..9d3a796c65 100755
--- a/build.sh
+++ b/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 # cuml build script
 
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index a832f67aaf..83521e5d11 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 ########################
 # RAFT Version Updater #
 ########################
diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake
index 8c29d2b321..51ed34754b 100644
--- a/cpp/cmake/thirdparty/get_faiss.cmake
+++ b/cpp/cmake/thirdparty/get_faiss.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/cluster/detail/kmeans.cuh b/cpp/include/raft/cluster/detail/kmeans.cuh
index 51e4037c60..f3777405c0 100644
--- a/cpp/include/raft/cluster/detail/kmeans.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/cluster/kmeans.hpp b/cpp/include/raft/cluster/kmeans.cuh
similarity index 98%
rename from cpp/include/raft/cluster/kmeans.hpp
rename to cpp/include/raft/cluster/kmeans.cuh
index ab0fbb04c7..28d4ae0719 100644
--- a/cpp/include/raft/cluster/kmeans.hpp
+++ b/cpp/include/raft/cluster/kmeans.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index 14c33c6cf2..05678a7e49 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/comms/comms_test.hpp b/cpp/include/raft/comms/comms_test.hpp
index 1acb72bc85..f01060cb40 100644
--- a/cpp/include/raft/comms/comms_test.hpp
+++ b/cpp/include/raft/comms/comms_test.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/comms/detail/ucp_helper.hpp b/cpp/include/raft/comms/detail/ucp_helper.hpp
index 6ba66fb6f3..ef93ae90c5 100644
--- a/cpp/include/raft/comms/detail/ucp_helper.hpp
+++ b/cpp/include/raft/comms/detail/ucp_helper.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp
index d83e8f4d4f..b1aae86556 100644
--- a/cpp/include/raft/comms/helper.hpp
+++ b/cpp/include/raft/comms/helper.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp
index 3fab04c441..ca5275cd06 100644
--- a/cpp/include/raft/comms/mpi_comms.hpp
+++ b/cpp/include/raft/comms/mpi_comms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index 6fa0f7e37b..7604606ba1 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/detail/correlation.cuh b/cpp/include/raft/distance/detail/correlation.cuh
index 21d04f3f8d..c88d5afeab 100644
--- a/cpp/include/raft/distance/detail/correlation.cuh
+++ b/cpp/include/raft/distance/detail/correlation.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/reduce.hpp>
+#include <raft/linalg/reduce.cuh>
 
 namespace raft {
 namespace distance {
diff --git a/cpp/include/raft/distance/detail/cosine.cuh b/cpp/include/raft/distance/detail/cosine.cuh
index bead5f1f71..b7eed3e2a8 100644
--- a/cpp/include/raft/distance/detail/cosine.cuh
+++ b/cpp/include/raft/distance/detail/cosine.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/norm.hpp>
+#include <raft/linalg/norm.cuh>
 
 namespace raft {
 namespace distance {
diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index 45850de115..4782afe46e 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/detail/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh
index 4786f584c4..d83e81b6a9 100644
--- a/cpp/include/raft/distance/detail/euclidean.cuh
+++ b/cpp/include/raft/distance/detail/euclidean.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/norm.hpp>
+#include <raft/linalg/norm.cuh>
 
 namespace raft {
 namespace distance {
diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
index 80eb6021ef..81d02c410c 100644
--- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 #include <limits>
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/contractions.hpp>
+#include <raft/linalg/contractions.cuh>
 #include <stdint.h>
 
 namespace raft {
diff --git a/cpp/include/raft/distance/detail/hellinger.cuh b/cpp/include/raft/distance/detail/hellinger.cuh
index 3cb0469803..31854fd1d6 100644
--- a/cpp/include/raft/distance/detail/hellinger.cuh
+++ b/cpp/include/raft/distance/detail/hellinger.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 
 namespace raft {
 namespace distance {
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index 996cc544a6..9d203c0c4f 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -16,8 +16,8 @@
 #pragma once
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/contractions.hpp>
-#include <raft/linalg/norm.hpp>
+#include <raft/linalg/contractions.cuh>
+#include <raft/linalg/norm.cuh>
 #include <raft/vectorized.cuh>
 
 #include <cstddef>
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
new file mode 100644
index 0000000000..71c9e8d32b
--- /dev/null
+++ b/cpp/include/raft/distance/distance.cuh
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __DISTANCE_H
+#define __DISTANCE_H
+
+#pragma once
+
+#include <raft/distance/detail/distance.cuh>
+#include <raft/distance/distance_type.hpp>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace distance {
+
+/**
+ * @brief Evaluate pairwise distances with the user epilogue lamba allowed
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam FinalLambda user-defined epilogue lamba
+ * @tparam Index_ Index type
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param workspace temporary workspace needed for computations
+ * @param worksize number of bytes of the workspace
+ * @param fin_op the final gemm epilogue lambda
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
+ *
+ * @note fin_op: This is a device lambda which is supposed to operate upon the
+ * input which is AccType and returns the output in OutType. It's signature is
+ * as follows:  <pre>OutType fin_op(AccType in, int g_idx);</pre>. If one needs
+ * any other parameters, feel free to pass them via closure.
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void distance(const InType* x,
+              const InType* y,
+              OutType* dist,
+              Index_ m,
+              Index_ n,
+              Index_ k,
+              void* workspace,
+              size_t worksize,
+              FinalLambda fin_op,
+              cudaStream_t stream,
+              bool isRowMajor   = true,
+              InType metric_arg = 2.0f)
+{
+  detail::distance<distanceType, InType, AccType, OutType, FinalLambda, Index_>(
+    x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg);
+}
+
+/**
+ * @brief Evaluate pairwise distances for the simple use case
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam Index_ Index type
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param workspace temporary workspace needed for computations
+ * @param worksize number of bytes of the workspace
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
+ *
+ * @note if workspace is passed as nullptr, this will return in
+ *  worksize, the number of bytes of workspace required
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int>
+void distance(const InType* x,
+              const InType* y,
+              OutType* dist,
+              Index_ m,
+              Index_ n,
+              Index_ k,
+              void* workspace,
+              size_t worksize,
+              cudaStream_t stream,
+              bool isRowMajor   = true,
+              InType metric_arg = 2.0f)
+{
+  detail::distance<distanceType, InType, AccType, OutType, Index_>(
+    x, y, dist, m, n, k, workspace, worksize, stream, isRowMajor, metric_arg);
+}
+
+/**
+ * @brief Return the exact workspace size to compute the distance
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam Index_ Index type
+ * @param x first set of points
+ * @param y second set of points
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ *
+ * @note If the specified distanceType doesn't need the workspace at all, it
+ * returns 0.
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int>
+size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k)
+{
+  return detail::getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(x, y, m, n, k);
+}
+
+/**
+ * @brief Evaluate pairwise distances for the simple use case
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam Index_ Index type
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
+ *
+ * @note if workspace is passed as nullptr, this will return in
+ *  worksize, the number of bytes of workspace required
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int>
+void distance(const InType* x,
+              const InType* y,
+              OutType* dist,
+              Index_ m,
+              Index_ n,
+              Index_ k,
+              cudaStream_t stream,
+              bool isRowMajor   = true,
+              InType metric_arg = 2.0f)
+{
+  rmm::device_uvector<char> workspace(0, stream);
+  auto worksize = getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(x, y, m, n, k);
+  workspace.resize(worksize, stream);
+  detail::distance<distanceType, InType, AccType, OutType, Index_>(
+    x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, metric_arg);
+}
+
+/**
+ * @defgroup pairwise_distance pairwise distance prims
+ * @{
+ * @brief Convenience wrapper around 'distance' prim to convert runtime metric
+ * into compile time for the purpose of dispatch
+ * @tparam Type input/accumulation/output data-type
+ * @tparam Index_ indexing type
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param workspace temporary workspace buffer which can get resized as per the
+ * needed workspace size
+ * @param metric distance metric
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ */
+template <typename Type, typename Index_ = int>
+void pairwise_distance(const raft::handle_t& handle,
+                       const Type* x,
+                       const Type* y,
+                       Type* dist,
+                       Index_ m,
+                       Index_ n,
+                       Index_ k,
+                       rmm::device_uvector<char>& workspace,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor = true,
+                       Type metric_arg = 2.0f)
+{
+  switch (metric) {
+    case raft::distance::DistanceType::L2Expanded:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2Expanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::L2SqrtExpanded:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2SqrtExpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::CosineExpanded:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::CosineExpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::L1:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L1>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::L2Unexpanded:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2Unexpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::L2SqrtUnexpanded:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2SqrtUnexpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::Linf:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::Linf>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::HellingerExpanded:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::HellingerExpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::LpUnexpanded:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::LpUnexpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor, metric_arg);
+      break;
+    case raft::distance::DistanceType::Canberra:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::Canberra>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::HammingUnexpanded:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::HammingUnexpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::JensenShannon:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::JensenShannon>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::RusselRaoExpanded:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::RusselRaoExpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::KLDivergence:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::KLDivergence>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::CorrelationExpanded:
+      detail::
+        pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::CorrelationExpanded>(
+          x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
+  };
+}
+/** @} */
+
+/**
+ * @defgroup pairwise_distance pairwise distance prims
+ * @{
+ * @brief Convenience wrapper around 'distance' prim to convert runtime metric
+ * into compile time for the purpose of dispatch
+ * @tparam Type input/accumulation/output data-type
+ * @tparam Index_ indexing type
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param metric distance metric
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ */
+template <typename Type, typename Index_ = int>
+void pairwise_distance(const raft::handle_t& handle,
+                       const Type* x,
+                       const Type* y,
+                       Type* dist,
+                       Index_ m,
+                       Index_ n,
+                       Index_ k,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor = true,
+                       Type metric_arg = 2.0f)
+{
+  rmm::device_uvector<char> workspace(0, handle.get_stream());
+  pairwise_distance<Type, Index_>(
+    handle, x, y, dist, m, n, k, workspace, metric, isRowMajor, metric_arg);
+}
+
+};  // namespace distance
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/distance/distance.hpp b/cpp/include/raft/distance/distance.hpp
index 935cf6677a..f9fbde50e4 100644
--- a/cpp/include/raft/distance/distance.hpp
+++ b/cpp/include/raft/distance/distance.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __DISTANCE_H
+#define __DISTANCE_H
 
 #pragma once
 
@@ -319,3 +326,5 @@ void pairwise_distance(const raft::handle_t& handle,
 
 };  // namespace distance
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh
new file mode 100644
index 0000000000..ac8895c9ce
--- /dev/null
+++ b/cpp/include/raft/distance/fused_l2_nn.cuh
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FUSED_L2_NN_H
+#define __FUSED_L2_NN_H
+
+#pragma once
+
+#include <cub/cub.cuh>
+#include <limits>
+#include <raft/cuda_utils.cuh>
+#include <raft/distance/detail/fused_l2_nn.cuh>
+#include <raft/handle.hpp>
+#include <stdint.h>
+
+namespace raft {
+namespace distance {
+
+template <typename LabelT, typename DataT>
+using KVPMinReduce = detail::KVPMinReduceImpl<LabelT, DataT>;
+
+template <typename LabelT, typename DataT>
+using MinAndDistanceReduceOp = detail::MinAndDistanceReduceOpImpl<LabelT, DataT>;
+
+template <typename LabelT, typename DataT>
+using MinReduceOp = detail::MinReduceOpImpl<LabelT, DataT>;
+
+/**
+ * Initialize array using init value from reduction op
+ */
+template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
+void initialize(const raft::handle_t& handle, OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
+{
+  detail::initialize<DataT, OutT, IdxT, ReduceOpT>(min, m, maxVal, redOp, handle.get_stream());
+}
+
+/**
+ * @brief Fused L2 distance and 1-nearest-neighbor computation in a single call.
+ *
+ * The benefits of such a call are 2-fold: 1) eliminate the need for an
+ * intermediate buffer to store the output of gemm 2) reduce the memory read
+ * traffic on this intermediate buffer, otherwise needed during the reduction
+ * phase for 1-NN.
+ *
+ * @tparam DataT     data type
+ * @tparam OutT      output type to either store 1-NN indices and their minimum
+ *                   distances or store only the min distances. Accordingly, one
+ *                   has to pass an appropriate `ReduceOpT`
+ * @tparam IdxT      indexing arithmetic type
+ * @tparam ReduceOpT A struct to perform the final needed reduction operation
+ *                   and also to initialize the output array elements with the
+ *                   appropriate initial value needed for reduction.
+ *
+ * @param[out] min           will contain the reduced output (Length = `m`)
+ *                           (on device)
+ * @param[in]  x             first matrix. Row major. Dim = `m x k`.
+ *                           (on device).
+ * @param[in]  y             second matrix. Row major. Dim = `n x k`.
+ *                           (on device).
+ * @param[in]  xn            L2 squared norm of `x`. Length = `m`. (on device).
+ * @param[in]  yn            L2 squared norm of `y`. Length = `n`. (on device)
+ * @param[in]  m             gemm m
+ * @param[in]  n             gemm n
+ * @param[in]  k             gemm k
+ * @param[in]  workspace     temp workspace. Size = sizeof(int)*m. (on device)
+ * @param[in]  redOp         reduction operator in the epilogue
+ * @param[in] pairRedOp reduction operation on key value pairs
+ * @param[in]  sqrt          Whether the output `minDist` should contain L2-sqrt
+ * @param[in]  initOutBuffer whether to initialize the output buffer before the
+ *                           main kernel launch
+ * @param[in]  stream        cuda stream
+ */
+template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT, typename KVPReduceOpT>
+void fusedL2NN(OutT* min,
+               const DataT* x,
+               const DataT* y,
+               const DataT* xn,
+               const DataT* yn,
+               IdxT m,
+               IdxT n,
+               IdxT k,
+               void* workspace,
+               ReduceOpT redOp,
+               KVPReduceOpT pairRedOp,
+               bool sqrt,
+               bool initOutBuffer,
+               cudaStream_t stream)
+{
+  size_t bytes = sizeof(DataT) * k;
+  if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) {
+    detail::fusedL2NNImpl<DataT, OutT, IdxT, 16 / sizeof(DataT), ReduceOpT>(
+      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+  } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) {
+    detail::fusedL2NNImpl<DataT, OutT, IdxT, 8 / sizeof(DataT), ReduceOpT>(
+      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+  } else {
+    detail::fusedL2NNImpl<DataT, OutT, IdxT, 1, ReduceOpT>(
+      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+  }
+}
+
+}  // namespace distance
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/distance/fused_l2_nn.hpp b/cpp/include/raft/distance/fused_l2_nn.hpp
index b293f0c237..1cb3ee39eb 100644
--- a/cpp/include/raft/distance/fused_l2_nn.hpp
+++ b/cpp/include/raft/distance/fused_l2_nn.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __FUSED_L2_NN_H
+#define __FUSED_L2_NN_H
 
 #pragma once
 
@@ -111,3 +118,5 @@ void fusedL2NN(OutT* min,
 
 }  // namespace distance
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/distance/specializations.cuh b/cpp/include/raft/distance/specializations.cuh
new file mode 100644
index 0000000000..5944534be7
--- /dev/null
+++ b/cpp/include/raft/distance/specializations.cuh
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DISTANCE_SPECIALIZATIONS_H
+#define __DISTANCE_SPECIALIZATIONS_H
+
+#pragma once
+
+#include <raft/distance/specializations/distance.cuh>
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/distance/specializations.hpp b/cpp/include/raft/distance/specializations.hpp
index e70943e731..db426c30d2 100644
--- a/cpp/include/raft/distance/specializations.hpp
+++ b/cpp/include/raft/distance/specializations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,7 +13,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __DISTANCE_SPECIALIZATIONS_H
+#define __DISTANCE_SPECIALIZATIONS_H
 
 #pragma once
 
-#include <raft/distance/specializations/distance.hpp>
\ No newline at end of file
+#include <raft/distance/specializations/distance.cuh>
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/distance/specializations/detail/canberra.hpp b/cpp/include/raft/distance/specializations/detail/canberra.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/canberra.hpp
rename to cpp/include/raft/distance/specializations/detail/canberra.cuh
index 2e71685532..22bdf41fd1 100644
--- a/cpp/include/raft/distance/specializations/detail/canberra.hpp
+++ b/cpp/include/raft/distance/specializations/detail/canberra.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/chebyshev.hpp b/cpp/include/raft/distance/specializations/detail/chebyshev.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/chebyshev.hpp
rename to cpp/include/raft/distance/specializations/detail/chebyshev.cuh
index dc03e047be..7502409082 100644
--- a/cpp/include/raft/distance/specializations/detail/chebyshev.hpp
+++ b/cpp/include/raft/distance/specializations/detail/chebyshev.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/correlation.hpp b/cpp/include/raft/distance/specializations/detail/correlation.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/correlation.hpp
rename to cpp/include/raft/distance/specializations/detail/correlation.cuh
index 2e7683ab10..a2cddea179 100644
--- a/cpp/include/raft/distance/specializations/detail/correlation.hpp
+++ b/cpp/include/raft/distance/specializations/detail/correlation.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/cosine.hpp b/cpp/include/raft/distance/specializations/detail/cosine.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/cosine.hpp
rename to cpp/include/raft/distance/specializations/detail/cosine.cuh
index b47d294645..c98703e135 100644
--- a/cpp/include/raft/distance/specializations/detail/cosine.hpp
+++ b/cpp/include/raft/distance/specializations/detail/cosine.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/hamming_unexpanded.hpp b/cpp/include/raft/distance/specializations/detail/hamming_unexpanded.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/hamming_unexpanded.hpp
rename to cpp/include/raft/distance/specializations/detail/hamming_unexpanded.cuh
index 29a4ca03d9..9cf7b9b343 100644
--- a/cpp/include/raft/distance/specializations/detail/hamming_unexpanded.hpp
+++ b/cpp/include/raft/distance/specializations/detail/hamming_unexpanded.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/hellinger_expanded.hpp b/cpp/include/raft/distance/specializations/detail/hellinger_expanded.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/hellinger_expanded.hpp
rename to cpp/include/raft/distance/specializations/detail/hellinger_expanded.cuh
index 264003ec0e..28ecaa1b65 100644
--- a/cpp/include/raft/distance/specializations/detail/hellinger_expanded.hpp
+++ b/cpp/include/raft/distance/specializations/detail/hellinger_expanded.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/jensen_shannon.hpp b/cpp/include/raft/distance/specializations/detail/jensen_shannon.cuh
similarity index 98%
rename from cpp/include/raft/distance/specializations/detail/jensen_shannon.hpp
rename to cpp/include/raft/distance/specializations/detail/jensen_shannon.cuh
index 3135a4c579..ac0190562b 100644
--- a/cpp/include/raft/distance/specializations/detail/jensen_shannon.hpp
+++ b/cpp/include/raft/distance/specializations/detail/jensen_shannon.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/kl_divergence.hpp b/cpp/include/raft/distance/specializations/detail/kl_divergence.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/kl_divergence.hpp
rename to cpp/include/raft/distance/specializations/detail/kl_divergence.cuh
index 207fca6bc2..b338cebdc2 100644
--- a/cpp/include/raft/distance/specializations/detail/kl_divergence.hpp
+++ b/cpp/include/raft/distance/specializations/detail/kl_divergence.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/l1.hpp b/cpp/include/raft/distance/specializations/detail/l1.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/l1.hpp
rename to cpp/include/raft/distance/specializations/detail/l1.cuh
index e8eddfe1e4..65979ce414 100644
--- a/cpp/include/raft/distance/specializations/detail/l1.hpp
+++ b/cpp/include/raft/distance/specializations/detail/l1.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/l2_expanded.hpp b/cpp/include/raft/distance/specializations/detail/l2_expanded.cuh
similarity index 98%
rename from cpp/include/raft/distance/specializations/detail/l2_expanded.hpp
rename to cpp/include/raft/distance/specializations/detail/l2_expanded.cuh
index db37b8db9f..1dac34ad7a 100644
--- a/cpp/include/raft/distance/specializations/detail/l2_expanded.hpp
+++ b/cpp/include/raft/distance/specializations/detail/l2_expanded.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/l2_sqrt_expanded.hpp b/cpp/include/raft/distance/specializations/detail/l2_sqrt_expanded.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/l2_sqrt_expanded.hpp
rename to cpp/include/raft/distance/specializations/detail/l2_sqrt_expanded.cuh
index ac23c9c357..8b752d8235 100644
--- a/cpp/include/raft/distance/specializations/detail/l2_sqrt_expanded.hpp
+++ b/cpp/include/raft/distance/specializations/detail/l2_sqrt_expanded.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/l2_sqrt_unexpanded.hpp b/cpp/include/raft/distance/specializations/detail/l2_sqrt_unexpanded.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/l2_sqrt_unexpanded.hpp
rename to cpp/include/raft/distance/specializations/detail/l2_sqrt_unexpanded.cuh
index 1e38575fbf..8632fda769 100644
--- a/cpp/include/raft/distance/specializations/detail/l2_sqrt_unexpanded.hpp
+++ b/cpp/include/raft/distance/specializations/detail/l2_sqrt_unexpanded.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/l2_unexpanded.hpp b/cpp/include/raft/distance/specializations/detail/l2_unexpanded.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/l2_unexpanded.hpp
rename to cpp/include/raft/distance/specializations/detail/l2_unexpanded.cuh
index 035c9ef693..3962cfd1ae 100644
--- a/cpp/include/raft/distance/specializations/detail/l2_unexpanded.hpp
+++ b/cpp/include/raft/distance/specializations/detail/l2_unexpanded.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/lp_unexpanded.hpp b/cpp/include/raft/distance/specializations/detail/lp_unexpanded.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/lp_unexpanded.hpp
rename to cpp/include/raft/distance/specializations/detail/lp_unexpanded.cuh
index 83eda5f07b..1f7e504ba8 100644
--- a/cpp/include/raft/distance/specializations/detail/lp_unexpanded.hpp
+++ b/cpp/include/raft/distance/specializations/detail/lp_unexpanded.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/distance.hpp b/cpp/include/raft/distance/specializations/distance.cuh
similarity index 54%
rename from cpp/include/raft/distance/specializations/distance.hpp
rename to cpp/include/raft/distance/specializations/distance.cuh
index a57d6f49a5..7553f87e39 100644
--- a/cpp/include/raft/distance/specializations/distance.hpp
+++ b/cpp/include/raft/distance/specializations/distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,17 +16,17 @@
 
 #pragma once
 
-#include <raft/distance/specializations/detail/canberra.hpp>
-#include <raft/distance/specializations/detail/chebyshev.hpp>
-#include <raft/distance/specializations/detail/correlation.hpp>
-#include <raft/distance/specializations/detail/cosine.hpp>
-#include <raft/distance/specializations/detail/hamming_unexpanded.hpp>
-#include <raft/distance/specializations/detail/hellinger_expanded.hpp>
-#include <raft/distance/specializations/detail/jensen_shannon.hpp>
-#include <raft/distance/specializations/detail/kl_divergence.hpp>
-#include <raft/distance/specializations/detail/l1.hpp>
-#include <raft/distance/specializations/detail/l2_expanded.hpp>
-#include <raft/distance/specializations/detail/l2_sqrt_expanded.hpp>
-#include <raft/distance/specializations/detail/l2_sqrt_unexpanded.hpp>
-#include <raft/distance/specializations/detail/l2_unexpanded.hpp>
-#include <raft/distance/specializations/detail/lp_unexpanded.hpp>
+#include <raft/distance/specializations/detail/canberra.cuh>
+#include <raft/distance/specializations/detail/chebyshev.cuh>
+#include <raft/distance/specializations/detail/correlation.cuh>
+#include <raft/distance/specializations/detail/cosine.cuh>
+#include <raft/distance/specializations/detail/hamming_unexpanded.cuh>
+#include <raft/distance/specializations/detail/hellinger_expanded.cuh>
+#include <raft/distance/specializations/detail/jensen_shannon.cuh>
+#include <raft/distance/specializations/detail/kl_divergence.cuh>
+#include <raft/distance/specializations/detail/l1.cuh>
+#include <raft/distance/specializations/detail/l2_expanded.cuh>
+#include <raft/distance/specializations/detail/l2_sqrt_expanded.cuh>
+#include <raft/distance/specializations/detail/l2_sqrt_unexpanded.cuh>
+#include <raft/distance/specializations/detail/l2_unexpanded.cuh>
+#include <raft/distance/specializations/detail/lp_unexpanded.cuh>
diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh
new file mode 100644
index 0000000000..93c1080ff2
--- /dev/null
+++ b/cpp/include/raft/label/classlabels.cuh
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CLASS_LABELS_H
+#define __CLASS_LABELS_H
+
+#pragma once
+
+#include <raft/label/detail/classlabels.cuh>
+
+namespace raft {
+namespace label {
+
+/**
+ * Get unique class labels.
+ *
+ * The y array is assumed to store class labels. The unique values are selected
+ * from this array.
+ *
+ * @tparam value_t numeric type of the arrays with class labels
+ * @param [inout] unique output unique labels
+ * @param [in] y device array of labels, size [n]
+ * @param [in] n number of labels
+ * @param [in] stream cuda stream
+ * @returns unique device array of unique labels, unallocated on entry,
+ *   on exit it has size
+ */
+template <typename value_t>
+int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n, cudaStream_t stream)
+{
+  return detail::getUniquelabels<value_t>(unique, y, n, stream);
+}
+
+/**
+ * Assign one versus rest labels.
+ *
+ * The output labels will have values +/-1:
+ * y_out = (y == y_unique[idx]) ? +1 : -1;
+ *
+ * The output type currently is set to value_t, but for SVM in principle we are
+ * free to choose other type for y_out (it should represent +/-1, and it is used
+ * in floating point arithmetics).
+ *
+ * @param [in] y device array if input labels, size [n]
+ * @param [in] n number of labels
+ * @param [in] y_unique device array of unique labels, size [n_classes]
+ * @param [in] n_classes number of unique labels
+ * @param [out] y_out device array of output labels
+ * @param [in] idx index of unique label that should be labeled as 1
+ * @param [in] stream cuda stream
+ */
+template <typename value_t>
+void getOvrlabels(
+  value_t* y, int n, value_t* y_unique, int n_classes, value_t* y_out, int idx, cudaStream_t stream)
+{
+  detail::getOvrlabels<value_t>(y, n, y_unique, n_classes, y_out, idx, stream);
+}
+/**
+ * Maps an input array containing a series of numbers into a new array
+ * where numbers have been mapped to a monotonically increasing set
+ * of labels. This can be useful in machine learning algorithms, for instance,
+ * where a given set of labels is not taken from a monotonically increasing
+ * set. This can happen if they are filtered or if only a subset of the
+ * total labels are used in a dataset. This is also useful in graph algorithms
+ * where a set of vertices need to be labeled in a monotonically increasing
+ * order.
+ * @tparam Type the numeric type of the input and output arrays
+ * @tparam Lambda the type of an optional filter function, which determines
+ * which items in the array to map.
+ * @param[out] out the output monotonic array
+ * @param[in] in input label array
+ * @param[in] N number of elements in the input array
+ * @param[in] stream cuda stream to use
+ * @param[in] filter_op an optional function for specifying which values
+ * should have monotonically increasing labels applied to them.
+ * @param[in] zero_based force monotonic set to start at 0?
+ */
+template <typename Type, typename Lambda>
+void make_monotonic(
+  Type* out, Type* in, size_t N, cudaStream_t stream, Lambda filter_op, bool zero_based = false)
+{
+  detail::make_monotonic<Type, Lambda>(out, in, N, stream, filter_op, zero_based);
+}
+
+/**
+ * Maps an input array containing a series of numbers into a new array
+ * where numbers have been mapped to a monotonically increasing set
+ * of labels. This can be useful in machine learning algorithms, for instance,
+ * where a given set of labels is not taken from a monotonically increasing
+ * set. This can happen if they are filtered or if only a subset of the
+ * total labels are used in a dataset. This is also useful in graph algorithms
+ * where a set of vertices need to be labeled in a monotonically increasing
+ * order.
+ * @tparam Type the numeric type of the input and output arrays
+ * @param[out] out output label array with labels assigned monotonically
+ * @param[in] in input label array
+ * @param[in] N number of elements in the input array
+ * @param[in] stream cuda stream to use
+ * @param[in] zero_based force monotonic label set to start at 0?
+ */
+template <typename Type>
+void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zero_based = false)
+{
+  detail::make_monotonic<Type>(out, in, N, stream, zero_based);
+}
+};  // namespace label
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/label/classlabels.hpp b/cpp/include/raft/label/classlabels.hpp
index de9f60518d..189c26f69f 100644
--- a/cpp/include/raft/label/classlabels.hpp
+++ b/cpp/include/raft/label/classlabels.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#ifndef __CLASS_LABELS_H
+#define __CLASS_LABELS_H
+
 #pragma once
 
 #include <raft/label/detail/classlabels.cuh>
@@ -115,3 +118,5 @@ void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zer
 }
 };  // namespace label
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/label/detail/classlabels.cuh b/cpp/include/raft/label/detail/classlabels.cuh
index 53657a5dfa..a941751d78 100644
--- a/cpp/include/raft/label/detail/classlabels.cuh
+++ b/cpp/include/raft/label/detail/classlabels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/label/detail/merge_labels.cuh b/cpp/include/raft/label/detail/merge_labels.cuh
index bf03d1c738..1f62b3f0d6 100644
--- a/cpp/include/raft/label/detail/merge_labels.cuh
+++ b/cpp/include/raft/label/detail/merge_labels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/init.hpp>
+#include <raft/linalg/init.cuh>
 
 namespace raft {
 namespace label {
diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/merge_labels.cuh
new file mode 100644
index 0000000000..2bf2fa830b
--- /dev/null
+++ b/cpp/include/raft/label/merge_labels.cuh
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MERGE_LABELS_H
+#define __MERGE_LABELS_H
+
+#pragma once
+
+#include <raft/label/detail/merge_labels.cuh>
+
+namespace raft {
+namespace label {
+
+/**
+ * @brief Merge two labellings in-place, according to a core mask
+ *
+ * A labelling is a representation of disjoint sets (groups) where points that
+ * belong to the same group have the same label. It is assumed that group
+ * labels take values between 1 and N. labels relate to points, i.e a label i+1
+ * means that you belong to the same group as the point i.
+ * The special value MAX_LABEL is used to mark points that are not labelled.
+ *
+ * The two label arrays A and B induce two sets of groups over points 0..N-1.
+ * If a point is labelled i in A and j in B and the mask is true for this
+ * point, then i and j are equivalent labels and their groups are merged by
+ * relabeling the elements of both groups to have the same label. The new label
+ * is the smaller one from the original labels.
+ * It is required that if the mask is true for a point, this point is labelled
+ * (i.e its label is different than the special value MAX_LABEL).
+ *
+ * One use case is finding connected components: the two input label arrays can
+ * represent the connected components of graphs G_A and G_B, and the output
+ * would be the connected components labels of G_A \union G_B.
+ *
+ * @param[inout] labels_a    First input, and output label array (in-place)
+ * @param[in]    labels_b    Second input label array
+ * @param[in]    mask        Core point mask
+ * @param[out]   R           label equivalence map
+ * @param[in]    m           Working flag
+ * @param[in]    N           Number of points in the dataset
+ * @param[in]    stream      CUDA stream
+ */
+template <typename value_idx = int, int TPB_X = 256>
+void merge_labels(value_idx* labels_a,
+                  const value_idx* labels_b,
+                  const bool* mask,
+                  value_idx* R,
+                  bool* m,
+                  value_idx N,
+                  cudaStream_t stream)
+{
+  detail::merge_labels<value_idx, TPB_X>(labels_a, labels_b, mask, R, m, N, stream);
+}
+
+};  // namespace label
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/label/merge_labels.hpp b/cpp/include/raft/label/merge_labels.hpp
index 5ba8fe8a27..2bf2fa830b 100644
--- a/cpp/include/raft/label/merge_labels.hpp
+++ b/cpp/include/raft/label/merge_labels.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#ifndef __MERGE_LABELS_H
+#define __MERGE_LABELS_H
+
 #pragma once
 
 #include <raft/label/detail/merge_labels.cuh>
@@ -63,4 +66,6 @@ void merge_labels(value_idx* labels_a,
 }
 
 };  // namespace label
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/lap/detail/d_structs.h b/cpp/include/raft/lap/detail/d_structs.h
index e488dc528f..74679d64ce 100644
--- a/cpp/include/raft/lap/detail/d_structs.h
+++ b/cpp/include/raft/lap/detail/d_structs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  * Copyright 2020 KETAN DATE & RAKESH NAGI
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cpp/include/raft/lap/detail/lap_functions.cuh b/cpp/include/raft/lap/detail/lap_functions.cuh
index 6c6b09e5d8..3a801ff060 100644
--- a/cpp/include/raft/lap/detail/lap_functions.cuh
+++ b/cpp/include/raft/lap/detail/lap_functions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  * Copyright 2020 KETAN DATE & RAKESH NAGI
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cpp/include/raft/lap/detail/lap_kernels.cuh b/cpp/include/raft/lap/detail/lap_kernels.cuh
index b61d0bd269..e98b246733 100644
--- a/cpp/include/raft/lap/detail/lap_kernels.cuh
+++ b/cpp/include/raft/lap/detail/lap_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  * Copyright 2020 KETAN DATE & RAKESH NAGI
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cpp/include/raft/lap/lap.hpp b/cpp/include/raft/lap/lap.cuh
similarity index 99%
rename from cpp/include/raft/lap/lap.hpp
rename to cpp/include/raft/lap/lap.cuh
index 2350ebcddf..5f72ca27c8 100644
--- a/cpp/include/raft/lap/lap.hpp
+++ b/cpp/include/raft/lap/lap.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  * Copyright 2020 KETAN DATE & RAKESH NAGI
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh
new file mode 100644
index 0000000000..92152a8c03
--- /dev/null
+++ b/cpp/include/raft/linalg/add.cuh
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ADD_H
+#define __ADD_H
+
+#pragma once
+
+#include "detail/add.cuh"
+
+namespace raft {
+namespace linalg {
+
+using detail::adds_scalar;
+
+/**
+ * @brief Elementwise scalar add operation on the input buffer
+ *
+ * @tparam InT     input data-type. Also the data-type upon which the math ops
+ *                 will be performed
+ * @tparam OutT    output data-type
+ * @tparam IdxType Integer type used to for addressing
+ *
+ * @param out    the output buffer
+ * @param in     the input buffer
+ * @param scalar the scalar used in the operations
+ * @param len    number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ */
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
+{
+  detail::addScalar(out, in, scalar, len, stream);
+}
+
+/**
+ * @brief Elementwise add operation on the input buffers
+ * @tparam InT     input data-type. Also the data-type upon which the math ops
+ *                 will be performed
+ * @tparam OutT    output data-type
+ * @tparam IdxType Integer type used to for addressing
+ *
+ * @param out    the output buffer
+ * @param in1    the first input buffer
+ * @param in2    the second input buffer
+ * @param len    number of elements in the input buffers
+ * @param stream cuda stream where to launch work
+ */
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
+{
+  detail::add(out, in1, in2, len, stream);
+}
+
+/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
+ * write result to outDev[i]
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param outDev the output buffer
+ * @param inDev the input buffer
+ * @param singleScalarDev pointer to the scalar located in device memory
+ * @param len number of elements in the input and output buffer
+ * @param stream cuda stream
+ */
+template <typename math_t, typename IdxType = int>
+void addDevScalar(math_t* outDev,
+                  const math_t* inDev,
+                  const math_t* singleScalarDev,
+                  IdxType len,
+                  cudaStream_t stream)
+{
+  detail::addDevScalar(outDev, inDev, singleScalarDev, len, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/add.hpp b/cpp/include/raft/linalg/add.hpp
index 2f999a45d2..32c7f68459 100644
--- a/cpp/include/raft/linalg/add.hpp
+++ b/cpp/include/raft/linalg/add.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __ADD_H
+#define __ADD_H
 
 #pragma once
 
@@ -84,3 +91,5 @@ void addDevScalar(math_t* outDev,
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/axpy.cuh b/cpp/include/raft/linalg/axpy.cuh
new file mode 100644
index 0000000000..2e23047b5a
--- /dev/null
+++ b/cpp/include/raft/linalg/axpy.cuh
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __AXPY_H
+#define __AXPY_H
+
+#pragma once
+
+#include "detail/axpy.cuh"
+
+namespace raft::linalg {
+
+/**
+ * @brief the wrapper of cublas axpy function
+ *  It computes the following equation: y = alpha * x + y
+ *
+ * @tparam T the element type
+ * @tparam DevicePointerMode whether pointers alpha, beta point to device memory
+ * @param [in] handle raft handle
+ * @param [in] n number of elements in x and y
+ * @param [in] alpha host or device scalar
+ * @param [in] x vector of length n
+ * @param [in] incx stride between consecutive elements of x
+ * @param [inout] y vector of length n
+ * @param [in] incy stride between consecutive elements of y
+ * @param [in] stream
+ */
+template <typename T, bool DevicePointerMode = false>
+void axpy(const raft::handle_t& handle,
+          const int n,
+          const T* alpha,
+          const T* x,
+          const int incx,
+          T* y,
+          const int incy,
+          cudaStream_t stream)
+{
+  detail::axpy<T, DevicePointerMode>(handle, n, alpha, x, incx, y, incy, stream);
+}
+
+}  // namespace raft::linalg
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/axpy.hpp b/cpp/include/raft/linalg/axpy.hpp
index 5a5a873132..921ed3f89b 100644
--- a/cpp/include/raft/linalg/axpy.hpp
+++ b/cpp/include/raft/linalg/axpy.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __AXPY_H
+#define __AXPY_H
 
 #pragma once
 
-#include "detail/axpy.hpp"
+#include "detail/axpy.cuh"
 
 namespace raft::linalg {
 
@@ -49,3 +56,5 @@ void axpy(const raft::handle_t& handle,
 }
 
 }  // namespace raft::linalg
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh
new file mode 100644
index 0000000000..a85bf698f7
--- /dev/null
+++ b/cpp/include/raft/linalg/binary_op.cuh
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __BINARY_OP_H
+#define __BINARY_OP_H
+
+#pragma once
+
+#include "detail/binary_op.cuh"
+
+#include <raft/cuda_utils.cuh>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief perform element-wise binary operation on the input arrays
+ * @tparam InType input data-type
+ * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam OutType output data-type
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @param out the output array
+ * @param in1 the first input array
+ * @param in2 the second input array
+ * @param len number of elements in the input array
+ * @param op the device-lambda
+ * @param stream cuda stream where to launch work
+ * @note Lambda must be a functor with the following signature:
+ *       `OutType func(const InType& val1, const InType& val2);`
+ */
+template <typename InType,
+          typename Lambda,
+          typename OutType = InType,
+          typename IdxType = int,
+          int TPB          = 256>
+void binaryOp(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
+{
+  detail::binaryOp(out, in1, in2, len, op, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/binary_op.hpp b/cpp/include/raft/linalg/binary_op.hpp
index 5c73b6d3c5..468c278909 100644
--- a/cpp/include/raft/linalg/binary_op.hpp
+++ b/cpp/include/raft/linalg/binary_op.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __BINARY_OP_H
+#define __BINARY_OP_H
 
 #pragma once
 
@@ -52,3 +59,5 @@ void binaryOp(
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh
new file mode 100644
index 0000000000..7d22d6bcf7
--- /dev/null
+++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CHOLESKY_R1_UPDATE_H
+#define __CHOLESKY_R1_UPDATE_H
+
+#pragma once
+
+#include "detail/cholesky_r1_update.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Rank 1 update of Cholesky decomposition.
+ *
+ * This method is useful if an algorithm iteratively builds up matrix A, and
+ * the Cholesky decomposition of A is required at each step.
+ *
+ * On entry, L is the Cholesky decomposition of matrix A, where both A and L
+ * have size n-1 x n-1. We are interested in the Cholesky decomposition of a new
+ * matrix A', which we get by adding a row and column to A. In Python notation:
+ * - A'[0:n-1, 0:n-1] = A;
+ * - A'[:,n-1] = A[n-1,:] = A_new
+ *
+ * On entry, the new column A_new, is stored as the n-th column of L if uplo ==
+ * CUBLAS_FILL_MODE_UPPER, else A_new is stored as the n-th row of L.
+ *
+ * On exit L contains the Cholesky decomposition of A'. In practice the elements
+ * of A_new are overwritten with new row/column of the L matrix.
+ *
+ * The uplo paramater is used to select the matrix layout.
+ * If (uplo != CUBLAS_FILL_MODE_UPPER) then the input arg L stores the
+ * lower triangular matrix L, so that A = L * L.T. Otherwise the input arg L
+ * stores an upper triangular matrix U: A = U.T * U.
+ *
+ * On exit L will be updated to store the Cholesky decomposition of A'.
+ *
+ * If the matrix is not positive definit, or very ill conditioned then the new
+ * diagonal element of L would be NaN. In such a case an exception is thrown.
+ * The eps argument can be used to override this behavior: if eps >= 0 then
+ * the diagonal element is replaced by eps in case the diagonal is NaN or
+ * smaller than eps. Note: for an iterative solver it is probably better to
+ * stop early in case of error, rather than relying on the eps parameter.
+ *
+ * Examples:
+ *
+ * - Lower triangular factorization:
+ * @code{.cpp}
+ * // Initialize arrays
+ * int ld_L = n_rows;
+ * rmm::device_uvector<math_t> L(ld_L * n_rows, stream);
+ * MLCommon::LinAlg::choleskyRank1Update(handle, L, n_rows, ld_L, nullptr,
+ *                                       &n_bytes, CUBLAS_FILL_MODE_LOWER,
+ *                                       stream);
+ * rmm::device_uvector<char> workspace(n_bytes, stream);
+ *
+ * for (n=1; n<=n_rows; rank++) {
+ *   // Calculate a new row/column of matrix A into A_new
+ *   // ...
+ *   // Copy new row to L[rank-1,:]
+ *   RAFT_CUBLAS_TRY(cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1,
+ *                           L + n - 1, ld_L, stream));
+ *   // Update Cholesky factorization
+ *   MLCommon::LinAlg::choleskyRank1Update(
+ *       handle, L, rank, ld_L, workspace, &n_bytes, CUBLAS_FILL_MODE_LOWER,
+ *       stream);
+ * }
+ * Now L stores the Cholesky decomposition of A: A = L * L.T
+ * @endcode
+ *
+ * - Upper triangular factorization:
+ * @code{.cpp}
+ * // Initialize arrays
+ * int ld_U = n_rows;
+ * rmm::device_uvector<math_t> U(ld_U * n_rows, stream);
+ * MLCommon::LinAlg::choleskyRank1Update(handle, L, n_rows, ld_U, nullptr,
+ *                                       &n_bytes, CUBLAS_FILL_MODE_UPPER,
+ *                                       stream);
+ * rmm::device_uvector<char> workspace(stream, n_bytes, stream);
+ *
+ * for (n=1; n<=n_rows; n++) {
+ *   // Calculate a new row/column of matrix A into array A_new
+ *   // ...
+ *   // Copy new row to U[:,n-1] (column major layout)
+ *   raft::copy(U + ld_U * (n-1), A_new, n-1, stream);
+ *   //
+ *   // Update Cholesky factorization
+ *   MLCommon::LinAlg::choleskyRank1Update(
+ *       handle, U, n, ld_U, workspace, &n_bytes, CUBLAS_FILL_MODE_UPPER,
+ *       stream);
+ * }
+ * // Now U stores the Cholesky decomposition of A: A = U.T * U
+ * @endcode
+ *
+ * @param handle RAFT handle (used to retrive cuBLAS handles).
+ * @param L device array for to store the triangular matrix L, and the new
+ *     column of A in column major format, size [n*n]
+ * @param n number of elements in the new row.
+ * @param ld stride of colums in L
+ * @param workspace device pointer to workspace shall be nullptr ar an array
+ *    of size [n_bytes].
+ * @param n_bytes size of workspace is returned here if workspace==nullptr.
+ * @param stream CUDA stream
+ * @param uplo indicates whether L is stored as an upper or lower triangular
+ *    matrix (CUBLAS_FILL_MODE_UPPER or CUBLAS_FILL_MODE_LOWER)
+ * @param eps numerical parameter that can act as a regularizer for ill
+ *    conditioned systems. Negative values mean no regularizaton.
+ */
+template <typename math_t>
+void choleskyRank1Update(const raft::handle_t& handle,
+                         math_t* L,
+                         int n,
+                         int ld,
+                         void* workspace,
+                         int* n_bytes,
+                         cublasFillMode_t uplo,
+                         cudaStream_t stream,
+                         math_t eps = -1)
+{
+  detail::choleskyRank1Update(handle, L, n, ld, workspace, n_bytes, uplo, stream, eps);
+}
+};  // namespace linalg
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.hpp b/cpp/include/raft/linalg/cholesky_r1_update.hpp
index 583c65c50e..b55f5d06da 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.hpp
+++ b/cpp/include/raft/linalg/cholesky_r1_update.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __CHOLESKY_R1_UPDATE_H
+#define __CHOLESKY_R1_UPDATE_H
 
 #pragma once
 
-#include "detail/cholesky_r1_update.hpp"
+#include "detail/cholesky_r1_update.cuh"
 
 namespace raft {
 namespace linalg {
@@ -132,3 +139,5 @@ void choleskyRank1Update(const raft::handle_t& handle,
 }
 };  // namespace linalg
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh
new file mode 100644
index 0000000000..03477f72d6
--- /dev/null
+++ b/cpp/include/raft/linalg/coalesced_reduction.cuh
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __COALESCED_REDUCTION_H
+#define __COALESCED_REDUCTION_H
+
+#pragma once
+
+#include "detail/coalesced_reduction.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Compute reduction of the input matrix along the leading dimension
+ *
+ * @tparam InType the data type of the input
+ * @tparam OutType the data type of the output (as well as the data type for
+ *  which reduction is performed)
+ * @tparam IdxType data type of the indices of the array
+ * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
+ * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*ReduceLambda)(OutType);</pre>
+ * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*FinalLambda)(OutType);</pre>
+ * @param dots the output reduction vector
+ * @param data the input matrix
+ * @param D leading dimension of data
+ * @param N second dimension data
+ * @param init initial value to use for the reduction
+ * @param main_op elementwise operation to apply before reduction
+ * @param reduce_op binary reduction operation
+ * @param final_op elementwise operation to apply before storing results
+ * @param inplace reduction result added inplace or overwrites old values?
+ * @param stream cuda stream where to launch work
+ */
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::Nop<OutType>>
+void coalescedReduction(OutType* dots,
+                        const InType* data,
+                        int D,
+                        int N,
+                        OutType init,
+                        cudaStream_t stream,
+                        bool inplace           = false,
+                        MainLambda main_op     = raft::Nop<InType, IdxType>(),
+                        ReduceLambda reduce_op = raft::Sum<OutType>(),
+                        FinalLambda final_op   = raft::Nop<OutType>())
+{
+  detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/coalesced_reduction.hpp b/cpp/include/raft/linalg/coalesced_reduction.hpp
index 0f1ca9202d..4b9e5d262f 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.hpp
+++ b/cpp/include/raft/linalg/coalesced_reduction.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __COALESCED_REDUCTION_H
+#define __COALESCED_REDUCTION_H
 
 #pragma once
 
@@ -70,3 +77,5 @@ void coalescedReduction(OutType* dots,
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh
new file mode 100644
index 0000000000..5ccbd15c3d
--- /dev/null
+++ b/cpp/include/raft/linalg/contractions.cuh
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CONTRACTIONS_H
+#define __CONTRACTIONS_H
+
+#pragma once
+
+#include "detail/contractions.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief This is the central enum that should be used to configure the perf
+ *        landscape of the Contraction kernel.
+ *
+ * Main goal of this Policy struct is to provide sufficient knobs to tune the
+ * perf of Contraction kernel, as and when we see matrices of different shapes.
+ *
+ * @tparam DataT   the IO and math datatype
+ * @tparam _veclen number of k-elements loaded by each thread for every LDG call
+ *                 it makes. This should be configured based on the input 'k'
+ *                 value and the input data type. For eg: if DataT = float and
+ *                 k is multiples of 4, then setting this to 4 gives the best
+ *                 LDG pattern. Possible values are {1, 2, 4}.
+ * @tparam _kblk   number of k-elements operated upon per main-loop iteration.
+ *                 Therefore total number of main-loop iterations will be
+ *                 `ceil(k/_kblk)`. This must be multiples of `_veclen`. Do note
+ *                 that bigger this value, the greater shared mem requirement.
+ * @tparam _rpt    Defines the number of rows that a given thread accumulates on.
+ *                 This directly results in increased register pressure. This
+ *                 also is used to compute the number of m-elements worked upon
+ *                 by each thread block.
+ * @tparam _cpt    Defines the number of cols that a given thread accumulates on.
+ *                 This directly results in increased register pressure. This
+ *                 also is used to compute the number of n-elements worked upon
+ *                 by each thread block.
+ * @tparam _tr     Number of threads working on the same output column. This is
+ *                 used to compute the number of m-elements worked upon by each
+ *                 thread block. This also determines the number of threads per
+ *                 thread block
+ * @tparam _tc     Number of threads working on the same output row. This is
+ *                 used to compute the number of m-elements worked upon by each
+ *                 thread block. This also determines the number of threads per
+ *                 thread block
+ */
+template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
+struct KernelPolicy {
+  enum {
+    /** number of elements along K worked upon per main loop iteration */
+    Kblk = _kblk,
+    /** number of elements loaded per LDG */
+    Veclen = _veclen,
+    /** number of rows a thread works on for accumulation */
+    AccRowsPerTh = _rpt,
+    /** number of cols a thread works on for accumulation */
+    AccColsPerTh = _cpt,
+    /** number of threads working the same output col */
+    AccThRows = _tr,
+    /** number of threads working the same output row */
+    AccThCols = _tc,
+    /** total threads per block */
+    Nthreads = AccThRows * AccThCols,
+    /** output tile size along rows */
+    Mblk = AccRowsPerTh * AccThRows,
+    /** output tile size along cols */
+    Nblk = AccColsPerTh * AccThCols,
+    /** number of threads loading a single row */
+    LdgThRow = Kblk / Veclen,
+    /** number of LDGs issued by a single thread for X */
+    LdgPerThX = Mblk * LdgThRow / Nthreads,
+    /** number of LDGs issued by a single thread for Y */
+    LdgPerThY = Nblk * LdgThRow / Nthreads,
+    /** number of rows of X covered per LDG */
+    LdgRowsX = Mblk / LdgPerThX,
+    /** number of rows of Y covered per LDG */
+    LdgRowsY = Nblk / LdgPerThY,
+    /** stride for accessing X/Y data in shared mem */
+    SmemStride = Kblk + Veclen,
+    /** size of one page for storing X data */
+    SmemPageX = SmemStride * Mblk,
+    /** size of one page for storing Y data */
+    SmemPageY = SmemStride * Nblk,
+    /** size of one smem page */
+    SmemPage = SmemPageX + SmemPageY,
+    /** size (in B) for smem needed */
+    SmemSize = 2 * SmemPage * sizeof(DataT),
+  };  // enum
+
+};  // struct KernelPolicy
+
+template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
+struct ColKernelPolicy {
+  enum {
+    /** number of elements along K worked upon per main loop iteration */
+    Kblk = _kblk,
+    /** number of elements loaded per LDG */
+    Veclen = _veclen,
+    /** number of rows a thread works on for accumulation */
+    AccRowsPerTh = _rpt,
+    /** number of cols a thread works on for accumulation */
+    AccColsPerTh = _cpt,
+    /** number of threads working the same output col */
+    AccThRows = _tr,
+    /** number of threads working the same output row */
+    AccThCols = _tc,
+    /** total threads per block */
+    Nthreads = AccThRows * AccThCols,
+    /** output tile size along rows */
+    Mblk = AccRowsPerTh * AccThRows,
+    /** output tile size along cols */
+    Nblk = AccColsPerTh * AccThCols,
+    /** number of threads loading a single col */
+    LdgThRow = Mblk / Veclen,
+    /** number of LDGs issued by a single thread for X */
+    LdgPerThX = Kblk * LdgThRow / Nthreads,
+    /** number of LDGs issued by a single thread for Y */
+    LdgPerThY = Kblk * LdgThRow / Nthreads,
+    /** number of rows of X covered per LDG */
+    LdgRowsX = Kblk / LdgPerThX,
+    /** number of rows of Y covered per LDG */
+    LdgRowsY = Kblk / LdgPerThY,
+    /** stride for accessing X/Y data in shared mem */
+    SmemStride = Mblk + Veclen,
+    /** size of one page for storing X data */
+    SmemPageX = SmemStride * Kblk,
+    /** size of one page for storing Y data */
+    SmemPageY = SmemStride * Kblk,
+    /** size of one smem page */
+    SmemPage = SmemPageX + SmemPageY,
+    /** size (in B) for smem needed */
+    SmemSize = 2 * SmemPage * sizeof(DataT),
+  };  // colMajor enum
+  static_assert(Mblk == Nblk, "Mblk should be equal to Nblk");
+};
+/**
+ * @defgroup Policy4x4 16 elements per thread Policy with k-block = 32
+ * @{
+ */
+template <typename DataT, int _veclen>
+struct Policy4x4 {
+};
+
+template <int _veclen>
+struct Policy4x4<float, _veclen> {
+  typedef KernelPolicy<float, _veclen, 32, 4, 4, 16, 16> Policy;
+  typedef ColKernelPolicy<float, _veclen, 32, 4, 4, 16, 16> ColPolicy;
+};
+
+template <int _veclen>
+struct Policy4x4<double, _veclen> {
+  typedef KernelPolicy<double, _veclen, 16, 4, 4, 16, 16> Policy;
+  typedef ColKernelPolicy<double, _veclen, 16, 4, 4, 16, 16> ColPolicy;
+};
+/** @} */
+
+/**
+ * @defgroup Policy2x8 16 elements per thread Policy with k-block = 16
+ * @{
+ */
+template <typename DataT, int _veclen = 1>
+struct Policy2x8 {
+};
+
+template <int _veclen>
+struct Policy2x8<float, _veclen> {
+  typedef KernelPolicy<float, _veclen, 16, 2, 8, 8, 32> Policy;
+  typedef ColKernelPolicy<float, _veclen, 16, 2, 8, 8, 32> ColPolicy;
+};
+
+template <int _veclen>
+struct Policy2x8<double, _veclen> {
+  // this is not used just for keeping compiler happy.
+  typedef KernelPolicy<double, _veclen, 32, 1, 2, 8, 32> Policy;
+  typedef ColKernelPolicy<double, _veclen, 32, 1, 2, 8, 32> ColPolicy;
+};
+/** @} */
+
+/**
+ * @brief Base class for gemm-like NT contractions
+ *
+ * This class does not provide any arithmetic operations, but only provides the
+ * memory-related operations of loading the `x` and `y` matrix blocks from the
+ * global memory into shared memory and then from shared into registers. Thus,
+ * this class acts as a basic building block for further composing gemm-like NT
+ * contractions on input matrices which are row-major (and so does the output)
+ *
+ * @tparam DataT  IO and math data type
+ * @tparam IdxT   indexing type
+ * @tparam Policy policy used to customize memory access behavior.
+ *                See documentation for `KernelPolicy` to know more.
+ */
+using detail::Contractions_NT;
+
+}  // namespace linalg
+}  // namespace raft
+
+#endif
diff --git a/cpp/include/raft/linalg/contractions.hpp b/cpp/include/raft/linalg/contractions.hpp
index e317588b1d..84c86b93a4 100644
--- a/cpp/include/raft/linalg/contractions.hpp
+++ b/cpp/include/raft/linalg/contractions.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __CONTRACTIONS_H
+#define __CONTRACTIONS_H
 
 #pragma once
 
@@ -205,3 +212,5 @@ using detail::Contractions_NT;
 
 }  // namespace linalg
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/cublas_macros.h b/cpp/include/raft/linalg/cublas_macros.h
new file mode 100644
index 0000000000..1cb5cfc81a
--- /dev/null
+++ b/cpp/include/raft/linalg/cublas_macros.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cublas_v2.h>
+#include <raft/error.hpp>
+
+///@todo: enable this once we have logger enabled
+//#include <cuml/common/logger.hpp>
+
+#include <cstdint>
+
+#define _CUBLAS_ERR_TO_STR(err) \
+  case err: return #err
+
+namespace raft {
+
+/**
+ * @brief Exception thrown when a cuBLAS error is encountered.
+ */
+struct cublas_error : public raft::exception {
+  explicit cublas_error(char const* const message) : raft::exception(message) {}
+  explicit cublas_error(std::string const& message) : raft::exception(message) {}
+};
+
+namespace linalg {
+namespace detail {
+
+inline const char* cublas_error_to_string(cublasStatus_t err)
+{
+  switch (err) {
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
+    default: return "CUBLAS_STATUS_UNKNOWN";
+  };
+}
+
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
+
+#undef _CUBLAS_ERR_TO_STR
+
+/**
+ * @brief Error checking macro for cuBLAS runtime API functions.
+ *
+ * Invokes a cuBLAS runtime API function call, if the call does not return
+ * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred
+ */
+#define RAFT_CUBLAS_TRY(call)                                              \
+  do {                                                                     \
+    cublasStatus_t const status = (call);                                  \
+    if (CUBLAS_STATUS_SUCCESS != status) {                                 \
+      std::string msg{};                                                   \
+      SET_ERROR_MSG(msg,                                                   \
+                    "cuBLAS error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                             \
+                    #call,                                                 \
+                    status,                                                \
+                    raft::linalg::detail::cublas_error_to_string(status)); \
+      throw raft::cublas_error(msg);                                       \
+    }                                                                      \
+  } while (0)
+
+// FIXME: Remove after consumers rename
+#ifndef CUBLAS_TRY
+#define CUBLAS_TRY(call) RAFT_CUBLAS_TRY(call)
+#endif
+
+// /**
+//  * @brief check for cuda runtime API errors but log error instead of raising
+//  *        exception.
+//  */
+#define RAFT_CUBLAS_TRY_NO_THROW(call)                               \
+  do {                                                               \
+    cublasStatus_t const status = call;                              \
+    if (CUBLAS_STATUS_SUCCESS != status) {                           \
+      printf("CUBLAS call='%s' at file=%s line=%d failed with %s\n", \
+             #call,                                                  \
+             __FILE__,                                               \
+             __LINE__,                                               \
+             raft::linalg::detail::cublas_error_to_string(status));  \
+    }                                                                \
+  } while (0)
+
+/** FIXME: remove after cuml rename */
+#ifndef CUBLAS_CHECK
+#define CUBLAS_CHECK(call) CUBLAS_TRY(call)
+#endif
+
+/** FIXME: remove after cuml rename */
+#ifndef CUBLAS_CHECK_NO_THROW
+#define CUBLAS_CHECK_NO_THROW(call) RAFT_CUBLAS_TRY_NO_THROW(call)
+#endif
diff --git a/cpp/include/raft/linalg/cusolver_macros.h b/cpp/include/raft/linalg/cusolver_macros.h
new file mode 100644
index 0000000000..6db0577509
--- /dev/null
+++ b/cpp/include/raft/linalg/cusolver_macros.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusolverDn.h>
+#include <cusolverSp.h>
+///@todo: enable this once logging is enabled
+//#include <cuml/common/logger.hpp>
+#include <raft/cudart_utils.h>
+#include <type_traits>
+
+#define _CUSOLVER_ERR_TO_STR(err) \
+  case err: return #err;
+
+namespace raft {
+
+/**
+ * @brief Exception thrown when a cuSOLVER error is encountered.
+ */
+struct cusolver_error : public raft::exception {
+  explicit cusolver_error(char const* const message) : raft::exception(message) {}
+  explicit cusolver_error(std::string const& message) : raft::exception(message) {}
+};
+
+namespace linalg {
+
+inline const char* cusolver_error_to_string(cusolverStatus_t err)
+{
+  switch (err) {
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ALLOC_FAILED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INVALID_VALUE);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ARCH_MISMATCH);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_EXECUTION_FAILED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INTERNAL_ERROR);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
+    default: return "CUSOLVER_STATUS_UNKNOWN";
+  };
+}
+
+}  // namespace linalg
+}  // namespace raft
+
+#undef _CUSOLVER_ERR_TO_STR
+
+/**
+ * @brief Error checking macro for cuSOLVER runtime API functions.
+ *
+ * Invokes a cuSOLVER runtime API function call, if the call does not return
+ * CUSolver_STATUS_SUCCESS, throws an exception detailing the cuSOLVER error that occurred
+ */
+#define RAFT_CUSOLVER_TRY(call)                                              \
+  do {                                                                       \
+    cusolverStatus_t const status = (call);                                  \
+    if (CUSOLVER_STATUS_SUCCESS != status) {                                 \
+      std::string msg{};                                                     \
+      SET_ERROR_MSG(msg,                                                     \
+                    "cuSOLVER error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                               \
+                    #call,                                                   \
+                    status,                                                  \
+                    raft::linalg::detail::cusolver_error_to_string(status)); \
+      throw raft::cusolver_error(msg);                                       \
+    }                                                                        \
+  } while (0)
+
+// FIXME: remove after consumer rename
+#ifndef CUSOLVER_TRY
+#define CUSOLVER_TRY(call) RAFT_CUSOLVER_TRY(call)
+#endif
+
+// /**
+//  * @brief check for cuda runtime API errors but log error instead of raising
+//  *        exception.
+//  */
+#define RAFT_CUSOLVER_TRY_NO_THROW(call)                               \
+  do {                                                                 \
+    cusolverStatus_t const status = call;                              \
+    if (CUSOLVER_STATUS_SUCCESS != status) {                           \
+      printf("CUSOLVER call='%s' at file=%s line=%d failed with %s\n", \
+             #call,                                                    \
+             __FILE__,                                                 \
+             __LINE__,                                                 \
+             raft::linalg::detail::cusolver_error_to_string(status));  \
+    }                                                                  \
+  } while (0)
+
+// FIXME: remove after cuml rename
+#ifndef CUSOLVER_CHECK
+#define CUSOLVER_CHECK(call) CUSOLVER_TRY(call)
+#endif
+
+#ifndef CUSOLVER_CHECK_NO_THROW
+#define CUSOLVER_CHECK_NO_THROW(call) CUSOLVER_TRY_NO_THROW(call)
+#endif
diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh
index 794a776dcf..652ffd2e86 100644
--- a/cpp/include/raft/linalg/detail/add.cuh
+++ b/cpp/include/raft/linalg/detail/add.cuh
@@ -19,8 +19,8 @@
 #include "functional.cuh"
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.hpp>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/unary_op.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/axpy.hpp b/cpp/include/raft/linalg/detail/axpy.cuh
similarity index 100%
rename from cpp/include/raft/linalg/detail/axpy.hpp
rename to cpp/include/raft/linalg/detail/axpy.cuh
diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp b/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
similarity index 98%
rename from cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
rename to cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
index 48993886a6..df1fb0a1f3 100644
--- a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
+++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
@@ -18,9 +18,8 @@
 
 #include "cublas_wrappers.hpp"
 #include "cusolver_wrappers.hpp"
-#include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
-#include <raft/linalg/binary_op.hpp>
+#include <raft/linalg/binary_op.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
index 752235d246..7f9abc324e 100644
--- a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
+++ b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cublas_v2.h>
 #include <raft/error.hpp>
 
 #include <cublas_v2.h>
diff --git a/cpp/include/raft/linalg/detail/divide.hpp b/cpp/include/raft/linalg/detail/divide.cuh
similarity index 96%
rename from cpp/include/raft/linalg/detail/divide.hpp
rename to cpp/include/raft/linalg/detail/divide.cuh
index c694529fb5..cb46ae76de 100644
--- a/cpp/include/raft/linalg/detail/divide.hpp
+++ b/cpp/include/raft/linalg/detail/divide.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include "functional.cuh"
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/eig.hpp b/cpp/include/raft/linalg/detail/eig.cuh
similarity index 99%
rename from cpp/include/raft/linalg/detail/eig.hpp
rename to cpp/include/raft/linalg/detail/eig.cuh
index 8716b4de29..1d9a6bfa8f 100644
--- a/cpp/include/raft/linalg/detail/eig.hpp
+++ b/cpp/include/raft/linalg/detail/eig.cuh
@@ -18,10 +18,9 @@
 
 #include "cusolver_wrappers.hpp"
 #include <cuda_runtime_api.h>
-#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/matrix/matrix.hpp>
+#include <raft/matrix/matrix.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/linalg/detail/eltwise.hpp b/cpp/include/raft/linalg/detail/eltwise.cuh
similarity index 97%
rename from cpp/include/raft/linalg/detail/eltwise.hpp
rename to cpp/include/raft/linalg/detail/eltwise.cuh
index b15717f205..6d728c8b0f 100644
--- a/cpp/include/raft/linalg/detail/eltwise.hpp
+++ b/cpp/include/raft/linalg/detail/eltwise.cuh
@@ -18,8 +18,8 @@
 
 #include "functional.cuh"
 
-#include <raft/linalg/binary_op.hpp>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/unary_op.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/lanczos.hpp b/cpp/include/raft/linalg/detail/lanczos.cuh
similarity index 100%
rename from cpp/include/raft/linalg/detail/lanczos.hpp
rename to cpp/include/raft/linalg/detail/lanczos.cuh
diff --git a/cpp/include/raft/linalg/detail/lstsq.hpp b/cpp/include/raft/linalg/detail/lstsq.cuh
similarity index 98%
rename from cpp/include/raft/linalg/detail/lstsq.hpp
rename to cpp/include/raft/linalg/detail/lstsq.cuh
index 6553394cc4..3eef58b4df 100644
--- a/cpp/include/raft/linalg/detail/lstsq.hpp
+++ b/cpp/include/raft/linalg/detail/lstsq.cuh
@@ -18,20 +18,19 @@
 
 #include <common/nvtx.hpp>
 #include <raft/common/nvtx.hpp>
-#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/detail/cusolver_wrappers.hpp>
-#include <raft/linalg/eig.hpp>
-#include <raft/linalg/eltwise.hpp>
-#include <raft/linalg/gemm.hpp>
-#include <raft/linalg/gemv.hpp>
-#include <raft/linalg/qr.hpp>
-#include <raft/linalg/svd.hpp>
-#include <raft/linalg/transpose.hpp>
-#include <raft/matrix/math.hpp>
-#include <raft/matrix/matrix.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/eig.cuh>
+#include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/gemv.cuh>
+#include <raft/linalg/qr.cuh>
+#include <raft/linalg/svd.cuh>
+#include <raft/linalg/transpose.cuh>
+#include <raft/matrix/math.cuh>
+#include <raft/matrix/matrix.cuh>
+#include <raft/random/rng.cuh>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
index 94545e59f6..4cfccdcaa3 100644
--- a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/matrix/matrix.hpp>
+#include <raft/matrix/matrix.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/mean_squared_error.hpp b/cpp/include/raft/linalg/detail/mean_squared_error.cuh
similarity index 96%
rename from cpp/include/raft/linalg/detail/mean_squared_error.hpp
rename to cpp/include/raft/linalg/detail/mean_squared_error.cuh
index f0a9daebdb..5889314eea 100644
--- a/cpp/include/raft/linalg/detail/mean_squared_error.hpp
+++ b/cpp/include/raft/linalg/detail/mean_squared_error.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/map_then_reduce.hpp>
+#include <raft/linalg/map_then_reduce.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/multiply.hpp b/cpp/include/raft/linalg/detail/multiply.cuh
similarity index 96%
rename from cpp/include/raft/linalg/detail/multiply.hpp
rename to cpp/include/raft/linalg/detail/multiply.cuh
index da06c23aed..ec3ec802de 100644
--- a/cpp/include/raft/linalg/detail/multiply.hpp
+++ b/cpp/include/raft/linalg/detail/multiply.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/norm.hpp b/cpp/include/raft/linalg/detail/norm.cuh
similarity index 99%
rename from cpp/include/raft/linalg/detail/norm.hpp
rename to cpp/include/raft/linalg/detail/norm.cuh
index fcf98c7daf..03d03497e9 100644
--- a/cpp/include/raft/linalg/detail/norm.hpp
+++ b/cpp/include/raft/linalg/detail/norm.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/reduce.hpp>
+#include <raft/linalg/reduce.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/qr.cuh b/cpp/include/raft/linalg/detail/qr.cuh
index 81b1867a82..4aa843081e 100644
--- a/cpp/include/raft/linalg/detail/qr.cuh
+++ b/cpp/include/raft/linalg/detail/qr.cuh
@@ -18,7 +18,7 @@
 
 #include "cublas_wrappers.hpp"
 #include "cusolver_wrappers.hpp"
-#include <raft/matrix/matrix.hpp>
+#include <raft/matrix/matrix.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/linalg/detail/reduce.hpp b/cpp/include/raft/linalg/detail/reduce.cuh
similarity index 95%
rename from cpp/include/raft/linalg/detail/reduce.hpp
rename to cpp/include/raft/linalg/detail/reduce.cuh
index 94c8f5ba52..4d5fa87202 100644
--- a/cpp/include/raft/linalg/detail/reduce.hpp
+++ b/cpp/include/raft/linalg/detail/reduce.cuh
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/coalesced_reduction.hpp>
-#include <raft/linalg/strided_reduction.hpp>
+#include <raft/linalg/coalesced_reduction.cuh>
+#include <raft/linalg/strided_reduction.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/rsvd.cuh b/cpp/include/raft/linalg/detail/rsvd.cuh
index 3dc22a7e89..033534be55 100644
--- a/cpp/include/raft/linalg/detail/rsvd.cuh
+++ b/cpp/include/raft/linalg/detail/rsvd.cuh
@@ -17,14 +17,14 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/eig.hpp>
-#include <raft/linalg/gemm.hpp>
-#include <raft/linalg/qr.hpp>
-#include <raft/linalg/svd.hpp>
-#include <raft/linalg/transpose.hpp>
-#include <raft/matrix/math.hpp>
-#include <raft/matrix/matrix.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/eig.cuh>
+#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/qr.cuh>
+#include <raft/linalg/svd.cuh>
+#include <raft/linalg/transpose.cuh>
+#include <raft/matrix/math.cuh>
+#include <raft/matrix/matrix.cuh>
+#include <raft/random/rng.cuh>
 
 #include <algorithm>
 
diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh
index a0d1e2abaa..f7af9e88d6 100644
--- a/cpp/include/raft/linalg/detail/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh
@@ -19,7 +19,7 @@
 #include "unary_op.cuh"
 #include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 #include <type_traits>
 
 namespace raft {
diff --git a/cpp/include/raft/linalg/detail/subtract.cuh b/cpp/include/raft/linalg/detail/subtract.cuh
index 23d5eded05..084c6d2fd3 100644
--- a/cpp/include/raft/linalg/detail/subtract.cuh
+++ b/cpp/include/raft/linalg/detail/subtract.cuh
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.hpp>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/unary_op.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/svd.hpp b/cpp/include/raft/linalg/detail/svd.cuh
similarity index 98%
rename from cpp/include/raft/linalg/detail/svd.hpp
rename to cpp/include/raft/linalg/detail/svd.cuh
index 5d349cd101..aa33dcb0a9 100644
--- a/cpp/include/raft/linalg/detail/svd.hpp
+++ b/cpp/include/raft/linalg/detail/svd.cuh
@@ -18,16 +18,16 @@
 
 #include "cublas_wrappers.hpp"
 #include "cusolver_wrappers.hpp"
-#include <raft/linalg/eig.hpp>
-#include <raft/linalg/gemm.hpp>
-#include <raft/linalg/transpose.hpp>
+#include <raft/linalg/eig.cuh>
+#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/transpose.cuh>
 
 #include <raft/common/nvtx.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/matrix/math.hpp>
-#include <raft/matrix/matrix.hpp>
+#include <raft/matrix/math.cuh>
+#include <raft/matrix/matrix.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/linalg/detail/transpose.hpp b/cpp/include/raft/linalg/detail/transpose.cuh
similarity index 100%
rename from cpp/include/raft/linalg/detail/transpose.hpp
rename to cpp/include/raft/linalg/detail/transpose.cuh
diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh
new file mode 100644
index 0000000000..820c42f0ea
--- /dev/null
+++ b/cpp/include/raft/linalg/divide.cuh
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __DIVIDE_H
+#define __DIVIDE_H
+
+#pragma once
+
+#include "detail/divide.cuh"
+
+namespace raft {
+namespace linalg {
+
+using detail::divides_scalar;
+
+/**
+ * @defgroup ScalarOps Scalar operations on the input buffer
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in the input buffer
+ * @param scalar the scalar used in the operations
+ * @param len number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
+{
+  detail::divideScalar(out, in, scalar, len, stream);
+}
+/** @} */
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/divide.hpp b/cpp/include/raft/linalg/divide.hpp
index 6c8480bf19..88b919b92a 100644
--- a/cpp/include/raft/linalg/divide.hpp
+++ b/cpp/include/raft/linalg/divide.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __DIVIDE_H
+#define __DIVIDE_H
 
 #pragma once
 
-#include "detail/divide.hpp"
+#include "detail/divide.cuh"
 
 namespace raft {
 namespace linalg {
@@ -43,3 +50,5 @@ void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cud
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
new file mode 100644
index 0000000000..f1f02dc13e
--- /dev/null
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __EIG_H
+#define __EIG_H
+
+#pragma once
+
+#include "detail/eig.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @defgroup eig Eigen Decomposition Methods
+ * @{
+ */
+
+/**
+ * @brief eig decomp with divide and conquer method for the column-major
+ * symmetric matrices
+ * @param handle raft handle
+ * @param in the input buffer (symmetric matrix that has real eig values and
+ * vectors.
+ * @param n_rows: number of rows of the input
+ * @param n_cols: number of cols of the input
+ * @param eig_vectors: eigenvectors
+ * @param eig_vals: eigen values
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void eigDC(const raft::handle_t& handle,
+           const math_t* in,
+           std::size_t n_rows,
+           std::size_t n_cols,
+           math_t* eig_vectors,
+           math_t* eig_vals,
+           cudaStream_t stream)
+{
+  detail::eigDC(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream);
+}
+
+using detail::COPY_INPUT;
+using detail::EigVecMemUsage;
+using detail::OVERWRITE_INPUT;
+
+/**
+ * @brief eig sel decomp with divide and conquer method for the column-major
+ * symmetric matrices
+ * @param handle raft handle
+ * @param in the input buffer (symmetric matrix that has real eig values and
+ * vectors.
+ * @param n_rows: number of rows of the input
+ * @param n_cols: number of cols of the input
+ * @param n_eig_vals: number of eigenvectors to be generated
+ * @param eig_vectors: eigenvectors
+ * @param eig_vals: eigen values
+ * @param memUsage: the memory selection for eig vector output
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void eigSelDC(const raft::handle_t& handle,
+              math_t* in,
+              int n_rows,
+              int n_cols,
+              int n_eig_vals,
+              math_t* eig_vectors,
+              math_t* eig_vals,
+              EigVecMemUsage memUsage,
+              cudaStream_t stream)
+{
+  detail::eigSelDC(handle, in, n_rows, n_cols, n_eig_vals, eig_vectors, eig_vals, memUsage, stream);
+}
+
+/**
+ * @brief overloaded function for eig decomp with Jacobi method for the
+ * column-major symmetric matrices (in parameter)
+ * @param handle: raft handle
+ * @param in: input matrix
+ * @param n_rows: number of rows of the input
+ * @param n_cols: number of cols of the input
+ * @param eig_vectors: eigenvectors
+ * @param eig_vals: eigen values
+ * @param stream: stream on which this function will be run
+ * @param tol: error tolerance for the jacobi method. Algorithm stops when the
+ * error is below tol
+ * @param sweeps: number of sweeps in the Jacobi algorithm. The more the better
+ * accuracy.
+ */
+template <typename math_t>
+void eigJacobi(const raft::handle_t& handle,
+               const math_t* in,
+               int n_rows,
+               int n_cols,
+               math_t* eig_vectors,
+               math_t* eig_vals,
+               cudaStream_t stream,
+               math_t tol = 1.e-7,
+               int sweeps = 15)
+{
+  detail::eigJacobi(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream, tol, sweeps);
+}
+/** @} */  // end of eig
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/eig.hpp b/cpp/include/raft/linalg/eig.hpp
index 5c465a3a41..9417b6fb3f 100644
--- a/cpp/include/raft/linalg/eig.hpp
+++ b/cpp/include/raft/linalg/eig.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __EIG_H
+#define __EIG_H
 
 #pragma once
 
-#include "detail/eig.hpp"
+#include "detail/eig.cuh"
 
 namespace raft {
 namespace linalg {
@@ -114,3 +121,5 @@ void eigJacobi(const raft::handle_t& handle,
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/eltwise.cuh b/cpp/include/raft/linalg/eltwise.cuh
new file mode 100644
index 0000000000..dbc06a4af3
--- /dev/null
+++ b/cpp/include/raft/linalg/eltwise.cuh
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ELTWISE_H
+#define __ELTWISE_H
+
+#pragma once
+
+#include "detail/eltwise.cuh"
+
+namespace raft {
+namespace linalg {
+
+using detail::adds_scalar;
+
+/**
+ * @defgroup ScalarOps Scalar operations on the input buffer
+ * @tparam InType data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in the input buffer
+ * @param scalar the scalar used in the operations
+ * @param len number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template <typename InType, typename IdxType, typename OutType = InType>
+void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
+{
+  detail::scalarAdd(out, in, scalar, len, stream);
+}
+
+using detail::multiplies_scalar;
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
+{
+  detail::scalarMultiply(out, in, scalar, len, stream);
+}
+/** @} */
+
+/**
+ * @defgroup BinaryOps Element-wise binary operations on the input buffers
+ * @tparam InType data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in1 the first input buffer
+ * @param in2 the second input buffer
+ * @param len number of elements in the input buffers
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseAdd(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  detail::eltwiseAdd(out, in1, in2, len, stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseSub(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  detail::eltwiseSub(out, in1, in2, len, stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseMultiply(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  detail::eltwiseMultiply(out, in1, in2, len, stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseDivide(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  detail::eltwiseDivide(out, in1, in2, len, stream);
+}
+
+using detail::divides_check_zero;
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseDivideCheckZero(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  detail::eltwiseDivideCheckZero(out, in1, in2, len, stream);
+}
+/** @} */
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/eltwise.hpp b/cpp/include/raft/linalg/eltwise.hpp
index 5c2a97b57d..0ebefc7c25 100644
--- a/cpp/include/raft/linalg/eltwise.hpp
+++ b/cpp/include/raft/linalg/eltwise.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __ELTWISE_H
+#define __ELTWISE_H
 
 #pragma once
 
-#include "detail/eltwise.hpp"
+#include "detail/eltwise.cuh"
 
 namespace raft {
 namespace linalg {
@@ -100,3 +107,5 @@ void eltwiseDivideCheckZero(
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh
new file mode 100644
index 0000000000..9670834ff0
--- /dev/null
+++ b/cpp/include/raft/linalg/gemm.cuh
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __GEMM_H
+#define __GEMM_H
+
+#pragma once
+
+#include "detail/gemm.hpp"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief the wrapper of cublas gemm function
+ *  It computes the following equation: C = alpha .* opA(A) * opB(B) + beta .* C
+ *
+ * @tparam math_t the element type
+ * @tparam DevicePointerMode whether pointers alpha, beta point to device memory
+ * @param [in] handle raft handle
+ * @param [in] trans_a cublas transpose op for A
+ * @param [in] trans_b cublas transpose op for B
+ * @param [in] m number of rows of C
+ * @param [in] n number of columns of C
+ * @param [in] k number of rows of opB(B) / number of columns of opA(A)
+ * @param [in] alpha host or device scalar
+ * @param [in] A such a matrix that the shape of column-major opA(A) is [m, k]
+ * @param [in] lda leading dimension of A
+ * @param [in] B such a matrix that the shape of column-major opA(B) is [k, n]
+ * @param [in] ldb leading dimension of B
+ * @param [in] beta host or device scalar
+ * @param [inout] C column-major matrix of size [m, n]
+ * @param [in] ldc leading dimension of C
+ * @param [in] stream
+ */
+template <typename math_t, bool DevicePointerMode = false>
+void gemm(const raft::handle_t& handle,
+          const bool trans_a,
+          const bool trans_b,
+          const int m,
+          const int n,
+          const int k,
+          const math_t* alpha,
+          const math_t* A,
+          const int lda,
+          const math_t* B,
+          const int ldb,
+          const math_t* beta,
+          const math_t* C,
+          const int ldc,
+          cudaStream_t stream)
+{
+  detail::gemm<math_t, DevicePointerMode>(
+    handle, trans_a, trans_b, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, stream);
+}
+
+/**
+ * @brief the wrapper of cublas gemm function
+ *  It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C
+ * @tparam math_t the type of input/output matrices
+ * @param handle raft handle
+ * @param a input matrix
+ * @param n_rows_a number of rows of A
+ * @param n_cols_a number of columns of A
+ * @param b input matrix
+ * @param c output matrix
+ * @param n_rows_c number of rows of C
+ * @param n_cols_c number of columns of C
+ * @param trans_a cublas transpose op for A
+ * @param trans_b cublas transpose op for B
+ * @param alpha scalar
+ * @param beta scalar
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void gemm(const raft::handle_t& handle,
+          const math_t* a,
+          int n_rows_a,
+          int n_cols_a,
+          const math_t* b,
+          math_t* c,
+          int n_rows_c,
+          int n_cols_c,
+          cublasOperation_t trans_a,
+          cublasOperation_t trans_b,
+          math_t alpha,
+          math_t beta,
+          cudaStream_t stream)
+{
+  detail::gemm(
+    handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream);
+}
+
+/**
+ * @brief the wrapper of cublas gemm function
+ *  It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C
+ * @tparam math_t the type of input/output matrices
+ * @param handle raft handle
+ * @param a input matrix
+ * @param n_rows_a number of rows of A
+ * @param n_cols_a number of columns of A
+ * @param b input matrix
+ * @param c output matrix
+ * @param n_rows_c number of rows of C
+ * @param n_cols_c number of columns of C
+ * @param trans_a cublas transpose op for A
+ * @param trans_b cublas transpose op for B
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void gemm(const raft::handle_t& handle,
+          const math_t* a,
+          int n_rows_a,
+          int n_cols_a,
+          const math_t* b,
+          math_t* c,
+          int n_rows_c,
+          int n_cols_c,
+          cublasOperation_t trans_a,
+          cublasOperation_t trans_b,
+          cudaStream_t stream)
+{
+  detail::gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, stream);
+}
+
+/**
+ * @brief A wrapper for CUBLS GEMM function designed for handling all possible
+ * combinations of operand layouts.
+ * It computes the following equation: Z = alpha . X * Y + beta . Z
+ * @tparam T Data type of input/output matrices (float/double)
+ * @param handle raft handle
+ * @param z output matrix of size M rows x N columns
+ * @param x input matrix of size M rows x K columns
+ * @param y input matrix of size K rows x N columns
+ * @param _M number of rows of X and Z
+ * @param _N number of rows of Y and columns of Z
+ * @param _K number of columns of X and rows of Y
+ * @param isZColMajor Storage layout of Z. true = col major, false = row major
+ * @param isXColMajor Storage layout of X. true = col major, false = row major
+ * @param isYColMajor Storage layout of Y. true = col major, false = row major
+ * @param stream cuda stream
+ * @param alpha scalar
+ * @param beta scalar
+ */
+template <typename T>
+void gemm(const raft::handle_t& handle,
+          T* z,
+          T* x,
+          T* y,
+          int _M,
+          int _N,
+          int _K,
+          bool isZColMajor,
+          bool isXColMajor,
+          bool isYColMajor,
+          cudaStream_t stream,
+          T alpha = T(1.0),
+          T beta  = T(0.0))
+{
+  detail::gemm(
+    handle, z, x, y, _M, _N, _K, isZColMajor, isXColMajor, isYColMajor, stream, alpha, beta);
+}
+
+}  // end namespace linalg
+}  // end namespace raft
+
+#endif
diff --git a/cpp/include/raft/linalg/gemm.hpp b/cpp/include/raft/linalg/gemm.hpp
index f22d15e650..736590938b 100644
--- a/cpp/include/raft/linalg/gemm.hpp
+++ b/cpp/include/raft/linalg/gemm.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __GEMM_H
+#define __GEMM_H
 
 #pragma once
 
@@ -173,3 +180,5 @@ void gemm(const raft::handle_t& handle,
 
 }  // end namespace linalg
 }  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/gemv.cuh b/cpp/include/raft/linalg/gemv.cuh
new file mode 100644
index 0000000000..26a6386148
--- /dev/null
+++ b/cpp/include/raft/linalg/gemv.cuh
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __GEMV_H
+#define __GEMV_H
+
+#pragma once
+
+#include "detail/gemv.hpp"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief the wrapper of cublas gemv function
+ *  It computes the following equation: y = alpha .* op(A) * x + beta .* y
+ *
+ * @tparam math_t the element type
+ * @tparam DevicePointerMode whether pointers alpha, beta point to device memory
+ * @param [in] handle raft handle
+ * @param [in] trans_a cublas transpose op for A
+ * @param [in] m number of rows of A
+ * @param [in] n number of columns of A
+ * @param [in] alpha host or device scalar
+ * @param [in] A column-major matrix of size [m, n]
+ * @param [in] lda leading dimension of A
+ * @param [in] x vector of length n if trans_a else m
+ * @param [in] incx stride between consecutive elements of x
+ * @param [in] beta host or device scalar
+ * @param [inout] y vector of length m if trans_a else n
+ * @param [in] incy stride between consecutive elements of y
+ * @param [in] stream
+ */
+template <typename math_t, bool DevicePointerMode = false>
+void gemv(const raft::handle_t& handle,
+          const bool trans_a,
+          const int m,
+          const int n,
+          const math_t* alpha,
+          const math_t* A,
+          const int lda,
+          const math_t* x,
+          const int incx,
+          const math_t* beta,
+          math_t* y,
+          const int incy,
+          cudaStream_t stream)
+{
+  detail::gemv<math_t, DevicePointerMode>(
+    handle, trans_a, m, n, alpha, A, lda, x, incx, beta, y, incy, stream);
+}
+
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows,
+          const int n_cols,
+          const math_t* x,
+          const int incx,
+          math_t* y,
+          const int incy,
+          const bool trans_a,
+          const math_t alpha,
+          const math_t beta,
+          cudaStream_t stream)
+{
+  detail::gemv(handle, A, n_rows, n_cols, x, incx, y, incy, trans_a, alpha, beta, stream);
+}
+
+/**
+ * y = alpha * op(A) * x + beta * y
+ *
+ * where
+ *
+ * @param handle raft handle
+ * @param A is a column-major matrix of size n_rows_a * n_cols_a.
+ *   op(A) is either the transpose operation (trans_a == true) or identity.
+ * @param n_rows_a number of rows in A
+ * @param n_cols_a number of cols in A
+ * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
+ * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ * @param trans_a whether to take transpose of a
+ * @param alpha is a scalar scale of Ax.
+ * @param beta is a scalar scale of y.
+ * @param stream stream on which this function is run
+ */
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          const math_t alpha,
+          const math_t beta,
+          cudaStream_t stream)
+{
+  detail::gemv(handle, A, n_rows_a, n_cols_a, x, y, trans_a, alpha, beta, stream);
+}
+
+/**
+ * y = op(A) * x
+ *
+ * where
+ *
+ * @param handle raft handle
+ * @param A is a column-major matrix of size n_rows_a * n_cols_a.
+ *   op(A) is either the transpose operation (trans_a == true) or identity.
+ * @param n_rows_a number of rows in A
+ * @param n_cols_a number of cols in A
+ * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
+ * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ * @param trans_a whether to take transpose of a
+ * @param stream stream on which this function is run
+ */
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          cudaStream_t stream)
+{
+  detail::gemv(handle, A, n_rows_a, n_cols_a, x, y, trans_a, stream);
+}
+
+/**
+ * y = alpha * op(A) * x + beta * y
+ *
+ * where
+ * @param handle raft handle
+ * @param A is a column-major matrix of size n_rows_a * n_cols_a.
+ *   op(A) is either the transpose operation (trans_a == true) or identity.
+ * @param n_rows_a number of rows in A
+ * @param n_cols_a number of cols in A
+ * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a.
+ *     set it when you need to use only the first n_rows_a rows of the matrix A, which has
+ *     (perhaps, due to padding) lda rows.
+ * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
+ * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ * @param trans_a whether to take transpose of a
+ * @param alpha is a scalar scale of Ax.
+ * @param beta is a scalar scale of y.
+ * @param stream stream on which this function is run
+ */
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const int lda,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          const math_t alpha,
+          const math_t beta,
+          cudaStream_t stream)
+{
+  detail::gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream);
+}
+
+/**
+ * y = op(A) * x
+ *
+ * where
+ * @param handle raft handle
+ * @param A is a column-major matrix of size n_rows_a * n_cols_a.
+ *   op(A) is either the transpose operation (trans_a == true) or identity.
+ * @param n_rows_a number of rows in A
+ * @param n_cols_a number of cols in A
+ * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a.
+ *     set it when you need to use only the first n_rows_a rows of the matrix A, which has
+ *     (perhaps, due to padding) lda rows.
+ * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
+ * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ * @param trans_a whether to take transpose of a
+ * @param stream stream on which this function is run
+ *
+ */
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const int lda,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          cudaStream_t stream)
+{
+  detail::gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, stream);
+}
+
+};  // namespace linalg
+};  // namespace raft
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/gemv.hpp b/cpp/include/raft/linalg/gemv.hpp
index 2098027b16..d6e0e0326b 100644
--- a/cpp/include/raft/linalg/gemv.hpp
+++ b/cpp/include/raft/linalg/gemv.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __GEMV_H
+#define __GEMV_H
 
 #pragma once
 
@@ -206,3 +213,5 @@ void gemv(const raft::handle_t& handle,
 
 };  // namespace linalg
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/init.cuh b/cpp/include/raft/linalg/init.cuh
new file mode 100644
index 0000000000..2fdf9dceb9
--- /dev/null
+++ b/cpp/include/raft/linalg/init.cuh
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __INIT_H
+#define __INIT_H
+
+#pragma once
+
+#include "detail/init.hpp"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Like Python range.
+ *
+ * Fills the output as out[i] = i.
+ *
+ * \param [out] out device array, size [end-start]
+ * \param [in] start of the range
+ * \param [in] end of range (exclusive)
+ * \param [in] stream cuda stream
+ */
+template <typename T>
+void range(T* out, int start, int end, cudaStream_t stream)
+{
+  detail::range(out, start, end, stream);
+}
+
+/**
+ * @brief Like Python range.
+ *
+ * Fills the output as out[i] = i.
+ *
+ * \param [out] out device array, size [n]
+ * \param [in] n length of the array
+ * \param [in] stream cuda stream
+ */
+template <typename T, int TPB = 256>
+void range(T* out, int n, cudaStream_t stream)
+{
+  detail::range(out, n, stream);
+}
+
+}  // namespace linalg
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/init.hpp b/cpp/include/raft/linalg/init.hpp
index bb577672e8..af3486f278 100644
--- a/cpp/include/raft/linalg/init.hpp
+++ b/cpp/include/raft/linalg/init.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __INIT_H
+#define __INIT_H
 
 #pragma once
 
@@ -54,3 +61,5 @@ void range(T* out, int n, cudaStream_t stream)
 
 }  // namespace linalg
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/lanczos.cuh b/cpp/include/raft/linalg/lanczos.cuh
new file mode 100644
index 0000000000..a7157adfab
--- /dev/null
+++ b/cpp/include/raft/linalg/lanczos.cuh
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __LANCZOS_H
+#define __LANCZOS_H
+
+#pragma once
+
+#include "detail/lanczos.cuh"
+#include <raft/spectral/matrix_wrappers.hpp>
+
+namespace raft {
+namespace linalg {
+
+// =========================================================
+// Eigensolver
+// =========================================================
+
+/**
+ *  @brief  Compute smallest eigenvectors of symmetric matrix
+ *    Computes eigenvalues and eigenvectors that are least
+ *    positive. If matrix is positive definite or positive
+ *    semidefinite, the computed eigenvalues are smallest in
+ *    magnitude.
+ *    The largest eigenvalue is estimated by performing several
+ *    Lanczos iterations. An implicitly restarted Lanczos method is
+ *    then applied to A+s*I, where s is negative the largest
+ *    eigenvalue.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param A Matrix.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter Maximum number of Lanczos steps. Does not include
+ *    Lanczos steps used to estimate largest eigenvalue.
+ *  @param restartIter Maximum size of Lanczos system before
+ *    performing an implicit restart. Should be at least 4.
+ *  @param tol Convergence tolerance. Lanczos iteration will
+ *    terminate when the residual norm is less than tol*theta, where
+ *    theta is an estimate for the smallest unwanted eigenvalue
+ *    (i.e. the (nEigVecs+1)th smallest eigenvalue).
+ *  @param reorthogonalize Whether to reorthogonalize Lanczos
+ *    vectors.
+ *  @param iter On exit, pointer to total number of Lanczos
+ *    iterations performed. Does not include Lanczos steps used to
+ *    estimate largest eigenvalue.
+ *  @param eigVals_dev (Output, device memory, nEigVecs entries)
+ *    Smallest eigenvalues of matrix.
+ *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
+ *    Eigenvectors corresponding to smallest eigenvalues of
+ *    matrix. Vectors are stored as columns of a column-major matrix
+ *    with dimensions n x nEigVecs.
+ *  @param seed random seed.
+ *  @return error flag.
+ */
+template <typename index_type_t, typename value_type_t>
+int computeSmallestEigenvectors(
+  handle_t const& handle,
+  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+  index_type_t nEigVecs,
+  index_type_t maxIter,
+  index_type_t restartIter,
+  value_type_t tol,
+  bool reorthogonalize,
+  index_type_t& iter,
+  value_type_t* __restrict__ eigVals_dev,
+  value_type_t* __restrict__ eigVecs_dev,
+  unsigned long long seed = 1234567)
+{
+  return detail::computeSmallestEigenvectors(handle,
+                                             A,
+                                             nEigVecs,
+                                             maxIter,
+                                             restartIter,
+                                             tol,
+                                             reorthogonalize,
+                                             iter,
+                                             eigVals_dev,
+                                             eigVecs_dev,
+                                             seed);
+}
+
+/**
+ *  @brief  Compute largest eigenvectors of symmetric matrix
+ *    Computes eigenvalues and eigenvectors that are least
+ *    positive. If matrix is positive definite or positive
+ *    semidefinite, the computed eigenvalues are largest in
+ *    magnitude.
+ *    The largest eigenvalue is estimated by performing several
+ *    Lanczos iterations. An implicitly restarted Lanczos method is
+ *    then applied to A+s*I, where s is negative the largest
+ *    eigenvalue.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param A Matrix.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter Maximum number of Lanczos steps. Does not include
+ *    Lanczos steps used to estimate largest eigenvalue.
+ *  @param restartIter Maximum size of Lanczos system before
+ *    performing an implicit restart. Should be at least 4.
+ *  @param tol Convergence tolerance. Lanczos iteration will
+ *    terminate when the residual norm is less than tol*theta, where
+ *    theta is an estimate for the largest unwanted eigenvalue
+ *    (i.e. the (nEigVecs+1)th largest eigenvalue).
+ *  @param reorthogonalize Whether to reorthogonalize Lanczos
+ *    vectors.
+ *  @param iter On exit, pointer to total number of Lanczos
+ *    iterations performed. Does not include Lanczos steps used to
+ *    estimate largest eigenvalue.
+ *  @param eigVals_dev (Output, device memory, nEigVecs entries)
+ *    Largest eigenvalues of matrix.
+ *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
+ *    Eigenvectors corresponding to largest eigenvalues of
+ *    matrix. Vectors are stored as columns of a column-major matrix
+ *    with dimensions n x nEigVecs.
+ *  @param seed random seed.
+ *  @return error flag.
+ */
+template <typename index_type_t, typename value_type_t>
+int computeLargestEigenvectors(
+  handle_t const& handle,
+  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+  index_type_t nEigVecs,
+  index_type_t maxIter,
+  index_type_t restartIter,
+  value_type_t tol,
+  bool reorthogonalize,
+  index_type_t& iter,
+  value_type_t* __restrict__ eigVals_dev,
+  value_type_t* __restrict__ eigVecs_dev,
+  unsigned long long seed = 123456)
+{
+  return detail::computeLargestEigenvectors(handle,
+                                            A,
+                                            nEigVecs,
+                                            maxIter,
+                                            restartIter,
+                                            tol,
+                                            reorthogonalize,
+                                            iter,
+                                            eigVals_dev,
+                                            eigVecs_dev,
+                                            seed);
+}
+
+}  // namespace linalg
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp
index 21b65158fc..7663af3cb2 100644
--- a/cpp/include/raft/linalg/lanczos.hpp
+++ b/cpp/include/raft/linalg/lanczos.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __LANCZOS_H
+#define __LANCZOS_H
 
 #pragma once
 
-#include "detail/lanczos.hpp"
+#include "detail/lanczos.cuh"
 #include <raft/spectral/matrix_wrappers.hpp>
 
 namespace raft {
@@ -156,3 +163,5 @@ int computeLargestEigenvectors(
 
 }  // namespace linalg
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/lstsq.cuh b/cpp/include/raft/linalg/lstsq.cuh
new file mode 100644
index 0000000000..255f1293f4
--- /dev/null
+++ b/cpp/include/raft/linalg/lstsq.cuh
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __LSTSQ_H
+#define __LSTSQ_H
+
+#pragma once
+
+#include <raft/handle.hpp>
+#include <raft/linalg/detail/lstsq.cuh>
+namespace raft {
+namespace linalg {
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  Via SVD decomposition of `A = U S Vt` using default cuSOLVER routine.
+ *
+ * @param[in] handle raft handle
+ * @param[inout] A input feature matrix.
+ *            Warning: the content of this matrix is modified by the cuSOLVER routines.
+ * @param[in] n_rows number of rows in A
+ * @param[in] n_cols number of columns in A
+ * @param[inout] b input target vector.
+ *            Warning: the content of this vector is modified by the cuSOLVER routines.
+ * @param[out] w output coefficient vector
+ * @param[in] stream cuda stream for ordering operations
+ */
+template <typename math_t>
+void lstsqSvdQR(const raft::handle_t& handle,
+                math_t* A,
+                const int n_rows,
+                const int n_cols,
+                const math_t* b,
+                math_t* w,
+                cudaStream_t stream)
+{
+  detail::lstsqSvdQR(handle, A, n_rows, n_cols, b, w, stream);
+}
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  Via SVD decomposition of `A = U S V^T` using Jacobi iterations (cuSOLVER).
+ *
+ * @param[in] handle raft handle
+ * @param[inout] A input feature matrix.
+ *            Warning: the content of this matrix is modified by the cuSOLVER routines.
+ * @param[in] n_rows number of rows in A
+ * @param[in] n_cols number of columns in A
+ * @param[inout] b input target vector.
+ *            Warning: the content of this vector is modified by the cuSOLVER routines.
+ * @param[out] w output coefficient vector
+ * @param[in] stream cuda stream for ordering operations
+ */
+template <typename math_t>
+void lstsqSvdJacobi(const raft::handle_t& handle,
+                    math_t* A,
+                    const int n_rows,
+                    const int n_cols,
+                    const math_t* b,
+                    math_t* w,
+                    cudaStream_t stream)
+{
+  detail::lstsqSvdJacobi(handle, A, n_rows, n_cols, b, w, stream);
+}
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  via eigenvalue decomposition of `A^T * A` (covariance matrix for dataset A).
+ *  (`w = (A^T A)^-1  A^T b`)
+ */
+template <typename math_t>
+void lstsqEig(const raft::handle_t& handle,
+              const math_t* A,
+              const int n_rows,
+              const int n_cols,
+              const math_t* b,
+              math_t* w,
+              cudaStream_t stream)
+{
+  detail::lstsqEig(handle, A, n_rows, n_cols, b, w, stream);
+}
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  via QR decomposition of `A = QR`.
+ *  (triangular system of equations `Rw = Q^T b`)
+ *
+ * @param[in] handle raft handle
+ * @param[inout] A input feature matrix.
+ *            Warning: the content of this matrix is modified by the cuSOLVER routines.
+ * @param[in] n_rows number of rows in A
+ * @param[in] n_cols number of columns in A
+ * @param[inout] b input target vector.
+ *            Warning: the content of this vector is modified by the cuSOLVER routines.
+ * @param[out] w output coefficient vector
+ * @param[in] stream cuda stream for ordering operations
+ */
+template <typename math_t>
+void lstsqQR(const raft::handle_t& handle,
+             math_t* A,
+             const int n_rows,
+             const int n_cols,
+             math_t* b,
+             math_t* w,
+             cudaStream_t stream)
+{
+  detail::lstsqQR(handle, A, n_rows, n_cols, b, w, stream);
+}
+
+};  // namespace linalg
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/lstsq.hpp b/cpp/include/raft/linalg/lstsq.hpp
index 57dd0a7b15..008fcab653 100644
--- a/cpp/include/raft/linalg/lstsq.hpp
+++ b/cpp/include/raft/linalg/lstsq.hpp
@@ -13,11 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __LSTSQ_H
+#define __LSTSQ_H
 
 #pragma once
 
 #include <raft/handle.hpp>
-#include <raft/linalg/detail/lstsq.hpp>
+#include <raft/linalg/detail/lstsq.cuh>
 namespace raft {
 namespace linalg {
 
@@ -115,3 +122,5 @@ void lstsqQR(const raft::handle_t& handle,
 
 };  // namespace linalg
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.cuh
new file mode 100644
index 0000000000..5df4d24b4f
--- /dev/null
+++ b/cpp/include/raft/linalg/map.cuh
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __MAP_H
+#define __MAP_H
+
+#pragma once
+
+#include "detail/map.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief CUDA version of map
+ * @tparam InType data-type upon which the math operation will be performed
+ * @tparam MapOp the device-lambda performing the actual operation
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @tparam Args additional parameters
+ * @tparam OutType data-type in which the result will be stored
+ * @param out the output of the map operation (assumed to be a device pointer)
+ * @param len number of elements in the input array
+ * @param map the device-lambda
+ * @param stream cuda-stream where to launch this kernel
+ * @param in the input array
+ * @param args additional input arrays
+ */
+
+template <typename InType,
+          typename MapOp,
+          int TPB = 256,
+          typename... Args,
+          typename OutType = InType>
+void map(OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+{
+  detail::mapImpl<InType, OutType, MapOp, TPB, Args...>(out, len, map, stream, in, args...);
+}
+
+}  // namespace linalg
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/map.hpp b/cpp/include/raft/linalg/map.hpp
index febeaa8621..d4ee231eb1 100644
--- a/cpp/include/raft/linalg/map.hpp
+++ b/cpp/include/raft/linalg/map.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MAP_H
+#define __MAP_H
 
 #pragma once
 
@@ -48,3 +55,5 @@ void map(OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType*
 
 }  // namespace linalg
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/map_then_reduce.cuh
new file mode 100644
index 0000000000..36828cf154
--- /dev/null
+++ b/cpp/include/raft/linalg/map_then_reduce.cuh
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __MAP_THEN_REDUCE_H
+#define __MAP_THEN_REDUCE_H
+
+#pragma once
+
+#include "detail/map_then_reduce.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief CUDA version of map and then sum reduction operation
+ * @tparam Type data-type upon which the math operation will be performed
+ * @tparam MapOp the device-lambda performing the actual operation
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @tparam Args additional parameters
+ * @param out the output sum-reduced value (assumed to be a device pointer)
+ * @param len number of elements in the input array
+ * @param map the device-lambda
+ * @param stream cuda-stream where to launch this kernel
+ * @param in the input array
+ * @param args additional input arrays
+ */
+
+template <typename InType,
+          typename MapOp,
+          int TPB = 256,
+          typename... Args,
+          typename OutType = InType>
+void mapThenSumReduce(
+  OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+{
+  detail::mapThenReduceImpl<InType, OutType, MapOp, detail::sum_tag, TPB, Args...>(
+    out, len, (OutType)0, map, detail::sum_tag(), stream, in, args...);
+}
+
+/**
+ * @brief CUDA version of map and then generic reduction operation
+ * @tparam Type data-type upon which the math operation will be performed
+ * @tparam MapOp the device-lambda performing the actual map operation
+ * @tparam ReduceLambda the device-lambda performing the actual reduction
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @tparam Args additional parameters
+ * @param out the output reduced value (assumed to be a device pointer)
+ * @param len number of elements in the input array
+ * @param neutral The neutral element of the reduction operation. For example:
+ *    0 for sum, 1 for multiply, +Inf for Min, -Inf for Max
+ * @param map the device-lambda
+ * @param op the reduction device lambda
+ * @param stream cuda-stream where to launch this kernel
+ * @param in the input array
+ * @param args additional input arrays
+ */
+
+template <typename InType,
+          typename MapOp,
+          typename ReduceLambda,
+          int TPB          = 256,
+          typename OutType = InType,
+          typename... Args>
+void mapThenReduce(OutType* out,
+                   size_t len,
+                   OutType neutral,
+                   MapOp map,
+                   ReduceLambda op,
+                   cudaStream_t stream,
+                   const InType* in,
+                   Args... args)
+{
+  detail::mapThenReduceImpl<InType, OutType, MapOp, ReduceLambda, TPB, Args...>(
+    out, len, neutral, map, op, stream, in, args...);
+}
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/map_then_reduce.hpp b/cpp/include/raft/linalg/map_then_reduce.hpp
index 04275995a0..c4b136d1b8 100644
--- a/cpp/include/raft/linalg/map_then_reduce.hpp
+++ b/cpp/include/raft/linalg/map_then_reduce.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MAP_THEN_REDUCE_H
+#define __MAP_THEN_REDUCE_H
 
 #pragma once
 
@@ -85,3 +92,5 @@ void mapThenReduce(OutType* out,
 }
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
new file mode 100644
index 0000000000..56437313e3
--- /dev/null
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __MATRIX_VECTOR_OP_H
+#define __MATRIX_VECTOR_OP_H
+
+#pragma once
+
+#include "detail/matrix_vector_op.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Operations for all the columns or rows with a given vector.
+ * Caution : Threads process multiple elements to speed up processing. These
+ * are loaded in a single read thanks to type promotion. Faster processing
+ * would thus only be enabled when adresses are optimally aligned for it.
+ * Note : the function will also check that the size of the window of accesses
+ * is a multiple of the number of elements processed by a thread in order to
+ * enable faster processing
+ * @tparam Type the matrix/vector type
+ * @tparam Lambda a device function which represents a binary operator
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads per block of the cuda kernel launched
+ * @param out the output matrix (passing out = matrix makes it in-place)
+ * @param matrix the input matrix
+ * @param vec the vector
+ * @param D number of columns of matrix
+ * @param N number of rows of matrix
+ * @param rowMajor whether input is row or col major
+ * @param bcastAlongRows whether the broadcast of vector needs to happen along
+ * the rows of the matrix or columns
+ * @param op the mathematical operation
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
+void matrixVectorOp(Type* out,
+                    const Type* matrix,
+                    const Type* vec,
+                    IdxType D,
+                    IdxType N,
+                    bool rowMajor,
+                    bool bcastAlongRows,
+                    Lambda op,
+                    cudaStream_t stream)
+{
+  detail::matrixVectorOp(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
+}
+
+/**
+ * @brief Operations for all the columns or rows with the given vectors.
+ * Caution : Threads process multiple elements to speed up processing. These
+ * are loaded in a single read thanks to type promotion. Faster processing
+ * would thus only be enabled when adresses are optimally aligned for it.
+ * Note : the function will also check that the size of the window of accesses
+ * is a multiple of the number of elements processed by a thread in order to
+ * enable faster processing
+ * @tparam Type the matrix/vector type
+ * @tparam Lambda a device function which represents a binary operator
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads per block of the cuda kernel launched
+ * @param out the output matrix (passing out = matrix makes it in-place)
+ * @param matrix the input matrix
+ * @param vec1 the first vector
+ * @param vec2 the second vector
+ * @param D number of columns of matrix
+ * @param N number of rows of matrix
+ * @param rowMajor whether input is row or col major
+ * @param bcastAlongRows whether the broadcast of vector needs to happen along
+ * the rows of the matrix or columns
+ * @param op the mathematical operation
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
+void matrixVectorOp(Type* out,
+                    const Type* matrix,
+                    const Type* vec1,
+                    const Type* vec2,
+                    IdxType D,
+                    IdxType N,
+                    bool rowMajor,
+                    bool bcastAlongRows,
+                    Lambda op,
+                    cudaStream_t stream)
+{
+  detail::matrixVectorOp(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/matrix_vector_op.hpp b/cpp/include/raft/linalg/matrix_vector_op.hpp
index b9790ebce2..c041d4c263 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.hpp
+++ b/cpp/include/raft/linalg/matrix_vector_op.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MATRIX_VECTOR_OP_H
+#define __MATRIX_VECTOR_OP_H
 
 #pragma once
 
@@ -99,3 +106,5 @@ void matrixVectorOp(Type* out,
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.cuh
new file mode 100644
index 0000000000..1b3297f926
--- /dev/null
+++ b/cpp/include/raft/linalg/mean_squared_error.cuh
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __MSE_H
+#define __MSE_H
+
+#pragma once
+
+#include "detail/mean_squared_error.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief CUDA version mean squared error function mean((A-B)**2)
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam TPB threads-per-block
+ * @param out the output mean squared error value (assumed to be a device pointer)
+ * @param A input array (assumed to be a device pointer)
+ * @param B input array (assumed to be a device pointer)
+ * @param len number of elements in the input arrays
+ * @param weight weight to apply to every term in the mean squared error calculation
+ * @param stream cuda-stream where to launch this kernel
+ */
+template <typename math_t, int TPB = 256>
+void meanSquaredError(
+  math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream)
+{
+  detail::meanSquaredError(out, A, B, len, weight, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/mean_squared_error.hpp b/cpp/include/raft/linalg/mean_squared_error.hpp
index 42af8642b6..95428d47e0 100644
--- a/cpp/include/raft/linalg/mean_squared_error.hpp
+++ b/cpp/include/raft/linalg/mean_squared_error.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MSE_H
+#define __MSE_H
 
 #pragma once
 
-#include "detail/mean_squared_error.hpp"
+#include "detail/mean_squared_error.cuh"
 
 namespace raft {
 namespace linalg {
@@ -41,3 +48,5 @@ void meanSquaredError(
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh
new file mode 100644
index 0000000000..f1161b23cb
--- /dev/null
+++ b/cpp/include/raft/linalg/multiply.cuh
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __MULTIPLY_H
+#define __MULTIPLY_H
+
+#pragma once
+
+#include "detail/multiply.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @defgroup ScalarOps Scalar operations on the input buffer
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in the input buffer
+ * @param scalar the scalar used in the operations
+ * @param len number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
+{
+  detail::multiplyScalar(out, in, scalar, len, stream);
+}
+/** @} */
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/multiply.hpp b/cpp/include/raft/linalg/multiply.hpp
index 4a1628b44a..260fb25018 100644
--- a/cpp/include/raft/linalg/multiply.hpp
+++ b/cpp/include/raft/linalg/multiply.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MULTIPLY_H
+#define __MULTIPLY_H
 
 #pragma once
 
-#include "detail/multiply.hpp"
+#include "detail/multiply.cuh"
 
 namespace raft {
 namespace linalg {
@@ -41,3 +48,5 @@ void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, c
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.cuh
new file mode 100644
index 0000000000..87bd2a2b0a
--- /dev/null
+++ b/cpp/include/raft/linalg/norm.cuh
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __NORM_H
+#define __NORM_H
+
+#pragma once
+
+#include "detail/norm.cuh"
+
+namespace raft {
+namespace linalg {
+
+/** different types of norms supported on the input buffers */
+using detail::L1Norm;
+using detail::L2Norm;
+using detail::NormType;
+
+/**
+ * @brief Compute row-wise norm of the input matrix and perform fin_op lambda
+ *
+ * Row-wise norm is useful while computing pairwise distance matrix, for
+ * example.
+ * This is used in many clustering algos like knn, kmeans, dbscan, etc... The
+ * current implementation is optimized only for bigger values of 'D'.
+ *
+ * @tparam Type the data type
+ * @tparam Lambda device final lambda
+ * @tparam IdxType Integer type used to for addressing
+ * @param dots the output vector of row-wise dot products
+ * @param data the input matrix (currently assumed to be row-major)
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param type the type of norm to be applied
+ * @param rowMajor whether the input is row-major or not
+ * @param stream cuda stream where to launch work
+ * @param fin_op the final lambda op
+ */
+template <typename Type, typename IdxType = int, typename Lambda = raft::Nop<Type, IdxType>>
+void rowNorm(Type* dots,
+             const Type* data,
+             IdxType D,
+             IdxType N,
+             NormType type,
+             bool rowMajor,
+             cudaStream_t stream,
+             Lambda fin_op = raft::Nop<Type, IdxType>())
+{
+  detail::rowNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op);
+}
+
+/**
+ * @brief Compute column-wise norm of the input matrix and perform fin_op
+ * @tparam Type the data type
+ * @tparam Lambda device final lambda
+ * @tparam IdxType Integer type used to for addressing
+ * @param dots the output vector of column-wise dot products
+ * @param data the input matrix (currently assumed to be row-major)
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param type the type of norm to be applied
+ * @param rowMajor whether the input is row-major or not
+ * @param stream cuda stream where to launch work
+ * @param fin_op the final lambda op
+ */
+template <typename Type, typename IdxType = int, typename Lambda = raft::Nop<Type, IdxType>>
+void colNorm(Type* dots,
+             const Type* data,
+             IdxType D,
+             IdxType N,
+             NormType type,
+             bool rowMajor,
+             cudaStream_t stream,
+             Lambda fin_op = raft::Nop<Type, IdxType>())
+{
+  detail::colNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/norm.hpp b/cpp/include/raft/linalg/norm.hpp
index a6336769ca..7be524f6de 100644
--- a/cpp/include/raft/linalg/norm.hpp
+++ b/cpp/include/raft/linalg/norm.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __NORM_H
+#define __NORM_H
 
 #pragma once
 
-#include "detail/norm.hpp"
+#include "detail/norm.cuh"
 
 namespace raft {
 namespace linalg {
@@ -88,3 +95,5 @@ void colNorm(Type* dots,
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/power.cuh b/cpp/include/raft/linalg/power.cuh
index d17fa9a043..f94fcfc894 100644
--- a/cpp/include/raft/linalg/power.cuh
+++ b/cpp/include/raft/linalg/power.cuh
@@ -13,12 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#ifndef __POWER_H
+#define __POWER_H
 
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.hpp>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/unary_op.cuh>
 
 namespace raft {
 namespace linalg {
@@ -63,3 +65,5 @@ void power(math_t* out, const math_t* in1, const math_t* in2, IdxType len, cudaS
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/power.hpp b/cpp/include/raft/linalg/power.hpp
new file mode 100644
index 0000000000..124ee8513a
--- /dev/null
+++ b/cpp/include/raft/linalg/power.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __POWER_H
+#define __POWER_H
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/unary_op.cuh>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @defgroup ScalarOps Scalar operations on the input buffer
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in the input buffer
+ * @param scalar the scalar used in the operations
+ * @param len number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void powerScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(
+    out, in, len, [scalar] __device__(math_t in) { return raft::myPow(in, scalar); }, stream);
+}
+/** @} */
+
+/**
+ * @defgroup BinaryOps Element-wise binary operations on the input buffers
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in1 the first input buffer
+ * @param in2 the second input buffer
+ * @param len number of elements in the input buffers
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void power(math_t* out, const math_t* in1, const math_t* in2, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::binaryOp(
+    out, in1, in2, len, [] __device__(math_t a, math_t b) { return raft::myPow(a, b); }, stream);
+}
+/** @} */
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh
new file mode 100644
index 0000000000..fe6a5263ca
--- /dev/null
+++ b/cpp/include/raft/linalg/qr.cuh
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __QR_H
+#define __QR_H
+
+#pragma once
+
+#include "detail/qr.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @defgroup QRdecomp QR decomposition
+ * @{
+ */
+
+/**
+ * @brief compute QR decomp and return only Q matrix
+ * @param handle: raft handle
+ * @param M: input matrix
+ * @param Q: Q matrix to be returned (on GPU)
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param stream cuda stream
+ * @{
+ */
+template <typename math_t>
+void qrGetQ(const raft::handle_t& handle,
+            const math_t* M,
+            math_t* Q,
+            int n_rows,
+            int n_cols,
+            cudaStream_t stream)
+{
+  detail::qrGetQ(handle, M, Q, n_rows, n_cols, stream);
+}
+
+/**
+ * @brief compute QR decomp and return both Q and R matrices
+ * @param handle: raft handle
+ * @param M: input matrix
+ * @param Q: Q matrix to be returned (on GPU)
+ * @param R: R matrix to be returned (on GPU)
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void qrGetQR(const raft::handle_t& handle,
+             math_t* M,
+             math_t* Q,
+             math_t* R,
+             int n_rows,
+             int n_cols,
+             cudaStream_t stream)
+{
+  detail::qrGetQR(handle, M, Q, R, n_rows, n_cols, stream);
+}
+/** @} */
+
+};  // namespace linalg
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/qr.hpp b/cpp/include/raft/linalg/qr.hpp
index 50e97e4069..da8736b46f 100644
--- a/cpp/include/raft/linalg/qr.hpp
+++ b/cpp/include/raft/linalg/qr.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __QR_H
+#define __QR_H
 
 #pragma once
 
@@ -72,3 +79,5 @@ void qrGetQR(const raft::handle_t& handle,
 
 };  // namespace linalg
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh
new file mode 100644
index 0000000000..7640da8c2d
--- /dev/null
+++ b/cpp/include/raft/linalg/reduce.cuh
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __REDUCE_H
+#define __REDUCE_H
+
+#pragma once
+
+#include "detail/reduce.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Compute reduction of the input matrix along the requested dimension
+ *
+ * @tparam InType the data type of the input
+ * @tparam OutType the data type of the output (as well as the data type for
+ *  which reduction is performed)
+ * @tparam IdxType data type of the indices of the array
+ * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
+ * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*ReduceLambda)(OutType);</pre>
+ * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*FinalLambda)(OutType);</pre>
+ * @param dots the output reduction vector
+ * @param data the input matrix
+ * @param D number of columns
+ * @param N number of rows
+ * @param init initial value to use for the reduction
+ * @param rowMajor input matrix is row-major or not
+ * @param alongRows whether to reduce along rows or columns
+ * @param stream cuda stream where to launch work
+ * @param inplace reduction result added inplace or overwrites old values?
+ * @param main_op elementwise operation to apply before reduction
+ * @param reduce_op binary reduction operation
+ * @param final_op elementwise operation to apply before storing results
+ */
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::Nop<OutType>>
+void reduce(OutType* dots,
+            const InType* data,
+            int D,
+            int N,
+            OutType init,
+            bool rowMajor,
+            bool alongRows,
+            cudaStream_t stream,
+            bool inplace           = false,
+            MainLambda main_op     = raft::Nop<InType, IdxType>(),
+            ReduceLambda reduce_op = raft::Sum<OutType>(),
+            FinalLambda final_op   = raft::Nop<OutType>())
+{
+  detail::reduce(
+    dots, data, D, N, init, rowMajor, alongRows, stream, inplace, main_op, reduce_op, final_op);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/reduce.hpp b/cpp/include/raft/linalg/reduce.hpp
index 1c4ef70df8..b9f057771a 100644
--- a/cpp/include/raft/linalg/reduce.hpp
+++ b/cpp/include/raft/linalg/reduce.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __REDUCE_H
+#define __REDUCE_H
 
 #pragma once
 
-#include "detail/reduce.hpp"
+#include "detail/reduce.cuh"
 
 namespace raft {
 namespace linalg {
@@ -75,3 +82,5 @@ void reduce(OutType* dots,
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
index 82d272671c..2336639258 100644
--- a/cpp/include/raft/linalg/reduce_cols_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
@@ -13,6 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#ifndef __REDUCE_COLS_BY_KEY
+#define __REDUCE_COLS_BY_KEY
 
 #pragma once
 
@@ -52,3 +54,5 @@ void reduce_cols_by_key(const T* data,
 }
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.hpp b/cpp/include/raft/linalg/reduce_cols_by_key.hpp
new file mode 100644
index 0000000000..a338d8572b
--- /dev/null
+++ b/cpp/include/raft/linalg/reduce_cols_by_key.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __REDUCE_COLS_BY_KEY
+#define __REDUCE_COLS_BY_KEY
+
+#pragma once
+
+#include <raft/linalg/detail/reduce_cols_by_key.cuh>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Computes the sum-reduction of matrix columns for each given key
+ * @tparam T the input data type (as well as the output reduced matrix)
+ * @tparam KeyType data type of the keys
+ * @tparam IdxType indexing arithmetic type
+ * @param data the input data (dim = nrows x ncols). This is assumed to be in
+ * row-major layout
+ * @param keys keys array (len = ncols). It is assumed that each key in this
+ * array is between [0, nkeys). In case this is not true, the caller is expected
+ * to have called make_monotonic primitive to prepare such a contiguous and
+ * monotonically increasing keys array.
+ * @param out the output reduced matrix along columns (dim = nrows x nkeys).
+ * This will be assumed to be in row-major layout
+ * @param nrows number of rows in the input data
+ * @param ncols number of colums in the input data
+ * @param nkeys number of unique keys in the keys array
+ * @param stream cuda stream to launch the kernel onto
+ */
+template <typename T, typename KeyIteratorT, typename IdxType = int>
+void reduce_cols_by_key(const T* data,
+                        const KeyIteratorT keys,
+                        T* out,
+                        IdxType nrows,
+                        IdxType ncols,
+                        IdxType nkeys,
+                        cudaStream_t stream)
+{
+  detail::reduce_cols_by_key(data, keys, out, nrows, ncols, nkeys, stream);
+}
+};  // end namespace linalg
+};  // end namespace raft
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
index 76d4ed4971..ca7a956986 100644
--- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#ifndef __REDUCE_ROWS_BY_KEY
+#define __REDUCE_ROWS_BY_KEY
 
 #pragma once
 
@@ -108,3 +110,5 @@ void reduce_rows_by_key(const DataIteratorT d_A,
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.hpp b/cpp/include/raft/linalg/reduce_rows_by_key.hpp
new file mode 100644
index 0000000000..70ce9eaa4f
--- /dev/null
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.hpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __REDUCE_ROWS_BY_KEY
+#define __REDUCE_ROWS_BY_KEY
+
+#pragma once
+
+#include <raft/linalg/detail/reduce_rows_by_key.cuh>
+
+namespace raft {
+namespace linalg {
+
+/**
+ Small helper function to convert from int->char and char->int
+ Transform ncols*nrows read of int in 2*nrows reads of int + ncols*rows reads of chars
+**/
+template <typename IteratorT1, typename IteratorT2>
+void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st)
+{
+  detail::convert_array(dst, src, n, st);
+}
+
+/**
+ * @brief Computes the weighted reduction of matrix rows for each given key
+ *
+ * @tparam DataIteratorT Random-access iterator type, for reading input matrix
+ *                       (may be a simple pointer type)
+ * @tparam KeysIteratorT Random-access iterator type, for reading input keys
+ *                       (may be a simple pointer type)
+ *
+ * @param[in]  d_A         Input data array (lda x nrows)
+ * @param[in]  lda         Real row size for input data, d_A
+ * @param[in]  d_keys      Keys for each row (1 x nrows)
+ * @param[in]  d_weights   Weights for each observation in d_A (1 x nrows)
+ * @param[out] d_keys_char Scratch memory for conversion of keys to char
+ * @param[in]  nrows       Number of rows in d_A and d_keys
+ * @param[in]  ncols       Number of data columns in d_A
+ * @param[in]  nkeys       Number of unique keys in d_keys
+ * @param[out] d_sums      Row sums by key (ncols x d_keys)
+ * @param[in]  stream      CUDA stream
+ */
+template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
+void reduce_rows_by_key(const DataIteratorT d_A,
+                        int lda,
+                        const KeysIteratorT d_keys,
+                        const WeightT* d_weights,
+                        char* d_keys_char,
+                        int nrows,
+                        int ncols,
+                        int nkeys,
+                        DataIteratorT d_sums,
+                        cudaStream_t stream)
+{
+  detail::reduce_rows_by_key(
+    d_A, lda, d_keys, d_weights, d_keys_char, nrows, ncols, nkeys, d_sums, stream);
+}
+
+/**
+ * @brief Computes the reduction of matrix rows for each given key
+ * @tparam DataIteratorT Random-access iterator type, for reading input matrix (may be a simple
+ * pointer type)
+ * @tparam KeysIteratorT Random-access iterator type, for reading input keys (may be a simple
+ * pointer type)
+ * @param[in]  d_A         Input data array (lda x nrows)
+ * @param[in]  lda         Real row size for input data, d_A
+ * @param[in]  d_keys      Keys for each row (1 x nrows)
+ * @param      d_keys_char Scratch memory for conversion of keys to char
+ * @param[in]  nrows       Number of rows in d_A and d_keys
+ * @param[in]  ncols       Number of data columns in d_A
+ * @param[in]  nkeys       Number of unique keys in d_keys
+ * @param[out] d_sums      Row sums by key (ncols x d_keys)
+ * @param[in]  stream      CUDA stream
+ */
+template <typename DataIteratorT, typename KeysIteratorT>
+void reduce_rows_by_key(const DataIteratorT d_A,
+                        int lda,
+                        const KeysIteratorT d_keys,
+                        char* d_keys_char,
+                        int nrows,
+                        int ncols,
+                        int nkeys,
+                        DataIteratorT d_sums,
+                        cudaStream_t stream)
+{
+  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+  reduce_rows_by_key(d_A,
+                     lda,
+                     d_keys,
+                     static_cast<DataType*>(nullptr),
+                     d_keys_char,
+                     nrows,
+                     ncols,
+                     nkeys,
+                     d_sums,
+                     stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/rsvd.cuh b/cpp/include/raft/linalg/rsvd.cuh
index d1d739489f..f5eaba7526 100644
--- a/cpp/include/raft/linalg/rsvd.cuh
+++ b/cpp/include/raft/linalg/rsvd.cuh
@@ -13,6 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#ifndef __RSVD_H
+#define __RSVD_H
 
 #pragma once
 
@@ -137,3 +139,5 @@ void rsvdPerc(const raft::handle_t& handle,
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/rsvd.hpp b/cpp/include/raft/linalg/rsvd.hpp
new file mode 100644
index 0000000000..2dd5faa332
--- /dev/null
+++ b/cpp/include/raft/linalg/rsvd.hpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __RSVD_H
+#define __RSVD_H
+
+#pragma once
+
+#include <raft/linalg/detail/rsvd.cuh>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief randomized singular value decomposition (RSVD) on the column major
+ * float type input matrix (Jacobi-based), by specifying no. of PCs and
+ * upsamples directly
+ * @param handle: raft handle
+ * @param M: input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param S_vec: singular values of input matrix
+ * @param U: left singular values of input matrix
+ * @param V: right singular values of input matrix
+ * @param k: no. of singular values to be computed
+ * @param p: no. of upsamples
+ * @param use_bbt: whether use eigen decomposition in computation or not
+ * @param gen_left_vec: left vector needs to be generated or not?
+ * @param gen_right_vec: right vector needs to be generated or not?
+ * @param use_jacobi: whether to jacobi solver for decomposition
+ * @param tol: tolerance for Jacobi-based solvers
+ * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void rsvdFixedRank(const raft::handle_t& handle,
+                   math_t* M,
+                   int n_rows,
+                   int n_cols,
+                   math_t* S_vec,
+                   math_t* U,
+                   math_t* V,
+                   int k,
+                   int p,
+                   bool use_bbt,
+                   bool gen_left_vec,
+                   bool gen_right_vec,
+                   bool use_jacobi,
+                   math_t tol,
+                   int max_sweeps,
+                   cudaStream_t stream)
+{
+  detail::rsvdFixedRank(handle,
+                        M,
+                        n_rows,
+                        n_cols,
+                        S_vec,
+                        U,
+                        V,
+                        k,
+                        p,
+                        use_bbt,
+                        gen_left_vec,
+                        gen_right_vec,
+                        use_jacobi,
+                        tol,
+                        max_sweeps,
+                        stream);
+}
+
+/**
+ * @brief randomized singular value decomposition (RSVD) on the column major
+ * float type input matrix (Jacobi-based), by specifying the PC and upsampling
+ * ratio
+ * @param handle: raft handle
+ * @param M: input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param S_vec: singular values of input matrix
+ * @param U: left singular values of input matrix
+ * @param V: right singular values of input matrix
+ * @param PC_perc: percentage of singular values to be computed
+ * @param UpS_perc: upsampling percentage
+ * @param use_bbt: whether use eigen decomposition in computation or not
+ * @param gen_left_vec: left vector needs to be generated or not?
+ * @param gen_right_vec: right vector needs to be generated or not?
+ * @param use_jacobi: whether to jacobi solver for decomposition
+ * @param tol: tolerance for Jacobi-based solvers
+ * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void rsvdPerc(const raft::handle_t& handle,
+              math_t* M,
+              int n_rows,
+              int n_cols,
+              math_t* S_vec,
+              math_t* U,
+              math_t* V,
+              math_t PC_perc,
+              math_t UpS_perc,
+              bool use_bbt,
+              bool gen_left_vec,
+              bool gen_right_vec,
+              bool use_jacobi,
+              math_t tol,
+              int max_sweeps,
+              cudaStream_t stream)
+{
+  detail::rsvdPerc(handle,
+                   M,
+                   n_rows,
+                   n_cols,
+                   S_vec,
+                   U,
+                   V,
+                   PC_perc,
+                   UpS_perc,
+                   use_bbt,
+                   gen_left_vec,
+                   gen_right_vec,
+                   use_jacobi,
+                   tol,
+                   max_sweeps,
+                   stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/sqrt.cuh b/cpp/include/raft/linalg/sqrt.cuh
index c431cfdcc0..b58bc752ac 100644
--- a/cpp/include/raft/linalg/sqrt.cuh
+++ b/cpp/include/raft/linalg/sqrt.cuh
@@ -13,11 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#ifndef __SQRT_H
+#define __SQRT_H
 
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 
 namespace raft {
 namespace linalg {
@@ -42,3 +44,5 @@ void sqrt(math_t* out, const math_t* in, IdxType len, cudaStream_t stream)
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/sqrt.hpp b/cpp/include/raft/linalg/sqrt.hpp
new file mode 100644
index 0000000000..9856173248
--- /dev/null
+++ b/cpp/include/raft/linalg/sqrt.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SQRT_H
+#define __SQRT_H
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/unary_op.cuh>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @defgroup ScalarOps Scalar operations on the input buffer
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in the input buffer
+ * @param len number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void sqrt(math_t* out, const math_t* in, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(
+    out, in, len, [] __device__(math_t in) { return raft::mySqrt(in); }, stream);
+}
+/** @} */
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh
new file mode 100644
index 0000000000..941e64dcb1
--- /dev/null
+++ b/cpp/include/raft/linalg/strided_reduction.cuh
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __STRIDED_REDUCTION_H
+#define __STRIDED_REDUCTION_H
+
+#pragma once
+
+#include "detail/strided_reduction.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Compute reduction of the input matrix along the strided dimension
+ *
+ * @tparam InType the data type of the input
+ * @tparam OutType the data type of the output (as well as the data type for
+ *  which reduction is performed)
+ * @tparam IdxType data type of the indices of the array
+ * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
+ * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*ReduceLambda)(OutType);</pre>
+ * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*FinalLambda)(OutType);</pre>
+ * @param dots the output reduction vector
+ * @param data the input matrix
+ * @param D leading dimension of data
+ * @param N second dimension data
+ * @param init initial value to use for the reduction
+ * @param main_op elementwise operation to apply before reduction
+ * @param reduce_op binary reduction operation
+ * @param final_op elementwise operation to apply before storing results
+ * @param inplace reduction result added inplace or overwrites old values?
+ * @param stream cuda stream where to launch work
+ */
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::Nop<OutType>>
+void stridedReduction(OutType* dots,
+                      const InType* data,
+                      IdxType D,
+                      IdxType N,
+                      OutType init,
+                      cudaStream_t stream,
+                      bool inplace           = false,
+                      MainLambda main_op     = raft::Nop<InType, IdxType>(),
+                      ReduceLambda reduce_op = raft::Sum<OutType>(),
+                      FinalLambda final_op   = raft::Nop<OutType>())
+{
+  detail::stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/strided_reduction.hpp b/cpp/include/raft/linalg/strided_reduction.hpp
index 0f97323e5a..534f8edcf7 100644
--- a/cpp/include/raft/linalg/strided_reduction.hpp
+++ b/cpp/include/raft/linalg/strided_reduction.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __STRIDED_REDUCTION_H
+#define __STRIDED_REDUCTION_H
 
 #pragma once
 
@@ -70,3 +77,5 @@ void stridedReduction(OutType* dots,
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh
new file mode 100644
index 0000000000..9ca36ddddf
--- /dev/null
+++ b/cpp/include/raft/linalg/subtract.cuh
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SUBTRACT_H
+#define __SUBTRACT_H
+
+#pragma once
+
+#include "detail/subtract.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Elementwise scalar subtraction operation on the input buffer
+ *
+ * @tparam InT     input data-type. Also the data-type upon which the math ops
+ *                 will be performed
+ * @tparam OutT    output data-type
+ * @tparam IdxType Integer type used to for addressing
+ *
+ * @param out    the output buffer
+ * @param in     the input buffer
+ * @param scalar the scalar used in the operations
+ * @param len    number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ */
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
+{
+  detail::subtractScalar(out, in, scalar, len, stream);
+}
+
+/**
+ * @brief Elementwise subtraction operation on the input buffers
+ * @tparam InT     input data-type. Also the data-type upon which the math ops
+ *                 will be performed
+ * @tparam OutT    output data-type
+ * @tparam IdxType Integer type used to for addressing
+ *
+ * @param out    the output buffer
+ * @param in1    the first input buffer
+ * @param in2    the second input buffer
+ * @param len    number of elements in the input buffers
+ * @param stream cuda stream where to launch work
+ */
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
+{
+  detail::subtract(out, in1, in2, len, stream);
+}
+
+/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
+ * write result to outDev[i]
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param outDev the output buffer
+ * @param inDev the input buffer
+ * @param singleScalarDev pointer to the scalar located in device memory
+ * @param len number of elements in the input and output buffer
+ * @param stream cuda stream
+ * @remark block size has not been tuned
+ */
+template <typename math_t, typename IdxType = int, int TPB = 256>
+void subtractDevScalar(math_t* outDev,
+                       const math_t* inDev,
+                       const math_t* singleScalarDev,
+                       IdxType len,
+                       cudaStream_t stream)
+{
+  detail::subtractDevScalar(outDev, inDev, singleScalarDev, len, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/subtract.hpp b/cpp/include/raft/linalg/subtract.hpp
index 9d48948cad..2420ce69e2 100644
--- a/cpp/include/raft/linalg/subtract.hpp
+++ b/cpp/include/raft/linalg/subtract.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SUBTRACT_H
+#define __SUBTRACT_H
 
 #pragma once
 
@@ -83,3 +90,5 @@ void subtractDevScalar(math_t* outDev,
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh
new file mode 100644
index 0000000000..b48def90a3
--- /dev/null
+++ b/cpp/include/raft/linalg/svd.cuh
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SVD_H
+#define __SVD_H
+
+#pragma once
+
+#include "detail/svd.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief singular value decomposition (SVD) on the column major float type
+ * input matrix using QR method
+ * @param handle: raft handle
+ * @param in: input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param sing_vals: singular values of input matrix
+ * @param left_sing_vecs: left singular values of input matrix
+ * @param right_sing_vecs: right singular values of input matrix
+ * @param trans_right: transpose right vectors or not
+ * @param gen_left_vec: generate left eig vector. Not activated.
+ * @param gen_right_vec: generate right eig vector. Not activated.
+ * @param stream cuda stream
+ */
+// TODO: activate gen_left_vec and gen_right_vec options
+// TODO: couldn't template this function due to cusolverDnSgesvd and
+// cusolverSnSgesvd. Check if there is any other way.
+template <typename T>
+void svdQR(const raft::handle_t& handle,
+           T* in,
+           int n_rows,
+           int n_cols,
+           T* sing_vals,
+           T* left_sing_vecs,
+           T* right_sing_vecs,
+           bool trans_right,
+           bool gen_left_vec,
+           bool gen_right_vec,
+           cudaStream_t stream)
+{
+  detail::svdQR(handle,
+                in,
+                n_rows,
+                n_cols,
+                sing_vals,
+                left_sing_vecs,
+                right_sing_vecs,
+                trans_right,
+                gen_left_vec,
+                gen_right_vec,
+                stream);
+}
+
+template <typename T>
+void svdEig(const raft::handle_t& handle,
+            T* in,
+            int n_rows,
+            int n_cols,
+            T* S,
+            T* U,
+            T* V,
+            bool gen_left_vec,
+            cudaStream_t stream)
+{
+  detail::svdEig(handle, in, n_rows, n_cols, S, U, V, gen_left_vec, stream);
+}
+
+/**
+ * @brief on the column major input matrix using Jacobi method
+ * @param handle: raft handle
+ * @param in: input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param sing_vals: singular values of input matrix
+ * @param left_sing_vecs: left singular vectors of input matrix
+ * @param right_sing_vecs: right singular vectors of input matrix
+ * @param gen_left_vec: generate left eig vector. Not activated.
+ * @param gen_right_vec: generate right eig vector. Not activated.
+ * @param tol: error tolerance for the jacobi method. Algorithm stops when the
+ * error is below tol
+ * @param max_sweeps: number of sweeps in the Jacobi algorithm. The more the better
+ * accuracy.
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void svdJacobi(const raft::handle_t& handle,
+               math_t* in,
+               int n_rows,
+               int n_cols,
+               math_t* sing_vals,
+               math_t* left_sing_vecs,
+               math_t* right_sing_vecs,
+               bool gen_left_vec,
+               bool gen_right_vec,
+               math_t tol,
+               int max_sweeps,
+               cudaStream_t stream)
+{
+  detail::svdJacobi(handle,
+                    in,
+                    n_rows,
+                    n_cols,
+                    sing_vals,
+                    left_sing_vecs,
+                    right_sing_vecs,
+                    gen_left_vec,
+                    gen_right_vec,
+                    tol,
+                    max_sweeps,
+                    stream);
+}
+
+/**
+ * @brief reconstruct a matrix use left and right singular vectors and
+ * singular values
+ * @param handle: raft handle
+ * @param U: left singular vectors of size n_rows x k
+ * @param S: square matrix with singular values on its diagonal, k x k
+ * @param V: right singular vectors of size n_cols x k
+ * @param out: reconstructed matrix to be returned
+ * @param n_rows: number rows of output matrix
+ * @param n_cols: number columns of output matrix
+ * @param k: number of singular values
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void svdReconstruction(const raft::handle_t& handle,
+                       math_t* U,
+                       math_t* S,
+                       math_t* V,
+                       math_t* out,
+                       int n_rows,
+                       int n_cols,
+                       int k,
+                       cudaStream_t stream)
+{
+  detail::svdReconstruction(handle, U, S, V, out, n_rows, n_cols, k, stream);
+}
+
+/**
+ * @brief reconstruct a matrix use left and right singular vectors and
+ * singular values
+ * @param handle: raft handle
+ * @param A_d: input matrix
+ * @param U: left singular vectors of size n_rows x k
+ * @param S_vec: singular values as a vector
+ * @param V: right singular vectors of size n_cols x k
+ * @param n_rows: number rows of output matrix
+ * @param n_cols: number columns of output matrix
+ * @param k: number of singular values to be computed, 1.0 for normal SVD
+ * @param tol: tolerance for the evaluation
+ * @param stream cuda stream
+ */
+template <typename math_t>
+bool evaluateSVDByL2Norm(const raft::handle_t& handle,
+                         math_t* A_d,
+                         math_t* U,
+                         math_t* S_vec,
+                         math_t* V,
+                         int n_rows,
+                         int n_cols,
+                         int k,
+                         math_t tol,
+                         cudaStream_t stream)
+{
+  return detail::evaluateSVDByL2Norm(handle, A_d, U, S_vec, V, n_rows, n_cols, k, tol, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/svd.hpp b/cpp/include/raft/linalg/svd.hpp
index a30180b174..765f364d5b 100644
--- a/cpp/include/raft/linalg/svd.hpp
+++ b/cpp/include/raft/linalg/svd.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SVD_H
+#define __SVD_H
 
 #pragma once
 
-#include "detail/svd.hpp"
+#include "detail/svd.cuh"
 
 namespace raft {
 namespace linalg {
@@ -182,3 +189,5 @@ bool evaluateSVDByL2Norm(const raft::handle_t& handle,
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/ternary_op.cuh b/cpp/include/raft/linalg/ternary_op.cuh
index be411e6492..158cca168d 100644
--- a/cpp/include/raft/linalg/ternary_op.cuh
+++ b/cpp/include/raft/linalg/ternary_op.cuh
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#ifndef __TERNARY_OP_H
+#define __TERNARY_OP_H
+
 #pragma once
 
 #include <raft/linalg/detail/ternary_op.cuh>
@@ -47,4 +50,6 @@ void ternaryOp(math_t* out,
 }
 
 };  // end namespace linalg
-};  // end namespace raft
\ No newline at end of file
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/ternary_op.hpp b/cpp/include/raft/linalg/ternary_op.hpp
new file mode 100644
index 0000000000..1e8892211c
--- /dev/null
+++ b/cpp/include/raft/linalg/ternary_op.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __TERNARY_OP_H
+#define __TERNARY_OP_H
+
+#pragma once
+
+#include <raft/linalg/detail/ternary_op.cuh>
+
+namespace raft {
+namespace linalg {
+/**
+ * @brief perform element-wise ternary operation on the input arrays
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @param out the output array
+ * @param in1 the first input array
+ * @param in2 the second input array
+ * @param in3 the third input array
+ * @param len number of elements in the input array
+ * @param op the device-lambda
+ * @param stream cuda stream where to launch work
+ */
+template <typename math_t, typename Lambda, typename IdxType = int, int TPB = 256>
+void ternaryOp(math_t* out,
+               const math_t* in1,
+               const math_t* in2,
+               const math_t* in3,
+               IdxType len,
+               Lambda op,
+               cudaStream_t stream)
+{
+  detail::ternaryOp(out, in1, in2, in3, len, op, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/transpose.cuh b/cpp/include/raft/linalg/transpose.cuh
new file mode 100644
index 0000000000..a9ada5125a
--- /dev/null
+++ b/cpp/include/raft/linalg/transpose.cuh
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __TRANSPOSE_H
+#define __TRANSPOSE_H
+
+#pragma once
+
+#include "detail/transpose.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief transpose on the column major input matrix using Jacobi method
+ * @param handle: raft handle
+ * @param in: input matrix
+ * @param out: output. Transposed input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename math_t>
+void transpose(const raft::handle_t& handle,
+               math_t* in,
+               math_t* out,
+               int n_rows,
+               int n_cols,
+               cudaStream_t stream)
+{
+  detail::transpose(handle, in, out, n_rows, n_cols, stream);
+}
+
+/**
+ * @brief transpose on the column major input matrix using Jacobi method
+ * @param inout: input and output matrix
+ * @param n: number of rows and columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename math_t>
+void transpose(math_t* inout, int n, cudaStream_t stream)
+{
+  detail::transpose(inout, n, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/transpose.hpp b/cpp/include/raft/linalg/transpose.hpp
index 50608877fa..765d523b16 100644
--- a/cpp/include/raft/linalg/transpose.hpp
+++ b/cpp/include/raft/linalg/transpose.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __TRANSPOSE_H
+#define __TRANSPOSE_H
 
 #pragma once
 
-#include "detail/transpose.hpp"
+#include "detail/transpose.cuh"
 
 namespace raft {
 namespace linalg {
@@ -55,3 +62,5 @@ void transpose(math_t* inout, int n, cudaStream_t stream)
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh
new file mode 100644
index 0000000000..f2466df463
--- /dev/null
+++ b/cpp/include/raft/linalg/unary_op.cuh
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __UNARY_OP_H
+#define __UNARY_OP_H
+
+#pragma once
+
+#include "detail/unary_op.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief perform element-wise unary operation in the input array
+ * @tparam InType input data-type
+ * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam OutType output data-type
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @param out the output array
+ * @param in the input array
+ * @param len number of elements in the input array
+ * @param op the device-lambda
+ * @param stream cuda stream where to launch work
+ * @note Lambda must be a functor with the following signature:
+ *       `OutType func(const InType& val);`
+ */
+template <typename InType,
+          typename Lambda,
+          typename IdxType = int,
+          typename OutType = InType,
+          int TPB          = 256>
+void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
+{
+  detail::unaryOpCaller(out, in, len, op, stream);
+}
+
+/**
+ * @brief Perform an element-wise unary operation into the output array
+ *
+ * Compared to `unaryOp()`, this method does not do any reads from any inputs
+ *
+ * @tparam OutType output data-type
+ * @tparam Lambda  the device-lambda performing the actual operation
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB     threads-per-block in the final kernel launched
+ *
+ * @param[out] out    the output array [on device] [len = len]
+ * @param[in]  len    number of elements in the input array
+ * @param[in]  op     the device-lambda which must be of the form:
+ *                    `void func(OutType* outLocationOffset, IdxType idx);`
+ *                    where outLocationOffset will be out + idx.
+ * @param[in]  stream cuda stream where to launch work
+ */
+template <typename OutType, typename Lambda, typename IdxType = int, int TPB = 256>
+void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
+{
+  detail::writeOnlyUnaryOpCaller(out, len, op, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/unary_op.hpp b/cpp/include/raft/linalg/unary_op.hpp
index 51faa2e4a4..12d841340b 100644
--- a/cpp/include/raft/linalg/unary_op.hpp
+++ b/cpp/include/raft/linalg/unary_op.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __UNARY_OP_H
+#define __UNARY_OP_H
 
 #pragma once
 
@@ -71,3 +78,5 @@ void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
diff --git a/cpp/include/raft/matrix/col_wise_sort.cuh b/cpp/include/raft/matrix/col_wise_sort.cuh
new file mode 100644
index 0000000000..afdec24ebd
--- /dev/null
+++ b/cpp/include/raft/matrix/col_wise_sort.cuh
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __COL_WISE_SORT_H
+#define __COL_WISE_SORT_H
+
+#pragma once
+
+#include <raft/matrix/detail/columnWiseSort.cuh>
+
+namespace raft {
+namespace matrix {
+
+/**
+ * @brief sort columns within each row of row-major input matrix and return sorted indexes
+ * modelled as key-value sort with key being input matrix and value being index of values
+ * @param in: input matrix
+ * @param out: output value(index) matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_columns: number columns of input matrix
+ * @param bAllocWorkspace: check returned value, if true allocate workspace passed in workspaceSize
+ * @param workspacePtr: pointer to workspace memory
+ * @param workspaceSize: Size of workspace to be allocated
+ * @param stream: cuda stream to execute prim on
+ * @param sortedKeys: Optional, output matrix for sorted keys (input)
+ */
+template <typename InType, typename OutType>
+void sort_cols_per_row(const InType* in,
+                       OutType* out,
+                       int n_rows,
+                       int n_columns,
+                       bool& bAllocWorkspace,
+                       void* workspacePtr,
+                       size_t& workspaceSize,
+                       cudaStream_t stream,
+                       InType* sortedKeys = nullptr)
+{
+  detail::sortColumnsPerRow<InType, OutType>(
+    in, out, n_rows, n_columns, bAllocWorkspace, workspacePtr, workspaceSize, stream, sortedKeys);
+}
+};  // end namespace matrix
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/col_wise_sort.hpp b/cpp/include/raft/matrix/col_wise_sort.hpp
index 7ace5881bc..f259bc71a8 100644
--- a/cpp/include/raft/matrix/col_wise_sort.hpp
+++ b/cpp/include/raft/matrix/col_wise_sort.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __COL_WISE_SORT_H
+#define __COL_WISE_SORT_H
 
 #pragma once
 
@@ -50,3 +57,5 @@ void sort_cols_per_row(const InType* in,
 }
 };  // end namespace matrix
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh
index 6b32cbc06e..9e996e19d9 100644
--- a/cpp/include/raft/matrix/detail/math.cuh
+++ b/cpp/include/raft/matrix/detail/math.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,10 +20,10 @@
 
 #include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.hpp>
-#include <raft/linalg/map_then_reduce.hpp>
-#include <raft/linalg/matrix_vector_op.hpp>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/map_then_reduce.cuh>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/unary_op.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index f057ba283c..3fa602d865 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh
new file mode 100644
index 0000000000..9e103afda5
--- /dev/null
+++ b/cpp/include/raft/matrix/math.cuh
@@ -0,0 +1,468 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MATH_H
+#define __MATH_H
+
+#pragma once
+
+#include "detail/math.cuh"
+
+namespace raft {
+namespace matrix {
+
+/**
+ * @defgroup MatrixMathOp math operation on the input matrix
+ * @{
+ */
+
+/**
+ * @brief Power of every element in the input matrix
+ * @param in: input matrix
+ * @param out: output matrix. The result is stored in the out matrix
+ * @param scalar: every element is multiplied with scalar.
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream)
+{
+  detail::power(in, out, scalar, len, stream);
+}
+
+/**
+ * @brief Power of every element in the input matrix
+ * @param inout: input matrix and also the result is stored
+ * @param scalar: every element is multiplied with scalar.
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void power(math_t* inout, math_t scalar, int len, cudaStream_t stream)
+{
+  detail::power(inout, scalar, len, stream);
+}
+
+/**
+ * @brief Power of every element in the input matrix
+ * @param inout: input matrix and also the result is stored
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void power(math_t* inout, int len, cudaStream_t stream)
+{
+  detail::power(inout, len, stream);
+}
+
+/**
+ * @brief Power of every element in the input matrix
+ * @param in: input matrix
+ * @param out: output matrix. The result is stored in the out matrix
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ * @{
+ */
+template <typename math_t>
+void power(math_t* in, math_t* out, int len, cudaStream_t stream)
+{
+  detail::power(in, out, len, stream);
+}
+
+/**
+ * @brief Square root of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param in: input matrix and also the result is stored
+ * @param out: output matrix. The result is stored in the out matrix
+ * @param scalar: every element is multiplied with scalar
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ * @param set_neg_zero whether to set negative numbers to zero
+ */
+template <typename math_t, typename IdxType = int>
+void seqRoot(math_t* in,
+             math_t* out,
+             math_t scalar,
+             IdxType len,
+             cudaStream_t stream,
+             bool set_neg_zero = false)
+{
+  detail::seqRoot(in, out, scalar, len, stream, set_neg_zero);
+}
+
+/**
+ * @brief Square root of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param inout: input matrix and also the result is stored
+ * @param scalar: every element is multiplied with scalar
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ * @param set_neg_zero whether to set negative numbers to zero
+ */
+template <typename math_t, typename IdxType = int>
+void seqRoot(
+  math_t* inout, math_t scalar, IdxType len, cudaStream_t stream, bool set_neg_zero = false)
+{
+  detail::seqRoot(inout, scalar, len, stream, set_neg_zero);
+}
+
+/**
+ * @brief Square root of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param in: input matrix and also the result is stored
+ * @param out: output matrix. The result is stored in the out matrix
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t, typename IdxType = int>
+void seqRoot(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
+{
+  detail::seqRoot(in, out, len, stream);
+}
+
+/**
+ * @brief Square root of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param inout: input matrix with in-place results
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t, typename IdxType = int>
+void seqRoot(math_t* inout, IdxType len, cudaStream_t stream)
+{
+  detail::seqRoot(inout, len, stream);
+}
+
+/**
+ * @brief sets the small values to zero based on a defined threshold
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out: output matrix. The result is stored in the out matrix
+ * @param in: input matrix
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ * @param thres threshold to set values to zero
+ */
+template <typename math_t, typename IdxType = int>
+void setSmallValuesZero(
+  math_t* out, const math_t* in, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
+{
+  detail::setSmallValuesZero(out, in, len, stream, thres);
+}
+
+/**
+ * @brief sets the small values to zero based on a defined threshold
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param inout: input matrix and also the result is stored
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ * @param thres: threshold
+ */
+template <typename math_t, typename IdxType = int>
+void setSmallValuesZero(math_t* inout, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
+{
+  detail::setSmallValuesZero(inout, len, stream, thres);
+}
+
+/**
+ * @brief Reciprocal of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param in: input matrix and also the result is stored
+ * @param out: output matrix. The result is stored in the out matrix
+ * @param scalar: every element is multiplied with scalar
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ * @param setzero round down to zero if the input is less the threshold
+ * @param thres the threshold used to forcibly set inputs to zero
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void reciprocal(math_t* in,
+                math_t* out,
+                math_t scalar,
+                int len,
+                cudaStream_t stream,
+                bool setzero = false,
+                math_t thres = 1e-15)
+{
+  detail::reciprocal(in, out, scalar, len, stream, setzero, thres);
+}
+
+/**
+ * @brief Reciprocal of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param inout: input matrix with in-place results
+ * @param scalar: every element is multiplied with scalar
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ * @param setzero round down to zero if the input is less the threshold
+ * @param thres the threshold used to forcibly set inputs to zero
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void reciprocal(math_t* inout,
+                math_t scalar,
+                IdxType len,
+                cudaStream_t stream,
+                bool setzero = false,
+                math_t thres = 1e-15)
+{
+  detail::reciprocal(inout, scalar, len, stream, setzero, thres);
+}
+
+/**
+ * @brief Reciprocal of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param inout: input matrix and also the result is stored
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t, typename IdxType = int>
+void reciprocal(math_t* inout, IdxType len, cudaStream_t stream)
+{
+  detail::reciprocal(inout, len, stream);
+}
+
+/**
+ * @brief Reciprocal of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param in: input matrix and also the result is stored
+ * @param out: output matrix. The result is stored in the out matrix
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t, typename IdxType = int>
+void reciprocal(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
+{
+  detail::reciprocal(in, out, len, stream);
+}
+
+/**
+ * @brief set values to scalar in matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @param out output matrix. The result is stored in the out matrix
+ * @param in input matrix
+ * @param scalar svalar value
+ * @param len number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_t stream = 0)
+{
+  detail::setValue(out, in, scalar, len, stream);
+}
+
+/**
+ * @brief ratio of every element over sum of input vector is calculated
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param handle
+ * @param src: input matrix
+ * @param dest: output matrix. The result is stored in the dest matrix
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t, typename IdxType = int>
+void ratio(
+  const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream)
+{
+  detail::ratio(handle, src, dest, len, stream);
+}
+
+/** @} */
+
+/**
+ * @brief Argmax: find the row idx with maximum value for each column
+ * @param in: input matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param out: output vector of size n_cols
+ * @param stream: cuda stream
+ */
+template <typename math_t>
+void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t stream)
+{
+  detail::argmax(in, n_rows, n_cols, out, stream);
+}
+
+/**
+ * @brief sign flip for PCA. This is used to stabilize the sign of column
+ * major eigen vectors. Flips the sign if the column has negative |max|.
+ * @param inout: input matrix. Result also stored in this parameter
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream)
+{
+  detail::signFlip(inout, n_rows, n_cols, stream);
+}
+
+/**
+ * @brief multiply each row or column of matrix with vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryMult(Type* data,
+                            const Type* vec,
+                            IdxType n_row,
+                            IdxType n_col,
+                            bool rowMajor,
+                            bool bcastAlongRows,
+                            cudaStream_t stream)
+{
+  detail::matrixVectorBinaryMult<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
+}
+
+/**
+ * @brief multiply each row or column of matrix with vector, skipping zeros in vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryMultSkipZero(Type* data,
+                                    const Type* vec,
+                                    IdxType n_row,
+                                    IdxType n_col,
+                                    bool rowMajor,
+                                    bool bcastAlongRows,
+                                    cudaStream_t stream)
+{
+  detail::matrixVectorBinaryMultSkipZero<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
+}
+
+/**
+ * @brief divide each row or column of matrix with vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryDiv(Type* data,
+                           const Type* vec,
+                           IdxType n_row,
+                           IdxType n_col,
+                           bool rowMajor,
+                           bool bcastAlongRows,
+                           cudaStream_t stream)
+{
+  detail::matrixVectorBinaryDiv<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
+}
+
+/**
+ * @brief divide each row or column of matrix with vector, skipping zeros in vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ * @param return_zero result is zero if true and vector value is below threshold, original value if
+ * false
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryDivSkipZero(Type* data,
+                                   const Type* vec,
+                                   IdxType n_row,
+                                   IdxType n_col,
+                                   bool rowMajor,
+                                   bool bcastAlongRows,
+                                   cudaStream_t stream,
+                                   bool return_zero = false)
+{
+  detail::matrixVectorBinaryDivSkipZero<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream, return_zero);
+}
+
+/**
+ * @brief add each row or column of matrix with vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryAdd(Type* data,
+                           const Type* vec,
+                           IdxType n_row,
+                           IdxType n_col,
+                           bool rowMajor,
+                           bool bcastAlongRows,
+                           cudaStream_t stream)
+{
+  detail::matrixVectorBinaryAdd<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
+}
+
+/**
+ * @brief subtract each row or column of matrix with vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinarySub(Type* data,
+                           const Type* vec,
+                           IdxType n_row,
+                           IdxType n_col,
+                           bool rowMajor,
+                           bool bcastAlongRows,
+                           cudaStream_t stream)
+{
+  detail::matrixVectorBinarySub<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
+}
+
+};  // end namespace matrix
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/math.hpp b/cpp/include/raft/matrix/math.hpp
index 619e20a702..ab02c8a85f 100644
--- a/cpp/include/raft/matrix/math.hpp
+++ b/cpp/include/raft/matrix/math.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MATH_H
+#define __MATH_H
 
 #pragma once
 
@@ -461,3 +468,5 @@ void matrixVectorBinarySub(Type* data,
 
 };  // end namespace matrix
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh
new file mode 100644
index 0000000000..1af7e37dec
--- /dev/null
+++ b/cpp/include/raft/matrix/matrix.cuh
@@ -0,0 +1,278 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MATRIX_H
+#define __MATRIX_H
+
+#pragma once
+
+#include "detail/linewise_op.cuh"
+#include "detail/matrix.cuh"
+
+#include <raft/common/nvtx.hpp>
+
+namespace raft {
+namespace matrix {
+
+using namespace std;
+
+/**
+ * @brief Copy selected rows of the input matrix into contiguous space.
+ *
+ * On exit out[i + k*n_rows] = in[indices[i] + k*n_rows],
+ * where i = 0..n_rows_indices-1, and k = 0..n_cols-1.
+ *
+ * @param in input matrix
+ * @param n_rows number of rows of output matrix
+ * @param n_cols number of columns of output matrix
+ * @param out output matrix
+ * @param indices of the rows to be copied
+ * @param n_rows_indices number of rows to copy
+ * @param stream cuda stream
+ * @param rowMajor whether the matrix has row major layout
+ */
+template <typename m_t, typename idx_array_t = int, typename idx_t = size_t>
+void copyRows(const m_t* in,
+              idx_t n_rows,
+              idx_t n_cols,
+              m_t* out,
+              const idx_array_t* indices,
+              idx_t n_rows_indices,
+              cudaStream_t stream,
+              bool rowMajor = false)
+{
+  detail::copyRows(in, n_rows, n_cols, out, indices, n_rows_indices, stream, rowMajor);
+}
+
+/**
+ * @brief copy matrix operation for column major matrices.
+ * @param in: input matrix
+ * @param out: output matrix
+ * @param n_rows: number of rows of output matrix
+ * @param n_cols: number of columns of output matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t = int>
+void copy(const m_t* in, m_t* out, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
+  raft::copy_async(out, in, n_rows * n_cols, stream);
+}
+
+/**
+ * @brief copy matrix operation for column major matrices. First n_rows and
+ * n_cols of input matrix "in" is copied to "out" matrix.
+ * @param in: input matrix
+ * @param in_n_rows: number of rows of input matrix
+ * @param out: output matrix
+ * @param out_n_rows: number of rows of output matrix
+ * @param out_n_cols: number of columns of output matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t = int>
+void truncZeroOrigin(
+  m_t* in, idx_t in_n_rows, m_t* out, idx_t out_n_rows, idx_t out_n_cols, cudaStream_t stream)
+{
+  detail::truncZeroOrigin(in, in_n_rows, out, out_n_rows, out_n_cols, stream);
+}
+
+/**
+ * @brief Columns of a column major matrix is reversed (i.e. first column and
+ * last column are swapped)
+ * @param inout: input and output matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t = int>
+void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
+  detail::colReverse(inout, n_rows, n_cols, stream);
+}
+
+/**
+ * @brief Rows of a column major matrix is reversed (i.e. first row and last
+ * row are swapped)
+ * @param inout: input and output matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t = int>
+void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
+  detail::rowReverse(inout, n_rows, n_cols, stream);
+}
+
+/**
+ * @brief Prints the data stored in GPU memory
+ * @param in: input matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param h_separator: horizontal separator character
+ * @param v_separator: vertical separator character
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t = int>
+void print(const m_t* in,
+           idx_t n_rows,
+           idx_t n_cols,
+           char h_separator    = ' ',
+           char v_separator    = '\n',
+           cudaStream_t stream = rmm::cuda_stream_default)
+{
+  detail::print(in, n_rows, n_cols, h_separator, v_separator, stream);
+}
+
+/**
+ * @brief Prints the data stored in CPU memory
+ * @param in: input matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ */
+template <typename m_t, typename idx_t = int>
+void printHost(const m_t* in, idx_t n_rows, idx_t n_cols)
+{
+  detail::printHost(in, n_rows, n_cols);
+}
+
+/**
+ * @brief Slice a matrix (in-place)
+ * @param in: input matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param out: output matrix
+ * @param x1, y1: coordinate of the top-left point of the wanted area (0-based)
+ * @param x2, y2: coordinate of the bottom-right point of the wanted area
+ * (1-based)
+ * example: Slice the 2nd and 3rd columns of a 4x3 matrix: slice_matrix(M_d, 4,
+ * 3, 0, 1, 4, 3);
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t = int>
+void sliceMatrix(m_t* in,
+                 idx_t n_rows,
+                 idx_t n_cols,
+                 m_t* out,
+                 idx_t x1,
+                 idx_t y1,
+                 idx_t x2,
+                 idx_t y2,
+                 cudaStream_t stream)
+{
+  detail::sliceMatrix(in, n_rows, n_cols, out, x1, y1, x2, y2, stream);
+}
+
+/**
+ * @brief Copy the upper triangular part of a matrix to another
+ * @param src: input matrix with a size of n_rows x n_cols
+ * @param dst: output matrix with a size of kxk, k = min(n_rows, n_cols)
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t = int>
+void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
+  detail::copyUpperTriangular(src, dst, n_rows, n_cols, stream);
+}
+
+/**
+ * @brief Initialize a diagonal matrix with a vector
+ * @param vec: vector of length k = min(n_rows, n_cols)
+ * @param matrix: matrix of size n_rows x n_cols
+ * @param n_rows: number of rows of the matrix
+ * @param n_cols: number of columns of the matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t = int>
+void initializeDiagonalMatrix(
+  m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
+  detail::initializeDiagonalMatrix(vec, matrix, n_rows, n_cols, stream);
+}
+
+/**
+ * @brief Get a square matrix with elements on diagonal reversed (in-place)
+ * @param in: square input matrix with size len x len
+ * @param len: size of one side of the matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t = int>
+void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
+{
+  detail::getDiagonalInverseMatrix(in, len, stream);
+}
+
+/**
+ * @brief Get the L2/F-norm of a matrix/vector
+ * @param handle
+ * @param in: input matrix/vector with totally size elements
+ * @param size: size of the matrix/vector
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t = int>
+m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream)
+{
+  return detail::getL2Norm(handle, in, size, stream);
+}
+
+/**
+ * Run a function over matrix lines (rows or columns) with a variable number
+ * row-vectors or column-vectors.
+ * The term `line` here signifies that the lines can be either columns or rows,
+ * depending on the matrix layout.
+ * What matters is if the vectors are applied along lines (indices of vectors correspond to
+ * indices within lines), or across lines (indices of vectors correspond to line numbers).
+ *
+ * @param [out] out result of the operation; can be same as `in`; should be aligned the same
+ *        as `in` to allow faster vectorized memory transfers.
+ * @param [in] in input matrix consisting of `nLines` lines, each `lineLen`-long.
+ * @param [in] lineLen length of matrix line in elements (`=nCols` in row-major or `=nRows` in
+ * col-major)
+ * @param [in] nLines number of matrix lines (`=nRows` in row-major or `=nCols` in col-major)
+ * @param [in] alongLines whether vectors are indices along or across lines.
+ * @param [in] op the operation applied on each line:
+ *    for i in [0..lineLen) and j in [0..nLines):
+ *      out[i, j] = op(in[i, j], vec1[i], vec2[i], ... veck[i])   if alongLines = true
+ *      out[i, j] = op(in[i, j], vec1[j], vec2[j], ... veck[j])   if alongLines = false
+ *    where matrix indexing is row-major ([i, j] = [i + lineLen * j]).
+ * @param [in] stream a cuda stream for the kernels
+ * @param [in] vecs zero or more vectors to be passed as arguments,
+ *    size of each vector is `alongLines ? lineLen : nLines`.
+ */
+template <typename m_t, typename idx_t = int, typename Lambda, typename... Vecs>
+void linewiseOp(m_t* out,
+                const m_t* in,
+                const idx_t lineLen,
+                const idx_t nLines,
+                const bool alongLines,
+                Lambda op,
+                cudaStream_t stream,
+                Vecs... vecs)
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope("linewiseOp-%c-%zu (%zu, %zu)",
+                                                            alongLines ? 'l' : 'x',
+                                                            sizeof...(Vecs),
+                                                            size_t(lineLen),
+                                                            size_t(nLines));
+  detail::MatrixLinewiseOp<16, 256>::run<m_t, idx_t, Lambda, Vecs...>(
+    out, in, lineLen, nLines, alongLines, op, stream, vecs...);
+}
+
+};  // end namespace matrix
+};  // end namespace raft
+
+#endif
diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp
index e3e2f88d14..cf5f5d1f25 100644
--- a/cpp/include/raft/matrix/matrix.hpp
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MATRIX_H
+#define __MATRIX_H
 
 #pragma once
 
@@ -271,3 +278,5 @@ void linewiseOp(m_t* out,
 
 };  // end namespace matrix
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp
index 151c49af7c..96aa622525 100644
--- a/cpp/include/raft/mr/buffer_base.hpp
+++ b/cpp/include/raft/mr/buffer_base.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/mr/device/buffer.hpp b/cpp/include/raft/mr/device/buffer.hpp
index aee3cba046..954ce83d1f 100644
--- a/cpp/include/raft/mr/device/buffer.hpp
+++ b/cpp/include/raft/mr/device/buffer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/mr/host/buffer.hpp b/cpp/include/raft/mr/host/buffer.hpp
index de9468add8..25aed3e725 100644
--- a/cpp/include/raft/mr/host/buffer.hpp
+++ b/cpp/include/raft/mr/host/buffer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/random/detail/make_blobs.cuh b/cpp/include/raft/random/detail/make_blobs.cuh
index b79178567b..10ded9c93e 100644
--- a/cpp/include/raft/random/detail/make_blobs.cuh
+++ b/cpp/include/raft/random/detail/make_blobs.cuh
@@ -19,8 +19,8 @@
 #include "permute.cuh"
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/unary_op.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/random/rng.cuh>
 #include <rmm/device_uvector.hpp>
 #include <vector>
 
diff --git a/cpp/include/raft/random/detail/make_regression.cuh b/cpp/include/raft/random/detail/make_regression.cuh
index eb8eaf565e..8bab85e485 100644
--- a/cpp/include/raft/random/detail/make_regression.cuh
+++ b/cpp/include/raft/random/detail/make_regression.cuh
@@ -24,15 +24,15 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/add.hpp>
+#include <raft/linalg/add.cuh>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/linalg/init.hpp>
-#include <raft/linalg/qr.hpp>
-#include <raft/linalg/transpose.hpp>
-#include <raft/matrix/matrix.hpp>
+#include <raft/linalg/init.cuh>
+#include <raft/linalg/qr.cuh>
+#include <raft/linalg/transpose.cuh>
+#include <raft/matrix/matrix.cuh>
 #include <raft/mr/device/buffer.hpp>
-#include <raft/random/permute.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/random/permute.cuh>
+#include <raft/random/rng.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft::random {
diff --git a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
index bf79b3cb71..15789742fd 100644
--- a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
+++ b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
@@ -22,8 +22,8 @@
 #include <raft/handle.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/detail/cusolver_wrappers.hpp>
-#include <raft/linalg/matrix_vector_op.hpp>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/unary_op.cuh>
 #include <stdio.h>
 
 // mvg.cuh takes in matrices that are colomn major (as in fortan)
diff --git a/cpp/include/raft/random/make_blobs.cuh b/cpp/include/raft/random/make_blobs.cuh
new file mode 100644
index 0000000000..2ad3a7960d
--- /dev/null
+++ b/cpp/include/raft/random/make_blobs.cuh
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MAKE_BLOBS_H
+#define __MAKE_BLOBS_H
+
+#pragma once
+
+#include "detail/make_blobs.cuh"
+
+namespace raft::random {
+
+/**
+ * @brief GPU-equivalent of sklearn.datasets.make_blobs
+ *
+ * @tparam DataT output data type
+ * @tparam IdxT  indexing arithmetic type
+ *
+ * @param[out] out                generated data [on device]
+ *                                [dim = n_rows x n_cols]
+ * @param[out] labels             labels for the generated data [on device]
+ *                                [len = n_rows]
+ * @param[in]  n_rows             number of rows in the generated data
+ * @param[in]  n_cols             number of columns in the generated data
+ * @param[in]  n_clusters         number of clusters (or classes) to generate
+ * @param[in]  stream             cuda stream to schedule the work on
+ * @param[in]  row_major          whether input `centers` and output `out`
+ *                                buffers are to be stored in row or column
+ *                                major layout
+ * @param[in]  centers            centers of each of the cluster, pass a nullptr
+ *                                if you need this also to be generated randomly
+ *                                [on device] [dim = n_clusters x n_cols]
+ * @param[in]  cluster_std        standard deviation of each cluster center,
+ *                                pass a nullptr if this is to be read from the
+ *                                `cluster_std_scalar`. [on device]
+ *                                [len = n_clusters]
+ * @param[in]  cluster_std_scalar if 'cluster_std' is nullptr, then use this as
+ *                                the std-dev across all dimensions.
+ * @param[in]  shuffle            shuffle the generated dataset and labels
+ * @param[in]  center_box_min     min value of box from which to pick cluster
+ *                                centers. Useful only if 'centers' is nullptr
+ * @param[in]  center_box_max     max value of box from which to pick cluster
+ *                                centers. Useful only if 'centers' is nullptr
+ * @param[in]  seed               seed for the RNG
+ * @param[in]  type               RNG type
+ */
+template <typename DataT, typename IdxT>
+void make_blobs(DataT* out,
+                IdxT* labels,
+                IdxT n_rows,
+                IdxT n_cols,
+                IdxT n_clusters,
+                cudaStream_t stream,
+                bool row_major                 = true,
+                const DataT* centers           = nullptr,
+                const DataT* cluster_std       = nullptr,
+                const DataT cluster_std_scalar = (DataT)1.0,
+                bool shuffle                   = true,
+                DataT center_box_min           = (DataT)-10.0,
+                DataT center_box_max           = (DataT)10.0,
+                uint64_t seed                  = 0ULL,
+                GeneratorType type             = GenPhilox)
+{
+  detail::make_blobs_caller(out,
+                            labels,
+                            n_rows,
+                            n_cols,
+                            n_clusters,
+                            stream,
+                            row_major,
+                            centers,
+                            cluster_std,
+                            cluster_std_scalar,
+                            shuffle,
+                            center_box_min,
+                            center_box_max,
+                            seed,
+                            type);
+}
+
+}  // end namespace raft::random
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/random/make_blobs.hpp b/cpp/include/raft/random/make_blobs.hpp
index afdabfe55b..19d4b8499b 100644
--- a/cpp/include/raft/random/make_blobs.hpp
+++ b/cpp/include/raft/random/make_blobs.hpp
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MAKE_BLOBS_H
+#define __MAKE_BLOBS_H
+
 #pragma once
 
 #include "detail/make_blobs.cuh"
@@ -88,4 +96,6 @@ void make_blobs(DataT* out,
                             type);
 }
 
-}  // end namespace raft::random
\ No newline at end of file
+}  // end namespace raft::random
+
+#endif
diff --git a/cpp/include/raft/random/make_regression.cuh b/cpp/include/raft/random/make_regression.cuh
new file mode 100644
index 0000000000..4fbb48fa35
--- /dev/null
+++ b/cpp/include/raft/random/make_regression.cuh
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Adapted from scikit-learn
+ * https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/datasets/_samples_generator.py
+ */
+
+#ifndef __MAKE_REGRESSION_H
+#define __MAKE_REGRESSION_H
+
+#pragma once
+
+#include <algorithm>
+
+#include "detail/make_regression.cuh"
+
+namespace raft::random {
+
+/**
+ * @brief GPU-equivalent of sklearn.datasets.make_regression as documented at:
+ * https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html
+ *
+ * @tparam  DataT  Scalar type
+ * @tparam  IdxT   Index type
+ *
+ * @param[in]   handle          RAFT handle
+ * @param[out]  out             Row-major (samples, features) matrix to store
+ *                              the problem data
+ * @param[out]  values          Row-major (samples, targets) matrix to store
+ *                              the values for the regression problem
+ * @param[in]   n_rows          Number of samples
+ * @param[in]   n_cols          Number of features
+ * @param[in]   n_informative   Number of informative features (non-zero
+ *                              coefficients)
+ * @param[in]   stream          CUDA stream
+ * @param[out]  coef            Row-major (features, targets) matrix to store
+ *                              the coefficients used to generate the values
+ *                              for the regression problem. If nullptr is
+ *                              given, nothing will be written
+ * @param[in]   n_targets       Number of targets (generated values per sample)
+ * @param[in]   bias            A scalar that will be added to the values
+ * @param[in]   effective_rank  The approximate rank of the data matrix (used
+ *                              to create correlations in the data). -1 is the
+ *                              code to use well-conditioned data
+ * @param[in]   tail_strength   The relative importance of the fat noisy tail
+ *                              of the singular values profile if
+ *                              effective_rank is not -1
+ * @param[in]   noise           Standard deviation of the gaussian noise
+ *                              applied to the output
+ * @param[in]   shuffle         Shuffle the samples and the features
+ * @param[in]   seed            Seed for the random number generator
+ * @param[in]   type            Random generator type
+ */
+template <typename DataT, typename IdxT>
+void make_regression(const raft::handle_t& handle,
+                     DataT* out,
+                     DataT* values,
+                     IdxT n_rows,
+                     IdxT n_cols,
+                     IdxT n_informative,
+                     cudaStream_t stream,
+                     DataT* coef         = nullptr,
+                     IdxT n_targets      = (IdxT)1,
+                     DataT bias          = (DataT)0.0,
+                     IdxT effective_rank = (IdxT)-1,
+                     DataT tail_strength = (DataT)0.5,
+                     DataT noise         = (DataT)0.0,
+                     bool shuffle        = true,
+                     uint64_t seed       = 0ULL,
+                     GeneratorType type  = GenPhilox)
+{
+  detail::make_regression_caller(handle,
+                                 out,
+                                 values,
+                                 n_rows,
+                                 n_cols,
+                                 n_informative,
+                                 stream,
+                                 coef,
+                                 n_targets,
+                                 bias,
+                                 effective_rank,
+                                 tail_strength,
+                                 noise,
+                                 shuffle,
+                                 seed,
+                                 type);
+}
+
+}  // namespace raft::random
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/random/make_regression.hpp b/cpp/include/raft/random/make_regression.hpp
index d6fceff466..c050a447ed 100644
--- a/cpp/include/raft/random/make_regression.hpp
+++ b/cpp/include/raft/random/make_regression.hpp
@@ -18,6 +18,14 @@
  * https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/datasets/_samples_generator.py
  */
 
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MAKE_REGRESSION_H
+#define __MAKE_REGRESSION_H
+
 #pragma once
 
 #include <algorithm>
@@ -97,4 +105,6 @@ void make_regression(const raft::handle_t& handle,
                                  type);
 }
 
-}  // namespace raft::random
\ No newline at end of file
+}  // namespace raft::random
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/random/multi_variable_gaussian.cuh b/cpp/include/raft/random/multi_variable_gaussian.cuh
new file mode 100644
index 0000000000..1d9d63f6c5
--- /dev/null
+++ b/cpp/include/raft/random/multi_variable_gaussian.cuh
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MVG_H
+#define __MVG_H
+
+#pragma once
+
+#include "detail/multi_variable_gaussian.cuh"
+
+namespace raft::random {
+
+template <typename T>
+class multi_variable_gaussian : public detail::multi_variable_gaussian_impl<T> {
+ public:
+  // using Decomposer = typename detail::multi_variable_gaussian_impl<T>::Decomposer;
+  // using detail::multi_variable_gaussian_impl<T>::Decomposer::chol_decomp;
+  // using detail::multi_variable_gaussian_impl<T>::Decomposer::jacobi;
+  // using detail::multi_variable_gaussian_impl<T>::Decomposer::qr;
+
+  multi_variable_gaussian() = delete;
+  multi_variable_gaussian(const raft::handle_t& handle,
+                          const int dim,
+                          typename detail::multi_variable_gaussian_impl<T>::Decomposer method)
+    : detail::multi_variable_gaussian_impl<T>{handle, dim, method}
+  {
+  }
+
+  std::size_t get_workspace_size()
+  {
+    return detail::multi_variable_gaussian_impl<T>::get_workspace_size();
+  }
+
+  void set_workspace(T* workarea)
+  {
+    detail::multi_variable_gaussian_impl<T>::set_workspace(workarea);
+  }
+
+  void give_gaussian(const int nPoints, T* P, T* X, const T* x = 0)
+  {
+    detail::multi_variable_gaussian_impl<T>::give_gaussian(nPoints, P, X, x);
+  }
+
+  void deinit() { detail::multi_variable_gaussian_impl<T>::deinit(); }
+
+  ~multi_variable_gaussian() { deinit(); }
+};  // end of multi_variable_gaussian
+
+};  // end of namespace raft::random
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/random/multi_variable_gaussian.hpp b/cpp/include/raft/random/multi_variable_gaussian.hpp
index c2af52322a..fd1de4aadd 100644
--- a/cpp/include/raft/random/multi_variable_gaussian.hpp
+++ b/cpp/include/raft/random/multi_variable_gaussian.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MVG_H
+#define __MVG_H
 
 #pragma once
 
@@ -56,4 +63,6 @@ class multi_variable_gaussian : public detail::multi_variable_gaussian_impl<T> {
   ~multi_variable_gaussian() { deinit(); }
 };  // end of multi_variable_gaussian
 
-};  // end of namespace raft::random
\ No newline at end of file
+};  // end of namespace raft::random
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/random/permute.cuh b/cpp/include/raft/random/permute.cuh
new file mode 100644
index 0000000000..1c01d589f4
--- /dev/null
+++ b/cpp/include/raft/random/permute.cuh
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PERMUTE_H
+#define __PERMUTE_H
+
+#pragma once
+
+#include "detail/permute.cuh"
+
+namespace raft::random {
+
+/**
+ * @brief Generate permutations of the input array. Pretty useful primitive for
+ * shuffling the input datasets in ML algos. See note at the end for some of its
+ * limitations!
+ * @tparam Type Data type of the array to be shuffled
+ * @tparam IntType Integer type used for ther perms array
+ * @tparam IdxType Integer type used for addressing indices
+ * @tparam TPB threads per block
+ * @param perms the output permutation indices. Typically useful only when
+ * one wants to refer back. If you don't need this, pass a nullptr
+ * @param out the output shuffled array. Pass nullptr if you don't want this to
+ * be written. For eg: when you only want the perms array to be filled.
+ * @param in input array (in-place is not supported due to race conditions!)
+ * @param D number of columns of the input array
+ * @param N length of the input array (or number of rows)
+ * @param rowMajor whether the input/output matrices are row or col major
+ * @param stream cuda stream where to launch the work
+ *
+ * @note This is NOT a uniform permutation generator! In fact, it only generates
+ * very small percentage of permutations. If your application really requires a
+ * high quality permutation generator, it is recommended that you pick
+ * Knuth Shuffle.
+ */
+template <typename Type, typename IntType = int, typename IdxType = int, int TPB = 256>
+void permute(IntType* perms,
+             Type* out,
+             const Type* in,
+             IntType D,
+             IntType N,
+             bool rowMajor,
+             cudaStream_t stream)
+{
+  detail::permute<Type, IntType, IdxType, TPB>(perms, out, in, D, N, rowMajor, stream);
+}
+
+};  // end namespace raft::random
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/random/permute.hpp b/cpp/include/raft/random/permute.hpp
index 32ed3779e4..3507d66cc3 100644
--- a/cpp/include/raft/random/permute.hpp
+++ b/cpp/include/raft/random/permute.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __PERMUTE_H
+#define __PERMUTE_H
 
 #pragma once
 
@@ -55,4 +62,6 @@ void permute(IntType* perms,
   detail::permute<Type, IntType, IdxType, TPB>(perms, out, in, D, N, rowMajor, stream);
 }
 
-};  // end namespace raft::random
\ No newline at end of file
+};  // end namespace raft::random
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh
new file mode 100644
index 0000000000..3e75b2ae74
--- /dev/null
+++ b/cpp/include/raft/random/rng.cuh
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RNG_H
+#define __RNG_H
+
+#pragma once
+
+#include "detail/rng_impl.cuh"
+
+namespace raft {
+namespace random {
+
+using detail::RngState;
+
+using detail::GeneratorType;
+using detail::GenPC;
+using detail::GenPhilox;
+
+using detail::PCGenerator;
+using detail::PhiloxGenerator;
+
+using detail::BernoulliDistParams;
+using detail::ExponentialDistParams;
+using detail::GumbelDistParams;
+using detail::InvariantDistParams;
+using detail::LaplaceDistParams;
+using detail::LogisticDistParams;
+using detail::LogNormalDistParams;
+using detail::NormalDistParams;
+using detail::NormalIntDistParams;
+using detail::NormalTableDistParams;
+using detail::RayleighDistParams;
+using detail::SamplingParams;
+using detail::ScaledBernoulliDistParams;
+using detail::UniformDistParams;
+using detail::UniformIntDistParams;
+
+// Not strictly needed due to C++ ADL rules
+using detail::custom_next;
+
+/**
+ * @brief Helper method to compute Box Muller transform
+ *
+ * @tparam Type data type
+ *
+ * @param[inout] val1   first value
+ * @param[inout] val2   second value
+ * @param[in]    sigma1 standard deviation of output gaussian for first value
+ * @param[in]    mu1    mean of output gaussian for first value
+ * @param[in]    sigma2 standard deviation of output gaussian for second value
+ * @param[in]    mu2    mean of output gaussian for second value
+ * @{
+ */
+template <typename Type>
+DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2)
+{
+  detail::box_muller_transform(val1, val2, sigma1, mu1, sigma2, mu2);
+}
+
+template <typename Type>
+DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1)
+{
+  detail::box_muller_transform(val1, val2, sigma1, mu1);
+}
+/** @} */
+
+class Rng : public detail::RngImpl {
+ public:
+  /**
+   * @brief ctor
+   * @param _s 64b seed used to initialize the RNG
+   * @param _t backend device RNG generator type
+   * @note Refer to the `Rng::seed` method for details about seeding the engine
+   */
+  Rng(uint64_t _s, GeneratorType _t = GenPhilox) : detail::RngImpl(_s, _t) {}
+
+  /**
+   * @brief Generates the 'a' and 'b' parameters for a modulo affine
+   *        transformation equation: `(ax + b) % n`
+   *
+   * @tparam IdxT integer type
+   *
+   * @param[in]  n the modulo range
+   * @param[out] a slope parameter
+   * @param[out] b intercept parameter
+   */
+  template <typename IdxT>
+  void affine_transform_params(IdxT n, IdxT& a, IdxT& b)
+  {
+    detail::RngImpl::affine_transform_params(n, a, b);
+  }
+
+  /**
+   * @brief Generate uniformly distributed numbers in the given range
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param start start of the range
+   * @param end end of the range
+   * @param stream stream where to launch the kernel
+   * @{
+   */
+  template <typename OutType, typename LenType = int>
+  void uniform(OutType* ptr, LenType len, OutType start, OutType end, cudaStream_t stream)
+  {
+    detail::RngImpl::uniform(ptr, len, start, end, stream);
+  }
+
+  template <typename OutType, typename LenType = int>
+  void uniformInt(OutType* ptr, LenType len, OutType start, OutType end, cudaStream_t stream)
+  {
+    detail::RngImpl::uniformInt(ptr, len, start, end, stream);
+  }
+  /** @} */
+
+  /**
+   * @brief Generate normal distributed numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param mu mean of the distribution
+   * @param sigma std-dev of the distribution
+   * @param stream stream where to launch the kernel
+   * @{
+   */
+  template <typename OutType, typename LenType = int>
+  void normal(OutType* ptr, LenType len, OutType mu, OutType sigma, cudaStream_t stream)
+  {
+    detail::RngImpl::normal(ptr, len, mu, sigma, stream);
+  }
+
+  template <typename IntType, typename LenType = int>
+  void normalInt(IntType* ptr, LenType len, IntType mu, IntType sigma, cudaStream_t stream)
+  {
+    detail::RngImpl::normalInt(ptr, len, mu, sigma, stream);
+  }
+  /** @} */
+
+  /**
+   * @brief Generate normal distributed table according to the given set of
+   * means and scalar standard deviations.
+   *
+   * Each row in this table conforms to a normally distributed n-dim vector
+   * whose mean is the input vector and standard deviation is the corresponding
+   * vector or scalar. Correlations among the dimensions itself is assumed to
+   * be absent.
+   *
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output table (dim = n_rows x n_cols)
+   * @param n_rows number of rows in the table
+   * @param n_cols number of columns in the table
+   * @param mu_vec mean vector (dim = n_cols x 1).
+   * @param sigma_vec std-dev vector of each component (dim = n_cols x 1). Pass
+   * a nullptr to use the same scalar 'sigma' across all components
+   * @param sigma scalar sigma to be used if 'sigma_vec' is nullptr
+   * @param stream stream where to launch the kernel
+   */
+  template <typename OutType, typename LenType = int>
+  void normalTable(OutType* ptr,
+                   LenType n_rows,
+                   LenType n_cols,
+                   const OutType* mu_vec,
+                   const OutType* sigma_vec,
+                   OutType sigma,
+                   cudaStream_t stream)
+  {
+    detail::RngImpl::normalTable(ptr, n_rows, n_cols, mu_vec, sigma_vec, sigma, stream);
+  }
+
+  /**
+   * @brief Fill an array with the given value
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param val value to be filled
+   * @param stream stream where to launch the kernel
+   */
+  template <typename OutType, typename LenType = int>
+  void fill(OutType* ptr, LenType len, OutType val, cudaStream_t stream)
+  {
+    detail::RngImpl::fill(ptr, len, val, stream);
+  }
+
+  /**
+   * @brief Generate bernoulli distributed boolean array
+   *
+   * @tparam Type    data type in which to compute the probabilities
+   * @tparam OutType output data type
+   * @tparam LenType data type used to represent length of the arrays
+   *
+   * @param[out] ptr    the output array
+   * @param[in]  len    the number of elements in the output
+   * @param[in]  prob   coin-toss probability for heads
+   * @param[in]  stream stream where to launch the kernel
+   */
+  template <typename Type, typename OutType = bool, typename LenType = int>
+  void bernoulli(OutType* ptr, LenType len, Type prob, cudaStream_t stream)
+  {
+    detail::RngImpl::bernoulli(ptr, len, prob, stream);
+  }
+
+  /**
+   * @brief Generate bernoulli distributed array and applies scale
+   * @tparam Type data type in which to compute the probabilities
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param prob coin-toss probability for heads
+   * @param scale scaling factor
+   * @param stream stream where to launch the kernel
+   */
+  template <typename OutType, typename LenType = int>
+  void scaled_bernoulli(OutType* ptr, LenType len, OutType prob, OutType scale, cudaStream_t stream)
+  {
+    detail::RngImpl::scaled_bernoulli(ptr, len, prob, scale, stream);
+  }
+
+  /**
+   * @brief Generate Gumbel distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param mu mean value
+   * @param beta scale value
+   * @param stream stream where to launch the kernel
+   * @note https://en.wikipedia.org/wiki/Gumbel_distribution
+   */
+  template <typename OutType, typename LenType = int>
+  void gumbel(OutType* ptr, LenType len, OutType mu, OutType beta, cudaStream_t stream)
+  {
+    detail::RngImpl::gumbel(ptr, len, mu, beta, stream);
+  }
+
+  /**
+   * @brief Generate lognormal distributed numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param mu mean of the distribution
+   * @param sigma std-dev of the distribution
+   * @param stream stream where to launch the kernel
+   */
+  template <typename OutType, typename LenType = int>
+  void lognormal(OutType* ptr, LenType len, OutType mu, OutType sigma, cudaStream_t stream)
+  {
+    detail::RngImpl::lognormal(ptr, len, mu, sigma, stream);
+  }
+
+  /**
+   * @brief Generate logistic distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param mu mean value
+   * @param scale scale value
+   * @param stream stream where to launch the kernel
+   */
+  template <typename OutType, typename LenType = int>
+  void logistic(OutType* ptr, LenType len, OutType mu, OutType scale, cudaStream_t stream)
+  {
+    detail::RngImpl::logistic(ptr, len, mu, scale, stream);
+  }
+
+  /**
+   * @brief Generate exponentially distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param lambda the lambda
+   * @param stream stream where to launch the kernel
+   */
+  template <typename OutType, typename LenType = int>
+  void exponential(OutType* ptr, LenType len, OutType lambda, cudaStream_t stream)
+  {
+    detail::RngImpl::exponential(ptr, len, lambda, stream);
+  }
+
+  /**
+   * @brief Generate rayleigh distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param sigma the sigma
+   * @param stream stream where to launch the kernel
+   */
+  template <typename OutType, typename LenType = int>
+  void rayleigh(OutType* ptr, LenType len, OutType sigma, cudaStream_t stream)
+  {
+    detail::RngImpl::rayleigh(ptr, len, sigma, stream);
+  }
+
+  /**
+   * @brief Generate laplace distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param mu the mean
+   * @param scale the scale
+   * @param stream stream where to launch the kernel
+   */
+  template <typename OutType, typename LenType = int>
+  void laplace(OutType* ptr, LenType len, OutType mu, OutType scale, cudaStream_t stream)
+  {
+    detail::RngImpl::laplace(ptr, len, mu, scale, stream);
+  }
+
+  void advance(uint64_t max_streams, uint64_t max_calls_per_subsequence)
+  {
+    detail::RngImpl::advance(max_streams, max_calls_per_subsequence);
+  }
+
+  /**
+   * @brief Sample the input array without replacement, optionally based on the
+   * input weight vector for each element in the array
+   *
+   * Implementation here is based on the `one-pass sampling` algo described here:
+   * https://www.ethz.ch/content/dam/ethz/special-interest/baug/ivt/ivt-dam/vpl/reports/1101-1200/ab1141.pdf
+   *
+   * @note In the sampled array the elements which are picked will always appear
+   * in the increasing order of their weights as computed using the exponential
+   * distribution. So, if you're particular about the order (for eg. array
+   * permutations), then this might not be the right choice!
+   *
+   * @tparam DataT data type
+   * @tparam WeightsT weights type
+   * @tparam IdxT index type
+   * @param handle
+   * @param out output sampled array (of length 'sampledLen')
+   * @param outIdx indices of the sampled array (of length 'sampledLen'). Pass
+   * a nullptr if this is not required.
+   * @param in input array to be sampled (of length 'len')
+   * @param wts weights array (of length 'len'). Pass a nullptr if uniform
+   * sampling is desired
+   * @param sampledLen output sampled array length
+   * @param len input array length
+   * @param stream cuda stream
+   */
+  template <typename DataT, typename WeightsT, typename IdxT = int>
+  void sampleWithoutReplacement(const raft::handle_t& handle,
+                                DataT* out,
+                                IdxT* outIdx,
+                                const DataT* in,
+                                const WeightsT* wts,
+                                IdxT sampledLen,
+                                IdxT len,
+                                cudaStream_t stream)
+  {
+    detail::RngImpl::sampleWithoutReplacement(
+      handle, out, outIdx, in, wts, sampledLen, len, stream);
+  }
+};
+
+};  // end namespace random
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/random/rng.hpp b/cpp/include/raft/random/rng.hpp
index 2b1bdbccf7..2d1af6a97e 100644
--- a/cpp/include/raft/random/rng.hpp
+++ b/cpp/include/raft/random/rng.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __RNG_H
+#define __RNG_H
 
 #pragma once
 
@@ -373,3 +380,5 @@ class Rng : public detail::RngImpl {
 
 };  // end namespace random
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/convert/coo.cuh b/cpp/include/raft/sparse/convert/coo.cuh
new file mode 100644
index 0000000000..b5568ef7d9
--- /dev/null
+++ b/cpp/include/raft/sparse/convert/coo.cuh
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __COO_H
+#define __COO_H
+
+#pragma once
+
+#include <raft/sparse/convert/detail/coo.cuh>
+
+namespace raft {
+namespace sparse {
+namespace convert {
+
+/**
+ * @brief Convert a CSR row_ind array to a COO rows array
+ * @param row_ind: Input CSR row_ind array
+ * @param m: size of row_ind array
+ * @param coo_rows: Output COO row array
+ * @param nnz: size of output COO row array
+ * @param stream: cuda stream to use
+ */
+template <typename value_idx = int>
+void csr_to_coo(
+  const value_idx* row_ind, value_idx m, value_idx* coo_rows, value_idx nnz, cudaStream_t stream)
+{
+  detail::csr_to_coo<value_idx, 32>(row_ind, m, coo_rows, nnz, stream);
+}
+
+};  // end NAMESPACE convert
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/convert/coo.hpp b/cpp/include/raft/sparse/convert/coo.hpp
index c647b99620..009a19a563 100644
--- a/cpp/include/raft/sparse/convert/coo.hpp
+++ b/cpp/include/raft/sparse/convert/coo.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __COO_H
+#define __COO_H
 
 #pragma once
 
@@ -39,4 +46,6 @@ void csr_to_coo(
 
 };  // end NAMESPACE convert
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
new file mode 100644
index 0000000000..10bc22bcc1
--- /dev/null
+++ b/cpp/include/raft/sparse/convert/csr.cuh
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CSR_H
+#define __CSR_H
+
+#pragma once
+
+#include <raft/sparse/convert/detail/csr.cuh>
+#include <raft/sparse/csr.hpp>
+
+namespace raft {
+namespace sparse {
+namespace convert {
+
+template <typename value_t>
+void coo_to_csr(const raft::handle_t& handle,
+                const int* srcRows,
+                const int* srcCols,
+                const value_t* srcVals,
+                int nnz,
+                int m,
+                int* dst_offsets,
+                int* dstCols,
+                value_t* dstVals)
+{
+  detail::coo_to_csr(handle, srcRows, srcCols, srcVals, nnz, m, dst_offsets, dstCols, dstVals);
+}
+
+/**
+ * @brief Constructs an adjacency graph CSR row_ind_ptr array from
+ * a row_ind array and adjacency array.
+ * @tparam T the numeric type of the index arrays
+ * @tparam TPB_X the number of threads to use per block for kernels
+ * @tparam Lambda function for fused operation in the adj_graph construction
+ * @param row_ind the input CSR row_ind array
+ * @param total_rows number of vertices in graph
+ * @param nnz number of non-zeros
+ * @param batchSize number of vertices in current batch
+ * @param adj an adjacency array (size batchSize x total_rows)
+ * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph
+ * @param stream cuda stream to use
+ * @param fused_op: the fused operation
+ */
+template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph_batched(const Index_* row_ind,
+                           Index_ total_rows,
+                           Index_ nnz,
+                           Index_ batchSize,
+                           const bool* adj,
+                           Index_* row_ind_ptr,
+                           cudaStream_t stream,
+                           Lambda fused_op)
+{
+  detail::csr_adj_graph_batched<Index_, 32, Lambda>(
+    row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream, fused_op);
+}
+
+template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph_batched(const Index_* row_ind,
+                           Index_ total_rows,
+                           Index_ nnz,
+                           Index_ batchSize,
+                           const bool* adj,
+                           Index_* row_ind_ptr,
+                           cudaStream_t stream)
+{
+  detail::csr_adj_graph_batched<Index_, 32, Lambda>(
+    row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream);
+}
+
+/**
+ * @brief Constructs an adjacency graph CSR row_ind_ptr array from a
+ * a row_ind array and adjacency array.
+ * @tparam T the numeric type of the index arrays
+ * @tparam TPB_X the number of threads to use per block for kernels
+ * @param row_ind the input CSR row_ind array
+ * @param total_rows number of total vertices in graph
+ * @param nnz number of non-zeros
+ * @param adj an adjacency array
+ * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph
+ * @param stream cuda stream to use
+ * @param fused_op the fused operation
+ */
+template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph(const Index_* row_ind,
+                   Index_ total_rows,
+                   Index_ nnz,
+                   const bool* adj,
+                   Index_* row_ind_ptr,
+                   cudaStream_t stream,
+                   Lambda fused_op)
+{
+  detail::csr_adj_graph<Index_, 32, Lambda>(
+    row_ind, total_rows, nnz, adj, row_ind_ptr, stream, fused_op);
+}
+
+/**
+ * @brief Generate the row indices array for a sorted COO matrix
+ *
+ * @param rows: COO rows array
+ * @param nnz: size of COO rows array
+ * @param row_ind: output row indices array
+ * @param m: number of rows in dense matrix
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void sorted_coo_to_csr(const T* rows, int nnz, T* row_ind, int m, cudaStream_t stream)
+{
+  detail::sorted_coo_to_csr(rows, nnz, row_ind, m, stream);
+}
+
+/**
+ * @brief Generate the row indices array for a sorted COO matrix
+ *
+ * @param coo: Input COO matrix
+ * @param row_ind: output row indices array
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void sorted_coo_to_csr(COO<T>* coo, int* row_ind, cudaStream_t stream)
+{
+  detail::sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, stream);
+}
+
+};  // end NAMESPACE convert
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/convert/csr.hpp b/cpp/include/raft/sparse/convert/csr.hpp
index f0fe76bed3..6a9a99d014 100644
--- a/cpp/include/raft/sparse/convert/csr.hpp
+++ b/cpp/include/raft/sparse/convert/csr.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __CSR_H
+#define __CSR_H
 
 #pragma once
 
@@ -135,4 +142,6 @@ void sorted_coo_to_csr(COO<T>* coo, int* row_ind, cudaStream_t stream)
 
 };  // end NAMESPACE convert
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/convert/dense.cuh b/cpp/include/raft/sparse/convert/dense.cuh
new file mode 100644
index 0000000000..a146113a86
--- /dev/null
+++ b/cpp/include/raft/sparse/convert/dense.cuh
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __DENSE_H
+#define __DENSE_H
+
+#pragma once
+
+#include <raft/sparse/convert/detail/dense.cuh>
+
+namespace raft {
+namespace sparse {
+namespace convert {
+
+/**
+ * Convert CSR arrays to a dense matrix in either row-
+ * or column-major format. A custom kernel is used when
+ * row-major output is desired since cusparse does not
+ * output row-major.
+ * @tparam value_idx : data type of the CSR index arrays
+ * @tparam value_t : data type of the CSR value array
+ * @param[in] handle : cusparse handle for conversion
+ * @param[in] nrows : number of rows in CSR
+ * @param[in] ncols : number of columns in CSR
+ * @param[in] nnz : number of nonzeros in CSR
+ * @param[in] csr_indptr : CSR row index pointer array
+ * @param[in] csr_indices : CSR column indices array
+ * @param[in] csr_data : CSR data array
+ * @param[in] lda : Leading dimension (used for col-major only)
+ * @param[out] out : Dense output array of size nrows * ncols
+ * @param[in] stream : Cuda stream for ordering events
+ * @param[in] row_major : Is row-major output desired?
+ */
+template <typename value_idx, typename value_t>
+void csr_to_dense(cusparseHandle_t handle,
+                  value_idx nrows,
+                  value_idx ncols,
+                  value_idx nnz,
+                  const value_idx* csr_indptr,
+                  const value_idx* csr_indices,
+                  const value_t* csr_data,
+                  value_idx lda,
+                  value_t* out,
+                  cudaStream_t stream,
+                  bool row_major = true)
+{
+  detail::csr_to_dense<value_idx, value_t>(
+    handle, nrows, ncols, nnz, csr_indptr, csr_indices, csr_data, lda, out, stream, row_major);
+}
+
+};  // end NAMESPACE convert
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/convert/dense.hpp b/cpp/include/raft/sparse/convert/dense.hpp
index 2570d7ae65..1bdfa26732 100644
--- a/cpp/include/raft/sparse/convert/dense.hpp
+++ b/cpp/include/raft/sparse/convert/dense.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __DENSE_H
+#define __DENSE_H
 
 #pragma once
 
@@ -60,4 +67,6 @@ void csr_to_dense(cusparseHandle_t handle,
 
 };  // end NAMESPACE convert
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/convert/detail/coo.cuh b/cpp/include/raft/sparse/convert/detail/coo.cuh
index c37087789c..2d13bfa34e 100644
--- a/cpp/include/raft/sparse/convert/detail/coo.cuh
+++ b/cpp/include/raft/sparse/convert/detail/coo.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/convert/detail/csr.cuh b/cpp/include/raft/sparse/convert/detail/csr.cuh
index 751335dfca..2516d00533 100644
--- a/cpp/include/raft/sparse/convert/detail/csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/csr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,8 +35,8 @@
 
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/detail/utils.h>
-#include <raft/sparse/linalg/degree.hpp>
-#include <raft/sparse/op/row_op.hpp>
+#include <raft/sparse/linalg/degree.cuh>
+#include <raft/sparse/op/row_op.cuh>
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/include/raft/sparse/convert/detail/dense.cuh b/cpp/include/raft/sparse/convert/detail/dense.cuh
index b2756b81c9..4f97cee8b4 100644
--- a/cpp/include/raft/sparse/convert/detail/dense.cuh
+++ b/cpp/include/raft/sparse/convert/detail/dense.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/detail/csr.cuh b/cpp/include/raft/sparse/detail/csr.cuh
index a256ac402b..1fd2bb9366 100644
--- a/cpp/include/raft/sparse/detail/csr.cuh
+++ b/cpp/include/raft/sparse/detail/csr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/detail/cusparse_macros.h b/cpp/include/raft/sparse/detail/cusparse_macros.h
index 1f9f0e5175..10c7e8836c 100644
--- a/cpp/include/raft/sparse/detail/cusparse_macros.h
+++ b/cpp/include/raft/sparse/detail/cusparse_macros.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/detail/cusparse_wrappers.h b/cpp/include/raft/sparse/detail/cusparse_wrappers.h
index aef3976294..b9c4a61850 100644
--- a/cpp/include/raft/sparse/detail/cusparse_wrappers.h
+++ b/cpp/include/raft/sparse/detail/cusparse_wrappers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
index 124fa2285d..7c1229b0d3 100644
--- a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
index 020de9e014..9edd1305b3 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
index 6e717e9920..0848d24bde 100644
--- a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,12 +22,12 @@
 #include <raft/distance/distance_type.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 
-#include <raft/sparse/convert/coo.hpp>
+#include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/detail/utils.h>
 #include <raft/sparse/distance/common.h>
 #include <raft/sparse/distance/detail/coo_spmv.cuh>
 #include <raft/sparse/distance/detail/operators.cuh>
-#include <raft/sparse/linalg/transpose.hpp>
+#include <raft/sparse/linalg/transpose.cuh>
 #include <rmm/device_uvector.hpp>
 
 #include <nvfunctional>
diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
index e6dd396f2d..468689848b 100644
--- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,12 @@
 
 #pragma once
 
-#include <raft/spatial/knn/knn.hpp>
+#include <raft/spatial/knn/knn.cuh>
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance_type.hpp>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 #include <raft/sparse/csr.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/sparse/detail/utils.h>
diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
index 96d51f2e75..c6ff32caf3 100644
--- a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@
 #include <raft/sparse/csr.hpp>
 #include <raft/sparse/detail/utils.h>
 
-#include <raft/sparse/convert/coo.hpp>
+#include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/distance/common.h>
 #include <raft/sparse/distance/detail/operators.cuh>
 
diff --git a/cpp/include/raft/sparse/distance/detail/utils.cuh b/cpp/include/raft/sparse/distance/detail/utils.cuh
index 06c034ad9f..a2fe090c96 100644
--- a/cpp/include/raft/sparse/distance/detail/utils.cuh
+++ b/cpp/include/raft/sparse/distance/detail/utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/distance/distance.cuh b/cpp/include/raft/sparse/distance/distance.cuh
new file mode 100644
index 0000000000..ab189796ea
--- /dev/null
+++ b/cpp/include/raft/sparse/distance/distance.cuh
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SPARSE_DIST_H
+#define __SPARSE_DIST_H
+
+#pragma once
+
+#include <raft/sparse/distance/common.h>
+#include <unordered_set>
+
+#include <raft/distance/distance_type.hpp>
+
+#include <raft/sparse/distance/detail/bin_distance.cuh>
+#include <raft/sparse/distance/detail/ip_distance.cuh>
+#include <raft/sparse/distance/detail/l2_distance.cuh>
+#include <raft/sparse/distance/detail/lp_distance.cuh>
+
+namespace raft {
+namespace sparse {
+namespace distance {
+
+static const std::unordered_set<raft::distance::DistanceType> supportedDistance{
+  raft::distance::DistanceType::L2Expanded,
+  raft::distance::DistanceType::L2Unexpanded,
+  raft::distance::DistanceType::L2SqrtExpanded,
+  raft::distance::DistanceType::L2SqrtUnexpanded,
+  raft::distance::DistanceType::InnerProduct,
+  raft::distance::DistanceType::L1,
+  raft::distance::DistanceType::Canberra,
+  raft::distance::DistanceType::Linf,
+  raft::distance::DistanceType::LpUnexpanded,
+  raft::distance::DistanceType::JaccardExpanded,
+  raft::distance::DistanceType::CosineExpanded,
+  raft::distance::DistanceType::HellingerExpanded,
+  raft::distance::DistanceType::DiceExpanded,
+  raft::distance::DistanceType::CorrelationExpanded,
+  raft::distance::DistanceType::RusselRaoExpanded,
+  raft::distance::DistanceType::HammingUnexpanded,
+  raft::distance::DistanceType::JensenShannon,
+  raft::distance::DistanceType::KLDivergence};
+
+/**
+ * Compute pairwise distances between A and B, using the provided
+ * input configuration and distance function.
+ *
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @param[out] out dense output array (size A.nrows * B.nrows)
+ * @param[in] input_config input argument configuration
+ * @param[in] metric distance metric to use
+ * @param[in] metric_arg metric argument (used for Minkowski distance)
+ */
+template <typename value_idx = int, typename value_t = float>
+void pairwiseDistance(value_t* out,
+                      distances_config_t<value_idx, value_t> input_config,
+                      raft::distance::DistanceType metric,
+                      float metric_arg)
+{
+  switch (metric) {
+    case raft::distance::DistanceType::L2Expanded:
+      detail::l2_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::L2SqrtExpanded:
+      detail::l2_sqrt_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::InnerProduct:
+      detail::ip_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::L2Unexpanded:
+      detail::l2_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::L2SqrtUnexpanded:
+      detail::l2_sqrt_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::L1:
+      detail::l1_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::LpUnexpanded:
+      detail::lp_unexpanded_distances_t<value_idx, value_t>(input_config, metric_arg).compute(out);
+      break;
+    case raft::distance::DistanceType::Linf:
+      detail::linf_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::Canberra:
+      detail::canberra_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::JaccardExpanded:
+      detail::jaccard_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::CosineExpanded:
+      detail::cosine_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::HellingerExpanded:
+      detail::hellinger_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::DiceExpanded:
+      detail::dice_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::CorrelationExpanded:
+      detail::correlation_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::RusselRaoExpanded:
+      detail::russelrao_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::HammingUnexpanded:
+      detail::hamming_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::JensenShannon:
+      detail::jensen_shannon_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::KLDivergence:
+      detail::kl_divergence_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+
+    default: THROW("Unsupported distance: %d", metric);
+  }
+}
+
+};  // namespace distance
+};  // namespace sparse
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp
index dc9837ab43..cba419e53a 100644
--- a/cpp/include/raft/sparse/distance/distance.hpp
+++ b/cpp/include/raft/sparse/distance/distance.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SPARSE_DIST_H
+#define __SPARSE_DIST_H
 
 #pragma once
 
@@ -130,3 +137,5 @@ void pairwiseDistance(value_t* out,
 };  // namespace distance
 };  // namespace sparse
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
index 105f1cc9f6..c89f5a370a 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
index fe58246545..9d4126f8fd 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,14 +20,14 @@
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 #include <rmm/device_uvector.hpp>
 
 #include <raft/distance/distance_type.hpp>
-#include <raft/sparse/convert/csr.hpp>
+#include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/hierarchy/common.h>
-#include <raft/sparse/selection/knn_graph.hpp>
+#include <raft/sparse/selection/knn_graph.cuh>
 
 #include <limits>
 
diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
index 10e9d04c0d..545a371850 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,8 +20,8 @@
 #include <raft/cudart_utils.h>
 
 #include <raft/sparse/mst/mst.cuh>
-#include <raft/sparse/op/sort.hpp>
-#include <raft/sparse/selection/connect_components.hpp>
+#include <raft/sparse/op/sort.cuh>
+#include <raft/sparse/selection/connect_components.cuh>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
diff --git a/cpp/include/raft/sparse/hierarchy/detail/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/detail/single_linkage.cuh
similarity index 99%
rename from cpp/include/raft/sparse/hierarchy/detail/single_linkage.hpp
rename to cpp/include/raft/sparse/hierarchy/detail/single_linkage.cuh
index 702198e422..4e94b6f65d 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/single_linkage.hpp
+++ b/cpp/include/raft/sparse/hierarchy/detail/single_linkage.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.cuh b/cpp/include/raft/sparse/hierarchy/single_linkage.cuh
new file mode 100644
index 0000000000..86940005b4
--- /dev/null
+++ b/cpp/include/raft/sparse/hierarchy/single_linkage.cuh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SINGLE_LINKAGE_H
+#define __SINGLE_LINKAGE_H
+
+#pragma once
+
+#include <raft/sparse/hierarchy/common.h>
+#include <raft/sparse/hierarchy/detail/single_linkage.cuh>
+
+namespace raft {
+namespace hierarchy {
+
+/**
+ * Single-linkage clustering, capable of constructing a KNN graph to
+ * scale the algorithm beyond the n^2 memory consumption of implementations
+ * that use the fully-connected graph of pairwise distances by connecting
+ * a knn graph when k is not large enough to connect it.
+
+ * @tparam value_idx
+ * @tparam value_t
+ * @tparam dist_type method to use for constructing connectivities graph
+ * @param[in] handle raft handle
+ * @param[in] X dense input matrix in row-major layout
+ * @param[in] m number of rows in X
+ * @param[in] n number of columns in X
+ * @param[in] metric distance metrix to use when constructing connectivities graph
+ * @param[out] out struct containing output dendrogram and cluster assignments
+ * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect
+ control
+ *            of k. The algorithm will set `k = log(n) + c`
+ * @param[in] n_clusters number of clusters to assign data samples
+ */
+template <typename value_idx,
+          typename value_t,
+          LinkageDistance dist_type = LinkageDistance::KNN_GRAPH>
+void single_linkage(const raft::handle_t& handle,
+                    const value_t* X,
+                    size_t m,
+                    size_t n,
+                    raft::distance::DistanceType metric,
+                    linkage_output<value_idx, value_t>* out,
+                    int c,
+                    size_t n_clusters)
+{
+  detail::single_linkage<value_idx, value_t, dist_type>(
+    handle, X, m, n, metric, out, c, n_clusters);
+}
+};  // namespace hierarchy
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
index 104c1235d4..e7a37b7bf5 100644
--- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
+++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,11 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SINGLE_LINKAGE_H
+#define __SINGLE_LINKAGE_H
 
 #pragma once
 
 #include <raft/sparse/hierarchy/common.h>
-#include <raft/sparse/hierarchy/detail/single_linkage.hpp>
+#include <raft/sparse/hierarchy/detail/single_linkage.cuh>
 
 namespace raft {
 namespace hierarchy {
@@ -59,3 +66,5 @@ void single_linkage(const raft::handle_t& handle,
 }
 };  // namespace hierarchy
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh
new file mode 100644
index 0000000000..def305afb2
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/add.cuh
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SPARSE_ADD_H
+#define __SPARSE_ADD_H
+
+#pragma once
+
+#include <raft/sparse/linalg/detail/add.cuh>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * @brief Calculate the CSR row_ind array that would result
+ * from summing together two CSR matrices
+ * @param a_ind: left hand row_ind array
+ * @param a_indptr: left hand index_ptr array
+ * @param a_val: left hand data array
+ * @param nnz1: size of left hand index_ptr and val arrays
+ * @param b_ind: right hand row_ind array
+ * @param b_indptr: right hand index_ptr array
+ * @param b_val: right hand data array
+ * @param nnz2: size of right hand index_ptr and val arrays
+ * @param m: size of output array (number of rows in final matrix)
+ * @param out_ind: output row_ind array
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+size_t csr_add_calc_inds(const int* a_ind,
+                         const int* a_indptr,
+                         const T* a_val,
+                         int nnz1,
+                         const int* b_ind,
+                         const int* b_indptr,
+                         const T* b_val,
+                         int nnz2,
+                         int m,
+                         int* out_ind,
+                         cudaStream_t stream)
+{
+  return detail::csr_add_calc_inds(
+    a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, out_ind, stream);
+}
+
+/**
+ * @brief Calculate the CSR row_ind array that would result
+ * from summing together two CSR matrices
+ * @param a_ind: left hand row_ind array
+ * @param a_indptr: left hand index_ptr array
+ * @param a_val: left hand data array
+ * @param nnz1: size of left hand index_ptr and val arrays
+ * @param b_ind: right hand row_ind array
+ * @param b_indptr: right hand index_ptr array
+ * @param b_val: right hand data array
+ * @param nnz2: size of right hand index_ptr and val arrays
+ * @param m: size of output array (number of rows in final matrix)
+ * @param c_ind: output row_ind array
+ * @param c_indptr: output ind_ptr array
+ * @param c_val: output data array
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void csr_add_finalize(const int* a_ind,
+                      const int* a_indptr,
+                      const T* a_val,
+                      int nnz1,
+                      const int* b_ind,
+                      const int* b_indptr,
+                      const T* b_val,
+                      int nnz2,
+                      int m,
+                      int* c_ind,
+                      int* c_indptr,
+                      T* c_val,
+                      cudaStream_t stream)
+{
+  detail::csr_add_finalize(
+    a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, c_ind, c_indptr, c_val, stream);
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/add.hpp b/cpp/include/raft/sparse/linalg/add.hpp
index 30c39b1ffc..33259cb39f 100644
--- a/cpp/include/raft/sparse/linalg/add.hpp
+++ b/cpp/include/raft/sparse/linalg/add.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SPARSE_ADD_H
+#define __SPARSE_ADD_H
 
 #pragma once
 
@@ -93,3 +100,5 @@ void csr_add_finalize(const int* a_ind,
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh
new file mode 100644
index 0000000000..57c9b986b4
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/degree.cuh
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SPARSE_DEGREE_H
+#define __SPARSE_DEGREE_H
+
+#pragma once
+
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/linalg/detail/degree.cuh>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * @brief Count the number of values for each row
+ * @tparam TPB_X: number of threads to use per block
+ * @param rows: rows array of the COO matrix
+ * @param nnz: size of the rows array
+ * @param results: output result array
+ * @param stream: cuda stream to use
+ */
+template <typename T = int>
+void coo_degree(const T* rows, int nnz, T* results, cudaStream_t stream)
+{
+  detail::coo_degree<64, T>(rows, nnz, results, stream);
+}
+
+/**
+ * @brief Count the number of values for each row
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: type name of underlying values array
+ * @param in: input COO object for counting rows
+ * @param results: output array with row counts (size=in->n_rows)
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_degree(COO<T>* in, int* results, cudaStream_t stream)
+{
+  coo_degree(in->rows(), in->nnz, results, stream);
+}
+
+/**
+ * @brief Count the number of values for each row that doesn't match a particular scalar
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: the type name of the underlying value arrays
+ * @param rows: Input COO row array
+ * @param vals: Input COO val arrays
+ * @param nnz: size of input COO arrays
+ * @param scalar: scalar to match for counting rows
+ * @param results: output row counts
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_degree_scalar(
+  const int* rows, const T* vals, int nnz, T scalar, int* results, cudaStream_t stream = 0)
+{
+  detail::coo_degree_scalar<64>(rows, vals, nnz, scalar, results, stream);
+}
+
+/**
+ * @brief Count the number of values for each row that doesn't match a particular scalar
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: the type name of the underlying value arrays
+ * @param in: Input COO array
+ * @param scalar: scalar to match for counting rows
+ * @param results: output row counts
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_degree_scalar(COO<T>* in, T scalar, int* results, cudaStream_t stream)
+{
+  coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, results, stream);
+}
+
+/**
+ * @brief Count the number of nonzeros for each row
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: the type name of the underlying value arrays
+ * @param rows: Input COO row array
+ * @param vals: Input COO val arrays
+ * @param nnz: size of input COO arrays
+ * @param results: output row counts
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_degree_nz(const int* rows, const T* vals, int nnz, int* results, cudaStream_t stream)
+{
+  detail::coo_degree_nz<64>(rows, vals, nnz, results, stream);
+}
+
+/**
+ * @brief Count the number of nonzero values for each row
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: the type name of the underlying value arrays
+ * @param in: Input COO array
+ * @param results: output row counts
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_degree_nz(COO<T>* in, int* results, cudaStream_t stream)
+{
+  coo_degree_nz(in->rows(), in->vals(), in->nnz, results, stream);
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/degree.hpp b/cpp/include/raft/sparse/linalg/degree.hpp
index 04643b219d..0c6af596ce 100644
--- a/cpp/include/raft/sparse/linalg/degree.hpp
+++ b/cpp/include/raft/sparse/linalg/degree.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SPARSE_DEGREE_H
+#define __SPARSE_DEGREE_H
 
 #pragma once
 
@@ -117,3 +124,5 @@ void coo_degree_nz(COO<T>* in, int* results, cudaStream_t stream)
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/detail/add.cuh b/cpp/include/raft/sparse/linalg/detail/add.cuh
index b288d0a603..5c3d07fc02 100644
--- a/cpp/include/raft/sparse/linalg/detail/add.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/add.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/linalg/detail/norm.cuh b/cpp/include/raft/sparse/linalg/detail/norm.cuh
index b7420a55e7..ba0ecd5dcc 100644
--- a/cpp/include/raft/sparse/linalg/detail/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/norm.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
index 7e5bd5b9e4..c295932719 100644
--- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -17,12 +17,12 @@
 #include <raft/cudart_utils.h>
 
 #include <raft/cuda_utils.cuh>
-#include <raft/spectral/cluster_solvers.hpp>
-#include <raft/spectral/eigen_solvers.hpp>
-#include <raft/spectral/partition.hpp>
+#include <raft/spectral/cluster_solvers.cuh>
+#include <raft/spectral/eigen_solvers.cuh>
+#include <raft/spectral/partition.cuh>
 #include <rmm/device_uvector.hpp>
 
-#include <raft/sparse/convert/csr.hpp>
+#include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
index 4384f2ba55..9143aac84f 100644
--- a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <raft/device_atomics.cuh>
-#include <raft/sparse/op/sort.hpp>
+#include <raft/sparse/op/sort.cuh>
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
 
@@ -35,10 +35,10 @@
 #include <algorithm>
 #include <iostream>
 
-#include <raft/sparse/convert/csr.hpp>
+#include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/detail/utils.h>
-#include <raft/sparse/op/reduce.hpp>
+#include <raft/sparse/op/reduce.cuh>
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/include/raft/sparse/linalg/detail/transpose.h b/cpp/include/raft/sparse/linalg/detail/transpose.h
index 398877eaab..4820b489d1 100644
--- a/cpp/include/raft/sparse/linalg/detail/transpose.h
+++ b/cpp/include/raft/sparse/linalg/detail/transpose.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh
new file mode 100644
index 0000000000..e13fd22843
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/norm.cuh
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SPARSE_NORM_H
+#define __SPARSE_NORM_H
+
+#pragma once
+
+#include <raft/sparse/linalg/detail/norm.cuh>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * @brief Perform L1 normalization on the rows of a given CSR-formatted sparse matrix
+ *
+ * @param ia: row_ind array
+ * @param vals: data array
+ * @param nnz: size of data array
+ * @param m: size of row_ind array
+ * @param result: l1 normalized data array
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void csr_row_normalize_l1(const int* ia,  // csr row ex_scan (sorted by row)
+                          const T* vals,
+                          int nnz,  // array of values and number of non-zeros
+                          int m,    // num rows in csr
+                          T* result,
+                          cudaStream_t stream)
+{  // output array
+  detail::csr_row_normalize_l1(ia, vals, nnz, m, result, stream);
+}
+
+/**
+ * @brief Perform L_inf normalization on a given CSR-formatted sparse matrix
+ *
+ * @param ia: row_ind array
+ * @param vals: data array
+ * @param nnz: size of data array
+ * @param m: size of row_ind array
+ * @param result: l1 normalized data array
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
+                           const T* vals,
+                           int nnz,  // array of values and number of non-zeros
+                           int m,    // num total rows in csr
+                           T* result,
+                           cudaStream_t stream)
+{
+  detail::csr_row_normalize_max(ia, vals, nnz, m, result, stream);
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/norm.hpp b/cpp/include/raft/sparse/linalg/norm.hpp
index 683daedf4f..196951bac7 100644
--- a/cpp/include/raft/sparse/linalg/norm.hpp
+++ b/cpp/include/raft/sparse/linalg/norm.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SPARSE_NORM_H
+#define __SPARSE_NORM_H
 
 #pragma once
 
@@ -66,4 +73,6 @@ void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
 
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh
new file mode 100644
index 0000000000..fe95d1414c
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/spectral.cuh
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SPARSE_SPECTRAL_H
+#define __SPARSE_SPECTRAL_H
+
+#include <raft/handle.hpp>
+#include <raft/sparse/linalg/detail/spectral.cuh>
+
+namespace raft {
+namespace sparse {
+namespace spectral {
+
+template <typename T>
+void fit_embedding(const raft::handle_t& handle,
+                   int* rows,
+                   int* cols,
+                   T* vals,
+                   int nnz,
+                   int n,
+                   int n_components,
+                   T* out,
+                   unsigned long long seed = 1234567)
+{
+  detail::fit_embedding(handle, rows, cols, vals, nnz, n, n_components, out, seed);
+}
+};  // namespace spectral
+};  // namespace sparse
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/spectral.hpp b/cpp/include/raft/sparse/linalg/spectral.hpp
index 619987062f..9daa6e07b0 100644
--- a/cpp/include/raft/sparse/linalg/spectral.hpp
+++ b/cpp/include/raft/sparse/linalg/spectral.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SPARSE_SPECTRAL_H
+#define __SPARSE_SPECTRAL_H
 
 #include <raft/handle.hpp>
 #include <raft/sparse/linalg/detail/spectral.cuh>
@@ -37,3 +44,5 @@ void fit_embedding(const raft::handle_t& handle,
 };  // namespace spectral
 };  // namespace sparse
 };  // namespace raft
+
+#endif
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh
new file mode 100644
index 0000000000..d41540c0b3
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SYMMETRIZE_H
+#define __SYMMETRIZE_H
+
+#pragma once
+
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/linalg/detail/symmetrize.cuh>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * @brief takes a COO matrix which may not be symmetric and symmetrizes
+ * it, running a custom reduction function against the each value
+ * and its transposed value.
+ *
+ * @param in: Input COO matrix
+ * @param out: Output symmetrized COO matrix
+ * @param reduction_op: a custom reduction function
+ * @param stream: cuda stream to use
+ */
+template <typename T, typename Lambda>
+void coo_symmetrize(COO<T>* in,
+                    COO<T>* out,
+                    Lambda reduction_op,  // two-argument reducer
+                    cudaStream_t stream)
+{
+  detail::coo_symmetrize(in, out, reduction_op, stream);
+}
+
+/**
+ * @brief Find how much space needed in each row.
+ * We look through all datapoints and increment the count for each row.
+ *
+ * TODO: This isn't generalized. Remove in place of `symmetrize()`
+ * @param data: Input knn distances(n, k)
+ * @param indices: Input knn indices(n, k)
+ * @param n: Number of rows
+ * @param k: Number of n_neighbors
+ * @param row_sizes: Input empty row sum 1 array(n)
+ * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction
+ */
+template <typename value_idx = int64_t, typename value_t = float>
+__global__ static void symmetric_find_size(const value_t* __restrict__ data,
+                                           const value_idx* __restrict__ indices,
+                                           const value_idx n,
+                                           const int k,
+                                           value_idx* __restrict__ row_sizes,
+                                           value_idx* __restrict__ row_sizes2)
+{
+  detail::symmetric_find_size(data, indices, n, k, row_sizes, row_sizes2);
+}
+
+/**
+ * @brief Reduce sum(row_sizes) + k
+ * Reduction for symmetric_find_size kernel. Allows algo to be faster.
+ *
+ * TODO: This isn't generalized. Remove in place of `symmetrize()`
+ * @param n: Number of rows
+ * @param k: Number of n_neighbors
+ * @param row_sizes: Input row sum 1 array(n)
+ * @param row_sizes2: Input row sum 2 array(n) for faster reduction
+ */
+template <typename value_idx>
+__global__ static void reduce_find_size(const value_idx n,
+                                        const int k,
+                                        value_idx* __restrict__ row_sizes,
+                                        const value_idx* __restrict__ row_sizes2)
+{
+  detail::reduce_find_size(n, k, row_sizes, row_sizes2);
+}
+
+/**
+ * @brief Perform data + data.T operation.
+ * Can only run once row_sizes from the CSR matrix of data + data.T has been
+ * determined.
+ *
+ * TODO: This isn't generalized. Remove in place of `symmetrize()`
+ *
+ * @param edges: Input row sum array(n) after reduction
+ * @param data: Input knn distances(n, k)
+ * @param indices: Input knn indices(n, k)
+ * @param VAL: Output values for data + data.T
+ * @param COL: Output column indices for data + data.T
+ * @param ROW: Output row indices for data + data.T
+ * @param n: Number of rows
+ * @param k: Number of n_neighbors
+ */
+template <typename value_idx = int64_t, typename value_t = float>
+__global__ static void symmetric_sum(value_idx* __restrict__ edges,
+                                     const value_t* __restrict__ data,
+                                     const value_idx* __restrict__ indices,
+                                     value_t* __restrict__ VAL,
+                                     value_idx* __restrict__ COL,
+                                     value_idx* __restrict__ ROW,
+                                     const value_idx n,
+                                     const int k)
+{
+  detail::symmetric_sum(edges, data, indices, VAL, COL, ROW, n, k);
+}
+
+/**
+ * @brief Perform data + data.T on raw KNN data.
+ * The following steps are invoked:
+ * (1) Find how much space needed in each row
+ * (2) Compute final space needed (n*k + sum(row_sizes)) == 2*n*k
+ * (3) Allocate new space
+ * (4) Prepare edges for each new row
+ * (5) Perform final data + data.T operation
+ * (6) Return summed up VAL, COL, ROW
+ *
+ * TODO: This isn't generalized. Remove in place of `symmetrize()`
+ *
+ * @param knn_indices: Input knn distances(n, k)
+ * @param knn_dists: Input knn indices(n, k)
+ * @param n: Number of rows
+ * @param k: Number of n_neighbors
+ * @param out: Output COO Matrix class
+ * @param stream: Input cuda stream
+ */
+template <typename value_idx = int64_t, typename value_t = float, int TPB_X = 32, int TPB_Y = 32>
+void from_knn_symmetrize_matrix(const value_idx* __restrict__ knn_indices,
+                                const value_t* __restrict__ knn_dists,
+                                const value_idx n,
+                                const int k,
+                                COO<value_t, value_idx>* out,
+                                cudaStream_t stream)
+{
+  detail::from_knn_symmetrize_matrix(knn_indices, knn_dists, n, k, out, stream);
+}
+
+/**
+ * Symmetrizes a COO matrix
+ */
+template <typename value_idx, typename value_t>
+void symmetrize(const raft::handle_t& handle,
+                const value_idx* rows,
+                const value_idx* cols,
+                const value_t* vals,
+                size_t m,
+                size_t n,
+                size_t nnz,
+                raft::sparse::COO<value_t, value_idx>& out)
+{
+  detail::symmetrize(handle, rows, cols, vals, m, n, nnz, out);
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.hpp b/cpp/include/raft/sparse/linalg/symmetrize.hpp
index 64d27f5b6f..4d8520dabf 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.hpp
+++ b/cpp/include/raft/sparse/linalg/symmetrize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SYMMETRIZE_H
+#define __SYMMETRIZE_H
 
 #pragma once
 
@@ -162,3 +169,5 @@ void symmetrize(const raft::handle_t& handle,
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
+
+#endif
diff --git a/cpp/include/raft/sparse/linalg/transpose.cuh b/cpp/include/raft/sparse/linalg/transpose.cuh
new file mode 100644
index 0000000000..8f0105f512
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/transpose.cuh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __TRANSPOSE_H
+#define __TRANSPOSE_H
+
+#pragma once
+
+#include <raft/handle.hpp>
+#include <raft/sparse/linalg/detail/transpose.h>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * Transpose a set of CSR arrays into a set of CSC arrays.
+ * @tparam value_idx : data type of the CSR index arrays
+ * @tparam value_t : data type of the CSR data array
+ * @param[in] handle : used for invoking cusparse
+ * @param[in] csr_indptr : CSR row index array
+ * @param[in] csr_indices : CSR column indices array
+ * @param[in] csr_data : CSR data array
+ * @param[out] csc_indptr : CSC row index array
+ * @param[out] csc_indices : CSC column indices array
+ * @param[out] csc_data : CSC data array
+ * @param[in] csr_nrows : Number of rows in CSR
+ * @param[in] csr_ncols : Number of columns in CSR
+ * @param[in] nnz : Number of nonzeros of CSR
+ * @param[in] stream : Cuda stream for ordering events
+ */
+template <typename value_idx, typename value_t>
+void csr_transpose(const raft::handle_t& handle,
+                   const value_idx* csr_indptr,
+                   const value_idx* csr_indices,
+                   const value_t* csr_data,
+                   value_idx* csc_indptr,
+                   value_idx* csc_indices,
+                   value_t* csc_data,
+                   value_idx csr_nrows,
+                   value_idx csr_ncols,
+                   value_idx nnz,
+                   cudaStream_t stream)
+{
+  detail::csr_transpose(handle.get_cusparse_handle(),
+                        csr_indptr,
+                        csr_indices,
+                        csr_data,
+                        csc_indptr,
+                        csc_indices,
+                        csc_data,
+                        csr_nrows,
+                        csr_ncols,
+                        nnz,
+                        stream);
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/transpose.hpp b/cpp/include/raft/sparse/linalg/transpose.hpp
index 6e40b647e9..0aea254803 100644
--- a/cpp/include/raft/sparse/linalg/transpose.hpp
+++ b/cpp/include/raft/sparse/linalg/transpose.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __TRANSPOSE_H
+#define __TRANSPOSE_H
 
 #pragma once
 
@@ -68,3 +75,5 @@ void csr_transpose(const raft::handle_t& handle,
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/mst/mst.cuh b/cpp/include/raft/sparse/mst/mst.cuh
index b49003467b..70a6ff521f 100644
--- a/cpp/include/raft/sparse/mst/mst.cuh
+++ b/cpp/include/raft/sparse/mst/mst.cuh
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#ifndef __MST_H
+#define __MST_H
 
 #pragma once
 
@@ -51,3 +53,5 @@ raft::Graph_COO<vertex_t, edge_t, weight_t> mst(const raft::handle_t& handle,
 
 }  // namespace mst
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/mst/mst.hpp b/cpp/include/raft/sparse/mst/mst.hpp
new file mode 100644
index 0000000000..ac4cf21b64
--- /dev/null
+++ b/cpp/include/raft/sparse/mst/mst.hpp
@@ -0,0 +1,63 @@
+
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MST_H
+#define __MST_H
+
+#pragma once
+
+#include "mst_solver.cuh"
+
+namespace raft {
+namespace mst {
+
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t = weight_t>
+raft::Graph_COO<vertex_t, edge_t, weight_t> mst(const raft::handle_t& handle,
+                                                edge_t const* offsets,
+                                                vertex_t const* indices,
+                                                weight_t const* weights,
+                                                vertex_t const v,
+                                                edge_t const e,
+                                                vertex_t* color,
+                                                cudaStream_t stream,
+                                                bool symmetrize_output = true,
+                                                bool initialize_colors = true,
+                                                int iterations         = 0)
+{
+  MST_solver<vertex_t, edge_t, weight_t, alteration_t> mst_solver(handle,
+                                                                  offsets,
+                                                                  indices,
+                                                                  weights,
+                                                                  v,
+                                                                  e,
+                                                                  color,
+                                                                  stream,
+                                                                  symmetrize_output,
+                                                                  initialize_colors,
+                                                                  iterations);
+  return mst_solver.solve();
+}
+
+}  // namespace mst
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/detail/filter.cuh b/cpp/include/raft/sparse/op/detail/filter.cuh
index 80a6584251..4e4e76946c 100644
--- a/cpp/include/raft/sparse/op/detail/filter.cuh
+++ b/cpp/include/raft/sparse/op/detail/filter.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,7 +35,7 @@
 
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/detail/utils.h>
-#include <raft/sparse/linalg/degree.hpp>
+#include <raft/sparse/linalg/degree.cuh>
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/include/raft/sparse/op/detail/reduce.cuh b/cpp/include/raft/sparse/op/detail/reduce.cuh
index 988f478f2b..eb747cce1e 100644
--- a/cpp/include/raft/sparse/op/detail/reduce.cuh
+++ b/cpp/include/raft/sparse/op/detail/reduce.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 #include <raft/sparse/detail/cusparse_wrappers.h>
 
 #include <raft/device_atomics.cuh>
-#include <raft/sparse/op/sort.hpp>
+#include <raft/sparse/op/sort.cuh>
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
 
@@ -34,7 +34,7 @@
 #include <algorithm>
 #include <iostream>
 
-#include <raft/sparse/convert/csr.hpp>
+#include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/detail/utils.h>
 
diff --git a/cpp/include/raft/sparse/op/detail/row_op.cuh b/cpp/include/raft/sparse/op/detail/row_op.cuh
index 4754f753d4..63c8cafaa7 100644
--- a/cpp/include/raft/sparse/op/detail/row_op.cuh
+++ b/cpp/include/raft/sparse/op/detail/row_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/op/detail/slice.h b/cpp/include/raft/sparse/op/detail/slice.cuh
similarity index 97%
rename from cpp/include/raft/sparse/op/detail/slice.h
rename to cpp/include/raft/sparse/op/detail/slice.cuh
index e3c0f09e14..6bf6688076 100644
--- a/cpp/include/raft/sparse/op/detail/slice.h
+++ b/cpp/include/raft/sparse/op/detail/slice.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 
 #include <thrust/device_ptr.h>
diff --git a/cpp/include/raft/sparse/op/detail/sort.h b/cpp/include/raft/sparse/op/detail/sort.h
index 9fc7cac5e3..a8b8161716 100644
--- a/cpp/include/raft/sparse/op/detail/sort.h
+++ b/cpp/include/raft/sparse/op/detail/sort.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh
new file mode 100644
index 0000000000..6c36538137
--- /dev/null
+++ b/cpp/include/raft/sparse/op/filter.cuh
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __FILTER_H
+#define __FILTER_H
+
+#pragma once
+
+#include <raft/handle.hpp>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/op/detail/filter.cuh>
+
+namespace raft {
+namespace sparse {
+namespace op {
+
+/**
+ * @brief Removes the values matching a particular scalar from a COO formatted sparse matrix.
+ *
+ * @param rows: input array of rows (size n)
+ * @param cols: input array of cols (size n)
+ * @param vals: input array of vals (size n)
+ * @param nnz: size of current rows/cols/vals arrays
+ * @param crows: compressed array of rows
+ * @param ccols: compressed array of cols
+ * @param cvals: compressed array of vals
+ * @param cnnz: array of non-zero counts per row
+ * @param cur_cnnz array of counts per row
+ * @param scalar: scalar to remove from arrays
+ * @param n: number of rows in dense matrix
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_remove_scalar(const int* rows,
+                       const int* cols,
+                       const T* vals,
+                       int nnz,
+                       int* crows,
+                       int* ccols,
+                       T* cvals,
+                       int* cnnz,
+                       int* cur_cnnz,
+                       T scalar,
+                       int n,
+                       cudaStream_t stream)
+{
+  detail::coo_remove_scalar<128, T>(
+    rows, cols, vals, nnz, crows, ccols, cvals, cnnz, cur_cnnz, scalar, n, stream);
+}
+
+/**
+ * @brief Removes the values matching a particular scalar from a COO formatted sparse matrix.
+ *
+ * @param in: input COO matrix
+ * @param out: output COO matrix
+ * @param scalar: scalar to remove from arrays
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
+{
+  detail::coo_remove_scalar<128, T>(in, out, scalar, stream);
+}
+
+/**
+ * @brief Removes zeros from a COO formatted sparse matrix.
+ *
+ * @param in: input COO matrix
+ * @param out: output COO matrix
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_remove_zeros(COO<T>* in, COO<T>* out, cudaStream_t stream)
+{
+  coo_remove_scalar<T>(in, out, T(0.0), stream);
+}
+
+};  // namespace op
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/filter.hpp b/cpp/include/raft/sparse/op/filter.hpp
index 0dff063e91..b67084f18a 100644
--- a/cpp/include/raft/sparse/op/filter.hpp
+++ b/cpp/include/raft/sparse/op/filter.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __FILTER_H
+#define __FILTER_H
 
 #pragma once
 
@@ -88,3 +95,5 @@ void coo_remove_zeros(COO<T>* in, COO<T>* out, cudaStream_t stream)
 };  // namespace op
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh
new file mode 100644
index 0000000000..fd860d2dc1
--- /dev/null
+++ b/cpp/include/raft/sparse/op/reduce.cuh
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SPARSE_REDUCE_H
+#define __SPARSE_REDUCE_H
+
+#pragma once
+
+#include <raft/handle.hpp>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/op/detail/reduce.cuh>
+
+namespace raft {
+namespace sparse {
+namespace op {
+/**
+ * Computes a mask from a sorted COO matrix where 0's denote
+ * duplicate values and 1's denote new values. This mask can
+ * be useful for computing an exclusive scan to pre-build offsets
+ * for reducing duplicates, such as when symmetrizing
+ * or taking the min of each duplicated value.
+ *
+ * Note that this function always marks the first value as 0 so that
+ * a cumulative sum can be performed as a follow-on. However, even
+ * if the mask is used direclty, any duplicates should always have a
+ * 1 when first encountered so it can be assumed that the first element
+ * is always a 1 otherwise.
+ *
+ * @tparam value_idx
+ * @param[out] mask output mask, size nnz
+ * @param[in] rows COO rows array, size nnz
+ * @param[in] cols COO cols array, size nnz
+ * @param[in] nnz number of nonzeros in input arrays
+ * @param[in] stream cuda ops will be ordered wrt this stream
+ */
+template <typename value_idx>
+void compute_duplicates_mask(
+  value_idx* mask, const value_idx* rows, const value_idx* cols, size_t nnz, cudaStream_t stream)
+{
+  detail::compute_duplicates_mask(mask, rows, cols, nnz, stream);
+}
+
+/**
+ * Performs a COO reduce of duplicate columns per row, taking the max weight
+ * for duplicate columns in each row. This function assumes the input COO
+ * has been sorted by both row and column but makes no assumption on
+ * the sorting of values.
+ * @tparam value_idx
+ * @tparam value_t
+ * @param[in] handle
+ * @param[out] out output COO, the nnz will be computed allocate() will be called in this function.
+ * @param[in] rows COO rows array, size nnz
+ * @param[in] cols COO cols array, size nnz
+ * @param[in] vals COO vals array, size nnz
+ * @param[in] nnz number of nonzeros in COO input arrays
+ * @param[in] m number of rows in COO input matrix
+ * @param[in] n number of columns in COO input matrix
+ */
+template <typename value_idx, typename value_t>
+void max_duplicates(const raft::handle_t& handle,
+                    raft::sparse::COO<value_t, value_idx>& out,
+                    const value_idx* rows,
+                    const value_idx* cols,
+                    const value_t* vals,
+                    size_t nnz,
+                    size_t m,
+                    size_t n)
+{
+  detail::max_duplicates(handle, out, rows, cols, vals, nnz, m, n);
+}
+};  // END namespace op
+};  // END namespace sparse
+};  // END namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/reduce.hpp b/cpp/include/raft/sparse/op/reduce.hpp
index b181f1c46f..a7e771d157 100644
--- a/cpp/include/raft/sparse/op/reduce.hpp
+++ b/cpp/include/raft/sparse/op/reduce.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SPARSE_REDUCE_H
+#define __SPARSE_REDUCE_H
 
 #pragma once
 
@@ -81,3 +88,5 @@ void max_duplicates(const raft::handle_t& handle,
 };  // END namespace op
 };  // END namespace sparse
 };  // END namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/row_op.cuh b/cpp/include/raft/sparse/op/row_op.cuh
new file mode 100644
index 0000000000..b31d3f29b6
--- /dev/null
+++ b/cpp/include/raft/sparse/op/row_op.cuh
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SPARSE_ROW_OP_H
+#define __SPARSE_ROW_OP_H
+#pragma once
+
+#include <raft/handle.hpp>
+#include <raft/sparse/op/detail/row_op.cuh>
+
+namespace raft {
+namespace sparse {
+namespace op {
+
+/**
+ * @brief Perform a custom row operation on a CSR matrix in batches.
+ * @tparam T numerical type of row_ind array
+ * @tparam TPB_X number of threads per block to use for underlying kernel
+ * @tparam Lambda type of custom operation function
+ * @param row_ind the CSR row_ind array to perform parallel operations over
+ * @param n_rows total number vertices in graph
+ * @param nnz number of non-zeros
+ * @param op custom row operation functor accepting the row and beginning index.
+ * @param stream cuda stream to use
+ */
+template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_row_op(const Index_* row_ind, Index_ n_rows, Index_ nnz, Lambda op, cudaStream_t stream)
+{
+  detail::csr_row_op<Index_, 128, Lambda>(row_ind, n_rows, nnz, op, stream);
+}
+
+};  // namespace op
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/row_op.hpp b/cpp/include/raft/sparse/op/row_op.hpp
index 5dc115cfce..b3eafafa66 100644
--- a/cpp/include/raft/sparse/op/row_op.hpp
+++ b/cpp/include/raft/sparse/op/row_op.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SPARSE_ROW_OP_H
+#define __SPARSE_ROW_OP_H
 
 #pragma once
 
@@ -43,3 +50,5 @@ void csr_row_op(const Index_* row_ind, Index_ n_rows, Index_ nnz, Lambda op, cud
 };  // namespace op
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/slice.cuh b/cpp/include/raft/sparse/op/slice.cuh
new file mode 100644
index 0000000000..cd7be1924b
--- /dev/null
+++ b/cpp/include/raft/sparse/op/slice.cuh
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SLICE_H
+#define __SLICE_H
+
+#pragma once
+
+#include <raft/handle.hpp>
+#include <raft/sparse/op/detail/slice.cuh>
+
+namespace raft {
+namespace sparse {
+namespace op {
+
+/**
+ * Slice consecutive rows from a CSR array and populate newly sliced indptr array
+ * @tparam value_idx
+ * @param[in] start_row : beginning row to slice
+ * @param[in] stop_row : ending row to slice
+ * @param[in] indptr : indptr of input CSR to slice
+ * @param[out] indptr_out : output sliced indptr to populate
+ * @param[in] start_offset : beginning column offset of input indptr
+ * @param[in] stop_offset : ending column offset of input indptr
+ * @param[in] stream : cuda stream for ordering events
+ */
+template <typename value_idx>
+void csr_row_slice_indptr(value_idx start_row,
+                          value_idx stop_row,
+                          const value_idx* indptr,
+                          value_idx* indptr_out,
+                          value_idx* start_offset,
+                          value_idx* stop_offset,
+                          cudaStream_t stream)
+{
+  detail::csr_row_slice_indptr(
+    start_row, stop_row, indptr, indptr_out, start_offset, stop_offset, stream);
+}
+
+/**
+ * Slice rows from a CSR, populate column and data arrays
+ * @tparam value_idx : data type of CSR index arrays
+ * @tparam value_t : data type of CSR data array
+ * @param[in] start_offset : beginning column offset to slice
+ * @param[in] stop_offset : ending column offset to slice
+ * @param[in] indices : column indices array from input CSR
+ * @param[in] data : data array from input CSR
+ * @param[out] indices_out : output column indices array
+ * @param[out] data_out : output data array
+ * @param[in] stream : cuda stream for ordering events
+ */
+template <typename value_idx, typename value_t>
+void csr_row_slice_populate(value_idx start_offset,
+                            value_idx stop_offset,
+                            const value_idx* indices,
+                            const value_t* data,
+                            value_idx* indices_out,
+                            value_t* data_out,
+                            cudaStream_t stream)
+{
+  detail::csr_row_slice_populate(
+    start_offset, stop_offset, indices, data, indices_out, data_out, stream);
+}
+
+};  // namespace op
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/slice.hpp b/cpp/include/raft/sparse/op/slice.hpp
index 917233319c..b4e0622ced 100644
--- a/cpp/include/raft/sparse/op/slice.hpp
+++ b/cpp/include/raft/sparse/op/slice.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,11 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SLICE_H
+#define __SLICE_H
 
 #pragma once
 
 #include <raft/handle.hpp>
-#include <raft/sparse/op/detail/slice.h>
+#include <raft/sparse/op/detail/slice.cuh>
 
 namespace raft {
 namespace sparse {
@@ -75,3 +82,5 @@ void csr_row_slice_populate(value_idx start_offset,
 };  // namespace op
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/sort.cuh b/cpp/include/raft/sparse/op/sort.cuh
new file mode 100644
index 0000000000..ae0e587c3b
--- /dev/null
+++ b/cpp/include/raft/sparse/op/sort.cuh
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SPARSE_SORT_H
+#define __SPARSE_SORT_H
+
+#pragma once
+
+#include <raft/handle.hpp>
+#include <raft/sparse/op/detail/sort.h>
+
+namespace raft {
+namespace sparse {
+namespace op {
+
+/**
+ * @brief Sorts the arrays that comprise the coo matrix
+ * by row and then by column.
+ *
+ * @param m number of rows in coo matrix
+ * @param n number of cols in coo matrix
+ * @param nnz number of non-zeros
+ * @param rows rows array from coo matrix
+ * @param cols cols array from coo matrix
+ * @param vals vals array from coo matrix
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_sort(int m, int n, int nnz, int* rows, int* cols, T* vals, cudaStream_t stream)
+{
+  detail::coo_sort(m, n, nnz, rows, cols, vals, stream);
+}
+
+/**
+ * @brief Sort the underlying COO arrays by row
+ * @tparam T: the type name of the underlying value array
+ * @param in: COO to sort by row
+ * @param stream: the cuda stream to use
+ */
+template <typename T>
+void coo_sort(COO<T>* const in, cudaStream_t stream)
+{
+  coo_sort<T>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream);
+}
+
+/**
+ * Sorts a COO by its weight
+ * @tparam value_idx
+ * @tparam value_t
+ * @param[inout] rows source edges
+ * @param[inout] cols dest edges
+ * @param[inout] data edge weights
+ * @param[in] nnz number of edges in edge list
+ * @param[in] stream cuda stream for which to order cuda operations
+ */
+template <typename value_idx, typename value_t>
+void coo_sort_by_weight(
+  value_idx* rows, value_idx* cols, value_t* data, value_idx nnz, cudaStream_t stream)
+{
+  detail::coo_sort_by_weight(rows, cols, data, nnz, stream);
+}
+};  // namespace op
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/sort.hpp b/cpp/include/raft/sparse/op/sort.hpp
index eb5c716976..12a4a77ca9 100644
--- a/cpp/include/raft/sparse/op/sort.hpp
+++ b/cpp/include/raft/sparse/op/sort.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SPARSE_SORT_H
+#define __SPARSE_SORT_H
 
 #pragma once
 
@@ -72,3 +79,5 @@ void coo_sort_by_weight(
 };  // namespace op
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh
new file mode 100644
index 0000000000..28bb5aa74b
--- /dev/null
+++ b/cpp/include/raft/sparse/selection/connect_components.cuh
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CONNECT_COMPONENTS_H
+#define __CONNECT_COMPONENTS_H
+
+#include <raft/handle.hpp>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/selection/detail/connect_components.cuh>
+
+namespace raft {
+namespace linkage {
+
+template <typename value_idx, typename value_t>
+using FixConnectivitiesRedOp = detail::FixConnectivitiesRedOp<value_idx, value_t>;
+
+/**
+ * Gets the number of unique components from array of
+ * colors or labels. This does not assume the components are
+ * drawn from a monotonically increasing set.
+ * @tparam value_idx
+ * @param[in] colors array of components
+ * @param[in] n_rows size of components array
+ * @param[in] stream cuda stream for which to order cuda operations
+ * @return total number of components
+ */
+template <typename value_idx>
+value_idx get_n_components(value_idx* colors, size_t n_rows, cudaStream_t stream)
+{
+  return detail::get_n_components(colors, n_rows, stream);
+}
+
+/**
+ * Connects the components of an otherwise unconnected knn graph
+ * by computing a 1-nn to neighboring components of each data point
+ * (e.g. component(nn) != component(self)) and reducing the results to
+ * include the set of smallest destination components for each source
+ * component. The result will not necessarily contain
+ * n_components^2 - n_components number of elements because many components
+ * will likely not be contained in the neighborhoods of 1-nns.
+ * @tparam value_idx
+ * @tparam value_t
+ * @param[in] handle raft handle
+ * @param[out] out output edge list containing nearest cross-component
+ *             edges.
+ * @param[in] X original (row-major) dense matrix for which knn graph should be constructed.
+ * @param[in] orig_colors array containing component number for each row of X
+ * @param[in] n_rows number of rows in X
+ * @param[in] n_cols number of cols in X
+ * @param[in] reduction_op
+ * @param[in] metric
+ */
+template <typename value_idx, typename value_t, typename red_op>
+void connect_components(
+  const raft::handle_t& handle,
+  raft::sparse::COO<value_t, value_idx>& out,
+  const value_t* X,
+  const value_idx* orig_colors,
+  size_t n_rows,
+  size_t n_cols,
+  red_op reduction_op,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded)
+{
+  detail::connect_components(handle, out, X, orig_colors, n_rows, n_cols, reduction_op, metric);
+}
+
+};  // end namespace linkage
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/selection/connect_components.hpp b/cpp/include/raft/sparse/selection/connect_components.hpp
index 23d247b50e..83d8fce8ba 100644
--- a/cpp/include/raft/sparse/selection/connect_components.hpp
+++ b/cpp/include/raft/sparse/selection/connect_components.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __CONNECT_COMPONENTS_H
+#define __CONNECT_COMPONENTS_H
 
 #include <raft/handle.hpp>
 #include <raft/sparse/coo.hpp>
@@ -76,3 +83,5 @@ void connect_components(
 
 };  // end namespace linkage
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/selection/detail/connect_components.cuh b/cpp/include/raft/sparse/selection/detail/connect_components.cuh
index 2b9ca2d8b5..9cfa2bbd44 100644
--- a/cpp/include/raft/sparse/selection/detail/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/detail/connect_components.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,13 @@
 
 #include <cub/cub.cuh>
 
-#include <raft/distance/fused_l2_nn.hpp>
-#include <raft/label/classlabels.hpp>
-#include <raft/linalg/norm.hpp>
-#include <raft/sparse/convert/csr.hpp>
+#include <raft/distance/fused_l2_nn.cuh>
+#include <raft/label/classlabels.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
-#include <raft/sparse/linalg/symmetrize.hpp>
-#include <raft/sparse/op/reduce.hpp>
+#include <raft/sparse/linalg/symmetrize.cuh>
+#include <raft/sparse/op/reduce.cuh>
 
 #include <raft/cudart_utils.h>
 
diff --git a/cpp/include/raft/sparse/selection/detail/knn.cuh b/cpp/include/raft/sparse/selection/detail/knn.cuh
index d263f2409f..b1dd6116e7 100644
--- a/cpp/include/raft/sparse/selection/detail/knn.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,15 +21,15 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance_type.hpp>
-#include <raft/linalg/unary_op.hpp>
-#include <raft/matrix/matrix.hpp>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/matrix/matrix.cuh>
 
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/csr.hpp>
 #include <raft/sparse/detail/utils.h>
-#include <raft/sparse/distance/distance.hpp>
-#include <raft/sparse/op/slice.hpp>
-#include <raft/spatial/knn/knn.hpp>
+#include <raft/sparse/distance/distance.cuh>
+#include <raft/sparse/op/slice.cuh>
+#include <raft/spatial/knn/knn.cuh>
 
 #include <algorithm>
 
diff --git a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
index b222dfd9bd..32b7fd3c63 100644
--- a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,9 +20,9 @@
 #include <raft/cudart_utils.h>
 
 #include <raft/sparse/coo.hpp>
-#include <raft/sparse/linalg/symmetrize.hpp>
+#include <raft/sparse/linalg/symmetrize.cuh>
 
-#include <raft/spatial/knn/knn.hpp>
+#include <raft/spatial/knn/knn.cuh>
 
 #include <raft/distance/distance_type.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh
new file mode 100644
index 0000000000..fd9ab4ac3d
--- /dev/null
+++ b/cpp/include/raft/sparse/selection/knn.cuh
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SPARSE_KNN_H
+#define __SPARSE_KNN_H
+
+#pragma once
+
+#include <raft/distance/distance_type.hpp>
+#include <raft/handle.hpp>
+#include <raft/sparse/selection/detail/knn.cuh>
+
+namespace raft {
+namespace sparse {
+namespace selection {
+
+/**
+ * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors
+ * using some distance implementation
+ * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1)
+ * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz)
+ * @param[in] idxData csr data array of the index matrix (size idxNNZ)
+ * @param[in] idxNNZ number of non-zeros for sparse index matrix
+ * @param[in] n_idx_rows number of data samples in index matrix
+ * @param[in] n_idx_cols
+ * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1)
+ * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ)
+ * @param[in] queryData csr data array of the query matrix (size queryNNZ)
+ * @param[in] queryNNZ number of non-zeros for sparse query matrix
+ * @param[in] n_query_rows number of data samples in query matrix
+ * @param[in] n_query_cols number of features in query matrix
+ * @param[out] output_indices dense matrix for output indices (size n_query_rows * k)
+ * @param[out] output_dists dense matrix for output distances (size n_query_rows * k)
+ * @param[in] k the number of neighbors to query
+ * @param[in] handle CUDA handle.get_stream() to order operations with respect to
+ * @param[in] batch_size_index maximum number of rows to use from index matrix per batch
+ * @param[in] batch_size_query maximum number of rows to use from query matrix per batch
+ * @param[in] metric distance metric/measure to use
+ * @param[in] metricArg potential argument for metric (currently unused)
+ */
+template <typename value_idx = int, typename value_t = float, int TPB_X = 32>
+void brute_force_knn(const value_idx* idxIndptr,
+                     const value_idx* idxIndices,
+                     const value_t* idxData,
+                     size_t idxNNZ,
+                     int n_idx_rows,
+                     int n_idx_cols,
+                     const value_idx* queryIndptr,
+                     const value_idx* queryIndices,
+                     const value_t* queryData,
+                     size_t queryNNZ,
+                     int n_query_rows,
+                     int n_query_cols,
+                     value_idx* output_indices,
+                     value_t* output_dists,
+                     int k,
+                     const raft::handle_t& handle,
+                     size_t batch_size_index             = 2 << 14,  // approx 1M
+                     size_t batch_size_query             = 2 << 14,
+                     raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
+                     float metricArg                     = 0)
+{
+  detail::sparse_knn_t<value_idx, value_t>(idxIndptr,
+                                           idxIndices,
+                                           idxData,
+                                           idxNNZ,
+                                           n_idx_rows,
+                                           n_idx_cols,
+                                           queryIndptr,
+                                           queryIndices,
+                                           queryData,
+                                           queryNNZ,
+                                           n_query_rows,
+                                           n_query_cols,
+                                           output_indices,
+                                           output_dists,
+                                           k,
+                                           handle,
+                                           batch_size_index,
+                                           batch_size_query,
+                                           metric,
+                                           metricArg)
+    .run();
+}
+
+};  // namespace selection
+};  // namespace sparse
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/selection/knn.hpp b/cpp/include/raft/sparse/selection/knn.hpp
index 8b2747d104..4158bd40c2 100644
--- a/cpp/include/raft/sparse/selection/knn.hpp
+++ b/cpp/include/raft/sparse/selection/knn.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SPARSE_KNN_H
+#define __SPARSE_KNN_H
 
 #pragma once
 
@@ -96,3 +103,5 @@ void brute_force_knn(const value_idx* idxIndptr,
 };  // namespace selection
 };  // namespace sparse
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh
new file mode 100644
index 0000000000..7d342db43b
--- /dev/null
+++ b/cpp/include/raft/sparse/selection/knn_graph.cuh
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __KNN_GRAPH_H
+#define __KNN_GRAPH_H
+
+#pragma once
+
+#include <raft/distance/distance_type.hpp>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/selection/detail/knn_graph.cuh>
+
+#include <cstdint>
+
+namespace raft {
+namespace sparse {
+namespace selection {
+
+/**
+ * Constructs a (symmetrized) knn graph edge list from
+ * dense input vectors.
+ *
+ * Note: The resulting KNN graph is not guaranteed to be connected.
+ *
+ * @tparam value_idx
+ * @tparam value_t
+ * @param[in] handle raft handle
+ * @param[in] X dense matrix of input data samples and observations
+ * @param[in] m number of data samples (rows) in X
+ * @param[in] n number of observations (columns) in X
+ * @param[in] metric distance metric to use when constructing neighborhoods
+ * @param[out] out output edge list
+ * @param c
+ */
+template <typename value_idx = int, typename value_t = float>
+void knn_graph(const handle_t& handle,
+               const value_t* X,
+               std::size_t m,
+               std::size_t n,
+               raft::distance::DistanceType metric,
+               raft::sparse::COO<value_t, value_idx>& out,
+               int c = 15)
+{
+  detail::knn_graph(handle, X, m, n, metric, out, c);
+}
+
+};  // namespace selection
+};  // namespace sparse
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/selection/knn_graph.hpp b/cpp/include/raft/sparse/selection/knn_graph.hpp
index 825761d44d..eb035390ce 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.hpp
+++ b/cpp/include/raft/sparse/selection/knn_graph.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __KNN_GRAPH_H
+#define __KNN_GRAPH_H
 
 #pragma once
 
@@ -57,3 +64,5 @@ void knn_graph(const handle_t& handle,
 };  // namespace selection
 };  // namespace sparse
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/ann.cuh b/cpp/include/raft/spatial/knn/ann.cuh
new file mode 100644
index 0000000000..2ef2ae0fa4
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/ann.cuh
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ANN_H
+#define __ANN_H
+
+#pragma once
+
+#include "ann_common.h"
+#include "detail/ann_quantized_faiss.cuh"
+
+#include <faiss/gpu/GpuIndex.h>
+#include <raft/spatial/knn/faiss_mr.hpp>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+/**
+ * @brief Flat C++ API function to build an approximate nearest neighbors index
+ * from an index array and a set of parameters.
+ *
+ * @param[in] handle RAFT handle
+ * @param[out] index index to be built
+ * @param[in] params parametrization of the index to be built
+ * @param[in] metric distance metric to use. Euclidean (L2) is used by default
+ * @param[in] metricArg metric argument
+ * @param[in] index_array the index array to build the index with
+ * @param[in] n number of rows in the index array
+ * @param[in] D the dimensionality of the index array
+ */
+template <typename value_idx = int>
+inline void approx_knn_build_index(raft::handle_t& handle,
+                                   raft::spatial::knn::knnIndex* index,
+                                   knnIndexParam* params,
+                                   raft::distance::DistanceType metric,
+                                   float metricArg,
+                                   float* index_array,
+                                   value_idx n,
+                                   value_idx D)
+{
+  detail::approx_knn_build_index(handle, index, params, metric, metricArg, index_array, n, D);
+}
+
+/**
+ * @brief Flat C++ API function to perform an approximate nearest neighbors
+ * search from previously built index and a query array
+ *
+ * @param[in] handle RAFT handle
+ * @param[out] distances distances of the nearest neighbors toward
+ *                       their query point
+ * @param[out] indices indices of the nearest neighbors
+ * @param[in] index index to perform a search with
+ * @param[in] k the number of nearest neighbors to search for
+ * @param[in] query_array the query to perform a search with
+ * @param[in] n number of rows in the query array
+ */
+template <typename value_idx = int>
+inline void approx_knn_search(raft::handle_t& handle,
+                              float* distances,
+                              int64_t* indices,
+                              raft::spatial::knn::knnIndex* index,
+                              value_idx k,
+                              float* query_array,
+                              value_idx n)
+{
+  detail::approx_knn_search(handle, distances, indices, index, k, query_array, n);
+}
+
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp
index 5f64a8d1b5..bb11a2b11b 100644
--- a/cpp/include/raft/spatial/knn/ann.hpp
+++ b/cpp/include/raft/spatial/knn/ann.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __ANN_H
+#define __ANN_H
 
 #pragma once
 
@@ -80,3 +87,5 @@ inline void approx_knn_search(raft::handle_t& handle,
 }  // namespace knn
 }  // namespace spatial
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index 339ca3687a..5cdd6b1141 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/spatial/knn/ball_cover.cuh b/cpp/include/raft/spatial/knn/ball_cover.cuh
new file mode 100644
index 0000000000..df797ecca2
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/ball_cover.cuh
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BALL_COVER_H
+#define __BALL_COVER_H
+
+#pragma once
+
+#include <cstdint>
+
+#include "ball_cover_common.h"
+#include "detail/ball_cover.cuh"
+#include "detail/ball_cover/common.cuh"
+#include <raft/distance/distance_type.hpp>
+#include <thrust/transform.h>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+template <typename value_idx = std::int64_t, typename value_t, typename value_int = std::uint32_t>
+void rbc_build_index(const raft::handle_t& handle,
+                     BallCoverIndex<value_idx, value_t, value_int>& index)
+{
+  ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions");
+  if (index.metric == raft::distance::DistanceType::Haversine) {
+    detail::rbc_build_index(handle, index, detail::HaversineFunc<value_t, value_int>());
+  } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
+             index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
+    detail::rbc_build_index(handle, index, detail::EuclideanFunc<value_t, value_int>());
+  } else {
+    RAFT_FAIL("Metric not support");
+  }
+
+  index.set_index_trained();
+}
+
+/**
+ * Performs a faster exact knn in metric spaces using the triangle
+ * inequality with a number of landmark points to reduce the
+ * number of distance computations from O(n^2) to O(sqrt(n)). This
+ * performs an all neighbors knn, which can reuse memory when
+ * the index and query are the same array. This function will
+ * build the index and assumes rbc_build_index() has not already
+ * been called.
+ * @tparam value_idx knn index type
+ * @tparam value_t knn distance type
+ * @tparam value_int type for integers, such as number of rows/cols
+ * @param handle raft handle for resource management
+ * @param index ball cover index which has not yet been built
+ * @param k number of nearest neighbors to find
+ * @param perform_post_filtering if this is false, only the closest k landmarks
+ *                               are considered (which will return approximate
+ *                               results).
+ * @param[out] inds output knn indices
+ * @param[out] dists output knn distances
+ * @param weight a weight for overlap between the closest landmark and
+ *               the radius of other landmarks when pruning distances.
+ *               Setting this value below 1 can effectively turn off
+ *               computing distances against many other balls, enabling
+ *               approximate nearest neighbors. Recall can be adjusted
+ *               based on how many relevant balls are ignored. Note that
+ *               many datasets can still have great recall even by only
+ *               looking in the closest landmark.
+ */
+template <typename value_idx = std::int64_t, typename value_t, typename value_int = std::uint32_t>
+void rbc_all_knn_query(const raft::handle_t& handle,
+                       BallCoverIndex<value_idx, value_t, value_int>& index,
+                       value_int k,
+                       value_idx* inds,
+                       value_t* dists,
+                       bool perform_post_filtering = true,
+                       float weight                = 1.0)
+{
+  ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions");
+  if (index.metric == raft::distance::DistanceType::Haversine) {
+    detail::rbc_all_knn_query(handle,
+                              index,
+                              k,
+                              inds,
+                              dists,
+                              detail::HaversineFunc<value_t, value_int>(),
+                              perform_post_filtering,
+                              weight);
+  } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
+             index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
+    detail::rbc_all_knn_query(handle,
+                              index,
+                              k,
+                              inds,
+                              dists,
+                              detail::EuclideanFunc<value_t, value_int>(),
+                              perform_post_filtering,
+                              weight);
+  } else {
+    RAFT_FAIL("Metric not supported");
+  }
+
+  index.set_index_trained();
+}
+
+/**
+ * Performs a faster exact knn in metric spaces using the triangle
+ * inequality with a number of landmark points to reduce the
+ * number of distance computations from O(n^2) to O(sqrt(n)). This
+ * function does not build the index and assumes rbc_build_index() has
+ * already been called. Use this function when the index and
+ * query arrays are different, otherwise use rbc_all_knn_query().
+ * @tparam value_idx index type
+ * @tparam value_t distances type
+ * @tparam value_int integer type for size info
+ * @param handle raft handle for resource management
+ * @param index ball cover index which has not yet been built
+ * @param k number of nearest neighbors to find
+ * @param query the
+ * @param perform_post_filtering if this is false, only the closest k landmarks
+ *                               are considered (which will return approximate
+ *                               results).
+ * @param[out] inds output knn indices
+ * @param[out] dists output knn distances
+ * @param weight a weight for overlap between the closest landmark and
+ *               the radius of other landmarks when pruning distances.
+ *               Setting this value below 1 can effectively turn off
+ *               computing distances against many other balls, enabling
+ *               approximate nearest neighbors. Recall can be adjusted
+ *               based on how many relevant balls are ignored. Note that
+ *               many datasets can still have great recall even by only
+ *               looking in the closest landmark.
+ * @param[in] n_query_pts number of query points
+ */
+template <typename value_idx = std::int64_t, typename value_t, typename value_int = std::uint32_t>
+void rbc_knn_query(const raft::handle_t& handle,
+                   BallCoverIndex<value_idx, value_t, value_int>& index,
+                   value_int k,
+                   const value_t* query,
+                   value_int n_query_pts,
+                   value_idx* inds,
+                   value_t* dists,
+                   bool perform_post_filtering = true,
+                   float weight                = 1.0)
+{
+  ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions");
+  if (index.metric == raft::distance::DistanceType::Haversine) {
+    detail::rbc_knn_query(handle,
+                          index,
+                          k,
+                          query,
+                          n_query_pts,
+                          inds,
+                          dists,
+                          detail::HaversineFunc<value_t, value_int>(),
+                          perform_post_filtering,
+                          weight);
+  } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
+             index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
+    detail::rbc_knn_query(handle,
+                          index,
+                          k,
+                          query,
+                          n_query_pts,
+                          inds,
+                          dists,
+                          detail::EuclideanFunc<value_t, value_int>(),
+                          perform_post_filtering,
+                          weight);
+  } else {
+    RAFT_FAIL("Metric not supported");
+  }
+}
+
+// TODO: implement functions for:
+//  4. rbc_eps_neigh() - given a populated index, perform query against different query array
+//  5. rbc_all_eps_neigh() - populate a BallCoverIndex and query against training data
+
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/ball_cover.hpp b/cpp/include/raft/spatial/knn/ball_cover.hpp
index d44e87710b..26c2c1fb2e 100644
--- a/cpp/include/raft/spatial/knn/ball_cover.hpp
+++ b/cpp/include/raft/spatial/knn/ball_cover.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __BALL_COVER_H
+#define __BALL_COVER_H
 
 #pragma once
 
@@ -185,3 +192,5 @@ void rbc_knn_query(const raft::handle_t& handle,
 }  // namespace knn
 }  // namespace spatial
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/ball_cover_common.h b/cpp/include/raft/spatial/knn/ball_cover_common.h
index e1a202107b..0567e124d9 100644
--- a/cpp/include/raft/spatial/knn/ball_cover_common.h
+++ b/cpp/include/raft/spatial/knn/ball_cover_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index 4d9bfd82ad..78631b431f 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@
 #include <raft/cudart_utils.h>
 
 #include <label/classlabels.cuh>
-#include <raft/distance/distance.hpp>
+#include <raft/distance/distance.cuh>
 #include <raft/spatial/knn/faiss_mr.hpp>
 
 #include <faiss/gpu/GpuDistance.h>
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
index d430a98ea0..afab663e2b 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
@@ -31,9 +31,9 @@
 
 #include <raft/cuda_utils.cuh>
 
-#include <raft/matrix/matrix.hpp>
-#include <raft/random/rng.hpp>
-#include <raft/sparse/convert/csr.hpp>
+#include <raft/matrix/matrix.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/sparse/convert/csr.cuh>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
diff --git a/cpp/include/raft/spatial/knn/detail/common_faiss.h b/cpp/include/raft/spatial/knn/detail/common_faiss.h
index 587505316b..aca1571de2 100644
--- a/cpp/include/raft/spatial/knn/detail/common_faiss.h
+++ b/cpp/include/raft/spatial/knn/detail/common_faiss.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
index 3b4a8d4174..e16efe4a69 100644
--- a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
+++ b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/device_utils.cuh>
-#include <raft/linalg/contractions.hpp>
+#include <raft/linalg/contractions.cuh>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index e3e33e6642..9b69d437f4 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #include <cub/cub.cuh>
 #include <faiss/gpu/utils/Select.cuh>
 #include <limits>
-#include <raft/linalg/norm.hpp>
+#include <raft/linalg/norm.cuh>
 // TODO: Need to hide the PairwiseDistance class impl and expose to public API
 #include "processing.hpp"
 #include <raft/distance/detail/distance.cuh>
diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
index 06473f6151..c2d89aae7d 100644
--- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
+++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index d5dfe4f8f8..196124352a 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index a515ca8507..001f57a4aa 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c)2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,11 @@
 #pragma once
 
 #include <raft/distance/distance_type.hpp>
-#include <raft/linalg/matrix_vector_op.hpp>
-#include <raft/linalg/norm.hpp>
-#include <raft/linalg/unary_op.hpp>
-#include <raft/stats/mean.hpp>
-#include <raft/stats/mean_center.hpp>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/stats/mean.cuh>
+#include <raft/stats/mean_center.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh b/cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh
new file mode 100644
index 0000000000..29ed51fb3d
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __EPSILON_NEIGH_H
+#define __EPSILON_NEIGH_H
+
+#pragma once
+
+#include <raft/spatial/knn/detail/epsilon_neighborhood.cuh>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+/**
+ * @brief Computes epsilon neighborhood for the L2-Squared distance metric
+ *
+ * @tparam DataT   IO and math type
+ * @tparam IdxT    Index type
+ *
+ * @param[out] adj    adjacency matrix [row-major] [on device] [dim = m x n]
+ * @param[out] vd     vertex degree array [on device] [len = m + 1]
+ *                    `vd + m` stores the total number of edges in the adjacency
+ *                    matrix. Pass a nullptr if you don't need this info.
+ * @param[in]  x      first matrix [row-major] [on device] [dim = m x k]
+ * @param[in]  y      second matrix [row-major] [on device] [dim = n x k]
+ * @param[in]  m      number of rows in x
+ * @param[in]  n      number of rows in y
+ * @param[in]  k      number of columns in x and k
+ * @param[in]  eps    defines epsilon neighborhood radius (should be passed as
+ *                    squared as we compute L2-squared distance in this method)
+ * @param[in]  stream cuda stream
+ */
+template <typename DataT, typename IdxT>
+void epsUnexpL2SqNeighborhood(bool* adj,
+                              IdxT* vd,
+                              const DataT* x,
+                              const DataT* y,
+                              IdxT m,
+                              IdxT n,
+                              IdxT k,
+                              DataT eps,
+                              cudaStream_t stream)
+{
+  detail::epsUnexpL2SqNeighborhood<DataT, IdxT>(adj, vd, x, y, m, n, k, eps, stream);
+}
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
index cd9163096a..b3ba0fc442 100644
--- a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
+++ b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __EPSILON_NEIGH_H
+#define __EPSILON_NEIGH_H
 
 #pragma once
 
@@ -57,3 +64,5 @@ void epsUnexpL2SqNeighborhood(bool* adj,
 }  // namespace knn
 }  // namespace spatial
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/knn.cuh b/cpp/include/raft/spatial/knn/knn.cuh
new file mode 100644
index 0000000000..189b537361
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/knn.cuh
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KNN_H
+#define __KNN_H
+
+#pragma once
+
+#include "detail/knn_brute_force_faiss.cuh"
+#include "detail/selection_faiss.cuh"
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+/**
+ * Performs a k-select across row partitioned index/distance
+ * matrices formatted like the following:
+ * row1: k0, k1, k2
+ * row2: k0, k1, k2
+ * row3: k0, k1, k2
+ * row1: k0, k1, k2
+ * row2: k0, k1, k2
+ * row3: k0, k1, k2
+ *
+ * etc...
+ *
+ * @tparam value_idx
+ * @tparam value_t
+ * @param inK
+ * @param inV
+ * @param outK
+ * @param outV
+ * @param n_samples
+ * @param n_parts
+ * @param k
+ * @param stream
+ * @param translations
+ */
+template <typename value_idx = int64_t, typename value_t = float>
+inline void knn_merge_parts(value_t* inK,
+                            value_idx* inV,
+                            value_t* outK,
+                            value_idx* outV,
+                            size_t n_samples,
+                            int n_parts,
+                            int k,
+                            cudaStream_t stream,
+                            value_idx* translations)
+{
+  detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
+}
+
+/**
+ * Performs a k-select across column-partitioned index/distance
+ * matrices formatted like the following:
+ * row1: k0, k1, k2, k0, k1, k2
+ * row2: k0, k1, k2, k0, k1, k2
+ * row3: k0, k1, k2, k0, k1, k2
+ *
+ * etc...
+ *
+ * @tparam value_idx
+ * @tparam value_t
+ * @param inK
+ * @param inV
+ * @param n_rows
+ * @param n_cols
+ * @param outK
+ * @param outV
+ * @param select_min
+ * @param k
+ * @param stream
+ */
+template <typename value_idx = int, typename value_t = float>
+inline void select_k(value_t* inK,
+                     value_idx* inV,
+                     size_t n_rows,
+                     size_t n_cols,
+                     value_t* outK,
+                     value_idx* outV,
+                     bool select_min,
+                     int k,
+                     cudaStream_t stream)
+{
+  detail::select_k(inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+}
+
+/**
+ * @brief Flat C++ API function to perform a brute force knn on
+ * a series of input arrays and combine the results into a single
+ * output array for indexes and distances.
+ *
+ * @param[in] handle the cuml handle to use
+ * @param[in] input vector of pointers to the input arrays
+ * @param[in] sizes vector of sizes of input arrays
+ * @param[in] D the dimensionality of the arrays
+ * @param[in] search_items array of items to search of dimensionality D
+ * @param[in] n number of rows in search_items
+ * @param[out] res_I the resulting index array of size n * k
+ * @param[out] res_D the resulting distance array of size n * k
+ * @param[in] k the number of nearest neighbors to return
+ * @param[in] rowMajorIndex are the index arrays in row-major order?
+ * @param[in] rowMajorQuery are the query arrays in row-major order?
+ * @param[in] metric distance metric to use. Euclidean (L2) is used by
+ * 			   default
+ * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances. This
+ * 					 is ignored if the metric_type is not Minkowski.
+ * @param[in] translations starting offsets for partitions. should be the same size
+ *            as input vector.
+ */
+template <typename value_idx = std::int64_t, typename value_t = float, typename value_int = int>
+void brute_force_knn(raft::handle_t const& handle,
+                     std::vector<value_t*>& input,
+                     std::vector<value_int>& sizes,
+                     value_int D,
+                     value_t* search_items,
+                     value_int n,
+                     value_idx* res_I,
+                     value_t* res_D,
+                     value_int k,
+                     bool rowMajorIndex                   = true,
+                     bool rowMajorQuery                   = true,
+                     std::vector<value_idx>* translations = nullptr,
+                     distance::DistanceType metric        = distance::DistanceType::L2Unexpanded,
+                     float metric_arg                     = 2.0f)
+{
+  ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size");
+
+  detail::brute_force_knn_impl(handle,
+                               input,
+                               sizes,
+                               D,
+                               search_items,
+                               n,
+                               res_I,
+                               res_D,
+                               k,
+                               rowMajorIndex,
+                               rowMajorQuery,
+                               translations,
+                               metric,
+                               metric_arg);
+}
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index 59df75ba36..da18e891d4 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __KNN_H
+#define __KNN_H
 
 #pragma once
 
@@ -155,3 +162,5 @@ void brute_force_knn(raft::handle_t const& handle,
 }  // namespace knn
 }  // namespace spatial
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/specializations.cuh b/cpp/include/raft/spatial/knn/specializations.cuh
new file mode 100644
index 0000000000..fbac1c8f8b
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/specializations.cuh
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KNN_SPECIALIZATIONS_H
+#define __KNN_SPECIALIZATIONS_H
+
+#pragma once
+
+#include <raft/spatial/knn/specializations/ball_cover.cuh>
+#include <raft/spatial/knn/specializations/fused_l2_knn.cuh>
+#include <raft/spatial/knn/specializations/knn.cuh>
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/specializations.hpp b/cpp/include/raft/spatial/knn/specializations.hpp
index 663e77c6a0..538e1b1380 100644
--- a/cpp/include/raft/spatial/knn/specializations.hpp
+++ b/cpp/include/raft/spatial/knn/specializations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,9 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __KNN_SPECIALIZATIONS_H
+#define __KNN_SPECIALIZATIONS_H
 
 #pragma once
 
-#include <raft/spatial/knn/specializations/ball_cover.hpp>
-#include <raft/spatial/knn/specializations/fused_l2_knn.hpp>
-#include <raft/spatial/knn/specializations/knn.hpp>
+#include <raft/spatial/knn/specializations/ball_cover.cuh>
+#include <raft/spatial/knn/specializations/fused_l2_knn.cuh>
+#include <raft/spatial/knn/specializations/knn.cuh>
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/specializations/ball_cover.hpp b/cpp/include/raft/spatial/knn/specializations/ball_cover.cuh
similarity index 95%
rename from cpp/include/raft/spatial/knn/specializations/ball_cover.hpp
rename to cpp/include/raft/spatial/knn/specializations/ball_cover.cuh
index 6b8b10b35a..033862c2f1 100644
--- a/cpp/include/raft/spatial/knn/specializations/ball_cover.hpp
+++ b/cpp/include/raft/spatial/knn/specializations/ball_cover.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/spatial/knn/ball_cover.hpp>
+#include <raft/spatial/knn/ball_cover.cuh>
 #include <raft/spatial/knn/ball_cover_common.h>
 #include <raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp>
 
diff --git a/cpp/include/raft/spatial/knn/specializations/fused_l2_knn.hpp b/cpp/include/raft/spatial/knn/specializations/fused_l2_knn.cuh
similarity index 98%
rename from cpp/include/raft/spatial/knn/specializations/fused_l2_knn.hpp
rename to cpp/include/raft/spatial/knn/specializations/fused_l2_knn.cuh
index 961351d734..916db8f0a2 100644
--- a/cpp/include/raft/spatial/knn/specializations/fused_l2_knn.hpp
+++ b/cpp/include/raft/spatial/knn/specializations/fused_l2_knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/spatial/knn/specializations/knn.hpp b/cpp/include/raft/spatial/knn/specializations/knn.cuh
similarity index 97%
rename from cpp/include/raft/spatial/knn/specializations/knn.hpp
rename to cpp/include/raft/spatial/knn/specializations/knn.cuh
index bd8673af39..6cf2418d29 100644
--- a/cpp/include/raft/spatial/knn/specializations/knn.hpp
+++ b/cpp/include/raft/spatial/knn/specializations/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/spatial/knn/knn.hpp>
+#include <raft/spatial/knn/knn.cuh>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/include/raft/spectral/cluster_solvers.cuh b/cpp/include/raft/spectral/cluster_solvers.cuh
new file mode 100644
index 0000000000..27599c9464
--- /dev/null
+++ b/cpp/include/raft/spectral/cluster_solvers.cuh
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CLUSTER_SOLVERS_H
+#define __CLUSTER_SOLVERS_H
+
+#pragma once
+
+#include <raft/cluster/kmeans.cuh>
+#include <utility>  // for std::pair
+
+namespace raft {
+namespace spectral {
+
+using namespace matrix;
+
+// aggregate of control params for Eigen Solver:
+//
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+struct cluster_solver_config_t {
+  size_type_t n_clusters;
+  size_type_t maxIter;
+
+  value_type_t tol;
+
+  unsigned long long seed{123456};
+};
+
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+struct kmeans_solver_t {
+  explicit kmeans_solver_t(
+    cluster_solver_config_t<index_type_t, value_type_t, size_type_t> const& config)
+    : config_(config)
+  {
+  }
+
+  std::pair<value_type_t, index_type_t> solve(handle_t const& handle,
+                                              size_type_t n_obs_vecs,
+                                              size_type_t dim,
+                                              value_type_t const* __restrict__ obs,
+                                              index_type_t* __restrict__ codes) const
+  {
+    RAFT_EXPECTS(obs != nullptr, "Null obs buffer.");
+    RAFT_EXPECTS(codes != nullptr, "Null codes buffer.");
+    value_type_t residual{};
+    index_type_t iters{};
+
+    raft::cluster::kmeans(handle,
+                          n_obs_vecs,
+                          dim,
+                          config_.n_clusters,
+                          config_.tol,
+                          config_.maxIter,
+                          obs,
+                          codes,
+                          residual,
+                          iters,
+                          config_.seed);
+    return std::make_pair(residual, iters);
+  }
+
+  auto const& get_config(void) const { return config_; }
+
+ private:
+  cluster_solver_config_t<index_type_t, value_type_t, size_type_t> config_;
+};
+
+}  // namespace spectral
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp
index cc25e66cae..c6b166bb4f 100644
--- a/cpp/include/raft/spectral/cluster_solvers.hpp
+++ b/cpp/include/raft/spectral/cluster_solvers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,9 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __CLUSTER_SOLVERS_H
+#define __CLUSTER_SOLVERS_H
+
 #pragma once
 
-#include <raft/cluster/kmeans.hpp>
+#include <raft/cluster/kmeans.cuh>
 #include <utility>  // for std::pair
 
 namespace raft {
@@ -76,3 +85,5 @@ struct kmeans_solver_t {
 
 }  // namespace spectral
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spectral/detail/lapack.hpp b/cpp/include/raft/spectral/detail/lapack.hpp
index d066c68a68..fa9cabf6a3 100644
--- a/cpp/include/raft/spectral/detail/lapack.hpp
+++ b/cpp/include/raft/spectral/detail/lapack.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.cuh b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
similarity index 99%
rename from cpp/include/raft/spectral/detail/matrix_wrappers.cuh
rename to cpp/include/raft/spectral/detail/matrix_wrappers.hpp
index b4a2ed175f..716260abd5 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.cuh
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/spectral/detail/modularity_maximization.hpp b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
index 6bb3dca920..b60ca719fb 100644
--- a/cpp/include/raft/spectral/detail/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,9 +27,9 @@
 #include <tuple>
 
 #include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/spectral/cluster_solvers.hpp>
+#include <raft/spectral/cluster_solvers.cuh>
 #include <raft/spectral/detail/spectral_util.cuh>
-#include <raft/spectral/eigen_solvers.hpp>
+#include <raft/spectral/eigen_solvers.cuh>
 #include <raft/spectral/matrix_wrappers.hpp>
 
 #ifdef COLLECT_TIME_STATISTICS
diff --git a/cpp/include/raft/spectral/detail/partition.hpp b/cpp/include/raft/spectral/detail/partition.hpp
index 775b37d118..97e10963dc 100644
--- a/cpp/include/raft/spectral/detail/partition.hpp
+++ b/cpp/include/raft/spectral/detail/partition.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,9 +26,9 @@
 #include <tuple>
 
 #include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/spectral/cluster_solvers.hpp>
+#include <raft/spectral/cluster_solvers.cuh>
 #include <raft/spectral/detail/spectral_util.cuh>
-#include <raft/spectral/eigen_solvers.hpp>
+#include <raft/spectral/eigen_solvers.cuh>
 
 namespace raft {
 namespace spectral {
diff --git a/cpp/include/raft/spectral/detail/spectral_util.cuh b/cpp/include/raft/spectral/detail/spectral_util.cuh
index c1796cbbc1..08ae9b856a 100644
--- a/cpp/include/raft/spectral/detail/spectral_util.cuh
+++ b/cpp/include/raft/spectral/detail/spectral_util.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/spectral/eigen_solvers.cuh b/cpp/include/raft/spectral/eigen_solvers.cuh
new file mode 100644
index 0000000000..787a5bde39
--- /dev/null
+++ b/cpp/include/raft/spectral/eigen_solvers.cuh
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __EIGEN_SOLVERS_H
+#define __EIGEN_SOLVERS_H
+
+#pragma once
+
+#include <raft/linalg/lanczos.cuh>
+#include <raft/spectral/matrix_wrappers.hpp>
+
+namespace raft {
+namespace spectral {
+
+// aggregate of control params for Eigen Solver:
+//
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+struct eigen_solver_config_t {
+  size_type_t n_eigVecs;
+  size_type_t maxIter;
+
+  size_type_t restartIter;
+  value_type_t tol;
+
+  bool reorthogonalize{false};
+  unsigned long long seed{
+    1234567};  // CAVEAT: this default value is now common to all instances of using seed in
+               // Lanczos; was not the case before: there were places where a default seed = 123456
+               // was used; this may trigger slightly different # solver iterations
+};
+
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+struct lanczos_solver_t {
+  explicit lanczos_solver_t(
+    eigen_solver_config_t<index_type_t, value_type_t, size_type_t> const& config)
+    : config_(config)
+  {
+  }
+
+  index_type_t solve_smallest_eigenvectors(
+    handle_t const& handle,
+    matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+    value_type_t* __restrict__ eigVals,
+    value_type_t* __restrict__ eigVecs) const
+  {
+    RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+    RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+    index_type_t iters{};
+    linalg::computeSmallestEigenvectors(handle,
+                                        A,
+                                        config_.n_eigVecs,
+                                        config_.maxIter,
+                                        config_.restartIter,
+                                        config_.tol,
+                                        config_.reorthogonalize,
+                                        iters,
+                                        eigVals,
+                                        eigVecs,
+                                        config_.seed);
+    return iters;
+  }
+
+  index_type_t solve_largest_eigenvectors(
+    handle_t const& handle,
+    matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+    value_type_t* __restrict__ eigVals,
+    value_type_t* __restrict__ eigVecs) const
+  {
+    RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+    RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+    index_type_t iters{};
+    linalg::computeLargestEigenvectors(handle,
+                                       A,
+                                       config_.n_eigVecs,
+                                       config_.maxIter,
+                                       config_.restartIter,
+                                       config_.tol,
+                                       config_.reorthogonalize,
+                                       iters,
+                                       eigVals,
+                                       eigVecs,
+                                       config_.seed);
+    return iters;
+  }
+
+  auto const& get_config(void) const { return config_; }
+
+ private:
+  eigen_solver_config_t<index_type_t, value_type_t, size_type_t> config_;
+};
+
+}  // namespace spectral
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp
index 0033dbeea9..d55ddf952a 100644
--- a/cpp/include/raft/spectral/eigen_solvers.hpp
+++ b/cpp/include/raft/spectral/eigen_solvers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,9 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __EIGEN_SOLVERS_H
+#define __EIGEN_SOLVERS_H
+
 #pragma once
 
-#include <raft/linalg/lanczos.hpp>
+#include <raft/linalg/lanczos.cuh>
 #include <raft/spectral/matrix_wrappers.hpp>
 
 namespace raft {
@@ -100,3 +108,5 @@ struct lanczos_solver_t {
 
 }  // namespace spectral
 }  // namespace raft
+
+#endif
diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp
index 237f1275fd..952dac0715 100644
--- a/cpp/include/raft/spectral/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/matrix_wrappers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <raft/spectral/detail/matrix_wrappers.cuh>
+#include <raft/spectral/detail/matrix_wrappers.hpp>
 
 // =========================================================
 // Useful macros
diff --git a/cpp/include/raft/spectral/modularity_maximization.cuh b/cpp/include/raft/spectral/modularity_maximization.cuh
new file mode 100644
index 0000000000..c8221e434c
--- /dev/null
+++ b/cpp/include/raft/spectral/modularity_maximization.cuh
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __MODULARITY_MAXIMIZATION_H
+#define __MODULARITY_MAXIMIZATION_H
+
+#pragma once
+
+#include <tuple>
+
+#include <raft/spectral/detail/modularity_maximization.hpp>
+
+namespace raft {
+namespace spectral {
+
+// =========================================================
+// Spectral modularity_maximization
+// =========================================================
+
+/** Compute partition for a weighted undirected graph. This
+ *  partition attempts to minimize the cost function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter_lanczos Maximum number of Lanczos iterations.
+ *  @param restartIter_lanczos Maximum size of Lanczos system before
+ *    implicit restart.
+ *  @param tol_lanczos Convergence tolerance for Lanczos method.
+ *  @param maxIter_kmeans Maximum number of k-means iterations.
+ *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+ *  @param clusters (Output, device memory, n entries) Cluster
+ *    assignments.
+ *  @param iters_lanczos On exit, number of Lanczos iterations
+ *    performed.
+ *  @param iters_kmeans On exit, number of k-means iterations
+ *    performed.
+ *  @return error flag.
+ */
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
+  handle_t const& handle,
+  matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
+{
+  return raft::spectral::detail::
+    modularity_maximization<vertex_t, weight_t, EigenSolver, ClusterSolver>(
+      handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
+}
+//===================================================
+// Analysis of graph partition
+// =========================================================
+
+/// Compute modularity
+/** This function determines the modularity based on a graph and cluster assignments
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of clusters.
+ *  @param clusters (Input, device memory, n entries) Cluster assignments.
+ *  @param modularity On exit, modularity
+ */
+template <typename vertex_t, typename weight_t>
+void analyzeModularity(handle_t const& handle,
+                       matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                       vertex_t nClusters,
+                       vertex_t const* __restrict__ clusters,
+                       weight_t& modularity)
+{
+  raft::spectral::detail::analyzeModularity<vertex_t, weight_t>(
+    handle, csr_m, nClusters, clusters, modularity);
+}
+
+}  // namespace spectral
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp
index e67be767a2..d1c3ea00f7 100644
--- a/cpp/include/raft/spectral/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/modularity_maximization.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MODULARITY_MAXIMIZATION_H
+#define __MODULARITY_MAXIMIZATION_H
 
 #pragma once
 
@@ -86,3 +93,5 @@ void analyzeModularity(handle_t const& handle,
 
 }  // namespace spectral
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spectral/partition.cuh b/cpp/include/raft/spectral/partition.cuh
new file mode 100644
index 0000000000..9ccc21c868
--- /dev/null
+++ b/cpp/include/raft/spectral/partition.cuh
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PARTITION_H
+#define __PARTITION_H
+
+#pragma once
+
+#include <tuple>
+
+#include <raft/spectral/detail/partition.hpp>
+
+namespace raft {
+namespace spectral {
+
+// =========================================================
+// Spectral partitioner
+// =========================================================
+
+/// Compute spectral graph partition
+/** Compute partition for a weighted undirected graph. This
+ *  partition attempts to minimize the cost function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter_lanczos Maximum number of Lanczos iterations.
+ *  @param restartIter_lanczos Maximum size of Lanczos system before
+ *    implicit restart.
+ *  @param tol_lanczos Convergence tolerance for Lanczos method.
+ *  @param maxIter_kmeans Maximum number of k-means iterations.
+ *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+ *  @param clusters (Output, device memory, n entries) Partition
+ *    assignments.
+ *  @param iters_lanczos On exit, number of Lanczos iterations
+ *    performed.
+ *  @param iters_kmeans On exit, number of k-means iterations
+ *    performed.
+ *  @return statistics: number of eigensolver iterations, .
+ */
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> partition(
+  handle_t const& handle,
+  matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
+{
+  return raft::spectral::detail::partition<vertex_t, weight_t, EigenSolver, ClusterSolver>(
+    handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
+}
+
+// =========================================================
+// Analysis of graph partition
+// =========================================================
+
+/// Compute cost function for partition
+/** This function determines the edges cut by a partition and a cost
+ *  function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *  Graph is assumed to be weighted and undirected.
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param clusters (Input, device memory, n entries) Partition
+ *    assignments.
+ *  @param edgeCut On exit, weight of edges cut by partition.
+ *  @param cost On exit, partition cost function.
+ *  @return error flag.
+ */
+template <typename vertex_t, typename weight_t>
+void analyzePartition(handle_t const& handle,
+                      matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                      vertex_t nClusters,
+                      const vertex_t* __restrict__ clusters,
+                      weight_t& edgeCut,
+                      weight_t& cost)
+{
+  raft::spectral::detail::analyzePartition<vertex_t, weight_t>(
+    handle, csr_m, nClusters, clusters, edgeCut, cost);
+}
+
+}  // namespace spectral
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp
index f62773a958..fde2e6572b 100644
--- a/cpp/include/raft/spectral/partition.hpp
+++ b/cpp/include/raft/spectral/partition.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __PARTITION_H
+#define __PARTITION_H
+
 #pragma once
 
 #include <tuple>
@@ -94,3 +102,5 @@ void analyzePartition(handle_t const& handle,
 
 }  // namespace spectral
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/accuracy.cuh b/cpp/include/raft/stats/accuracy.cuh
new file mode 100644
index 0000000000..250ce579e5
--- /dev/null
+++ b/cpp/include/raft/stats/accuracy.cuh
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __STATS_ACCURACY_H
+#define __STATS_ACCURACY_H
+
+#pragma once
+
+#include <raft/stats/detail/scores.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute accuracy of predictions. Useful for classification.
+ * @tparam math_t: data type for predictions (e.g., int for classification)
+ * @param[in] predictions: array of predictions (GPU pointer).
+ * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer).
+ * @param[in] n: number of elements in each of predictions, ref_predictions.
+ * @param[in] stream: cuda stream.
+ * @return: Accuracy score in [0, 1]; higher is better.
+ */
+template <typename math_t>
+float accuracy(const math_t* predictions, const math_t* ref_predictions, int n, cudaStream_t stream)
+{
+  return detail::accuracy_score(predictions, ref_predictions, n, stream);
+}
+
+}  // namespace stats
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/accuracy.hpp b/cpp/include/raft/stats/accuracy.hpp
index 043d2c0d0b..eefe96b2d1 100644
--- a/cpp/include/raft/stats/accuracy.hpp
+++ b/cpp/include/raft/stats/accuracy.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __STATS_ACCURACY_H
+#define __STATS_ACCURACY_H
 
 #pragma once
 
@@ -38,3 +45,5 @@ float accuracy(const math_t* predictions, const math_t* ref_predictions, int n,
 
 }  // namespace stats
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/adjusted_rand_index.cuh b/cpp/include/raft/stats/adjusted_rand_index.cuh
new file mode 100644
index 0000000000..a59d7b4c81
--- /dev/null
+++ b/cpp/include/raft/stats/adjusted_rand_index.cuh
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file adjusted_rand_index.hpp
+ * @brief The adjusted Rand index is the corrected-for-chance version of the Rand index.
+ * Such a correction for chance establishes a baseline by using the expected similarity
+ * of all pair-wise comparisons between clusterings specified by a random model.
+ */
+#ifndef __ADJUSTED_RAND_INDEX_H
+#define __ADJUSTED_RAND_INDEX_H
+
+#pragma once
+
+#include <raft/stats/detail/adjusted_rand_index.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate Adjusted RandIndex as described
+ *        <a href="https://en.wikipedia.org/wiki/Rand_index">here</a>
+ * @tparam T data-type for input label arrays
+ * @tparam MathT integral data-type used for computing n-choose-r
+ * @param firstClusterArray: the array of classes
+ * @param secondClusterArray: the array of classes
+ * @param size: the size of the data points of type int
+ * @param stream: the cudaStream object
+ */
+template <typename T, typename MathT = int>
+double adjusted_rand_index(const T* firstClusterArray,
+                           const T* secondClusterArray,
+                           int size,
+                           cudaStream_t stream)
+{
+  return detail::compute_adjusted_rand_index(firstClusterArray, secondClusterArray, size, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/adjusted_rand_index.hpp b/cpp/include/raft/stats/adjusted_rand_index.hpp
index 22d81e5296..cbf6112000 100644
--- a/cpp/include/raft/stats/adjusted_rand_index.hpp
+++ b/cpp/include/raft/stats/adjusted_rand_index.hpp
@@ -13,6 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __ADJUSTED_RAND_INDEX_H
+#define __ADJUSTED_RAND_INDEX_H
+
 /**
  * @file adjusted_rand_index.hpp
  * @brief The adjusted Rand index is the corrected-for-chance version of the Rand index.
@@ -48,3 +56,5 @@ double adjusted_rand_index(const T* firstClusterArray,
 
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
diff --git a/cpp/include/raft/stats/completeness_score.cuh b/cpp/include/raft/stats/completeness_score.cuh
new file mode 100644
index 0000000000..dbfe6ce430
--- /dev/null
+++ b/cpp/include/raft/stats/completeness_score.cuh
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __COMPLETENESS_SCORE_H
+#define __COMPLETENESS_SCORE_H
+
+#pragma once
+
+#include <raft/stats/detail/completeness_score.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate the completeness score between two clusters
+ *
+ * @param truthClusterArray: the array of truth classes of type T
+ * @param predClusterArray: the array of predicted classes of type T
+ * @param size: the size of the data points of type int
+ * @param lowerLabelRange: the lower bound of the range of labels
+ * @param upperLabelRange: the upper bound of the range of labels
+ * @param stream: the cudaStream object
+ */
+template <typename T>
+double completeness_score(const T* truthClusterArray,
+                          const T* predClusterArray,
+                          int size,
+                          T lowerLabelRange,
+                          T upperLabelRange,
+                          cudaStream_t stream)
+{
+  return detail::completeness_score(
+    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/completeness_score.hpp b/cpp/include/raft/stats/completeness_score.hpp
index ee8598bcc4..01ed0d66b9 100644
--- a/cpp/include/raft/stats/completeness_score.hpp
+++ b/cpp/include/raft/stats/completeness_score.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __COMPLETENESS_SCORE_H
+#define __COMPLETENESS_SCORE_H
 
 #pragma once
 
@@ -45,3 +52,5 @@ double completeness_score(const T* truthClusterArray,
 
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
diff --git a/cpp/include/raft/stats/contingency_matrix.cuh b/cpp/include/raft/stats/contingency_matrix.cuh
new file mode 100644
index 0000000000..081782432c
--- /dev/null
+++ b/cpp/include/raft/stats/contingency_matrix.cuh
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CONTINGENCY_MATRIX_H
+#define __CONTINGENCY_MATRIX_H
+
+#pragma once
+
+#include <raft/stats/detail/contingencyMatrix.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief use this to allocate output matrix size
+ * size of matrix = (maxLabel - minLabel + 1)^2 * sizeof(int)
+ * @param groundTruth: device 1-d array for ground truth (num of rows)
+ * @param nSamples: number of elements in input array
+ * @param stream: cuda stream for execution
+ * @param minLabel: [out] calculated min value in input array
+ * @param maxLabel: [out] calculated max value in input array
+ */
+template <typename T>
+void getInputClassCardinality(
+  const T* groundTruth, const int nSamples, cudaStream_t stream, T& minLabel, T& maxLabel)
+{
+  detail::getInputClassCardinality(groundTruth, nSamples, stream, minLabel, maxLabel);
+}
+
+/**
+ * @brief Calculate workspace size for running contingency matrix calculations
+ * @tparam T label type
+ * @tparam OutT output matrix type
+ * @param nSamples: number of elements in input array
+ * @param groundTruth: device 1-d array for ground truth (num of rows)
+ * @param stream: cuda stream for execution
+ * @param minLabel: Optional, min value in input array
+ * @param maxLabel: Optional, max value in input array
+ */
+template <typename T, typename OutT = int>
+size_t getContingencyMatrixWorkspaceSize(int nSamples,
+                                         const T* groundTruth,
+                                         cudaStream_t stream,
+                                         T minLabel = std::numeric_limits<T>::max(),
+                                         T maxLabel = std::numeric_limits<T>::max())
+{
+  return detail::getContingencyMatrixWorkspaceSize(
+    nSamples, groundTruth, stream, minLabel, maxLabel);
+}
+
+/**
+ * @brief contruct contingency matrix given input ground truth and prediction
+ *        labels. Users should call function getInputClassCardinality to find
+ *        and allocate memory for output. Similarly workspace requirements
+ *        should be checked using function getContingencyMatrixWorkspaceSize
+ * @tparam T label type
+ * @tparam OutT output matrix type
+ * @param groundTruth: device 1-d array for ground truth (num of rows)
+ * @param predictedLabel: device 1-d array for prediction (num of columns)
+ * @param nSamples: number of elements in input array
+ * @param outMat: output buffer for contingecy matrix
+ * @param stream: cuda stream for execution
+ * @param workspace: Optional, workspace memory allocation
+ * @param workspaceSize: Optional, size of workspace memory
+ * @param minLabel: Optional, min value in input ground truth array
+ * @param maxLabel: Optional, max value in input ground truth array
+ */
+template <typename T, typename OutT = int>
+void contingencyMatrix(const T* groundTruth,
+                       const T* predictedLabel,
+                       int nSamples,
+                       OutT* outMat,
+                       cudaStream_t stream,
+                       void* workspace      = nullptr,
+                       size_t workspaceSize = 0,
+                       T minLabel           = std::numeric_limits<T>::max(),
+                       T maxLabel           = std::numeric_limits<T>::max())
+{
+  detail::contingencyMatrix<T, OutT>(groundTruth,
+                                     predictedLabel,
+                                     nSamples,
+                                     outMat,
+                                     stream,
+                                     workspace,
+                                     workspaceSize,
+                                     minLabel,
+                                     maxLabel);
+}
+
+};  // namespace stats
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/contingency_matrix.hpp b/cpp/include/raft/stats/contingency_matrix.hpp
index 7783bb9f42..6fa4a314f9 100644
--- a/cpp/include/raft/stats/contingency_matrix.hpp
+++ b/cpp/include/raft/stats/contingency_matrix.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __CONTINGENCY_MATRIX_H
+#define __CONTINGENCY_MATRIX_H
 
 #pragma once
 
@@ -99,3 +106,5 @@ void contingencyMatrix(const T* groundTruth,
 
 };  // namespace stats
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/cov.cuh b/cpp/include/raft/stats/cov.cuh
new file mode 100644
index 0000000000..06e8ba0215
--- /dev/null
+++ b/cpp/include/raft/stats/cov.cuh
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __COV_H
+#define __COV_H
+
+#pragma once
+
+#include <raft/stats/detail/cov.cuh>
+namespace raft {
+namespace stats {
+/**
+ * @brief Compute covariance of the input matrix
+ *
+ * Mean operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @param covar the output covariance matrix
+ * @param data the input matrix (this will get mean-centered at the end!)
+ * @param mu mean vector of the input matrix
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param sample whether to evaluate sample covariance or not. In other words,
+ * whether to normalize the output using N-1 or N, for true or false,
+ * respectively
+ * @param rowMajor whether the input data is row or col major
+ * @param stable whether to run the slower-but-numerically-stable version or not
+ * @param handle cublas handle
+ * @param stream cuda stream
+ * @note if stable=true, then the input data will be mean centered after this
+ * function returns!
+ */
+template <typename Type>
+void cov(const raft::handle_t& handle,
+         Type* covar,
+         Type* data,
+         const Type* mu,
+         std::size_t D,
+         std::size_t N,
+         bool sample,
+         bool rowMajor,
+         bool stable,
+         cudaStream_t stream)
+{
+  detail::cov(handle, covar, data, mu, D, N, sample, rowMajor, stable, stream);
+}
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/cov.hpp b/cpp/include/raft/stats/cov.hpp
index dc5bc63ee8..27b4ede876 100644
--- a/cpp/include/raft/stats/cov.hpp
+++ b/cpp/include/raft/stats/cov.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __COV_H
+#define __COV_H
 
 #pragma once
 
@@ -56,3 +63,5 @@ void cov(const raft::handle_t& handle,
 }
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/detail/adjusted_rand_index.cuh b/cpp/include/raft/stats/detail/adjusted_rand_index.cuh
index 03ffac6377..6b97c49864 100644
--- a/cpp/include/raft/stats/detail/adjusted_rand_index.cuh
+++ b/cpp/include/raft/stats/detail/adjusted_rand_index.cuh
@@ -27,9 +27,9 @@
 #include <math.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/map_then_reduce.hpp>
-#include <raft/linalg/reduce.hpp>
-#include <raft/stats/histogram.hpp>
+#include <raft/linalg/map_then_reduce.cuh>
+#include <raft/linalg/reduce.cuh>
+#include <raft/stats/histogram.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/stats/detail/batched/information_criterion.cuh b/cpp/include/raft/stats/detail/batched/information_criterion.cuh
index a6d8d174b0..1590910594 100644
--- a/cpp/include/raft/stats/detail/batched/information_criterion.cuh
+++ b/cpp/include/raft/stats/detail/batched/information_criterion.cuh
@@ -15,7 +15,7 @@
  */
 
 #pragma once
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 #include <raft/stats/common.hpp>
 
 #include <cmath>
diff --git a/cpp/include/raft/stats/detail/completeness_score.cuh b/cpp/include/raft/stats/detail/completeness_score.cuh
index 1ddd4ffc4c..5e6fb835ef 100644
--- a/cpp/include/raft/stats/detail/completeness_score.cuh
+++ b/cpp/include/raft/stats/detail/completeness_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,8 +22,8 @@
 
 #pragma once
 
-#include <raft/stats/entropy.hpp>
-#include <raft/stats/mutual_info_score.hpp>
+#include <raft/stats/entropy.cuh>
+#include <raft/stats/mutual_info_score.cuh>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/include/raft/stats/detail/cov.cuh b/cpp/include/raft/stats/detail/cov.cuh
index 7e3fc701a1..24de58dd91 100644
--- a/cpp/include/raft/stats/detail/cov.cuh
+++ b/cpp/include/raft/stats/detail/cov.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <raft/linalg/gemm.hpp>
-#include <raft/stats/mean_center.hpp>
+#include <raft/linalg/gemm.cuh>
+#include <raft/stats/mean_center.cuh>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/include/raft/stats/detail/dispersion.cuh b/cpp/include/raft/stats/detail/dispersion.cuh
index c1d9376e05..0c4d25b9aa 100644
--- a/cpp/include/raft/stats/detail/dispersion.cuh
+++ b/cpp/include/raft/stats/detail/dispersion.cuh
@@ -21,7 +21,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/interruptible.hpp>
-#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/eltwise.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/stats/detail/entropy.cuh b/cpp/include/raft/stats/detail/entropy.cuh
index 3eed86f705..d36fa1d7ba 100644
--- a/cpp/include/raft/stats/detail/entropy.cuh
+++ b/cpp/include/raft/stats/detail/entropy.cuh
@@ -24,8 +24,8 @@
 #include <math.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/divide.hpp>
-#include <raft/linalg/map_then_reduce.hpp>
+#include <raft/linalg/divide.cuh>
+#include <raft/linalg/map_then_reduce.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/stats/detail/homogeneity_score.cuh b/cpp/include/raft/stats/detail/homogeneity_score.cuh
index b91175fe0f..4c78553258 100644
--- a/cpp/include/raft/stats/detail/homogeneity_score.cuh
+++ b/cpp/include/raft/stats/detail/homogeneity_score.cuh
@@ -23,8 +23,8 @@
 #pragma once
 
 #include <raft/mr/device/allocator.hpp>
-#include <raft/stats/entropy.hpp>
-#include <raft/stats/mutual_info_score.hpp>
+#include <raft/stats/entropy.cuh>
+#include <raft/stats/mutual_info_score.cuh>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/include/raft/stats/detail/kl_divergence.cuh b/cpp/include/raft/stats/detail/kl_divergence.cuh
index 117dfd07fc..1a95aff531 100644
--- a/cpp/include/raft/stats/detail/kl_divergence.cuh
+++ b/cpp/include/raft/stats/detail/kl_divergence.cuh
@@ -24,7 +24,7 @@
 #include <math.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/map_then_reduce.hpp>
+#include <raft/linalg/map_then_reduce.cuh>
 #include <rmm/device_scalar.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh
index a512579c11..a55b7b4cd1 100644
--- a/cpp/include/raft/stats/detail/mean.cuh
+++ b/cpp/include/raft/stats/detail/mean.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/eltwise.cuh>
 
 #include <cub/cub.cuh>
 
diff --git a/cpp/include/raft/stats/detail/mean_center.cuh b/cpp/include/raft/stats/detail/mean_center.cuh
index db2eaf8459..1a4fc20c51 100644
--- a/cpp/include/raft/stats/detail/mean_center.cuh
+++ b/cpp/include/raft/stats/detail/mean_center.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/vectorized.cuh>
 
 namespace raft {
diff --git a/cpp/include/raft/stats/detail/meanvar.cuh b/cpp/include/raft/stats/detail/meanvar.cuh
index 075e7fe170..1d4e1f95bd 100644
--- a/cpp/include/raft/stats/detail/meanvar.cuh
+++ b/cpp/include/raft/stats/detail/meanvar.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/reduce.hpp>
+#include <raft/linalg/reduce.cuh>
 
 namespace raft::stats::detail {
 
diff --git a/cpp/include/raft/stats/detail/mutual_info_score.cuh b/cpp/include/raft/stats/detail/mutual_info_score.cuh
index b1349d6379..c730ac0362 100644
--- a/cpp/include/raft/stats/detail/mutual_info_score.cuh
+++ b/cpp/include/raft/stats/detail/mutual_info_score.cuh
@@ -30,8 +30,8 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/interruptible.hpp>
-#include <raft/linalg/reduce.hpp>
-#include <raft/stats/contingency_matrix.hpp>
+#include <raft/linalg/reduce.cuh>
+#include <raft/stats/contingency_matrix.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/stats/detail/scores.cuh b/cpp/include/raft/stats/detail/scores.cuh
index 130bdb4a85..0c345cbb21 100644
--- a/cpp/include/raft/stats/detail/scores.cuh
+++ b/cpp/include/raft/stats/detail/scores.cuh
@@ -18,12 +18,12 @@
 
 #include <memory>
 #include <raft/cudart_utils.h>
-#include <raft/distance/distance.hpp>
-#include <raft/linalg/eltwise.hpp>
+#include <raft/distance/distance.cuh>
+#include <raft/linalg/eltwise.cuh>
 #include <raft/linalg/power.cuh>
-#include <raft/linalg/subtract.hpp>
-#include <raft/spatial/knn/knn.hpp>
-#include <raft/stats/mean.hpp>
+#include <raft/linalg/subtract.cuh>
+#include <raft/spatial/knn/knn.cuh>
+#include <raft/stats/mean.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <thrust/device_ptr.h>
diff --git a/cpp/include/raft/stats/detail/silhouette_score.cuh b/cpp/include/raft/stats/detail/silhouette_score.cuh
index 8f02ff5045..aa100f7299 100644
--- a/cpp/include/raft/stats/detail/silhouette_score.cuh
+++ b/cpp/include/raft/stats/detail/silhouette_score.cuh
@@ -22,13 +22,13 @@
 #include <math.h>
 #include <numeric>
 #include <raft/cuda_utils.cuh>
-#include <raft/distance/distance.hpp>
+#include <raft/distance/distance.cuh>
 #include <raft/distance/distance_type.hpp>
-#include <raft/linalg/add.hpp>
-#include <raft/linalg/eltwise.hpp>
-#include <raft/linalg/map_then_reduce.hpp>
-#include <raft/linalg/matrix_vector_op.hpp>
-#include <raft/linalg/reduce.hpp>
+#include <raft/linalg/add.cuh>
+#include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/map_then_reduce.cuh>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/reduce.cuh>
 #include <raft/linalg/reduce_cols_by_key.cuh>
 #include <rmm/device_scalar.hpp>
 
diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh
index c07c212e54..b9149b5a9f 100644
--- a/cpp/include/raft/stats/detail/stddev.cuh
+++ b/cpp/include/raft/stats/detail/stddev.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.hpp>
+#include <raft/linalg/binary_op.cuh>
 
 #include <cub/cub.cuh>
 
diff --git a/cpp/include/raft/stats/detail/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh
index ad46c3bf10..3652a852de 100644
--- a/cpp/include/raft/stats/detail/sum.cuh
+++ b/cpp/include/raft/stats/detail/sum.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/eltwise.cuh>
 
 #include <cub/cub.cuh>
 
diff --git a/cpp/include/raft/stats/detail/trustworthiness_score.cuh b/cpp/include/raft/stats/detail/trustworthiness_score.cuh
index 04ae0228d6..feb3fe607d 100644
--- a/cpp/include/raft/stats/detail/trustworthiness_score.cuh
+++ b/cpp/include/raft/stats/detail/trustworthiness_score.cuh
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include <raft/distance/distance.hpp>
-#include <raft/matrix/col_wise_sort.hpp>
-#include <raft/spatial/knn/knn.hpp>
+#include <raft/distance/distance.cuh>
+#include <raft/matrix/col_wise_sort.cuh>
+#include <raft/spatial/knn/knn.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/stats/detail/v_measure.cuh b/cpp/include/raft/stats/detail/v_measure.cuh
index c51ababbb9..346755503a 100644
--- a/cpp/include/raft/stats/detail/v_measure.cuh
+++ b/cpp/include/raft/stats/detail/v_measure.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
  * @file v_measure.cuh
  */
 
-#include <raft/stats/homogeneity_score.hpp>
+#include <raft/stats/homogeneity_score.cuh>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/include/raft/stats/detail/weighted_mean.cuh b/cpp/include/raft/stats/detail/weighted_mean.cuh
index ca7fc136d3..0069cf0a3f 100644
--- a/cpp/include/raft/stats/detail/weighted_mean.cuh
+++ b/cpp/include/raft/stats/detail/weighted_mean.cuh
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/coalesced_reduction.hpp>
-#include <raft/linalg/strided_reduction.hpp>
+#include <raft/linalg/coalesced_reduction.cuh>
+#include <raft/linalg/strided_reduction.cuh>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/include/raft/stats/dispersion.cuh b/cpp/include/raft/stats/dispersion.cuh
new file mode 100644
index 0000000000..c868092517
--- /dev/null
+++ b/cpp/include/raft/stats/dispersion.cuh
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DISPERSION_H
+#define __DISPERSION_H
+
+#pragma once
+
+#include <raft/stats/detail/dispersion.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute cluster dispersion metric. This is very useful for
+ * automatically finding the 'k' (in kmeans) that improves this metric.
+ * @tparam DataT data type
+ * @tparam IdxT index type
+ * @tparam TPB threads block for kernels launched
+ * @param centroids the cluster centroids. This is assumed to be row-major
+ *   and of dimension (nClusters x dim)
+ * @param clusterSizes number of points in the dataset which belong to each
+ *   cluster. This is of length nClusters
+ * @param globalCentroid compute the global weighted centroid of all cluster
+ *   centroids. This is of length dim. Pass a nullptr if this is not needed
+ * @param nClusters number of clusters
+ * @param nPoints number of points in the dataset
+ * @param dim dataset dimensionality
+ * @param stream cuda stream
+ * @return the cluster dispersion value
+ */
+template <typename DataT, typename IdxT = int, int TPB = 256>
+DataT dispersion(const DataT* centroids,
+                 const IdxT* clusterSizes,
+                 DataT* globalCentroid,
+                 IdxT nClusters,
+                 IdxT nPoints,
+                 IdxT dim,
+                 cudaStream_t stream)
+{
+  return detail::dispersion(
+    centroids, clusterSizes, globalCentroid, nClusters, nPoints, dim, stream);
+}
+
+}  // end namespace stats
+}  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/dispersion.hpp b/cpp/include/raft/stats/dispersion.hpp
index 381f210d85..5958551e87 100644
--- a/cpp/include/raft/stats/dispersion.hpp
+++ b/cpp/include/raft/stats/dispersion.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __DISPERSION_H
+#define __DISPERSION_H
 
 #pragma once
 
@@ -54,3 +61,5 @@ DataT dispersion(const DataT* centroids,
 
 }  // end namespace stats
 }  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/entropy.cuh b/cpp/include/raft/stats/entropy.cuh
new file mode 100644
index 0000000000..59cbbd368f
--- /dev/null
+++ b/cpp/include/raft/stats/entropy.cuh
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ENTROPY_H
+#define __ENTROPY_H
+
+#pragma once
+#include <raft/stats/detail/entropy.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate entropy
+ * <a href="https://en.wikipedia.org/wiki/Entropy_(information_theory)">more info on entropy</a>
+ *
+ * @param clusterArray: the array of classes of type T
+ * @param size: the size of the data points of type int
+ * @param lowerLabelRange: the lower bound of the range of labels
+ * @param upperLabelRange: the upper bound of the range of labels
+ * @param stream: the cudaStream object
+ * @return the entropy score
+ */
+template <typename T>
+double entropy(const T* clusterArray,
+               const int size,
+               const T lowerLabelRange,
+               const T upperLabelRange,
+               cudaStream_t stream)
+{
+  return detail::entropy(clusterArray, size, lowerLabelRange, upperLabelRange, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/entropy.hpp b/cpp/include/raft/stats/entropy.hpp
index c1f15cb0fe..eb1fee2949 100644
--- a/cpp/include/raft/stats/entropy.hpp
+++ b/cpp/include/raft/stats/entropy.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __ENTROPY_H
+#define __ENTROPY_H
 
 #pragma once
 #include <raft/stats/detail/entropy.cuh>
@@ -43,3 +50,5 @@ double entropy(const T* clusterArray,
 
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/histogram.cuh b/cpp/include/raft/stats/histogram.cuh
new file mode 100644
index 0000000000..e8176ebc92
--- /dev/null
+++ b/cpp/include/raft/stats/histogram.cuh
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __HISTOGRAM_H
+#define __HISTOGRAM_H
+
+#pragma once
+
+#include <raft/stats/common.hpp>
+#include <raft/stats/detail/histogram.cuh>
+
+// This file is a shameless amalgamation of independent works done by
+// Lars Nyland and Andy Adinets
+
+///@todo: add cub's histogram as another option
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Perform histogram on the input data. It chooses the right load size
+ * based on the input data vector length. It also supports large-bin cases
+ * using a specialized smem-based hashing technique.
+ * @tparam DataT input data type
+ * @tparam IdxT data type used to compute indices
+ * @tparam BinnerOp takes the input data and computes its bin index
+ * @param type histogram implementation type to choose
+ * @param bins the output bins (length = ncols * nbins)
+ * @param nbins number of bins
+ * @param data input data (length = ncols * nrows)
+ * @param nrows data array length in each column (or batch)
+ * @param ncols number of columsn (or batch size)
+ * @param stream cuda stream
+ * @param binner the operation that computes the bin index of the input data
+ *
+ * @note signature of BinnerOp is `int func(DataT, IdxT);`
+ */
+template <typename DataT, typename IdxT = int, typename BinnerOp = IdentityBinner<DataT, IdxT>>
+void histogram(HistType type,
+               int* bins,
+               IdxT nbins,
+               const DataT* data,
+               IdxT nrows,
+               IdxT ncols,
+               cudaStream_t stream,
+               BinnerOp binner = IdentityBinner<DataT, IdxT>())
+{
+  detail::histogram<DataT, IdxT, BinnerOp>(type, bins, nbins, data, nrows, ncols, stream, binner);
+}
+
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/histogram.hpp b/cpp/include/raft/stats/histogram.hpp
index d4d3b449f7..828719236b 100644
--- a/cpp/include/raft/stats/histogram.hpp
+++ b/cpp/include/raft/stats/histogram.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __HISTOGRAM_H
+#define __HISTOGRAM_H
 
 #pragma once
 
@@ -60,3 +67,5 @@ void histogram(HistType type,
 
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/homogeneity_score.cuh b/cpp/include/raft/stats/homogeneity_score.cuh
new file mode 100644
index 0000000000..5fe92db78a
--- /dev/null
+++ b/cpp/include/raft/stats/homogeneity_score.cuh
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __HOMOGENEITY_SCORE_H
+#define __HOMOGENEITY_SCORE_H
+
+#pragma once
+
+#include <raft/stats/detail/homogeneity_score.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate the homogeneity score between two clusters
+ * <a href="https://en.wikipedia.org/wiki/Homogeneity_(statistics)">more info on mutual
+ * information</a>
+ * @param truthClusterArray: the array of truth classes of type T
+ * @param predClusterArray: the array of predicted classes of type T
+ * @param size: the size of the data points of type int
+ * @param lowerLabelRange: the lower bound of the range of labels
+ * @param upperLabelRange: the upper bound of the range of labels
+ * @param stream: the cudaStream object
+ */
+template <typename T>
+double homogeneity_score(const T* truthClusterArray,
+                         const T* predClusterArray,
+                         int size,
+                         T lowerLabelRange,
+                         T upperLabelRange,
+                         cudaStream_t stream)
+{
+  return detail::homogeneity_score(
+    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/homogeneity_score.hpp b/cpp/include/raft/stats/homogeneity_score.hpp
index e94d519902..49baea0c19 100644
--- a/cpp/include/raft/stats/homogeneity_score.hpp
+++ b/cpp/include/raft/stats/homogeneity_score.hpp
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __HOMOGENEITY_SCORE_H
+#define __HOMOGENEITY_SCORE_H
+
 #pragma once
 
 #include <raft/stats/detail/homogeneity_score.cuh>
@@ -46,3 +54,5 @@ double homogeneity_score(const T* truthClusterArray,
 
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/information_criterion.cuh b/cpp/include/raft/stats/information_criterion.cuh
new file mode 100644
index 0000000000..0744dcdffe
--- /dev/null
+++ b/cpp/include/raft/stats/information_criterion.cuh
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file information_criterion.hpp
+ * @brief These information criteria are used to evaluate the quality of models
+ *        by balancing the quality of the fit and the number of parameters.
+ *
+ * See:
+ *  - AIC: https://en.wikipedia.org/wiki/Akaike_information_criterion
+ *  - AICc: https://en.wikipedia.org/wiki/Akaike_information_criterion#AICc
+ *  - BIC: https://en.wikipedia.org/wiki/Bayesian_information_criterion
+ */
+
+#ifndef __INFORMATION_CRIT_H
+#define __INFORMATION_CRIT_H
+
+#pragma once
+
+#include <raft/stats/common.hpp>
+#include <raft/stats/detail/batched/information_criterion.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * Compute the given type of information criterion
+ *
+ * @note: it is safe to do the computation in-place (i.e give same pointer
+ *        as input and output)
+ *
+ * @param[out] d_ic             Information criterion to be returned for each
+ *                              series (device)
+ * @param[in]  d_loglikelihood  Log-likelihood for each series (device)
+ * @param[in]  ic_type          Type of criterion to compute. See IC_Type
+ * @param[in]  n_params         Number of parameters in the model
+ * @param[in]  batch_size       Number of series in the batch
+ * @param[in]  n_samples        Number of samples in each series
+ * @param[in]  stream           CUDA stream
+ */
+template <typename ScalarT, typename IdxT>
+void information_criterion_batched(ScalarT* d_ic,
+                                   const ScalarT* d_loglikelihood,
+                                   IC_Type ic_type,
+                                   IdxT n_params,
+                                   IdxT batch_size,
+                                   IdxT n_samples,
+                                   cudaStream_t stream)
+{
+  batched::detail::information_criterion(
+    d_ic, d_loglikelihood, ic_type, n_params, batch_size, n_samples, stream);
+}
+
+}  // namespace stats
+}  // namespace raft
+#endif
diff --git a/cpp/include/raft/stats/information_criterion.hpp b/cpp/include/raft/stats/information_criterion.hpp
index c367471953..f6dd69aa08 100644
--- a/cpp/include/raft/stats/information_criterion.hpp
+++ b/cpp/include/raft/stats/information_criterion.hpp
@@ -13,6 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __INFORMATION_CRIT_H
+#define __INFORMATION_CRIT_H
+
 /**
  * @file information_criterion.hpp
  * @brief These information criteria are used to evaluate the quality of models
@@ -61,3 +69,5 @@ void information_criterion_batched(ScalarT* d_ic,
 
 }  // namespace stats
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/kl_divergence.cuh b/cpp/include/raft/stats/kl_divergence.cuh
new file mode 100644
index 0000000000..b29f277b4a
--- /dev/null
+++ b/cpp/include/raft/stats/kl_divergence.cuh
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KL_DIVERGENCE_H
+#define __KL_DIVERGENCE_H
+
+#pragma once
+
+#include <raft/stats/detail/kl_divergence.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate KL Divergence
+ * <a href="https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence">more info on KL
+ * Divergence</a>
+ *
+ * @tparam DataT: Data type of the input array
+ * @param modelPDF: the model array of probability density functions of type DataT
+ * @param candidatePDF: the candidate array of probability density functions of type DataT
+ * @param size: the size of the data points of type int
+ * @param stream: the cudaStream object
+ */
+template <typename DataT>
+DataT kl_divergence(const DataT* modelPDF, const DataT* candidatePDF, int size, cudaStream_t stream)
+{
+  return detail::kl_divergence(modelPDF, candidatePDF, size, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
diff --git a/cpp/include/raft/stats/kl_divergence.hpp b/cpp/include/raft/stats/kl_divergence.hpp
index 377e96719d..9d7c0b1e46 100644
--- a/cpp/include/raft/stats/kl_divergence.hpp
+++ b/cpp/include/raft/stats/kl_divergence.hpp
@@ -13,6 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __KL_DIVERGENCE_H
+#define __KL_DIVERGENCE_H
+
 #pragma once
 
 #include <raft/stats/detail/kl_divergence.cuh>
@@ -39,3 +47,5 @@ DataT kl_divergence(const DataT* modelPDF, const DataT* candidatePDF, int size,
 
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/mean.cuh
new file mode 100644
index 0000000000..eed3159d5d
--- /dev/null
+++ b/cpp/include/raft/stats/mean.cuh
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MEAN_H
+#define __MEAN_H
+
+#pragma once
+
+#include "detail/mean.cuh"
+
+#include <raft/handle.hpp>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute mean of the input matrix
+ *
+ * Mean operation is assumed to be performed on a given column.
+ *
+ * @tparam Type: the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param mu: the output mean vector
+ * @param data: the input matrix
+ * @param D: number of columns of data
+ * @param N: number of rows of data
+ * @param sample: whether to evaluate sample mean or not. In other words,
+ * whether
+ *  to normalize the output using N-1 or N, for true or false, respectively
+ * @param rowMajor: whether the input data is row or col major
+ * @param stream: cuda stream
+ */
+template <typename Type, typename IdxType = int>
+void mean(
+  Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream)
+{
+  detail::mean(mu, data, D, N, sample, rowMajor, stream);
+}
+
+};  // namespace stats
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/mean.hpp b/cpp/include/raft/stats/mean.hpp
index ba1eb55e71..add9e47569 100644
--- a/cpp/include/raft/stats/mean.hpp
+++ b/cpp/include/raft/stats/mean.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MEAN_H
+#define __MEAN_H
 
 #pragma once
 
@@ -49,3 +56,5 @@ void mean(
 
 };  // namespace stats
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/mean_center.cuh b/cpp/include/raft/stats/mean_center.cuh
new file mode 100644
index 0000000000..3b2222ef52
--- /dev/null
+++ b/cpp/include/raft/stats/mean_center.cuh
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MEAN_CENTER_H
+#define __MEAN_CENTER_H
+
+#pragma once
+
+#include "detail/mean_center.cuh"
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Center the input matrix wrt its mean
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads per block of the cuda kernel launched
+ * @param out the output mean-centered matrix
+ * @param data input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param rowMajor whether input is row or col major
+ * @param bcastAlongRows whether to broadcast vector along rows or columns
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void meanCenter(Type* out,
+                const Type* data,
+                const Type* mu,
+                IdxType D,
+                IdxType N,
+                bool rowMajor,
+                bool bcastAlongRows,
+                cudaStream_t stream)
+{
+  detail::meanCenter<Type, IdxType, TPB>(out, data, mu, D, N, rowMajor, bcastAlongRows, stream);
+}
+
+/**
+ * @brief Add the input matrix wrt its mean
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads per block of the cuda kernel launched
+ * @param out the output mean-added matrix
+ * @param data input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param rowMajor whether input is row or col major
+ * @param bcastAlongRows whether to broadcast vector along rows or columns
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void meanAdd(Type* out,
+             const Type* data,
+             const Type* mu,
+             IdxType D,
+             IdxType N,
+             bool rowMajor,
+             bool bcastAlongRows,
+             cudaStream_t stream)
+{
+  detail::meanAdd<Type, IdxType, TPB>(out, data, mu, D, N, rowMajor, bcastAlongRows, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/mean_center.hpp b/cpp/include/raft/stats/mean_center.hpp
index 406a0b5047..69ce79338b 100644
--- a/cpp/include/raft/stats/mean_center.hpp
+++ b/cpp/include/raft/stats/mean_center.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MEAN_CENTER_H
+#define __MEAN_CENTER_H
 
 #pragma once
 
@@ -77,3 +84,5 @@ void meanAdd(Type* out,
 
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/meanvar.cuh b/cpp/include/raft/stats/meanvar.cuh
new file mode 100644
index 0000000000..0c3c423493
--- /dev/null
+++ b/cpp/include/raft/stats/meanvar.cuh
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __MEANVAR_H
+#define __MEANVAR_H
+
+#pragma once
+
+#include "detail/meanvar.cuh"
+
+namespace raft::stats {
+
+/**
+ * @brief Compute mean and variance for each column of a given matrix.
+ *
+ * The operation is performed in a single sweep. Consider using it when you need to compute
+ * both mean and variance, or when you need to compute variance but don't have the mean.
+ * It's almost twice faster than running `mean` and `vars` sequentially, because all three
+ * kernels are memory-bound.
+ *
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used for addressing
+ * @param [out] mean the output mean vector of size D
+ * @param [out] var the output variance vector of size D
+ * @param [in] data the input matrix of size [N, D]
+ * @param [in] D number of columns of data
+ * @param [in] N number of rows of data
+ * @param [in] sample whether to evaluate sample variance or not. In other words, whether to
+ * normalize the variance using N-1 or N, for true or false respectively.
+ * @param [in] rowMajor whether the input data is row- or col-major, for true or false respectively.
+ * @param [in] stream
+ */
+template <typename Type, typename IdxType = int>
+void meanvar(Type* mean,
+             Type* var,
+             const Type* data,
+             IdxType D,
+             IdxType N,
+             bool sample,
+             bool rowMajor,
+             cudaStream_t stream)
+{
+  detail::meanvar(mean, var, data, D, N, sample, rowMajor, stream);
+}
+
+};  // namespace raft::stats
+
+#endif
diff --git a/cpp/include/raft/stats/meanvar.hpp b/cpp/include/raft/stats/meanvar.hpp
index 3a41ee8a00..a6809170e7 100644
--- a/cpp/include/raft/stats/meanvar.hpp
+++ b/cpp/include/raft/stats/meanvar.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MEANVAR_H
+#define __MEANVAR_H
 
 #pragma once
 
@@ -54,3 +61,5 @@ void meanvar(Type* mean,
 }
 
 };  // namespace raft::stats
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/minmax.cuh b/cpp/include/raft/stats/minmax.cuh
new file mode 100644
index 0000000000..62533b1a00
--- /dev/null
+++ b/cpp/include/raft/stats/minmax.cuh
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __MINMAX_H
+#define __MINMAX_H
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/stats/detail/minmax.cuh>
+
+#include <limits>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Computes min/max across every column of the input matrix, as well as
+ * optionally allow to subsample based on the given row/col ID mapping vectors
+ *
+ * @tparam T the data type
+ * @tparam TPB number of threads per block
+ * @param data input data
+ * @param rowids actual row ID mappings. It is of length nrows. If you want to
+ * skip this index lookup entirely, pass nullptr
+ * @param colids actual col ID mappings. It is of length ncols. If you want to
+ * skip this index lookup entirely, pass nullptr
+ * @param nrows number of rows of data to be worked upon. The actual rows of the
+ * input "data" can be bigger than this!
+ * @param ncols number of cols of data to be worked upon. The actual cols of the
+ * input "data" can be bigger than this!
+ * @param row_stride stride (in number of elements) between 2 adjacent columns
+ * @param globalmin final col-wise global minimum (size = ncols)
+ * @param globalmax final col-wise global maximum (size = ncols)
+ * @param sampledcols output sampled data. Pass nullptr if you don't need this
+ * @param stream cuda stream
+ * @note This method makes the following assumptions:
+ * 1. input and output matrices are assumed to be col-major
+ * 2. ncols is small enough to fit the whole of min/max values across all cols
+ *    in shared memory
+ */
+template <typename T, int TPB = 512>
+void minmax(const T* data,
+            const unsigned* rowids,
+            const unsigned* colids,
+            int nrows,
+            int ncols,
+            int row_stride,
+            T* globalmin,
+            T* globalmax,
+            T* sampledcols,
+            cudaStream_t stream)
+{
+  detail::minmax<T, TPB>(
+    data, rowids, colids, nrows, ncols, row_stride, globalmin, globalmax, sampledcols, stream);
+}
+
+};  // namespace stats
+};  // namespace raft
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/minmax.hpp b/cpp/include/raft/stats/minmax.hpp
index 966287bb41..669b3c5837 100644
--- a/cpp/include/raft/stats/minmax.hpp
+++ b/cpp/include/raft/stats/minmax.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MINMAX_H
+#define __MINMAX_H
 
 #pragma once
 
@@ -68,3 +75,5 @@ void minmax(const T* data,
 
 };  // namespace stats
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/mutual_info_score.cuh b/cpp/include/raft/stats/mutual_info_score.cuh
new file mode 100644
index 0000000000..9e48168e74
--- /dev/null
+++ b/cpp/include/raft/stats/mutual_info_score.cuh
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MUTUAL_INFO_SCORE_H
+#define __MUTUAL_INFO_SCORE_H
+
+#pragma once
+
+#include <raft/stats/detail/mutual_info_score.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate the mutual information between two clusters
+ * <a href="https://en.wikipedia.org/wiki/Mutual_information">more info on mutual information</a>
+ * @param firstClusterArray: the array of classes of type T
+ * @param secondClusterArray: the array of classes of type T
+ * @param size: the size of the data points of type int
+ * @param lowerLabelRange: the lower bound of the range of labels
+ * @param upperLabelRange: the upper bound of the range of labels
+ * @param stream: the cudaStream object
+ */
+template <typename T>
+double mutual_info_score(const T* firstClusterArray,
+                         const T* secondClusterArray,
+                         int size,
+                         T lowerLabelRange,
+                         T upperLabelRange,
+                         cudaStream_t stream)
+{
+  return detail::mutual_info_score(
+    firstClusterArray, secondClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/mutual_info_score.hpp b/cpp/include/raft/stats/mutual_info_score.hpp
index b1044d0a3c..c900f9ce5b 100644
--- a/cpp/include/raft/stats/mutual_info_score.hpp
+++ b/cpp/include/raft/stats/mutual_info_score.hpp
@@ -13,6 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MUTUAL_INFO_SCORE_H
+#define __MUTUAL_INFO_SCORE_H
+
 #pragma once
 
 #include <raft/stats/detail/mutual_info_score.cuh>
@@ -44,3 +52,5 @@ double mutual_info_score(const T* firstClusterArray,
 
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/r2_score.cuh b/cpp/include/raft/stats/r2_score.cuh
new file mode 100644
index 0000000000..88fac5aaa6
--- /dev/null
+++ b/cpp/include/raft/stats/r2_score.cuh
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __R2_SCORE_H
+#define __R2_SCORE_H
+
+#pragma once
+
+#include <raft/stats/detail/scores.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * Calculates the "Coefficient of Determination" (R-Squared) score
+ * normalizing the sum of squared errors by the total sum of squares.
+ *
+ * This score indicates the proportionate amount of variation in an
+ * expected response variable is explained by the independent variables
+ * in a linear regression model. The larger the R-squared value, the
+ * more variability is explained by the linear regression model.
+ *
+ * @param y: Array of ground-truth response variables
+ * @param y_hat: Array of predicted response variables
+ * @param n: Number of elements in y and y_hat
+ * @param stream: cuda stream
+ * @return: The R-squared value.
+ */
+template <typename math_t>
+math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
+{
+  return detail::r2_score(y, y_hat, n, stream);
+}
+
+}  // namespace stats
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/r2_score.hpp b/cpp/include/raft/stats/r2_score.hpp
index 4858a2b2a8..4e126d903b 100644
--- a/cpp/include/raft/stats/r2_score.hpp
+++ b/cpp/include/raft/stats/r2_score.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __R2_SCORE_H
+#define __R2_SCORE_H
 
 #pragma once
 
@@ -44,3 +51,5 @@ math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
 
 }  // namespace stats
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/rand_index.cuh b/cpp/include/raft/stats/rand_index.cuh
new file mode 100644
index 0000000000..82bf046c4e
--- /dev/null
+++ b/cpp/include/raft/stats/rand_index.cuh
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __RAND_INDEX_H
+#define __RAND_INDEX_H
+
+#pragma once
+
+#include <raft/stats/detail/rand_index.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate RandIndex
+ * <a href="https://en.wikipedia.org/wiki/Rand_index">more info on rand index</a>
+ * @param firstClusterArray: the array of classes of type T
+ * @param secondClusterArray: the array of classes of type T
+ * @param size: the size of the data points of type uint64_t
+ * @param stream: the cudaStream object
+ */
+template <typename T>
+double rand_index(T* firstClusterArray, T* secondClusterArray, uint64_t size, cudaStream_t stream)
+{
+  return detail::compute_rand_index(firstClusterArray, secondClusterArray, size, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/rand_index.hpp b/cpp/include/raft/stats/rand_index.hpp
index 602ff11f47..c94e4fa8db 100644
--- a/cpp/include/raft/stats/rand_index.hpp
+++ b/cpp/include/raft/stats/rand_index.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __RAND_INDEX_H
+#define __RAND_INDEX_H
 
 #pragma once
 
@@ -37,3 +44,5 @@ double rand_index(T* firstClusterArray, T* secondClusterArray, uint64_t size, cu
 
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/regression_metrics.cuh b/cpp/include/raft/stats/regression_metrics.cuh
new file mode 100644
index 0000000000..0fb6d39967
--- /dev/null
+++ b/cpp/include/raft/stats/regression_metrics.cuh
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __REGRESSION_METRICS_H
+#define __REGRESSION_METRICS_H
+
+#pragma once
+
+#include <raft/stats/detail/scores.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute regression metrics mean absolute error, mean squared error, median absolute error
+ * @tparam T: data type for predictions (e.g., float or double for regression).
+ * @param[in] predictions: array of predictions (GPU pointer).
+ * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer).
+ * @param[in] n: number of elements in each of predictions, ref_predictions. Should be > 0.
+ * @param[in] stream: cuda stream.
+ * @param[out] mean_abs_error: Mean Absolute Error. Sum over n of (|predictions[i] -
+ * ref_predictions[i]|) / n.
+ * @param[out] mean_squared_error: Mean Squared Error. Sum over n of ((predictions[i] -
+ * ref_predictions[i])^2) / n.
+ * @param[out] median_abs_error: Median Absolute Error. Median of |predictions[i] -
+ * ref_predictions[i]| for i in [0, n).
+ */
+template <typename T>
+void regression_metrics(const T* predictions,
+                        const T* ref_predictions,
+                        int n,
+                        cudaStream_t stream,
+                        double& mean_abs_error,
+                        double& mean_squared_error,
+                        double& median_abs_error)
+{
+  detail::regression_metrics(
+    predictions, ref_predictions, n, stream, mean_abs_error, mean_squared_error, median_abs_error);
+}
+}  // namespace stats
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/regression_metrics.hpp b/cpp/include/raft/stats/regression_metrics.hpp
index 4cfbb88231..b8868bdb33 100644
--- a/cpp/include/raft/stats/regression_metrics.hpp
+++ b/cpp/include/raft/stats/regression_metrics.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __REGRESSION_METRICS_H
+#define __REGRESSION_METRICS_H
 
 #pragma once
 
@@ -49,3 +56,5 @@ void regression_metrics(const T* predictions,
 }
 }  // namespace stats
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/silhouette_score.cuh b/cpp/include/raft/stats/silhouette_score.cuh
new file mode 100644
index 0000000000..9f02cf6d74
--- /dev/null
+++ b/cpp/include/raft/stats/silhouette_score.cuh
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SILHOUETTE_SCORE_H
+#define __SILHOUETTE_SCORE_H
+
+#pragma once
+
+#include <raft/stats/detail/batched/silhouette_score.cuh>
+#include <raft/stats/detail/silhouette_score.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief main function that returns the average silhouette score for a given set of data and its
+ * clusterings
+ * @tparam DataT: type of the data samples
+ * @tparam LabelT: type of the labels
+ * @param handle: raft handle for managing expensive resources
+ * @param X_in: pointer to the input Data samples array (nRows x nCols)
+ * @param nRows: number of data samples
+ * @param nCols: number of features
+ * @param labels: the pointer to the array containing labels for every data sample (1 x nRows)
+ * @param nLabels: number of Labels
+ * @param silhouette_scorePerSample: pointer to the array that is optionally taken in as input and
+ * is populated with the silhouette score for every sample (1 x nRows)
+ * @param stream: the cuda stream where to launch this kernel
+ * @param metric: the numerical value that maps to the type of distance metric to be used in the
+ * calculations
+ */
+template <typename DataT, typename LabelT>
+DataT silhouette_score(
+  const raft::handle_t& handle,
+  DataT* X_in,
+  int nRows,
+  int nCols,
+  LabelT* labels,
+  int nLabels,
+  DataT* silhouette_scorePerSample,
+  cudaStream_t stream,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded)
+{
+  return detail::silhouette_score(
+    handle, X_in, nRows, nCols, labels, nLabels, silhouette_scorePerSample, stream, metric);
+}
+
+template <typename value_t, typename value_idx, typename label_idx>
+value_t silhouette_score_batched(
+  const raft::handle_t& handle,
+  value_t* X,
+  value_idx n_rows,
+  value_idx n_cols,
+  label_idx* y,
+  label_idx n_labels,
+  value_t* scores,
+  value_idx chunk,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded)
+{
+  return batched::detail::silhouette_score(
+    handle, X, n_rows, n_cols, y, n_labels, scores, chunk, metric);
+}
+
+};  // namespace stats
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/silhouette_score.hpp b/cpp/include/raft/stats/silhouette_score.hpp
index c0e4afb413..7506d9a733 100644
--- a/cpp/include/raft/stats/silhouette_score.hpp
+++ b/cpp/include/raft/stats/silhouette_score.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SILHOUETTE_SCORE_H
+#define __SILHOUETTE_SCORE_H
 
 #pragma once
 
@@ -73,3 +80,5 @@ value_t silhouette_score_batched(
 
 };  // namespace stats
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/specializations.cuh b/cpp/include/raft/stats/specializations.cuh
new file mode 100644
index 0000000000..660eee783f
--- /dev/null
+++ b/cpp/include/raft/stats/specializations.cuh
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __STATS_SPECIALIZATIONS_H
+#define __STATS_SPECIALIZATIONS_H
+
+#pragma once
+
+#include <raft/distance/specializations.cuh>
+#include <raft/spatial/knn/specializations.cuh>
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/specializations.hpp b/cpp/include/raft/stats/specializations.hpp
index 8f33690e5b..87301deccc 100644
--- a/cpp/include/raft/stats/specializations.hpp
+++ b/cpp/include/raft/stats/specializations.hpp
@@ -13,8 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __STATS_SPECIALIZATIONS_H
+#define __STATS_SPECIALIZATIONS_H
 
 #pragma once
 
-#include <raft/distance/specializations.hpp>
-#include <raft/spatial/knn/specializations.hpp>
\ No newline at end of file
+#include <raft/distance/specializations.cuh>
+#include <raft/spatial/knn/specializations.cuh>
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/stddev.cuh b/cpp/include/raft/stats/stddev.cuh
new file mode 100644
index 0000000000..72df090939
--- /dev/null
+++ b/cpp/include/raft/stats/stddev.cuh
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __STDDEV_H
+#define __STDDEV_H
+
+#pragma once
+
+#include "detail/stddev.cuh"
+
+#include <raft/handle.hpp>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute stddev of the input matrix
+ *
+ * Stddev operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param std the output stddev vector
+ * @param data the input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param sample whether to evaluate sample stddev or not. In other words,
+ * whether
+ *  to normalize the output using N-1 or N, for true or false, respectively
+ * @param rowMajor whether the input data is row or col major
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int>
+void stddev(Type* std,
+            const Type* data,
+            const Type* mu,
+            IdxType D,
+            IdxType N,
+            bool sample,
+            bool rowMajor,
+            cudaStream_t stream)
+{
+  detail::stddev(std, data, mu, D, N, sample, rowMajor, stream);
+}
+
+/**
+ * @brief Compute variance of the input matrix
+ *
+ * Variance operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param var the output stddev vector
+ * @param data the input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param sample whether to evaluate sample stddev or not. In other words,
+ * whether
+ *  to normalize the output using N-1 or N, for true or false, respectively
+ * @param rowMajor whether the input data is row or col major
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int>
+void vars(Type* var,
+          const Type* data,
+          const Type* mu,
+          IdxType D,
+          IdxType N,
+          bool sample,
+          bool rowMajor,
+          cudaStream_t stream)
+{
+  detail::vars(var, data, mu, D, N, sample, rowMajor, stream);
+}
+
+};  // namespace stats
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/stddev.hpp b/cpp/include/raft/stats/stddev.hpp
index 9393dec8bc..e038fecc02 100644
--- a/cpp/include/raft/stats/stddev.hpp
+++ b/cpp/include/raft/stats/stddev.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __STDDEV_H
+#define __STDDEV_H
 
 #pragma once
 
@@ -87,3 +94,5 @@ void vars(Type* var,
 
 };  // namespace stats
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/sum.cuh b/cpp/include/raft/stats/sum.cuh
new file mode 100644
index 0000000000..2e07e9aafa
--- /dev/null
+++ b/cpp/include/raft/stats/sum.cuh
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SUM_H
+#define __SUM_H
+
+#pragma once
+
+#include "detail/sum.cuh"
+
+#include <raft/cudart_utils.h>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute sum of the input matrix
+ *
+ * Sum operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param output the output mean vector
+ * @param input the input matrix
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param rowMajor whether the input data is row or col major
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int>
+void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream)
+{
+  detail::sum(output, input, D, N, rowMajor, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/sum.hpp b/cpp/include/raft/stats/sum.hpp
index cfb5142a14..c2b93b79db 100644
--- a/cpp/include/raft/stats/sum.hpp
+++ b/cpp/include/raft/stats/sum.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SUM_H
+#define __SUM_H
 
 #pragma once
 
@@ -45,3 +52,5 @@ void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, c
 
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/trustworthiness_score.cuh b/cpp/include/raft/stats/trustworthiness_score.cuh
new file mode 100644
index 0000000000..c89eab8d2b
--- /dev/null
+++ b/cpp/include/raft/stats/trustworthiness_score.cuh
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __TRUSTWORTHINESS_SCORE_H
+#define __TRUSTWORTHINESS_SCORE_H
+
+#pragma once
+#include <raft/stats/detail/trustworthiness_score.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute the trustworthiness score
+ * @param[in] h: raft handle
+ * @param[in] X: Data in original dimension
+ * @param[in] X_embedded: Data in target dimension (embedding)
+ * @param[in] n: Number of samples
+ * @param[in] m: Number of features in high/original dimension
+ * @param[in] d: Number of features in low/embedded dimension
+ * @param[in] n_neighbors Number of neighbors considered by trustworthiness score
+ * @param[in] batchSize Batch size
+ * @return[out] Trustworthiness score
+ */
+template <typename math_t, raft::distance::DistanceType distance_type>
+double trustworthiness_score(const raft::handle_t& h,
+                             const math_t* X,
+                             math_t* X_embedded,
+                             int n,
+                             int m,
+                             int d,
+                             int n_neighbors,
+                             int batchSize = 512)
+{
+  return detail::trustworthiness_score<math_t, distance_type>(
+    h, X, X_embedded, n, m, d, n_neighbors, batchSize);
+}
+}  // namespace stats
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/trustworthiness_score.hpp b/cpp/include/raft/stats/trustworthiness_score.hpp
index f3f1bacfd4..81ca4eb5b7 100644
--- a/cpp/include/raft/stats/trustworthiness_score.hpp
+++ b/cpp/include/raft/stats/trustworthiness_score.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __TRUSTWORTHINESS_SCORE_H
+#define __TRUSTWORTHINESS_SCORE_H
 
 #pragma once
 #include <raft/stats/detail/trustworthiness_score.cuh>
@@ -47,3 +54,5 @@ double trustworthiness_score(const raft::handle_t& h,
 }
 }  // namespace stats
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/v_measure.cuh b/cpp/include/raft/stats/v_measure.cuh
new file mode 100644
index 0000000000..dd6ebd9b15
--- /dev/null
+++ b/cpp/include/raft/stats/v_measure.cuh
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __V_MEASURE_H
+#define __V_MEASURE_H
+
+#pragma once
+#include <raft/stats/detail/v_measure.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate the v-measure between two clusters
+ *
+ * @param truthClusterArray: the array of truth classes of type T
+ * @param predClusterArray: the array of predicted classes of type T
+ * @param size: the size of the data points of type int
+ * @param lowerLabelRange: the lower bound of the range of labels
+ * @param upperLabelRange: the upper bound of the range of labels
+ * @param stream: the cudaStream object
+ * @param beta: v_measure parameter
+ */
+template <typename T>
+double v_measure(const T* truthClusterArray,
+                 const T* predClusterArray,
+                 int size,
+                 T lowerLabelRange,
+                 T upperLabelRange,
+                 cudaStream_t stream,
+                 double beta = 1.0)
+{
+  return detail::v_measure(
+    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream, beta);
+}
+
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/v_measure.hpp b/cpp/include/raft/stats/v_measure.hpp
index c7c4c3942d..925171c2d2 100644
--- a/cpp/include/raft/stats/v_measure.hpp
+++ b/cpp/include/raft/stats/v_measure.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __V_MEASURE_H
+#define __V_MEASURE_H
+
 #pragma once
 #include <raft/stats/detail/v_measure.cuh>
 
@@ -45,3 +53,5 @@ double v_measure(const T* truthClusterArray,
 
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/weighted_mean.cuh b/cpp/include/raft/stats/weighted_mean.cuh
new file mode 100644
index 0000000000..fe54d927ca
--- /dev/null
+++ b/cpp/include/raft/stats/weighted_mean.cuh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __WEIGHTED_MEAN_H
+#define __WEIGHTED_MEAN_H
+
+#pragma once
+
+#include <raft/stats/detail/weighted_mean.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute the row-wise weighted mean of the input matrix
+ *
+ * @tparam Type the data type
+ * @param mu the output mean vector
+ * @param data the input matrix (assumed to be row-major)
+ * @param weights per-column means
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param stream cuda stream to launch work on
+ */
+template <typename Type>
+void rowWeightedMean(
+  Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream)
+{
+  detail::rowWeightedMean(mu, data, weights, D, N, stream);
+}
+
+/**
+ * @brief Compute the column-wise weighted mean of the input matrix
+ *
+ * @tparam Type the data type
+ * @param mu the output mean vector
+ * @param data the input matrix (assumed to be column-major)
+ * @param weights per-column means
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param stream cuda stream to launch work on
+ */
+template <typename Type>
+void colWeightedMean(
+  Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream)
+{
+  detail::colWeightedMean(mu, data, weights, D, N, stream);
+}
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/weighted_mean.hpp b/cpp/include/raft/stats/weighted_mean.hpp
index ad90142a08..6d2fd1e928 100644
--- a/cpp/include/raft/stats/weighted_mean.hpp
+++ b/cpp/include/raft/stats/weighted_mean.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __WEIGHTED_MEAN_H
+#define __WEIGHTED_MEAN_H
 
 #pragma once
 
@@ -58,3 +65,5 @@ void colWeightedMean(
 }
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu b/cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
index 615af0554c..30fbf70322 100644
--- a/cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
+++ b/cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu b/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
index c737e1645e..31bc11aa71 100644
--- a/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
+++ b/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu b/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu
index 91e5f4b3d1..3fb652479a 100644
--- a/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu
+++ b/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu b/cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
index 3add90ce4e..0ac2b23b29 100644
--- a/cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
+++ b/cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu b/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
index ae81c29aff..f56179fcac 100644
--- a/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
+++ b/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu b/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu
index d6638004ed..db12d23f0e 100644
--- a/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu
+++ b/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l1_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l1_double_double_double_int.cu
index 49cef9a76f..b19603dcb9 100644
--- a/cpp/src/distance/specializations/detail/l1_double_double_double_int.cu
+++ b/cpp/src/distance/specializations/detail/l1_double_double_double_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l1_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l1_float_float_float_int.cu
index afec666d57..8319239115 100644
--- a/cpp/src/distance/specializations/detail/l1_float_float_float_int.cu
+++ b/cpp/src/distance/specializations/detail/l1_float_float_float_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l1_float_float_float_uint32.cu b/cpp/src/distance/specializations/detail/l1_float_float_float_uint32.cu
index b12f10a3c3..f906d711d3 100644
--- a/cpp/src/distance/specializations/detail/l1_float_float_float_uint32.cu
+++ b/cpp/src/distance/specializations/detail/l1_float_float_float_uint32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
index 690fdb304a..c77dee4220 100644
--- a/cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
+++ b/cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
index 743e885bde..778a65ad01 100644
--- a/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
+++ b/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_uint32.cu b/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_uint32.cu
index 3e84786db5..43494eabd1 100644
--- a/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_uint32.cu
+++ b/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_uint32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu
index a57d664c7b..68e178089c 100644
--- a/cpp/src/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu
+++ b/cpp/src/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu
index 836d3b28e4..bcfa5d99d1 100644
--- a/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu
+++ b/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_uint32.cu b/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_uint32.cu
index ff57678a5d..e85058e34f 100644
--- a/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_uint32.cu
+++ b/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_uint32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu
index b12c70df58..4cfb058a55 100644
--- a/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu
+++ b/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu
index 24d6e6916c..b8331977c4 100644
--- a/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu
+++ b/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_uint32.cu b/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_uint32.cu
index f61c40541c..1531b3b728 100644
--- a/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_uint32.cu
+++ b/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_uint32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
index 8c02098c96..fec7c2dce3 100644
--- a/cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
+++ b/cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
index 350cb27874..8f83b9cfbb 100644
--- a/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
+++ b/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_uint32.cu b/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_uint32.cu
index 607113a18d..baad4cc451 100644
--- a/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_uint32.cu
+++ b/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_uint32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
index bd306df055..082fad62d1 100644
--- a/cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
+++ b/cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
index 64a0656e27..919d069e1d 100644
--- a/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
+++ b/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_uint32.cu b/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_uint32.cu
index fcf6f2c65b..79a560dde7 100644
--- a/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_uint32.cu
+++ b/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_uint32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/nn/specializations/ball_cover.cu b/cpp/src/nn/specializations/ball_cover.cu
index ceb9468c21..d142a49264 100644
--- a/cpp/src/nn/specializations/ball_cover.cu
+++ b/cpp/src/nn/specializations/ball_cover.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#include <raft/spatial/knn/ball_cover.hpp>
+#include <raft/spatial/knn/ball_cover.cuh>
 #include <raft/spatial/knn/ball_cover_common.h>
 
 // Ignore upstream specializations to avoid unnecessary recompiling
-#include <raft/distance/specializations.hpp>
+#include <raft/distance/specializations.cuh>
 #include <raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp>
-#include <raft/spatial/knn/specializations/fused_l2_knn.hpp>
-#include <raft/spatial/knn/specializations/knn.hpp>
+#include <raft/spatial/knn/specializations/fused_l2_knn.cuh>
+#include <raft/spatial/knn/specializations/knn.cuh>
 
 #include <cstdint>
 
diff --git a/cpp/src/nn/specializations/knn.cu b/cpp/src/nn/specializations/knn.cu
index 8973cfbb02..bb59e5b2ba 100644
--- a/cpp/src/nn/specializations/knn.cu
+++ b/cpp/src/nn/specializations/knn.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 
 #include <cstdint>
-#include <raft/spatial/knn/knn.hpp>
+#include <raft/spatial/knn/knn.cuh>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index ee57b7c09a..0d3121fee6 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu
index 0030596e21..d475fd2a69 100644
--- a/cpp/test/cluster_solvers.cu
+++ b/cpp/test/cluster_solvers.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,8 +19,8 @@
 #include <memory>
 #include <raft/handle.hpp>
 
-#include <raft/spectral/cluster_solvers.hpp>
-#include <raft/spectral/modularity_maximization.hpp>
+#include <raft/spectral/cluster_solvers.cuh>
+#include <raft/spectral/modularity_maximization.cuh>
 
 namespace raft {
 namespace spectral {
diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index 3bfc70ccf0..7676fd2e07 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/distance/distance.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/distance/distance.cuh>
+#include <raft/random/rng.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index afac15522f..1070f18b96 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -19,11 +19,11 @@
 #include <raft/common/nvtx.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/distance/distance.hpp>
+#include <raft/distance/distance.cuh>
 #if defined RAFT_DISTANCE_COMPILED
-#include <raft/distance/specializations.hpp>
+#include <raft/distance/specializations.cuh>
 #endif
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace distance {
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 176922529f..5ec55bf30d 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,9 +19,9 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/distance/detail/fused_l2_nn.cuh>
-#include <raft/distance/fused_l2_nn.hpp>
-#include <raft/linalg/norm.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/distance/fused_l2_nn.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace distance {
diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu
index 541d4dccc8..635908240b 100644
--- a/cpp/test/eigen_solvers.cu
+++ b/cpp/test/eigen_solvers.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@
 
 #include <raft/common/nvtx.hpp>
 #include <raft/handle.hpp>
-#include <raft/spectral/eigen_solvers.hpp>
-#include <raft/spectral/partition.hpp>
+#include <raft/spectral/eigen_solvers.cuh>
+#include <raft/spectral/partition.cuh>
 
 #include <gtest/gtest.h>
 
diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp
index 118002dba0..d594a49e83 100644
--- a/cpp/test/handle.cpp
+++ b/cpp/test/handle.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu
index b19accc3b4..06f25cb308 100644
--- a/cpp/test/label/label.cu
+++ b/cpp/test/label/label.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 
-#include <raft/label/classlabels.hpp>
+#include <raft/label/classlabels.cuh>
 
 #include "../test_utils.h"
 #include <raft/cuda_utils.cuh>
diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu
index db6b34bbd6..cab8c44969 100644
--- a/cpp/test/label/merge_labels.cu
+++ b/cpp/test/label/merge_labels.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 
 #include <gtest/gtest.h>
-#include <raft/label/merge_labels.hpp>
+#include <raft/label/merge_labels.cuh>
 
 #include "../test_utils.h"
 #include <raft/cudart_utils.h>
diff --git a/cpp/test/lap/lap.cu b/cpp/test/lap/lap.cu
index 24e1c6be4f..1f847ceef3 100644
--- a/cpp/test/lap/lap.cu
+++ b/cpp/test/lap/lap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  * Copyright 2020 KETAN DATE & RAKESH NAGI
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -28,7 +28,7 @@
 
 #include <iostream>
 #include <omp.h>
-#include <raft/lap/lap.hpp>
+#include <raft/lap/lap.cuh>
 #include <random>
 
 #define PROBLEMSIZE  1000  // Number of rows/columns
diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu
index d5daef8d7b..e3d1919b09 100644
--- a/cpp/test/linalg/add.cu
+++ b/cpp/test/linalg/add.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include "add.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/add.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/add.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/add.cuh b/cpp/test/linalg/add.cuh
index 1f1ff87a4d..215b4d3805 100644
--- a/cpp/test/linalg/add.cuh
+++ b/cpp/test/linalg/add.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/add.hpp>
+#include <raft/linalg/add.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu
index d1b00da728..591d20fdb6 100644
--- a/cpp/test/linalg/binary_op.cu
+++ b/cpp/test/linalg/binary_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include "binary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/binary_op.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/binary_op.cuh>
+#include <raft/random/rng.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/test/linalg/binary_op.cuh
index b9ca9f8fd2..763398aff1 100644
--- a/cpp/test/linalg/binary_op.cuh
+++ b/cpp/test/linalg/binary_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include "../test_utils.h"
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.hpp>
+#include <raft/linalg/binary_op.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu
index 9f44cc8d5f..c057c20403 100644
--- a/cpp/test/linalg/cholesky_r1.cu
+++ b/cpp/test/linalg/cholesky_r1.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/cholesky_r1_update.hpp>
+#include <raft/linalg/cholesky_r1_update.cuh>
 #include <raft/linalg/detail/cusolver_wrappers.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index 56b4c5bd49..6214a5eccc 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,8 +19,8 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/coalesced_reduction.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/coalesced_reduction.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu
index fd1bb8a670..ef186b6c1f 100644
--- a/cpp/test/linalg/divide.cu
+++ b/cpp/test/linalg/divide.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include "unary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/divide.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/divide.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
index 9949c900ef..51a540a3dd 100644
--- a/cpp/test/linalg/eig.cu
+++ b/cpp/test/linalg/eig.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eig.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/eig.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index e35835a445..23ded35174 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -20,7 +20,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eig.hpp>
+#include <raft/linalg/eig.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu
index 982dc21573..6f16c7684a 100644
--- a/cpp/test/linalg/eltwise.cu
+++ b/cpp/test/linalg/eltwise.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eltwise.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/eltwise.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu
index 72567ff5f9..422ba26f46 100644
--- a/cpp/test/linalg/gemm_layout.cu
+++ b/cpp/test/linalg/gemm_layout.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/gemm.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/gemm.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/gemv.cu b/cpp/test/linalg/gemv.cu
index ea84e06675..43d2a6bd54 100644
--- a/cpp/test/linalg/gemv.cu
+++ b/cpp/test/linalg/gemv.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/gemv.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/gemv.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu
index d27fad4dfc..0814d821a9 100644
--- a/cpp/test/linalg/map.cu
+++ b/cpp/test/linalg/map.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eltwise.hpp>
-#include <raft/linalg/map.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/map.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index a12bb6ff9d..4696cc0d0c 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include <gtest/gtest.h>
 #include <limits>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/map_then_reduce.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/map_then_reduce.cuh>
+#include <raft/random/rng.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index 1a97603430..29ebe0a902 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #include "matrix_vector_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh
index 9ab005a075..1e5812ba89 100644
--- a/cpp/test/linalg/matrix_vector_op.cuh
+++ b/cpp/test/linalg/matrix_vector_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #include "../test_utils.h"
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/matrix_vector_op.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu
index 6341fa341d..7f34a19580 100644
--- a/cpp/test/linalg/multiply.cu
+++ b/cpp/test/linalg/multiply.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include "unary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/multiply.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/multiply.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
index e574c52692..7070510738 100644
--- a/cpp/test/linalg/norm.cu
+++ b/cpp/test/linalg/norm.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/norm.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/norm.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/power.cu b/cpp/test/linalg/power.cu
index 0ec8613ce7..5c536eb498 100644
--- a/cpp/test/linalg/power.cu
+++ b/cpp/test/linalg/power.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/power.cuh>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
index cb69dc0e81..7793f3ab7a 100644
--- a/cpp/test/linalg/reduce.cu
+++ b/cpp/test/linalg/reduce.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,8 +19,8 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/reduce.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/reduce.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh
index 7840df2c0d..16f261cfc2 100644
--- a/cpp/test/linalg/reduce.cuh
+++ b/cpp/test/linalg/reduce.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include <cublas_v2.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
diff --git a/cpp/test/linalg/reduce_cols_by_key.cu b/cpp/test/linalg/reduce_cols_by_key.cu
index 94459769f8..ee0505fcfc 100644
--- a/cpp/test/linalg/reduce_cols_by_key.cu
+++ b/cpp/test/linalg/reduce_cols_by_key.cu
@@ -19,7 +19,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/interruptible.hpp>
 #include <raft/linalg/reduce_cols_by_key.cuh>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/reduce_rows_by_key.cu b/cpp/test/linalg/reduce_rows_by_key.cu
index 9219c4f561..174ed12d4d 100644
--- a/cpp/test/linalg/reduce_rows_by_key.cu
+++ b/cpp/test/linalg/reduce_rows_by_key.cu
@@ -19,7 +19,7 @@
 #include <iostream>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/reduce_rows_by_key.cuh>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/rsvd.cu b/cpp/test/linalg/rsvd.cu
index 7b0bb7c928..66b472c7e1 100644
--- a/cpp/test/linalg/rsvd.cu
+++ b/cpp/test/linalg/rsvd.cu
@@ -20,7 +20,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 #include <raft/linalg/rsvd.cuh>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 #include <rmm/device_uvector.hpp>
 
 #include <algorithm>
diff --git a/cpp/test/linalg/sqrt.cu b/cpp/test/linalg/sqrt.cu
index 92c9626395..8401170647 100644
--- a/cpp/test/linalg/sqrt.cu
+++ b/cpp/test/linalg/sqrt.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/sqrt.cuh>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu
index 840889dee8..7957baacac 100644
--- a/cpp/test/linalg/strided_reduction.cu
+++ b/cpp/test/linalg/strided_reduction.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include "reduce.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/strided_reduction.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/strided_reduction.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu
index 2801592de9..13f268fbe9 100644
--- a/cpp/test/linalg/subtract.cu
+++ b/cpp/test/linalg/subtract.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/subtract.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/subtract.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu
index e074197dec..9d7656445e 100644
--- a/cpp/test/linalg/svd.cu
+++ b/cpp/test/linalg/svd.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,9 +18,9 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/svd.hpp>
-#include <raft/matrix/matrix.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/svd.cuh>
+#include <raft/matrix/matrix.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/ternary_op.cu b/cpp/test/linalg/ternary_op.cu
index 4140a9c4b3..d816ea5ee1 100644
--- a/cpp/test/linalg/ternary_op.cu
+++ b/cpp/test/linalg/ternary_op.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/ternary_op.cuh>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index 3c651bb8ee..3484d50a3a 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/transpose.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/transpose.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu
index 7a976ec336..c3a086da28 100644
--- a/cpp/test/linalg/unary_op.cu
+++ b/cpp/test/linalg/unary_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include "unary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/unary_op.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/test/linalg/unary_op.cuh
index b47e60d4f6..625fe7ab00 100644
--- a/cpp/test/linalg/unary_op.cuh
+++ b/cpp/test/linalg/unary_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include "../test_utils.h"
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/matrix/columnSort.cu b/cpp/test/matrix/columnSort.cu
index d0b27bb4a4..dbfaacaa9a 100644
--- a/cpp/test/matrix/columnSort.cu
+++ b/cpp/test/matrix/columnSort.cu
@@ -19,7 +19,7 @@
 #include <gtest/gtest.h>
 #include <numeric>
 #include <raft/cudart_utils.h>
-#include <raft/matrix/col_wise_sort.hpp>
+#include <raft/matrix/col_wise_sort.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/test/matrix/linewise_op.cu b/cpp/test/matrix/linewise_op.cu
index cd0d065ad4..ad273c9363 100644
--- a/cpp/test/matrix/linewise_op.cu
+++ b/cpp/test/matrix/linewise_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,9 +20,9 @@
 #include <gtest/gtest.h>
 #include <raft/common/nvtx.hpp>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/matrix_vector_op.hpp>
-#include <raft/matrix/matrix.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/matrix/matrix.cuh>
+#include <raft/random/rng.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu
index 127e582145..42eb5d0e36 100644
--- a/cpp/test/matrix/math.cu
+++ b/cpp/test/matrix/math.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/matrix/math.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/matrix/math.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace matrix {
diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu
index fb2f6c6b15..234d02b9ca 100644
--- a/cpp/test/matrix/matrix.cu
+++ b/cpp/test/matrix/matrix.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/matrix/matrix.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/matrix/matrix.cuh>
+#include <raft/random/rng.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp
index 324e9b9e4b..b060568981 100644
--- a/cpp/test/mr/device/buffer.cpp
+++ b/cpp/test/mr/device/buffer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/mr/host/buffer.cpp b/cpp/test/mr/host/buffer.cpp
index c174b269da..89b185547a 100644
--- a/cpp/test/mr/host/buffer.cpp
+++ b/cpp/test/mr/host/buffer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/random/make_blobs.cu b/cpp/test/random/make_blobs.cu
index 48e8986947..b2b4ba9e66 100644
--- a/cpp/test/random/make_blobs.cu
+++ b/cpp/test/random/make_blobs.cu
@@ -19,7 +19,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/random/make_blobs.hpp>
+#include <raft/random/make_blobs.cuh>
 
 namespace raft {
 namespace random {
diff --git a/cpp/test/random/make_regression.cu b/cpp/test/random/make_regression.cu
index 01c3008cd3..72c7f64cd0 100644
--- a/cpp/test/random/make_regression.cu
+++ b/cpp/test/random/make_regression.cu
@@ -22,9 +22,9 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/linalg/subtract.hpp>
-#include <raft/linalg/transpose.hpp>
-#include <raft/random/make_regression.hpp>
+#include <raft/linalg/subtract.cuh>
+#include <raft/linalg/transpose.cuh>
+#include <raft/random/make_regression.cuh>
 
 namespace raft::random {
 
diff --git a/cpp/test/random/multi_variable_gaussian.cu b/cpp/test/random/multi_variable_gaussian.cu
index daafdbc754..58fbed7eb2 100644
--- a/cpp/test/random/multi_variable_gaussian.cu
+++ b/cpp/test/random/multi_variable_gaussian.cu
@@ -19,7 +19,7 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <raft/cudart_utils.h>
-#include <raft/random/multi_variable_gaussian.hpp>
+#include <raft/random/multi_variable_gaussian.cuh>
 #include <random>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/random/permute.cu b/cpp/test/random/permute.cu
index 294444d409..643dfc4c98 100644
--- a/cpp/test/random/permute.cu
+++ b/cpp/test/random/permute.cu
@@ -18,8 +18,8 @@
 #include <algorithm>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/random/permute.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/random/permute.cuh>
+#include <raft/random/rng.cuh>
 #include <vector>
 
 namespace raft {
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
index c63763d5a4..872ed25000 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/test/random/rng.cu
@@ -19,9 +19,9 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
-#include <raft/stats/mean.hpp>
-#include <raft/stats/stddev.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/stats/mean.cuh>
+#include <raft/stats/stddev.cuh>
 
 namespace raft {
 namespace random {
diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu
index 2715181db1..7fffd84063 100644
--- a/cpp/test/random/rng_int.cu
+++ b/cpp/test/random/rng_int.cu
@@ -19,7 +19,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace random {
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu
index e469c366c3..b764b6b0f3 100644
--- a/cpp/test/random/sample_without_replacement.cu
+++ b/cpp/test/random/sample_without_replacement.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 #include <set>
 #include <vector>
 
diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu
index 0804b46957..5a6dc2966b 100644
--- a/cpp/test/sparse/add.cu
+++ b/cpp/test/sparse/add.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,11 +17,11 @@
 #include <gtest/gtest.h>
 
 #include <raft/sparse/csr.hpp>
-#include <raft/sparse/linalg/add.hpp>
+#include <raft/sparse/linalg/add.cuh>
 
 #include "../test_utils.h"
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 #include <iostream>
 #include <limits>
diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu
index e4b197d7f5..167c88e264 100644
--- a/cpp/test/sparse/connect_components.cu
+++ b/cpp/test/sparse/connect_components.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,15 +22,15 @@
 #include <raft/cudart_utils.h>
 #include <vector>
 
-#include <raft/sparse/linalg/symmetrize.hpp>
+#include <raft/sparse/linalg/symmetrize.cuh>
 #include <raft/sparse/mst/mst.cuh>
-#include <raft/sparse/selection/knn_graph.hpp>
+#include <raft/sparse/selection/knn_graph.cuh>
 
 #include <raft/distance/distance_type.hpp>
-#include <raft/linalg/transpose.hpp>
-#include <raft/sparse/convert/csr.hpp>
+#include <raft/linalg/transpose.cuh>
+#include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
-#include <raft/sparse/hierarchy/single_linkage.hpp>
+#include <raft/sparse/hierarchy/single_linkage.cuh>
 #include <rmm/device_uvector.hpp>
 
 #include "../test_utils.h"
diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu
index ecc1315c5f..a4fc8f3fe1 100644
--- a/cpp/test/sparse/convert_coo.cu
+++ b/cpp/test/sparse/convert_coo.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,11 @@
 
 #include <gtest/gtest.h>
 
-#include <raft/sparse/convert/coo.hpp>
+#include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/csr.hpp>
 
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 #include "../test_utils.h"
 
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index f4cd5640fe..6e4262b72f 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
-#include <raft/sparse/convert/csr.hpp>
+#include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
 
 #include <iostream>
diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu
index e92717c454..fa2b88cdef 100644
--- a/cpp/test/sparse/csr_row_slice.cu
+++ b/cpp/test/sparse/csr_row_slice.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/sparse/detail/cusparse_wrappers.h>
-#include <raft/sparse/op/slice.hpp>
+#include <raft/sparse/op/slice.cuh>
 
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu
index 60447e3a81..fbc3708b37 100644
--- a/cpp/test/sparse/csr_to_dense.cu
+++ b/cpp/test/sparse/csr_to_dense.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include <raft/handle.hpp>
 
 #include <gtest/gtest.h>
-#include <raft/sparse/convert/dense.hpp>
+#include <raft/sparse/convert/dense.cuh>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu
index e4fb7a102b..d06a365b15 100644
--- a/cpp/test/sparse/csr_transpose.cu
+++ b/cpp/test/sparse/csr_transpose.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
-#include <raft/sparse/linalg/transpose.hpp>
+#include <raft/sparse/linalg/transpose.cuh>
 
 #include "../test_utils.h"
 
diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu
index 6f567c260d..c87bd29b9d 100644
--- a/cpp/test/sparse/degree.cu
+++ b/cpp/test/sparse/degree.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
-#include <raft/sparse/linalg/degree.hpp>
+#include <raft/sparse/linalg/degree.cuh>
 
 #include <iostream>
 
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index e2288daed9..1ccff3532f 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,11 +18,11 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance_type.hpp>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <rmm/device_uvector.hpp>
 
-#include <raft/sparse/convert/coo.hpp>
+#include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/distance/detail/coo_spmv.cuh>
 #include <raft/sparse/distance/detail/operators.cuh>
 
diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu
index 7c61f2ed1c..d211a2a0c8 100644
--- a/cpp/test/sparse/distance.cu
+++ b/cpp/test/sparse/distance.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 #include <raft/distance/distance_type.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 
-#include <raft/sparse/distance/distance.hpp>
+#include <raft/sparse/distance/distance.cuh>
 
 #include "../test_utils.h"
 
diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu
index 5a389b8c87..e9ce0e5a9e 100644
--- a/cpp/test/sparse/filter.cu
+++ b/cpp/test/sparse/filter.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,11 +17,11 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 #include <raft/sparse/coo.hpp>
-#include <raft/sparse/op/filter.hpp>
-#include <raft/sparse/op/sort.hpp>
+#include <raft/sparse/op/filter.cuh>
+#include <raft/sparse/op/sort.cuh>
 
 #include <iostream>
 
diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu
index 5a066c2c28..7ced61fa9c 100644
--- a/cpp/test/sparse/knn.cu
+++ b/cpp/test/sparse/knn.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 
 #include "../test_utils.h"
 #include <raft/distance/distance_type.hpp>
-#include <raft/sparse/selection/knn.hpp>
+#include <raft/sparse/selection/knn.cuh>
 
 #include <raft/cudart_utils.h>
 
diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu
index 3645ef45ba..5cab91bc30 100644
--- a/cpp/test/sparse/knn_graph.cu
+++ b/cpp/test/sparse/knn_graph.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,14 +17,14 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/coo.hpp>
-#include <raft/sparse/selection/knn_graph.hpp>
+#include <raft/sparse/selection/knn_graph.cuh>
 #if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.hpp>
+#include <raft/spatial/knn/specializations.cuh>
 #endif
 
 #include <iostream>
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu
index 7944d0ee1f..35501c661a 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/sparse/linkage.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,9 +18,9 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance_type.hpp>
-#include <raft/linalg/transpose.hpp>
+#include <raft/linalg/transpose.cuh>
 #include <raft/sparse/coo.hpp>
-#include <raft/sparse/hierarchy/single_linkage.hpp>
+#include <raft/sparse/hierarchy/single_linkage.cuh>
 
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
index ac5443d43b..5e534d6374 100644
--- a/cpp/test/sparse/norm.cu
+++ b/cpp/test/sparse/norm.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,9 +18,9 @@
 
 #include "../test_utils.h"
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 #include <raft/sparse/csr.hpp>
-#include <raft/sparse/linalg/norm.hpp>
+#include <raft/sparse/linalg/norm.cuh>
 
 #include <iostream>
 #include <limits>
diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu
index edf7432c49..c605943cb4 100644
--- a/cpp/test/sparse/reduce.cu
+++ b/cpp/test/sparse/reduce.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 #include <raft/sparse/coo.hpp>
-#include <raft/sparse/op/reduce.hpp>
+#include <raft/sparse/op/reduce.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu
index 8d0317abac..da9adcd5ae 100644
--- a/cpp/test/sparse/row_op.cu
+++ b/cpp/test/sparse/row_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,11 +17,11 @@
 #include <gtest/gtest.h>
 
 #include <raft/sparse/csr.hpp>
-#include <raft/sparse/op/row_op.hpp>
+#include <raft/sparse/op/row_op.cuh>
 
 #include "../test_utils.h"
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 #include <iostream>
 #include <limits>
diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu
index 66407341da..10aa6e244a 100644
--- a/cpp/test/sparse/sort.cu
+++ b/cpp/test/sparse/sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
-#include <raft/sparse/op/sort.hpp>
+#include <raft/sparse/op/sort.cuh>
 
 #include <iostream>
 
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu
index 9a2e35b0fe..f8c9b7a03d 100644
--- a/cpp/test/sparse/symmetrize.cu
+++ b/cpp/test/sparse/symmetrize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,10 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
-#include <raft/sparse/convert/coo.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/coo.hpp>
-#include <raft/sparse/linalg/symmetrize.hpp>
+#include <raft/sparse/linalg/symmetrize.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu
index 0cdc0d8765..0470750f36 100644
--- a/cpp/test/spatial/ball_cover.cu
+++ b/cpp/test/spatial/ball_cover.cu
@@ -22,7 +22,7 @@
 #include <raft/spatial/knn/ball_cover.hpp>
 #include <raft/spatial/knn/detail/knn_brute_force_faiss.cuh>
 #if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.hpp>
+#include <raft/spatial/knn/specializations.cuh>
 #endif
 
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/test/spatial/epsilon_neighborhood.cu b/cpp/test/spatial/epsilon_neighborhood.cu
index 33af5726a0..30cd79188b 100644
--- a/cpp/test/spatial/epsilon_neighborhood.cu
+++ b/cpp/test/spatial/epsilon_neighborhood.cu
@@ -18,8 +18,8 @@
 #include <gtest/gtest.h>
 #include <memory>
 #include <raft/cudart_utils.h>
-#include <raft/random/make_blobs.hpp>
-#include <raft/spatial/knn/epsilon_neighborhood.hpp>
+#include <raft/random/make_blobs.cuh>
+#include <raft/spatial/knn/epsilon_neighborhood.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/test/spatial/faiss_mr.cu b/cpp/test/spatial/faiss_mr.cu
index e635619897..eee221cffa 100644
--- a/cpp/test/spatial/faiss_mr.cu
+++ b/cpp/test/spatial/faiss_mr.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include <faiss/gpu/GpuResources.h>
 #include <raft/distance/distance_type.hpp>
-#include <raft/spatial/knn/knn.hpp>
+#include <raft/spatial/knn/knn.cuh>
 
 #include <rmm/device_buffer.hpp>
 
diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu
index 3254d41401..65c4284dd2 100644
--- a/cpp/test/spatial/fused_l2_knn.cu
+++ b/cpp/test/spatial/fused_l2_knn.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,10 +20,10 @@
 #include <faiss/gpu/StandardGpuResources.h>
 
 #include <raft/distance/distance_type.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 #include <raft/spatial/knn/detail/common_faiss.h>
 #include <raft/spatial/knn/detail/fused_l2_knn.cuh>
-#include <raft/spatial/knn/knn.hpp>
+#include <raft/spatial/knn/knn.cuh>
 
 #include <rmm/device_buffer.hpp>
 
diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu
index f78c6c46da..473d1e31da 100644
--- a/cpp/test/spatial/haversine.cu
+++ b/cpp/test/spatial/haversine.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index 54c3b55e5e..bf13288f48 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,9 +18,9 @@
 
 #include <raft/distance/distance_type.hpp>
 
-#include <raft/spatial/knn/knn.hpp>
+#include <raft/spatial/knn/knn.cuh>
 #if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.hpp>
+#include <raft/spatial/knn/specializations.cuh>
 #endif
 
 #include <rmm/device_buffer.hpp>
diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu
index 769406487a..25ec2e50ab 100644
--- a/cpp/test/spatial/selection.cu
+++ b/cpp/test/spatial/selection.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,9 +20,9 @@
 #include "../test_utils.h"
 
 #include <raft/sparse/detail/utils.h>
-#include <raft/spatial/knn/knn.hpp>
+#include <raft/spatial/knn/knn.cuh>
 #if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.hpp>
+#include <raft/spatial/knn/specializations.cuh>
 #endif
 
 namespace raft {
diff --git a/cpp/test/spatial/spatial_data.h b/cpp/test/spatial/spatial_data.h
index dbb32c4546..d71b47cf1e 100644
--- a/cpp/test/spatial/spatial_data.h
+++ b/cpp/test/spatial/spatial_data.h
@@ -1,22 +1,38 @@
-#include <vector>
-
-namespace raft {
-namespace spatial {
-
-// Latitude and longitude coordinates of 51 US states / territories
-std::vector<float> spatial_data = {
-  63.588753, -154.493062, 32.318231, -86.902298,  35.20105,  -91.831833,  34.048928, -111.093731,
-  36.778261, -119.417932, 39.550051, -105.782067, 41.603221, -73.087749,  38.905985, -77.033418,
-  38.910832, -75.52767,   27.664827, -81.515754,  32.157435, -82.907123,  19.898682, -155.665857,
-  41.878003, -93.097702,  44.068202, -114.742041, 40.633125, -89.398528,  40.551217, -85.602364,
-  39.011902, -98.484246,  37.839333, -84.270018,  31.244823, -92.145024,  42.407211, -71.382437,
-  39.045755, -76.641271,  45.253783, -69.445469,  44.314844, -85.602364,  46.729553, -94.6859,
-  37.964253, -91.831833,  32.354668, -89.398528,  46.879682, -110.362566, 35.759573, -79.0193,
-  47.551493, -101.002012, 41.492537, -99.901813,  43.193852, -71.572395,  40.058324, -74.405661,
-  34.97273,  -105.032363, 38.80261,  -116.419389, 43.299428, -74.217933,  40.417287, -82.907123,
-  35.007752, -97.092877,  43.804133, -120.554201, 41.203322, -77.194525,  18.220833, -66.590149,
-  41.580095, -71.477429,  33.836081, -81.163725,  43.969515, -99.901813,  35.517491, -86.580447,
-  31.968599, -99.901813,  39.32098,  -111.093731, 37.431573, -78.656894,  44.558803, -72.577841,
-  47.751074, -120.740139, 43.78444,  -88.787868,  38.597626, -80.454903,  43.075968, -107.290284};
-};  // namespace spatial
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+
+namespace raft {
+namespace spatial {
+
+// Latitude and longitude coordinates of 51 US states / territories
+std::vector<float> spatial_data = {
+  63.588753, -154.493062, 32.318231, -86.902298,  35.20105,  -91.831833,  34.048928, -111.093731,
+  36.778261, -119.417932, 39.550051, -105.782067, 41.603221, -73.087749,  38.905985, -77.033418,
+  38.910832, -75.52767,   27.664827, -81.515754,  32.157435, -82.907123,  19.898682, -155.665857,
+  41.878003, -93.097702,  44.068202, -114.742041, 40.633125, -89.398528,  40.551217, -85.602364,
+  39.011902, -98.484246,  37.839333, -84.270018,  31.244823, -92.145024,  42.407211, -71.382437,
+  39.045755, -76.641271,  45.253783, -69.445469,  44.314844, -85.602364,  46.729553, -94.6859,
+  37.964253, -91.831833,  32.354668, -89.398528,  46.879682, -110.362566, 35.759573, -79.0193,
+  47.551493, -101.002012, 41.492537, -99.901813,  43.193852, -71.572395,  40.058324, -74.405661,
+  34.97273,  -105.032363, 38.80261,  -116.419389, 43.299428, -74.217933,  40.417287, -82.907123,
+  35.007752, -97.092877,  43.804133, -120.554201, 41.203322, -77.194525,  18.220833, -66.590149,
+  41.580095, -71.477429,  33.836081, -81.163725,  43.969515, -99.901813,  35.517491, -86.580447,
+  31.968599, -99.901813,  39.32098,  -111.093731, 37.431573, -78.656894,  44.558803, -72.577841,
+  47.751074, -120.740139, 43.78444,  -88.787868,  38.597626, -80.454903,  43.075968, -107.290284};
+};  // namespace spatial
 };  // namespace raft
\ No newline at end of file
diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu
index 5d0768a729..2e2d918016 100644
--- a/cpp/test/spectral_matrix.cu
+++ b/cpp/test/spectral_matrix.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/stats/adjusted_rand_index.cu b/cpp/test/stats/adjusted_rand_index.cu
index 33e05295e1..4bacbadbf7 100644
--- a/cpp/test/stats/adjusted_rand_index.cu
+++ b/cpp/test/stats/adjusted_rand_index.cu
@@ -19,8 +19,8 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <raft/cudart_utils.h>
-#include <raft/stats/adjusted_rand_index.hpp>
-#include <raft/stats/contingency_matrix.hpp>
+#include <raft/stats/adjusted_rand_index.cuh>
+#include <raft/stats/contingency_matrix.cuh>
 #include <random>
 
 namespace raft {
diff --git a/cpp/test/stats/completeness_score.cu b/cpp/test/stats/completeness_score.cu
index b8ca65ed7b..f0f06614e3 100644
--- a/cpp/test/stats/completeness_score.cu
+++ b/cpp/test/stats/completeness_score.cu
@@ -18,9 +18,9 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <raft/cudart_utils.h>
-#include <raft/stats/completeness_score.hpp>
-#include <raft/stats/entropy.hpp>
-#include <raft/stats/mutual_info_score.hpp>
+#include <raft/stats/completeness_score.cuh>
+#include <raft/stats/entropy.cuh>
+#include <raft/stats/mutual_info_score.cuh>
 #include <random>
 
 namespace raft {
diff --git a/cpp/test/stats/contingencyMatrix.cu b/cpp/test/stats/contingencyMatrix.cu
index fbae9f5224..5c8d6da566 100644
--- a/cpp/test/stats/contingencyMatrix.cu
+++ b/cpp/test/stats/contingencyMatrix.cu
@@ -20,7 +20,7 @@
 #include <iostream>
 #include <raft/cudart_utils.h>
 #include <raft/interruptible.hpp>
-#include <raft/stats/contingency_matrix.hpp>
+#include <raft/stats/contingency_matrix.cuh>
 #include <random>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/stats/cov.cu b/cpp/test/stats/cov.cu
index 2db64a7999..1b4066bda5 100644
--- a/cpp/test/stats/cov.cu
+++ b/cpp/test/stats/cov.cu
@@ -17,9 +17,9 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
-#include <raft/stats/cov.hpp>
-#include <raft/stats/mean.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/stats/cov.cuh>
+#include <raft/stats/mean.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/test/stats/dispersion.cu b/cpp/test/stats/dispersion.cu
index 256469be7c..b6e07bbff5 100644
--- a/cpp/test/stats/dispersion.cu
+++ b/cpp/test/stats/dispersion.cu
@@ -18,8 +18,8 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/interruptible.hpp>
-#include <raft/random/rng.hpp>
-#include <raft/stats/dispersion.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/stats/dispersion.cuh>
 #include <rmm/device_uvector.hpp>
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/cpp/test/stats/entropy.cu b/cpp/test/stats/entropy.cu
index 7074b1a6aa..fb9e82058e 100644
--- a/cpp/test/stats/entropy.cu
+++ b/cpp/test/stats/entropy.cu
@@ -19,7 +19,7 @@
 #include <iostream>
 #include <raft/cudart_utils.h>
 #include <raft/interruptible.hpp>
-#include <raft/stats/entropy.hpp>
+#include <raft/stats/entropy.cuh>
 #include <random>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/stats/histogram.cu b/cpp/test/stats/histogram.cu
index ff538fcdca..efc6d8ad93 100644
--- a/cpp/test/stats/histogram.cu
+++ b/cpp/test/stats/histogram.cu
@@ -19,8 +19,8 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/interruptible.hpp>
-#include <raft/random/rng.hpp>
-#include <raft/stats/histogram.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/stats/histogram.cuh>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/test/stats/homogeneity_score.cu b/cpp/test/stats/homogeneity_score.cu
index 44434aef8d..697cea55ad 100644
--- a/cpp/test/stats/homogeneity_score.cu
+++ b/cpp/test/stats/homogeneity_score.cu
@@ -18,8 +18,8 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <raft/cudart_utils.h>
-#include <raft/stats/homogeneity_score.hpp>
-#include <raft/stats/mutual_info_score.hpp>
+#include <raft/stats/homogeneity_score.cuh>
+#include <raft/stats/mutual_info_score.cuh>
 #include <random>
 
 namespace raft {
diff --git a/cpp/test/stats/information_criterion.cu b/cpp/test/stats/information_criterion.cu
index 034567efa5..802e3fee23 100644
--- a/cpp/test/stats/information_criterion.cu
+++ b/cpp/test/stats/information_criterion.cu
@@ -16,7 +16,7 @@
 
 #include <test_utils.h>
 
-#include <raft/stats/information_criterion.hpp>
+#include <raft/stats/information_criterion.cuh>
 
 #include <raft/cudart_utils.h>
 #include <raft/mr/device/allocator.hpp>
diff --git a/cpp/test/stats/kl_divergence.cu b/cpp/test/stats/kl_divergence.cu
index 050f48f557..d66a832e30 100644
--- a/cpp/test/stats/kl_divergence.cu
+++ b/cpp/test/stats/kl_divergence.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <raft/cudart_utils.h>
-#include <raft/stats/kl_divergence.hpp>
+#include <raft/stats/kl_divergence.cuh>
 #include <random>
 
 namespace raft {
diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu
index f6ad98e1a4..b0a9eae1d4 100644
--- a/cpp/test/stats/mean.cu
+++ b/cpp/test/stats/mean.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
-#include <raft/stats/mean.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/stats/mean.cuh>
 #include <stdio.h>
 #include <stdlib.h>
 
diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu
index ddabe0e814..ffc1d482c2 100644
--- a/cpp/test/stats/mean_center.cu
+++ b/cpp/test/stats/mean_center.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,9 +18,9 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
-#include <raft/stats/mean.hpp>
-#include <raft/stats/mean_center.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/stats/mean.cuh>
+#include <raft/stats/mean_center.cuh>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/test/stats/meanvar.cu b/cpp/test/stats/meanvar.cu
index b0efe1c7dd..0ea390393a 100644
--- a/cpp/test/stats/meanvar.cu
+++ b/cpp/test/stats/meanvar.cu
@@ -17,9 +17,9 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/matrix/math.hpp>
-#include <raft/random/rng.hpp>
-#include <raft/stats/meanvar.hpp>
+#include <raft/matrix/math.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/stats/meanvar.cuh>
 
 #include <algorithm>
 
diff --git a/cpp/test/stats/minmax.cu b/cpp/test/stats/minmax.cu
index 61b16b65ae..ca6ab31437 100644
--- a/cpp/test/stats/minmax.cu
+++ b/cpp/test/stats/minmax.cu
@@ -19,8 +19,8 @@
 #include <limits>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
-#include <raft/stats/minmax.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/stats/minmax.cuh>
 #include <stdio.h>
 #include <stdlib.h>
 
diff --git a/cpp/test/stats/mutual_info_score.cu b/cpp/test/stats/mutual_info_score.cu
index b7f6406009..ad4ec900c9 100644
--- a/cpp/test/stats/mutual_info_score.cu
+++ b/cpp/test/stats/mutual_info_score.cu
@@ -18,8 +18,8 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <raft/cudart_utils.h>
-#include <raft/stats/contingency_matrix.hpp>
-#include <raft/stats/mutual_info_score.hpp>
+#include <raft/stats/contingency_matrix.cuh>
+#include <raft/stats/mutual_info_score.cuh>
 #include <random>
 
 namespace raft {
diff --git a/cpp/test/stats/rand_index.cu b/cpp/test/stats/rand_index.cu
index 1f4805a160..f1ec58d944 100644
--- a/cpp/test/stats/rand_index.cu
+++ b/cpp/test/stats/rand_index.cu
@@ -22,7 +22,7 @@
 
 #include <algorithm>
 #include <iostream>
-#include <raft/stats/rand_index.hpp>
+#include <raft/stats/rand_index.cuh>
 #include <random>
 
 namespace raft {
diff --git a/cpp/test/stats/silhouette_score.cu b/cpp/test/stats/silhouette_score.cu
index 6efb3a4f78..8542276bd7 100644
--- a/cpp/test/stats/silhouette_score.cu
+++ b/cpp/test/stats/silhouette_score.cu
@@ -21,10 +21,10 @@
 #include <raft/distance/distance_type.hpp>
 
 #if defined RAFT_DISTANCE_COMPILED && defined RAFT_NN_COMPILED
-#include <raft/stats/specializations.hpp>
+#include <raft/stats/specializations.cuh>
 #endif
 
-#include <raft/stats/silhouette_score.hpp>
+#include <raft/stats/silhouette_score.cuh>
 #include <random>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index ef7964201f..64e264bb3e 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,10 +17,10 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/matrix/math.hpp>
-#include <raft/random/rng.hpp>
-#include <raft/stats/mean.hpp>
-#include <raft/stats/stddev.hpp>
+#include <raft/matrix/math.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/stats/mean.cuh>
+#include <raft/stats/stddev.cuh>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index 0df140b8b4..125f4e2de0 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eltwise.hpp>
-#include <raft/random/rng.hpp>
-#include <raft/stats/sum.hpp>
+#include <raft/linalg/eltwise.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/stats/sum.cuh>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/test/stats/trustworthiness.cu b/cpp/test/stats/trustworthiness.cu
index ebbd52a332..a963957d32 100644
--- a/cpp/test/stats/trustworthiness.cu
+++ b/cpp/test/stats/trustworthiness.cu
@@ -18,13 +18,13 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <raft/cudart_utils.h>
-#include <raft/distance/distance.hpp>
+#include <raft/distance/distance.cuh>
 
 #if defined RAFT_DISTANCE_COMPILED && defined RAFT_NN_COMPILED
-#include <raft/stats/specializations.hpp>
+#include <raft/stats/specializations.cuh>
 #endif
 
-#include <raft/stats/trustworthiness_score.hpp>
+#include <raft/stats/trustworthiness_score.cuh>
 #include <vector>
 
 namespace raft {
diff --git a/cpp/test/stats/v_measure.cu b/cpp/test/stats/v_measure.cu
index 2ff60c0a86..65a875c5e0 100644
--- a/cpp/test/stats/v_measure.cu
+++ b/cpp/test/stats/v_measure.cu
@@ -18,8 +18,8 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <raft/cudart_utils.h>
-#include <raft/stats/homogeneity_score.hpp>
-#include <raft/stats/v_measure.hpp>
+#include <raft/stats/homogeneity_score.cuh>
+#include <raft/stats/v_measure.cuh>
 #include <random>
 
 namespace raft {
diff --git a/cpp/test/stats/weighted_mean.cu b/cpp/test/stats/weighted_mean.cu
index ee58747b69..dc67947a27 100644
--- a/cpp/test/stats/weighted_mean.cu
+++ b/cpp/test/stats/weighted_mean.cu
@@ -17,8 +17,8 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/random/rng.hpp>
-#include <raft/stats/weighted_mean.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/stats/weighted_mean.cuh>
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 
diff --git a/python/raft/dask/common/comms_utils.pyx b/python/raft/dask/common/comms_utils.pyx
index 990e882be5..38c5670372 100644
--- a/python/raft/dask/common/comms_utils.pyx
+++ b/python/raft/dask/common/comms_utils.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/dask/common/nccl.pyx b/python/raft/dask/common/nccl.pyx
index fd91f34eb5..fd113e2222 100644
--- a/python/raft/dask/common/nccl.pyx
+++ b/python/raft/dask/common/nccl.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 97c3b2b003b8b053a2ede9bcb7f035a244a7c7c2 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Fri, 25 Feb 2022 15:19:53 -0500
Subject: [PATCH 132/171] Adding lap.hpp back (with deprecation) (#529)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/raft/pull/529
---
 cpp/include/raft/lap/lap.cuh |   6 +
 cpp/include/raft/lap/lap.hpp | 297 +++++++++++++++++++++++++++++++++++
 2 files changed, 303 insertions(+)
 create mode 100644 cpp/include/raft/lap/lap.hpp

diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.cuh
index 5f72ca27c8..f040e27dad 100644
--- a/cpp/include/raft/lap/lap.cuh
+++ b/cpp/include/raft/lap/lap.cuh
@@ -22,6 +22,10 @@
  *          for the Linear Assignment Problem." Parallel Computing 57 (2016): 52-72.
  *
  */
+
+#ifndef __LAP_H
+#define __LAP_H
+
 #pragma once
 
 #include <raft/handle.hpp>
@@ -284,3 +288,5 @@ class LinearAssignmentProblem {
 
 }  // namespace lap
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/lap/lap.hpp b/cpp/include/raft/lap/lap.hpp
new file mode 100644
index 0000000000..238af9545d
--- /dev/null
+++ b/cpp/include/raft/lap/lap.hpp
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright 2020 KETAN DATE & RAKESH NAGI
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *      CUDA Implementation of O(n^3) alternating tree Hungarian Algorithm
+ *      Authors: Ketan Date and Rakesh Nagi
+ *
+ *      Article reference:
+ *          Date, Ketan, and Rakesh Nagi. "GPU-accelerated Hungarian algorithms
+ *          for the Linear Assignment Problem." Parallel Computing 57 (2016): 52-72.
+ *
+ */
+
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __LAP_H
+#define __LAP_H
+
+#pragma once
+
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include "detail/d_structs.h"
+#include "detail/lap_functions.cuh"
+
+namespace raft {
+namespace lap {
+
+template <typename vertex_t, typename weight_t>
+class LinearAssignmentProblem {
+  vertex_t size_;
+  vertex_t batchsize_;
+  weight_t epsilon_;
+
+  weight_t const* d_costs_;
+
+  Vertices<vertex_t, weight_t> d_vertices_dev;
+  VertexData<vertex_t> d_row_data_dev, d_col_data_dev;
+
+  raft::handle_t const& handle_;
+  rmm::device_uvector<int> row_covers_v;
+  rmm::device_uvector<int> col_covers_v;
+  rmm::device_uvector<weight_t> row_duals_v;
+  rmm::device_uvector<weight_t> col_duals_v;
+  rmm::device_uvector<weight_t> col_slacks_v;
+  rmm::device_uvector<int> row_is_visited_v;
+  rmm::device_uvector<int> col_is_visited_v;
+  rmm::device_uvector<vertex_t> row_parents_v;
+  rmm::device_uvector<vertex_t> col_parents_v;
+  rmm::device_uvector<vertex_t> row_children_v;
+  rmm::device_uvector<vertex_t> col_children_v;
+  rmm::device_uvector<weight_t> obj_val_primal_v;
+  rmm::device_uvector<weight_t> obj_val_dual_v;
+
+ public:
+  LinearAssignmentProblem(raft::handle_t const& handle,
+                          vertex_t size,
+                          vertex_t batchsize,
+                          weight_t epsilon)
+    : handle_(handle),
+      size_(size),
+      batchsize_(batchsize),
+      epsilon_(epsilon),
+      d_costs_(nullptr),
+      row_covers_v(0, handle_.get_stream()),
+      col_covers_v(0, handle_.get_stream()),
+      row_duals_v(0, handle_.get_stream()),
+      col_duals_v(0, handle_.get_stream()),
+      col_slacks_v(0, handle_.get_stream()),
+      row_is_visited_v(0, handle_.get_stream()),
+      col_is_visited_v(0, handle_.get_stream()),
+      row_parents_v(0, handle_.get_stream()),
+      col_parents_v(0, handle_.get_stream()),
+      row_children_v(0, handle_.get_stream()),
+      col_children_v(0, handle_.get_stream()),
+      obj_val_primal_v(0, handle_.get_stream()),
+      obj_val_dual_v(0, handle_.get_stream())
+  {
+  }
+
+  // Executes Hungarian algorithm on the input cost matrix.
+  void solve(weight_t const* d_cost_matrix, vertex_t* d_row_assignment, vertex_t* d_col_assignment)
+  {
+    initializeDevice();
+
+    d_vertices_dev.row_assignments = d_row_assignment;
+    d_vertices_dev.col_assignments = d_col_assignment;
+
+    d_costs_ = d_cost_matrix;
+
+    int step = 0;
+
+    while (step != 100) {
+      switch (step) {
+        case 0: step = hungarianStep0(); break;
+        case 1: step = hungarianStep1(); break;
+        case 2: step = hungarianStep2(); break;
+        case 3: step = hungarianStep3(); break;
+        case 4: step = hungarianStep4(); break;
+        case 5: step = hungarianStep5(); break;
+        case 6: step = hungarianStep6(); break;
+      }
+    }
+
+    d_costs_ = nullptr;
+  }
+
+  // Function for getting optimal row dual vector for subproblem spId.
+  std::pair<const weight_t*, vertex_t> getRowDualVector(int spId) const
+  {
+    return std::make_pair(row_duals_v.data() + spId * size_, size_);
+  }
+
+  // Function for getting optimal col dual vector for subproblem spId.
+  std::pair<const weight_t*, vertex_t> getColDualVector(int spId)
+  {
+    return std::make_pair(col_duals_v.data() + spId * size_, size_);
+  }
+
+  // Function for getting optimal primal objective value for subproblem spId.
+  weight_t getPrimalObjectiveValue(int spId)
+  {
+    weight_t result;
+    raft::update_host(&result, obj_val_primal_v.data() + spId, 1, handle_.get_stream());
+    CHECK_CUDA(handle_.get_stream());
+    return result;
+  }
+
+  // Function for getting optimal dual objective value for subproblem spId.
+  weight_t getDualObjectiveValue(int spId)
+  {
+    weight_t result;
+    raft::update_host(&result, obj_val_dual_v.data() + spId, 1, handle_.get_stream());
+    CHECK_CUDA(handle_.get_stream());
+    return result;
+  }
+
+ private:
+  // Helper function for initializing global variables and arrays on a single host.
+  void initializeDevice()
+  {
+    cudaStream_t stream = handle_.get_stream();
+    row_covers_v.resize(batchsize_ * size_, stream);
+    col_covers_v.resize(batchsize_ * size_, stream);
+    row_duals_v.resize(batchsize_ * size_, stream);
+    col_duals_v.resize(batchsize_ * size_, stream);
+    col_slacks_v.resize(batchsize_ * size_, stream);
+    row_is_visited_v.resize(batchsize_ * size_, stream);
+    col_is_visited_v.resize(batchsize_ * size_, stream);
+    row_parents_v.resize(batchsize_ * size_, stream);
+    col_parents_v.resize(batchsize_ * size_, stream);
+    row_children_v.resize(batchsize_ * size_, stream);
+    col_children_v.resize(batchsize_ * size_, stream);
+    obj_val_primal_v.resize(batchsize_, stream);
+    obj_val_dual_v.resize(batchsize_, stream);
+
+    d_vertices_dev.row_covers = row_covers_v.data();
+    d_vertices_dev.col_covers = col_covers_v.data();
+
+    d_vertices_dev.row_duals  = row_duals_v.data();
+    d_vertices_dev.col_duals  = col_duals_v.data();
+    d_vertices_dev.col_slacks = col_slacks_v.data();
+
+    d_row_data_dev.is_visited = row_is_visited_v.data();
+    d_col_data_dev.is_visited = col_is_visited_v.data();
+    d_row_data_dev.parents    = row_parents_v.data();
+    d_row_data_dev.children   = row_children_v.data();
+    d_col_data_dev.parents    = col_parents_v.data();
+    d_col_data_dev.children   = col_children_v.data();
+
+    thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(), int{0});
+    thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(), int{0});
+    thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(), weight_t{0});
+    thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(), weight_t{0});
+  }
+
+  // Function for calculating initial zeros by subtracting row and column minima from each element.
+  int hungarianStep0()
+  {
+    detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_, size_);
+
+    return 1;
+  }
+
+  // Function for calculating initial zeros by subtracting row and column minima from each element.
+  int hungarianStep1()
+  {
+    detail::computeInitialAssignments(
+      handle_, d_costs_, d_vertices_dev, batchsize_, size_, epsilon_);
+
+    int next = 2;
+
+    while (true) {
+      if ((next = hungarianStep2()) == 6) break;
+
+      if ((next = hungarianStep3()) == 5) break;
+
+      hungarianStep4();
+    }
+
+    return next;
+  }
+
+  // Function for checking optimality and constructing predicates and covers.
+  int hungarianStep2()
+  {
+    int cover_count = detail::computeRowCovers(
+      handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_);
+
+    int next = (cover_count == batchsize_ * size_) ? 6 : 3;
+
+    return next;
+  }
+
+  // Function for building alternating tree rooted at unassigned rows.
+  int hungarianStep3()
+  {
+    int next;
+
+    rmm::device_scalar<bool> flag_v(handle_.get_stream());
+
+    bool h_flag = false;
+    flag_v.set_value_async(h_flag, handle_.get_stream());
+
+    detail::executeZeroCover(handle_,
+                             d_costs_,
+                             d_vertices_dev,
+                             d_row_data_dev,
+                             d_col_data_dev,
+                             flag_v.data(),
+                             batchsize_,
+                             size_,
+                             epsilon_);
+
+    h_flag = flag_v.value(handle_.get_stream());
+
+    next = h_flag ? 4 : 5;
+
+    return next;
+  }
+
+  // Function for augmenting the solution along multiple node-disjoint alternating trees.
+  int hungarianStep4()
+  {
+    detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_, size_);
+
+    detail::augmentationPass(
+      handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_);
+
+    return 2;
+  }
+
+  // Function for updating dual solution to introduce new zero-cost arcs.
+  int hungarianStep5()
+  {
+    detail::dualUpdate(
+      handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_, epsilon_);
+
+    return 3;
+  }
+
+  // Function for calculating primal and dual objective values at optimality.
+  int hungarianStep6()
+  {
+    detail::calcObjValPrimal(handle_,
+                             obj_val_primal_v.data(),
+                             d_costs_,
+                             d_vertices_dev.row_assignments,
+                             batchsize_,
+                             size_);
+
+    detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev, batchsize_, size_);
+
+    return 100;
+  }
+};
+
+}  // namespace lap
+}  // namespace raft
+
+#endif
\ No newline at end of file

From 390cf72a0cc780c8256bd2f916fee054b6d6beb2 Mon Sep 17 00:00:00 2001
From: Chuck Hastings <45364586+ChuckHastings@users.noreply.github.com>
Date: Sat, 26 Feb 2022 19:06:56 -0500
Subject: [PATCH 133/171] Allocate sufficient memory for Hungarian if number of
 batches > 1 (#531)

Addresses Hungarian bug described in #528.

The `dualUpdate` method was originally using an array of size one which was eventually changed to a scalar.  It really needs to be an array of size SP (number of subproblems in Date/Nagi nomenclature, number of batches as integrated into raft).

Authors:
  - Chuck Hastings (https://github.com/ChuckHastings)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/531
---
 cpp/include/raft/lap/detail/lap_functions.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/lap/detail/lap_functions.cuh b/cpp/include/raft/lap/detail/lap_functions.cuh
index 3a801ff060..e03185f508 100644
--- a/cpp/include/raft/lap/detail/lap_functions.cuh
+++ b/cpp/include/raft/lap/detail/lap_functions.cuh
@@ -466,7 +466,7 @@ inline void dualUpdate(raft::handle_t const& handle,
   dim3 threads_per_block;
   int total_blocks;
 
-  rmm::device_scalar<weight_t> sp_min_v(handle.get_stream());
+  rmm::device_uvector<weight_t> sp_min_v(SP, handle.get_stream());
 
   raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP);
   kernel_dualUpdate_1<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(

From db7ba5ea78615dedc31739b0d0f89ba398b68ac5 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Mon, 28 Feb 2022 14:03:33 -0500
Subject: [PATCH 134/171] find_package(raft) can now be called multiple times
 safely (#532)

Previously it would try to add the `raft::Thrust` target multiple times, causing a CMake error as the target already existed.

This is required as find_package(X) can be invoked any number of times and will execute the `X-config.cmake` each time.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/532
---
 cpp/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 484285bf84..c13ee03a33 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -328,7 +328,9 @@ Imported Targets:
 
 set(code_string
 [=[
-thrust_create_target(raft::Thrust FROM_OPTIONS)
+if(NOT TARGET raft::Thrust)
+  thrust_create_target(raft::Thrust FROM_OPTIONS)
+endif()
 
 if(distance IN_LIST raft_FIND_COMPONENTS)
   enable_language(CUDA)

From fd0336a0600af68e83eb772153e7283528cc1bce Mon Sep 17 00:00:00 2001
From: AJ Schmidt <aschmidt@nvidia.com>
Date: Mon, 28 Feb 2022 15:30:47 -0500
Subject: [PATCH 135/171] update changelog

---
 CHANGELOG.md | 75 ++++++++++++++++++++++++++--------------------------
 1 file changed, 37 insertions(+), 38 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a85f64a098..8464e437bc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,62 +1,61 @@
 # raft 22.02.00 (2 Feb 2022)
 
-## 🚨 Beaking Changes
+## 🚨 Breaking Changes
 
-- Simplify aft component CMake logic, and allow compilation without FAISS ([#428](https://github.com/rapidsai/raft/pull/428)) [@obetmaynad](https://github.com/obetmaynad)
-- One cudaSteam_t instance pe aft::handle_t ([#291](https://github.com/rapidsai/raft/pull/291)) [@divyegala](https://github.com/divyegala)
+- Simplify raft component CMake logic, and allow compilation without FAISS ([#428](https://github.com/rapidsai/raft/pull/428)) [@robertmaynard](https://github.com/robertmaynard)
+- One cudaStream_t instance per raft::handle_t ([#291](https://github.com/rapidsai/raft/pull/291)) [@divyegala](https://github.com/divyegala)
 
 ## 🐛 Bug Fixes
 
-- Removing exta logging fom faiss m ([#463](https://github.com/rapidsai/raft/pull/463)) [@cjnolet](https://github.com/cjnolet)
-- Pin `dask` &amp; `distibuted` vesions ([#455](https://github.com/rapidsai/raft/pull/455)) [@galipemsaga](https://github.com/galipemsaga)
-- Replace RMM CUDA Python bindings with those povided  by CUDA-Python ([#451](https://github.com/rapidsai/raft/pull/451)) [@shwina](https://github.com/shwina)
-- Fix comms memoy leak ([#436](https://github.com/rapidsai/raft/pull/436)) [@seunghwak](https://github.com/seunghwak)
-- Fix C++ doxygen documentation ([#426](https://github.com/rapidsai/raft/pull/426)) [@achikin](https://github.com/achikin)
-- Fix clang-fomat style eos ([#425](https://github.com/rapidsai/raft/pull/425)) [@achikin](https://github.com/achikin)
-- Fix using incoect maco RAFT_CHECK_CUDA in place of RAFT_CUDA_TRY ([#415](https://github.com/rapidsai/raft/pull/415)) [@achikin](https://github.com/achikin)
-- Fix CUDA_CHECK_NO_THROW compatibility define ([#414](https://github.com/rapidsai/raft/pull/414)) [@zbjonson](https://github.com/zbjonson)
-- Disabling fused l2 knn fom bfknn ([#407](https://github.com/rapidsai/raft/pull/407)) [@cjnolet](https://github.com/cjnolet)
+- Removing extra logging from faiss mr ([#463](https://github.com/rapidsai/raft/pull/463)) [@cjnolet](https://github.com/cjnolet)
+- Pin `dask` &amp; `distributed` versions ([#455](https://github.com/rapidsai/raft/pull/455)) [@galipremsagar](https://github.com/galipremsagar)
+- Replace RMM CUDA Python bindings with those provided  by CUDA-Python ([#451](https://github.com/rapidsai/raft/pull/451)) [@shwina](https://github.com/shwina)
+- Fix comms memory leak ([#436](https://github.com/rapidsai/raft/pull/436)) [@seunghwak](https://github.com/seunghwak)
+- Fix C++ doxygen documentation ([#426](https://github.com/rapidsai/raft/pull/426)) [@achirkin](https://github.com/achirkin)
+- Fix clang-format style errors ([#425](https://github.com/rapidsai/raft/pull/425)) [@achirkin](https://github.com/achirkin)
+- Fix using incorrect macro RAFT_CHECK_CUDA in place of RAFT_CUDA_TRY ([#415](https://github.com/rapidsai/raft/pull/415)) [@achirkin](https://github.com/achirkin)
+- Fix CUDA_CHECK_NO_THROW compatibility define ([#414](https://github.com/rapidsai/raft/pull/414)) [@zbjornson](https://github.com/zbjornson)
+- Disabling fused l2 knn from bfknn ([#407](https://github.com/rapidsai/raft/pull/407)) [@cjnolet](https://github.com/cjnolet)
 - Disabling expanded fused l2 knn to unblock cuml CI ([#404](https://github.com/rapidsai/raft/pull/404)) [@cjnolet](https://github.com/cjnolet)
-- Reveting default knn distance to L2Unexpanded fo now. ([#403](https://github.com/rapidsai/raft/pull/403)) [@cjnolet](https://github.com/cjnolet)
+- Reverting default knn distance to L2Unexpanded for now. ([#403](https://github.com/rapidsai/raft/pull/403)) [@cjnolet](https://github.com/cjnolet)
 
 ## 📖 Documentation
 
-- README and build fixes befoe elease ([#459](https://github.com/rapidsai/raft/pull/459)) [@cjnolet](https://github.com/cjnolet)
+- README and build fixes before release ([#459](https://github.com/rapidsai/raft/pull/459)) [@cjnolet](https://github.com/cjnolet)
 - Updates to Python and C++ Docs ([#442](https://github.com/rapidsai/raft/pull/442)) [@cjnolet](https://github.com/cjnolet)
 
-## 🚀 New Featues
+## 🚀 New Features
 
-- eo macos: detemining buffe size instead of fixed 2048 chas ([#420](https://github.com/rapidsai/raft/pull/420)) [@MatthiasKohl](https://github.com/MatthiasKohl)
-- NVTX ange helpes ([#416](https://github.com/rapidsai/raft/pull/416)) [@achikin](https://github.com/achikin)
+- error macros: determining buffer size instead of fixed 2048 chars ([#420](https://github.com/rapidsai/raft/pull/420)) [@MatthiasKohl](https://github.com/MatthiasKohl)
+- NVTX range helpers ([#416](https://github.com/rapidsai/raft/pull/416)) [@achirkin](https://github.com/achirkin)
 
-## 🛠️ Impovements
+## 🛠️ Improvements
 
 - Splitting fused l2 knn specializations ([#461](https://github.com/rapidsai/raft/pull/461)) [@cjnolet](https://github.com/cjnolet)
 - Update cuCollection git tag ([#447](https://github.com/rapidsai/raft/pull/447)) [@seunghwak](https://github.com/seunghwak)
-- Remove libcudacxx patch needed fo nvcc 11.4 ([#446](https://github.com/rapidsai/raft/pull/446)) [@obetmaynad](https://github.com/obetmaynad)
-- Unpin `dask` and `distibuted` ([#440](https://github.com/rapidsai/raft/pull/440)) [@galipemsaga](https://github.com/galipemsaga)
-- Public apis fo emainde of matix and stats ([#438](https://github.com/rapidsai/raft/pull/438)) [@divyegala](https://github.com/divyegala)
-- Fix bug in poduce-consume buffe exchange which occus in UMAP test on GV100 ([#429](https://github.com/rapidsai/raft/pull/429)) [@mdoijade](https://github.com/mdoijade)
-- Simplify aft component CMake logic, and allow compilation without FAISS ([#428](https://github.com/rapidsai/raft/pull/428)) [@obetmaynad](https://github.com/obetmaynad)
-- Update ucx-py vesion on elease using vc ([#422](https://github.com/rapidsai/raft/pull/422)) [@Ethyling](https://github.com/Ethyling)
-- Disabling fused l2 knn again. Not sue how this got added back. ([#421](https://github.com/rapidsai/raft/pull/421)) [@cjnolet](https://github.com/cjnolet)
-- Adding no thow maco vaiants ([#417](https://github.com/rapidsai/raft/pull/417)) [@cjnolet](https://github.com/cjnolet)
-- Remove `IncludeCategoies` fom `.clang-fomat` ([#412](https://github.com/rapidsai/raft/pull/412)) [@codeepot](https://github.com/codeepot)
-- fix nan issues in L2 expanded sqt KNN distances ([#411](https://github.com/rapidsai/raft/pull/411)) [@mdoijade](https://github.com/mdoijade)
-- Consistent enaming of CHECK_CUDA and *_TRY macos ([#410](https://github.com/rapidsai/raft/pull/410)) [@cjnolet](https://github.com/cjnolet)
-- Faste matix-vecto-ops ([#401](https://github.com/rapidsai/raft/pull/401)) [@achikin](https://github.com/achikin)
-- Adding dev conda envionment files. ([#397](https://github.com/rapidsai/raft/pull/397)) [@cjnolet](https://github.com/cjnolet)
+- Remove libcudacxx patch needed for nvcc 11.4 ([#446](https://github.com/rapidsai/raft/pull/446)) [@robertmaynard](https://github.com/robertmaynard)
+- Unpin `dask` and `distributed` ([#440](https://github.com/rapidsai/raft/pull/440)) [@galipremsagar](https://github.com/galipremsagar)
+- Public apis for remainder of matrix and stats ([#438](https://github.com/rapidsai/raft/pull/438)) [@divyegala](https://github.com/divyegala)
+- Fix bug in producer-consumer buffer exchange which occurs in UMAP test on GV100 ([#429](https://github.com/rapidsai/raft/pull/429)) [@mdoijade](https://github.com/mdoijade)
+- Simplify raft component CMake logic, and allow compilation without FAISS ([#428](https://github.com/rapidsai/raft/pull/428)) [@robertmaynard](https://github.com/robertmaynard)
+- Update ucx-py version on release using rvc ([#422](https://github.com/rapidsai/raft/pull/422)) [@Ethyling](https://github.com/Ethyling)
+- Disabling fused l2 knn again. Not sure how this got added back. ([#421](https://github.com/rapidsai/raft/pull/421)) [@cjnolet](https://github.com/cjnolet)
+- Adding no throw macro variants ([#417](https://github.com/rapidsai/raft/pull/417)) [@cjnolet](https://github.com/cjnolet)
+- Remove `IncludeCategories` from `.clang-format` ([#412](https://github.com/rapidsai/raft/pull/412)) [@codereport](https://github.com/codereport)
+- fix nan issues in L2 expanded sqrt KNN distances ([#411](https://github.com/rapidsai/raft/pull/411)) [@mdoijade](https://github.com/mdoijade)
+- Consistent renaming of CHECK_CUDA and *_TRY macros ([#410](https://github.com/rapidsai/raft/pull/410)) [@cjnolet](https://github.com/cjnolet)
+- Faster matrix-vector-ops ([#401](https://github.com/rapidsai/raft/pull/401)) [@achirkin](https://github.com/achirkin)
+- Adding dev conda environment files. ([#397](https://github.com/rapidsai/raft/pull/397)) [@cjnolet](https://github.com/cjnolet)
 - Update to UCX-Py 0.24 ([#392](https://github.com/rapidsai/raft/pull/392)) [@pentschev](https://github.com/pentschev)
-- Banch 21.12 mege 22.02 ([#386](https://github.com/rapidsai/raft/pull/386)) [@cjnolet](https://github.com/cjnolet)
-- Hiding implementation details fo spase API ([#381](https://github.com/rapidsai/raft/pull/381)) [@cjnolet](https://github.com/cjnolet)
+- Branch 21.12 merge 22.02 ([#386](https://github.com/rapidsai/raft/pull/386)) [@cjnolet](https://github.com/cjnolet)
+- Hiding implementation details for sparse API ([#381](https://github.com/rapidsai/raft/pull/381)) [@cjnolet](https://github.com/cjnolet)
 - Adding distance specializations ([#376](https://github.com/rapidsai/raft/pull/376)) [@cjnolet](https://github.com/cjnolet)
-- Use FAISS with RMM ([#363](https://github.com/rapidsai/raft/pull/363)) [@viclafague](https://github.com/viclafague)
-- Add Fused L2 Expanded KNN kenel ([#339](https://github.com/rapidsai/raft/pull/339)) [@mdoijade](https://github.com/mdoijade)
-- Update `.clang-fomat` to be consistent with all othe RAPIDS epos ([#300](https://github.com/rapidsai/raft/pull/300)) [@codeepot](https://github.com/codeepot)
-- One cudaSteam_t instance pe aft::handle_t ([#291](https://github.com/rapidsai/raft/pull/291)) [@divyegala](https://github.com/divyegala)
+- Use FAISS with RMM ([#363](https://github.com/rapidsai/raft/pull/363)) [@viclafargue](https://github.com/viclafargue)
+- Add Fused L2 Expanded KNN kernel ([#339](https://github.com/rapidsai/raft/pull/339)) [@mdoijade](https://github.com/mdoijade)
+- Update `.clang-format` to be consistent with all other RAPIDS repos ([#300](https://github.com/rapidsai/raft/pull/300)) [@codereport](https://github.com/codereport)
+- One cudaStream_t instance per raft::handle_t ([#291](https://github.com/rapidsai/raft/pull/291)) [@divyegala](https://github.com/divyegala)
 
 # raft 21.12.00 (9 Dec 2021)
-
 ## 🚨 Breaking Changes
 
 - Use 64 bit CuSolver API for Eigen decomposition ([#349](https://github.com/rapidsai/raft/pull/349)) [@lowener](https://github.com/lowener)

From a6f3caf305561d1fa9789d576a2e0b21089b75a1 Mon Sep 17 00:00:00 2001
From: Micka <9810050+lowener@users.noreply.github.com>
Date: Wed, 2 Mar 2022 01:54:55 +0100
Subject: [PATCH 136/171] Unify weighted mean code (#514)

Needed for https://github.com/rapidsai/cuml/pull/4428

Authors:
  - Micka (https://github.com/lowener)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/514
---
 .../raft/stats/detail/weighted_mean.cuh       |  70 ++++------
 cpp/include/raft/stats/weighted_mean.cuh      |  52 ++++++--
 cpp/include/raft/stats/weighted_mean.hpp      |  52 ++++++--
 cpp/test/stats/weighted_mean.cu               | 121 ++++++++++++++----
 4 files changed, 207 insertions(+), 88 deletions(-)

diff --git a/cpp/include/raft/stats/detail/weighted_mean.cuh b/cpp/include/raft/stats/detail/weighted_mean.cuh
index 0069cf0a3f..6d6f901fab 100644
--- a/cpp/include/raft/stats/detail/weighted_mean.cuh
+++ b/cpp/include/raft/stats/detail/weighted_mean.cuh
@@ -17,75 +17,55 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/coalesced_reduction.cuh>
-#include <raft/linalg/strided_reduction.cuh>
+#include <raft/linalg/reduce.hpp>
+#include <raft/stats/sum.hpp>
 
 namespace raft {
 namespace stats {
 namespace detail {
 
 /**
- * @brief Compute the row-wise weighted mean of the input matrix
+ * @brief Compute the row-wise weighted mean of the input matrix with a
+ * vector of weights
  *
  * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
  * @param mu the output mean vector
- * @param data the input matrix (assumed to be row-major)
- * @param weights per-column means
+ * @param data the input matrix
+ * @param weights weight of size D if along_row is true, else of size N
  * @param D number of columns of data
  * @param N number of rows of data
+ * @param row_major data input matrix is row-major or not
+ * @param along_rows whether to reduce along rows or columns
  * @param stream cuda stream to launch work on
  */
-template <typename Type>
-void rowWeightedMean(
-  Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream)
+template <typename Type, typename IdxType = int>
+void weightedMean(Type* mu,
+                  const Type* data,
+                  const Type* weights,
+                  IdxType D,
+                  IdxType N,
+                  bool row_major,
+                  bool along_rows,
+                  cudaStream_t stream)
 {
   // sum the weights & copy back to CPU
-  Type WS = 0;
-  raft::linalg::coalescedReduction(mu, weights, D, 1, (Type)0, stream, false);
+  auto weight_size = along_rows ? D : N;
+  Type WS          = 0;
+  raft::stats::sum(mu, weights, (IdxType)1, weight_size, false, stream);
   raft::update_host(&WS, mu, 1, stream);
 
-  raft::linalg::coalescedReduction(
+  raft::linalg::reduce(
     mu,
     data,
     D,
     N,
     (Type)0,
+    row_major,
+    along_rows,
     stream,
     false,
-    [weights] __device__(Type v, int i) { return v * weights[i]; },
-    [] __device__(Type a, Type b) { return a + b; },
-    [WS] __device__(Type v) { return v / WS; });
-}
-
-/**
- * @brief Compute the column-wise weighted mean of the input matrix
- *
- * @tparam Type the data type
- * @param mu the output mean vector
- * @param data the input matrix (assumed to be column-major)
- * @param weights per-column means
- * @param D number of columns of data
- * @param N number of rows of data
- * @param stream cuda stream to launch work on
- */
-template <typename Type>
-void colWeightedMean(
-  Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream)
-{
-  // sum the weights & copy back to CPU
-  Type WS = 0;
-  raft::linalg::stridedReduction(mu, weights, 1, N, (Type)0, stream, false);
-  raft::update_host(&WS, mu, 1, stream);
-
-  raft::linalg::stridedReduction(
-    mu,
-    data,
-    D,
-    N,
-    (Type)0,
-    stream,
-    false,
-    [weights] __device__(Type v, int i) { return v * weights[i]; },
+    [weights] __device__(Type v, IdxType i) { return v * weights[i]; },
     [] __device__(Type a, Type b) { return a + b; },
     [WS] __device__(Type v) { return v / WS; });
 }
diff --git a/cpp/include/raft/stats/weighted_mean.cuh b/cpp/include/raft/stats/weighted_mean.cuh
index fe54d927ca..0e8338fe84 100644
--- a/cpp/include/raft/stats/weighted_mean.cuh
+++ b/cpp/include/raft/stats/weighted_mean.cuh
@@ -25,9 +25,39 @@ namespace raft {
 namespace stats {
 
 /**
- * @brief Compute the row-wise weighted mean of the input matrix
+ * @brief Compute the weighted mean of the input matrix with a
+ * vector of weights, along rows or along columns
  *
  * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param mu the output mean vector
+ * @param data the input matrix
+ * @param weights weight of size D if along_row is true, else of size N
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param row_major data input matrix is row-major or not
+ * @param along_rows whether to reduce along rows or columns
+ * @param stream cuda stream to launch work on
+ */
+template <typename Type, typename IdxType = int>
+void weightedMean(Type* mu,
+                  const Type* data,
+                  const Type* weights,
+                  IdxType D,
+                  IdxType N,
+                  bool row_major,
+                  bool along_rows,
+                  cudaStream_t stream)
+{
+  detail::weightedMean(mu, data, weights, D, N, row_major, along_rows, stream);
+}
+
+/**
+ * @brief Compute the row-wise weighted mean of the input matrix with a
+ * vector of column weights
+ *
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
  * @param mu the output mean vector
  * @param data the input matrix (assumed to be row-major)
  * @param weights per-column means
@@ -35,29 +65,31 @@ namespace stats {
  * @param N number of rows of data
  * @param stream cuda stream to launch work on
  */
-template <typename Type>
+template <typename Type, typename IdxType = int>
 void rowWeightedMean(
-  Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream)
+  Type* mu, const Type* data, const Type* weights, IdxType D, IdxType N, cudaStream_t stream)
 {
-  detail::rowWeightedMean(mu, data, weights, D, N, stream);
+  weightedMean(mu, data, weights, D, N, true, true, stream);
 }
 
 /**
- * @brief Compute the column-wise weighted mean of the input matrix
+ * @brief Compute the column-wise weighted mean of the input matrix with a
+ * vector of row weights
  *
  * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
  * @param mu the output mean vector
- * @param data the input matrix (assumed to be column-major)
- * @param weights per-column means
+ * @param data the input matrix (assumed to be row-major)
+ * @param weights per-row means
  * @param D number of columns of data
  * @param N number of rows of data
  * @param stream cuda stream to launch work on
  */
-template <typename Type>
+template <typename Type, typename IdxType = int>
 void colWeightedMean(
-  Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream)
+  Type* mu, const Type* data, const Type* weights, IdxType D, IdxType N, cudaStream_t stream)
 {
-  detail::colWeightedMean(mu, data, weights, D, N, stream);
+  weightedMean(mu, data, weights, D, N, true, false, stream);
 }
 };  // end namespace stats
 };  // end namespace raft
diff --git a/cpp/include/raft/stats/weighted_mean.hpp b/cpp/include/raft/stats/weighted_mean.hpp
index 6d2fd1e928..4f53067e65 100644
--- a/cpp/include/raft/stats/weighted_mean.hpp
+++ b/cpp/include/raft/stats/weighted_mean.hpp
@@ -29,9 +29,39 @@ namespace raft {
 namespace stats {
 
 /**
- * @brief Compute the row-wise weighted mean of the input matrix
+ * @brief Compute the weighted mean of the input matrix with a
+ * vector of weights, along rows or along columns
  *
  * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param mu the output mean vector
+ * @param data the input matrix
+ * @param weights weight of size D if along_row is true, else of size N
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param row_major data input matrix is row-major or not
+ * @param along_rows whether to reduce along rows or columns
+ * @param stream cuda stream to launch work on
+ */
+template <typename Type, typename IdxType = int>
+void weightedMean(Type* mu,
+                  const Type* data,
+                  const Type* weights,
+                  IdxType D,
+                  IdxType N,
+                  bool row_major,
+                  bool along_rows,
+                  cudaStream_t stream)
+{
+  detail::weightedMean(mu, data, weights, D, N, row_major, along_rows, stream);
+}
+
+/**
+ * @brief Compute the row-wise weighted mean of the input matrix with a
+ * vector of column weights
+ *
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
  * @param mu the output mean vector
  * @param data the input matrix (assumed to be row-major)
  * @param weights per-column means
@@ -39,29 +69,31 @@ namespace stats {
  * @param N number of rows of data
  * @param stream cuda stream to launch work on
  */
-template <typename Type>
+template <typename Type, typename IdxType = int>
 void rowWeightedMean(
-  Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream)
+  Type* mu, const Type* data, const Type* weights, IdxType D, IdxType N, cudaStream_t stream)
 {
-  detail::rowWeightedMean(mu, data, weights, D, N, stream);
+  weightedMean(mu, data, weights, D, N, true, true, stream);
 }
 
 /**
- * @brief Compute the column-wise weighted mean of the input matrix
+ * @brief Compute the column-wise weighted mean of the input matrix with a
+ * vector of row weights
  *
  * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
  * @param mu the output mean vector
- * @param data the input matrix (assumed to be column-major)
- * @param weights per-column means
+ * @param data the input matrix (assumed to be row-major)
+ * @param weights per-row means
  * @param D number of columns of data
  * @param N number of rows of data
  * @param stream cuda stream to launch work on
  */
-template <typename Type>
+template <typename Type, typename IdxType = int>
 void colWeightedMean(
-  Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream)
+  Type* mu, const Type* data, const Type* weights, IdxType D, IdxType N, cudaStream_t stream)
 {
-  detail::colWeightedMean(mu, data, weights, D, N, stream);
+  weightedMean(mu, data, weights, D, N, true, false, stream);
 }
 };  // end namespace stats
 };  // end namespace raft
diff --git a/cpp/test/stats/weighted_mean.cu b/cpp/test/stats/weighted_mean.cu
index dc67947a27..d78175fc21 100644
--- a/cpp/test/stats/weighted_mean.cu
+++ b/cpp/test/stats/weighted_mean.cu
@@ -30,13 +30,15 @@ struct WeightedMeanInputs {
   T tolerance;
   int M, N;
   unsigned long long int seed;
+  bool along_rows;  // Used only for the weightedMean test function
+  bool row_major;
 };
 
 template <typename T>
 ::std::ostream& operator<<(::std::ostream& os, const WeightedMeanInputs<T>& I)
 {
-  return os << "{ " << I.tolerance << ", " << I.M << ", " << I.N << ", " << I.seed << "}"
-            << std::endl;
+  return os << "{ " << I.tolerance << ", " << I.M << ", " << I.N << ", " << I.seed << ", "
+            << I.along_rows << "}" << std::endl;
 }
 
 ///// weighted row-wise mean test and support functions
@@ -89,7 +91,7 @@ class RowWeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T
     naiveRowWeightedMean(hexp.data(), hin.data(), hweights.data(), rows, cols, true);
     dexp = hexp;
 
-    // compute ml-prims result
+    // compute result
     rowWeightedMean(dact.data().get(), din.data().get(), dweights.data().get(), cols, rows, stream);
 
     // adjust tolerance to account for round-off accumulation
@@ -155,7 +157,7 @@ class ColWeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T
     naiveColWeightedMean(hexp.data(), hin.data(), hweights.data(), rows, cols, true);
     dexp = hexp;
 
-    // compute ml-prims result
+    // compute result
     colWeightedMean(dact.data().get(), din.data().get(), dweights.data().get(), cols, rows, stream);
 
     // adjust tolerance to account for round-off accumulation
@@ -171,29 +173,86 @@ class ColWeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T
   thrust::device_vector<T> din, dweights, dexp, dact;
 };
 
+template <typename T>
+class WeightedMeanTest : public ::testing::TestWithParam<WeightedMeanInputs<T>> {
+ protected:
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<WeightedMeanInputs<T>>::GetParam();
+    raft::random::Rng r(params.seed);
+    int rows = params.M, cols = params.N, len = rows * cols;
+    auto weight_size    = params.along_rows ? cols : rows;
+    auto mean_size      = params.along_rows ? rows : cols;
+    cudaStream_t stream = 0;
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+    // device-side data
+    din.resize(len);
+    dweights.resize(weight_size);
+    dexp.resize(mean_size);
+    dact.resize(mean_size);
+
+    // create random matrix and weights
+    r.uniform(din.data().get(), len, T(-1.0), T(1.0), stream);
+    r.uniform(dweights.data().get(), weight_size, T(-1.0), T(1.0), stream);
+
+    // host-side data
+    thrust::host_vector<T> hin      = din;
+    thrust::host_vector<T> hweights = dweights;
+    thrust::host_vector<T> hexp(mean_size);
+
+    // compute naive result & copy to GPU
+    if (params.along_rows)
+      naiveRowWeightedMean(hexp.data(), hin.data(), hweights.data(), rows, cols, params.row_major);
+    else
+      naiveColWeightedMean(hexp.data(), hin.data(), hweights.data(), rows, cols, params.row_major);
+    dexp = hexp;
+
+    // compute result
+    weightedMean(dact.data().get(),
+                 din.data().get(),
+                 dweights.data().get(),
+                 cols,
+                 rows,
+                 params.row_major,
+                 params.along_rows,
+                 stream);
+
+    // adjust tolerance to account for round-off accumulation
+    params.tolerance *= params.N;
+    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
+  }
+
+  void TearDown() override {}
+
+ protected:
+  WeightedMeanInputs<T> params;
+  thrust::host_vector<T> hin, hweights;
+  thrust::device_vector<T> din, dweights, dexp, dact;
+};
+
 ////// Parameter sets and test instantiation
 static const float tolF  = 128 * std::numeric_limits<float>::epsilon();
 static const double tolD = 256 * std::numeric_limits<double>::epsilon();
 
-const std::vector<WeightedMeanInputs<float>> inputsf = {{tolF, 4, 4, 1234},
-                                                        {tolF, 1024, 32, 1234},
-                                                        {tolF, 1024, 64, 1234},
-                                                        {tolF, 1024, 128, 1234},
-                                                        {tolF, 1024, 256, 1234},
-                                                        {tolF, 1024, 32, 1234},
-                                                        {tolF, 1024, 64, 1234},
-                                                        {tolF, 1024, 128, 1234},
-                                                        {tolF, 1024, 256, 1234}};
-
-const std::vector<WeightedMeanInputs<double>> inputsd = {{tolD, 4, 4, 1234},
-                                                         {tolD, 1024, 32, 1234},
-                                                         {tolD, 1024, 64, 1234},
-                                                         {tolD, 1024, 128, 1234},
-                                                         {tolD, 1024, 256, 1234},
-                                                         {tolD, 1024, 32, 1234},
-                                                         {tolD, 1024, 64, 1234},
-                                                         {tolD, 1024, 128, 1234},
-                                                         {tolD, 1024, 256, 1234}};
+const std::vector<WeightedMeanInputs<float>> inputsf = {{tolF, 4, 4, 1234, true, true},
+                                                        {tolF, 1024, 32, 1234, true, false},
+                                                        {tolF, 1024, 64, 1234, true, true},
+                                                        {tolF, 1024, 128, 1234, true, false},
+                                                        {tolF, 1024, 256, 1234, true, true},
+                                                        {tolF, 1024, 32, 1234, false, false},
+                                                        {tolF, 1024, 64, 1234, false, true},
+                                                        {tolF, 1024, 128, 1234, false, false},
+                                                        {tolF, 1024, 256, 1234, false, true}};
+
+const std::vector<WeightedMeanInputs<double>> inputsd = {{tolD, 4, 4, 1234, true, true},
+                                                         {tolD, 1024, 32, 1234, true, false},
+                                                         {tolD, 1024, 64, 1234, true, true},
+                                                         {tolD, 1024, 128, 1234, true, false},
+                                                         {tolD, 1024, 256, 1234, true, true},
+                                                         {tolD, 1024, 32, 1234, false, false},
+                                                         {tolD, 1024, 64, 1234, false, true},
+                                                         {tolD, 1024, 128, 1234, false, false},
+                                                         {tolD, 1024, 256, 1234, false, true}};
 
 using RowWeightedMeanTestF = RowWeightedMeanTest<float>;
 TEST_P(RowWeightedMeanTestF, Result)
@@ -227,5 +286,21 @@ TEST_P(ColWeightedMeanTestD, Result)
 }
 INSTANTIATE_TEST_CASE_P(ColWeightedMeanTest, ColWeightedMeanTestD, ::testing::ValuesIn(inputsd));
 
+using WeightedMeanTestF = WeightedMeanTest<float>;
+TEST_P(WeightedMeanTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dexp.data().get(), dact.data().get(), params.N, raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(WeightedMeanTest, WeightedMeanTestF, ::testing::ValuesIn(inputsf));
+
+using WeightedMeanTestD = WeightedMeanTest<double>;
+TEST_P(WeightedMeanTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dexp.data().get(), dact.data().get(), params.N, raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(WeightedMeanTest, WeightedMeanTestD, ::testing::ValuesIn(inputsd));
+
 };  // end namespace stats
 };  // end namespace raft

From 194c7ee0c16aab2e49de234a1803237de657217a Mon Sep 17 00:00:00 2001
From: Vinay Deshpande <vinayd@nvidia.com>
Date: Tue, 8 Mar 2022 21:47:01 +0530
Subject: [PATCH 137/171] RNG test fixes and improvements (#513)

- Fixes the failures of RNG tests and issues mentioned [here](https://github.com/rapidsai/raft/issues/493).
- Reference standard deviation calculations: https://gist.github.com/vinaydes/cee04f50ff7e3365759603d39b7e079b
- Additionally fixes issue of Rng throwing NaNs and Infs
- Need to add parameter validation for each distribution


@MatthiasKohl @teju85 Please take a look.

Authors:
  - Vinay Deshpande (https://github.com/vinaydes)

Approvers:
  - Matt Joux (https://github.com/MatthiasKohl)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/513
---
 cpp/include/raft/random/detail/make_blobs.cuh |   4 +-
 cpp/include/raft/random/detail/rng_impl.cuh   |  79 ++++++--
 cpp/test/linalg/gemm_layout.cu                |   2 +-
 cpp/test/random/rng.cu                        | 186 +++++++-----------
 cpp/test/spatial/epsilon_neighborhood.cu      |  24 +--
 5 files changed, 143 insertions(+), 152 deletions(-)

diff --git a/cpp/include/raft/random/detail/make_blobs.cuh b/cpp/include/raft/random/detail/make_blobs.cuh
index 10ded9c93e..ece49a0811 100644
--- a/cpp/include/raft/random/detail/make_blobs.cuh
+++ b/cpp/include/raft/random/detail/make_blobs.cuh
@@ -107,7 +107,9 @@ __global__ void generate_data_kernel(DataT* out,
   IdxT len          = n_rows * n_cols;
   for (IdxT idx = tid; idx < len; idx += stride) {
     DataT val1, val2;
-    gen.next(val1);
+    do {
+      gen.next(val1);
+    } while (val1 == DataT(0.0));
     gen.next(val2);
     DataT mu1, sigma1, mu2, sigma2;
     get_mu_sigma(mu1,
diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh
index 2406456404..1b245ca45f 100644
--- a/cpp/include/raft/random/detail/rng_impl.cuh
+++ b/cpp/include/raft/random/detail/rng_impl.cuh
@@ -221,8 +221,13 @@ DI void custom_next(
   GenType& gen, OutType* val, NormalDistParams<OutType> params, LenType idx = 0, LenType stride = 0)
 {
   OutType res1, res2;
-  gen.next(res1);
+
+  do {
+    gen.next(res1);
+  } while (res1 == OutType(0.0));
+
   gen.next(res2);
+
   box_muller_transform<OutType>(res1, res2, params.sigma, params.mu);
   *val       = res1;
   *(val + 1) = res2;
@@ -236,7 +241,11 @@ DI void custom_next(GenType& gen,
                     LenType stride = 0)
 {
   IntType res1_int, res2_int;
-  gen.next(res1_int);
+
+  do {
+    gen.next(res1_int);
+  } while (res1_int == 0);
+
   gen.next(res2_int);
   double res1  = static_cast<double>(res1_int);
   double res2  = static_cast<double>(res2_int);
@@ -255,7 +264,11 @@ DI void custom_next(GenType& gen,
                     LenType stride)
 {
   OutType res1, res2;
-  gen.next(res1);
+
+  do {
+    gen.next(res1);
+  } while (res1 == OutType(0.0));
+
   gen.next(res2);
   LenType col1  = idx % params.n_cols;
   LenType col2  = (idx + stride) % params.n_cols;
@@ -274,7 +287,7 @@ DI void custom_next(
 {
   Type res = 0;
   gen.next(res);
-  *val = res > params.prob;
+  *val = res < params.prob;
 }
 
 template <typename GenType, typename OutType, typename LenType>
@@ -286,7 +299,7 @@ DI void custom_next(GenType& gen,
 {
   OutType res = 0;
   gen.next(res);
-  *val = res > params.prob ? -params.scale : params.scale;
+  *val = res < params.prob ? -params.scale : params.scale;
 }
 
 template <typename GenType, typename OutType, typename LenType>
@@ -294,7 +307,11 @@ DI void custom_next(
   GenType& gen, OutType* val, GumbelDistParams<OutType> params, LenType idx = 0, LenType stride = 0)
 {
   OutType res = 0;
-  gen.next(res);
+
+  do {
+    gen.next(res);
+  } while (res == OutType(0.0));
+
   *val = params.mu - params.beta * raft::myLog(-raft::myLog(res));
 }
 
@@ -306,7 +323,10 @@ DI void custom_next(GenType& gen,
                     LenType stride = 0)
 {
   OutType res1 = 0, res2 = 0;
-  gen.next(res1);
+  do {
+    gen.next(res1);
+  } while (res1 == OutType(0.0));
+
   gen.next(res2);
   box_muller_transform<OutType>(res1, res2, params.sigma, params.mu);
   *val       = raft::myExp(res1);
@@ -321,7 +341,11 @@ DI void custom_next(GenType& gen,
                     LenType stride = 0)
 {
   OutType res;
-  gen.next(res);
+
+  do {
+    gen.next(res);
+  } while (res == OutType(0.0));
+
   constexpr OutType one = (OutType)1.0;
   *val                  = params.mu - params.scale * raft::myLog(one / res - one);
 }
@@ -348,6 +372,7 @@ DI void custom_next(GenType& gen,
 {
   OutType res;
   gen.next(res);
+
   constexpr OutType one = (OutType)1.0;
   constexpr OutType two = (OutType)2.0;
   *val                  = raft::mySqrt(-two * raft::myLog(one - res)) * params.sigma;
@@ -361,10 +386,17 @@ DI void custom_next(GenType& gen,
                     LenType stride = 0)
 {
   OutType res, out;
-  gen.next(res);
+
+  do {
+    gen.next(res);
+  } while (res == OutType(0.0));
+
   constexpr OutType one     = (OutType)1.0;
   constexpr OutType two     = (OutType)2.0;
   constexpr OutType oneHalf = (OutType)0.5;
+
+  // The <= comparison here means, number of samples going in `if` branch are more by 1 than `else`
+  // branch. However it does not matter as for 0.5 both branches evaluate to same result.
   if (res <= oneHalf) {
     out = params.mu + params.scale * raft::myLog(two * res);
   } else {
@@ -451,8 +483,33 @@ struct PhiloxGenerator {
     return ret;
   }
 
-  DI void next(float& ret) { ret = curand_uniform(&(this->philox_state)); }
-  DI void next(double& ret) { ret = curand_uniform_double(&(this->philox_state)); }
+  DI float next_float()
+  {
+    float ret;
+    uint32_t val = next_u32() >> 8;
+    ret          = static_cast<float>(val) / float(uint32_t(1) << 24);
+    return ret;
+  }
+
+  DI double next_double()
+  {
+    double ret;
+    uint64_t val = next_u64() >> 11;
+    ret          = static_cast<double>(val) / double(uint64_t(1) << 53);
+    return ret;
+  }
+
+  DI void next(float& ret)
+  {
+    // ret = curand_uniform(&(this->philox_state));
+    ret = next_float();
+  }
+
+  DI void next(double& ret)
+  {
+    // ret = curand_uniform_double(&(this->philox_state));
+    ret = next_double();
+  }
 
   DI void next(uint32_t& ret) { ret = next_u32(); }
   DI void next(uint64_t& ret) { ret = next_u64(); }
diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu
index 422ba26f46..baf8cc00f4 100644
--- a/cpp/test/linalg/gemm_layout.cu
+++ b/cpp/test/linalg/gemm_layout.cu
@@ -128,7 +128,7 @@ const std::vector<GemmLayoutInputs<float>> inputsf = {
   {50, 10, 60, false, true, true, 73012ULL},
   {90, 90, 30, false, true, false, 538147ULL},
   {30, 100, 10, false, false, true, 412352ULL},
-  {40, 80, 100, false, false, false, 297941ULL}};
+  {40, 80, 100, false, false, false, 2979410ULL}};
 
 const std::vector<GemmLayoutInputs<double>> inputsd = {
   {10, 70, 40, true, true, true, 535648ULL},
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
index 872ed25000..28e3e461c7 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/test/random/rng.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <sys/timeb.h>
+
 #include "../test_utils.h"
 #include <cub/cub.cuh>
 #include <gtest/gtest.h>
@@ -58,28 +60,34 @@ __global__ void meanKernel(T* out, const T* data, int len)
 
 template <typename T>
 struct RngInputs {
-  T tolerance;
   int len;
-  // start, end: for uniform
-  // mean, sigma: for normal/lognormal
-  // mean, beta: for gumbel
-  // mean, scale: for logistic and laplace
-  // lambda: for exponential
-  // sigma: for rayleigh
+  // Meaning of 'start' and 'end' parameter for various distributions
+  //
+  //         Uniform   Normal/Log-Normal   Gumbel   Logistic   Laplace   Exponential   Rayleigh
+  // start    start          mean           mean     mean       mean       lambda       sigma
+  // end       end           sigma          beta     scale      scale      Unused       Unused
   T start, end;
   RandomType type;
   GeneratorType gtype;
-  unsigned long long int seed;
+  uint64_t seed;
 };
 
-template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const RngInputs<T>& dims)
-{
-  return os;
-}
-
-#include <sys/timeb.h>
-#include <time.h>
+// In this test we generate pseudo-random values that follow various probability distributions such
+// as Normal, Laplace etc. To check the correctness of generated random variates we compute two
+// measures, mean and variance from the generated data. The computed values are matched against
+// their theoretically expected values for the corresponding distribution. The computed mean and
+// variance are statistical variables themselves and follow a Normal distribution. Which means,
+// there is 99+% chance that the computed values fall in the 3-sigma (standard deviation) interval
+// [theoretical_value - 3*sigma, theoretical_value + 3*sigma]. The values are practically
+// guaranteed to fall in the 4-sigma interval. Reference standard deviation of the computed
+// mean/variance distribution is calculated here
+// https://gist.github.com/vinaydes/cee04f50ff7e3365759603d39b7e079b Maximum standard deviation
+// observed here is ~1.5e-2, thus we use this as sigma in our test.
+// N O T E: Before adding any new test case below, make sure to calculate standard deviation for the
+// test parameters using above notebook.
+
+constexpr int NUM_SIGMA    = 4;
+constexpr double MAX_SIGMA = 1.5e-2;
 
 template <typename T>
 class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
@@ -97,9 +105,6 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
  protected:
   void SetUp() override
   {
-    // Tests are configured with their expected test-values sigma. For example,
-    // 4 x sigma indicates the test shouldn't fail 99.9% of the time.
-    num_sigma = 4;
     Rng r(params.seed, params.gtype);
     switch (params.type) {
       case RNG_Normal: r.normal(data.data(), params.len, params.start, params.end, stream); break;
@@ -176,118 +181,61 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
   RngInputs<T> params;
   rmm::device_uvector<T> data, stats;
   T h_stats[2];  // mean, var
-  int num_sigma;
 };
 
-// The measured mean and standard deviation for each tested distribution are,
-// of course, statistical variables. Thus setting an appropriate testing
-// tolerance essentially requires one to set a probability of test failure. We
-// choose to set this at 3-4 x sigma, i.e., a 99.7-99.9% confidence interval so that
-// the test will indeed pass. In quick experiments (using the identical
-// distributions given by NumPy/SciPy), the measured standard deviation is the
-// variable with the greatest variance and so we determined the variance for
-// each distribution and number of samples (32*1024 or 8*1024). Below
-// are listed the standard deviation for these tests.
-
-// Distribution: StdDev 32*1024, StdDev 8*1024
-// Normal: 0.0055, 0.011
-// LogNormal: 0.05, 0.1
-// Uniform: 0.003, 0.005
-// Gumbel: 0.005, 0.01
-// Logistic: 0.005, 0.01
-// Exp: 0.008, 0.015
-// Rayleigh: 0.0125, 0.025
-// Laplace: 0.02, 0.04
-
-// We generally want 4 x sigma >= 99.9% chance of success
-
 typedef RngTest<float> RngTestF;
 const std::vector<RngInputs<float>> inputsf = {
-  {0.0055, 32 * 1024, 1.f, 1.f, RNG_Normal, GenPhilox, 1234ULL},
-  {0.011, 8 * 1024, 1.f, 1.f, RNG_Normal, GenPhilox, 1234ULL},
-  {0.05, 32 * 1024, 1.f, 1.f, RNG_LogNormal, GenPhilox, 1234ULL},
-  {0.1, 8 * 1024, 1.f, 1.f, RNG_LogNormal, GenPhilox, 1234ULL},
-  {0.003, 32 * 1024, -1.f, 1.f, RNG_Uniform, GenPhilox, 1234ULL},
-  {0.005, 8 * 1024, -1.f, 1.f, RNG_Uniform, GenPhilox, 1234ULL},
-  {0.005, 32 * 1024, 1.f, 1.f, RNG_Gumbel, GenPhilox, 1234ULL},
-  {0.01, 8 * 1024, 1.f, 1.f, RNG_Gumbel, GenPhilox, 1234ULL},
-  {0.005, 32 * 1024, 1.f, 1.f, RNG_Logistic, GenPhilox, 67632ULL},
-  {0.01, 8 * 1024, 1.f, 1.f, RNG_Logistic, GenPhilox, 1234ULL},
-  {0.008, 32 * 1024, 1.f, 1.f, RNG_Exp, GenPhilox, 1234ULL},
-  {0.015, 8 * 1024, 1.f, 1.f, RNG_Exp, GenPhilox, 1234ULL},
-  {0.0125, 32 * 1024, 1.f, 1.f, RNG_Rayleigh, GenPhilox, 1234ULL},
-  {0.025, 8 * 1024, 1.f, 1.f, RNG_Rayleigh, GenPhilox, 1234ULL},
-  {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenPhilox, 1234ULL},
-  {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenPhilox, 1234ULL},
-
-  {0.0055, 32 * 1024, 1.f, 1.f, RNG_Normal, GenPC, 1234ULL},
-  {0.011, 8 * 1024, 1.f, 1.f, RNG_Normal, GenPC, 1234ULL},
-  {0.05, 32 * 1024, 1.f, 1.f, RNG_LogNormal, GenPC, 1234ULL},
-  {0.1, 8 * 1024, 1.f, 1.f, RNG_LogNormal, GenPC, 1234ULL},
-  {0.003, 32 * 1024, -1.f, 1.f, RNG_Uniform, GenPC, 1234ULL},
-  {0.005, 8 * 1024, -1.f, 1.f, RNG_Uniform, GenPC, 1234ULL},
-  {0.005, 32 * 1024, 1.f, 1.f, RNG_Gumbel, GenPC, 1234ULL},
-  {0.01, 8 * 1024, 1.f, 1.f, RNG_Gumbel, GenPC, 1234ULL},
-  {0.005, 32 * 1024, 1.f, 1.f, RNG_Logistic, GenPC, 1234ULL},
-  {0.01, 8 * 1024, 1.f, 1.f, RNG_Logistic, GenPC, 1234ULL},
-  {0.008, 32 * 1024, 1.f, 1.f, RNG_Exp, GenPC, 1234ULL},
-  {0.015, 8 * 1024, 1.f, 1.f, RNG_Exp, GenPC, 1234ULL},
-  {0.0125, 32 * 1024, 1.f, 1.f, RNG_Rayleigh, GenPC, 1234ULL},
-  {0.025, 8 * 1024, 1.f, 1.f, RNG_Rayleigh, GenPC, 1234ULL},
-  {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenPC, 1234ULL},
-  {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenPC, 1234ULL}};
+  // Test with Philox
+  {1024 * 1024, 3.0f, 1.3f, RNG_Normal, GenPhilox, 1234ULL},
+  {1024 * 1024, 1.2f, 0.1f, RNG_LogNormal, GenPhilox, 1234ULL},
+  {1024 * 1024, 1.2f, 5.5f, RNG_Uniform, GenPhilox, 1234ULL},
+  {1024 * 1024, 0.1f, 1.3f, RNG_Gumbel, GenPhilox, 1234ULL},
+  {1024 * 1024, 1.6f, 0.0f, RNG_Exp, GenPhilox, 1234ULL},
+  {1024 * 1024, 1.6f, 0.0f, RNG_Rayleigh, GenPhilox, 1234ULL},
+  {1024 * 1024, 2.6f, 1.3f, RNG_Laplace, GenPhilox, 1234ULL},
+  // Test with PCG
+  {1024 * 1024, 3.0f, 1.3f, RNG_Normal, GenPC, 1234ULL},
+  {1024 * 1024, 1.2f, 0.1f, RNG_LogNormal, GenPC, 1234ULL},
+  {1024 * 1024, 1.2f, 5.5f, RNG_Uniform, GenPC, 1234ULL},
+  {1024 * 1024, 0.1f, 1.3f, RNG_Gumbel, GenPC, 1234ULL},
+  {1024 * 1024, 1.6f, 0.0f, RNG_Exp, GenPC, 1234ULL},
+  {1024 * 1024, 1.6f, 0.0f, RNG_Rayleigh, GenPC, 1234ULL},
+  {1024 * 1024, 2.6f, 1.3f, RNG_Laplace, GenPC, 1234ULL}};
 
 TEST_P(RngTestF, Result)
 {
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(num_sigma * params.tolerance)));
-  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(NUM_SIGMA * MAX_SIGMA)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(NUM_SIGMA * MAX_SIGMA)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestF, ::testing::ValuesIn(inputsf));
 
 typedef RngTest<double> RngTestD;
 const std::vector<RngInputs<double>> inputsd = {
-  {0.0055, 32 * 1024, 1.0, 1.0, RNG_Normal, GenPhilox, 1234ULL},
-  {0.011, 8 * 1024, 1.0, 1.0, RNG_Normal, GenPhilox, 1234ULL},
-  {0.05, 32 * 1024, 1.0, 1.0, RNG_LogNormal, GenPhilox, 1234ULL},
-  {0.1, 8 * 1024, 1.0, 1.0, RNG_LogNormal, GenPhilox, 1234ULL},
-  {0.003, 32 * 1024, -1.0, 1.0, RNG_Uniform, GenPhilox, 1234ULL},
-  {0.005, 8 * 1024, -1.0, 1.0, RNG_Uniform, GenPhilox, 1234ULL},
-  {0.005, 32 * 1024, 1.0, 1.0, RNG_Gumbel, GenPhilox, 1234ULL},
-  {0.01, 8 * 1024, 1.0, 1.0, RNG_Gumbel, GenPhilox, 1234ULL},
-  {0.005, 32 * 1024, 1.0, 1.0, RNG_Logistic, GenPhilox, 67632ULL},
-  {0.01, 8 * 1024, 1.0, 1.0, RNG_Logistic, GenPhilox, 1234ULL},
-  {0.008, 32 * 1024, 1.0, 1.0, RNG_Exp, GenPhilox, 1234ULL},
-  {0.015, 8 * 1024, 1.0, 1.0, RNG_Exp, GenPhilox, 1234ULL},
-  {0.0125, 32 * 1024, 1.0, 1.0, RNG_Rayleigh, GenPhilox, 1234ULL},
-  {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenPhilox, 1234ULL},
-  {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenPhilox, 1234ULL},
-  {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenPhilox, 1234ULL},
-
-  {0.0055, 32 * 1024, 1.0, 1.0, RNG_Normal, GenPC, 1234ULL},
-  {0.011, 8 * 1024, 1.0, 1.0, RNG_Normal, GenPC, 1234ULL},
-  {0.05, 32 * 1024, 1.0, 1.0, RNG_LogNormal, GenPC, 1234ULL},
-  {0.1, 8 * 1024, 1.0, 1.0, RNG_LogNormal, GenPC, 1234ULL},
-  {0.003, 32 * 1024, -1.0, 1.0, RNG_Uniform, GenPC, 1234ULL},
-  {0.005, 8 * 1024, -1.0, 1.0, RNG_Uniform, GenPC, 1234ULL},
-  {0.005, 32 * 1024, 1.0, 1.0, RNG_Gumbel, GenPC, 1234ULL},
-  {0.01, 8 * 1024, 1.0, 1.0, RNG_Gumbel, GenPC, 1234ULL},
-  {0.005, 32 * 1024, 1.0, 1.0, RNG_Logistic, GenPC, 1234ULL},
-  {0.01, 8 * 1024, 1.0, 1.0, RNG_Logistic, GenPC, 1234ULL},
-  {0.008, 32 * 1024, 1.0, 1.0, RNG_Exp, GenPC, 1234ULL},
-  {0.015, 8 * 1024, 1.0, 1.0, RNG_Exp, GenPC, 1234ULL},
-  {0.0125, 32 * 1024, 1.0, 1.0, RNG_Rayleigh, GenPC, 1234ULL},
-  {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenPC, 1234ULL},
-  {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenPC, 1234ULL},
-  {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenPC, 1234ULL}};
+  // Test with Philox
+  {1024 * 1024, 3.0f, 1.3f, RNG_Normal, GenPhilox, 1234ULL},
+  {1024 * 1024, 1.2f, 0.1f, RNG_LogNormal, GenPhilox, 1234ULL},
+  {1024 * 1024, 1.2f, 5.5f, RNG_Uniform, GenPhilox, 1234ULL},
+  {1024 * 1024, 0.1f, 1.3f, RNG_Gumbel, GenPhilox, 1234ULL},
+  {1024 * 1024, 1.6f, 0.0f, RNG_Exp, GenPhilox, 1234ULL},
+  {1024 * 1024, 1.6f, 0.0f, RNG_Rayleigh, GenPhilox, 1234ULL},
+  {1024 * 1024, 2.6f, 1.3f, RNG_Laplace, GenPhilox, 1234ULL},
+  // Test with PCG
+  {1024 * 1024, 3.0f, 1.3f, RNG_Normal, GenPC, 1234ULL},
+  {1024 * 1024, 1.2f, 0.1f, RNG_LogNormal, GenPC, 1234ULL},
+  {1024 * 1024, 1.2f, 5.5f, RNG_Uniform, GenPC, 1234ULL},
+  {1024 * 1024, 0.1f, 1.3f, RNG_Gumbel, GenPC, 1234ULL},
+  {1024 * 1024, 1.6f, 0.0f, RNG_Exp, GenPC, 1234ULL},
+  {1024 * 1024, 1.6f, 0.0f, RNG_Rayleigh, GenPC, 1234ULL},
+  {1024 * 1024, 2.6f, 1.3f, RNG_Laplace, GenPC, 1234ULL}};
 
 TEST_P(RngTestD, Result)
 {
   double meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<double>(num_sigma * params.tolerance)));
-  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<double>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<double>(NUM_SIGMA * MAX_SIGMA)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<double>(NUM_SIGMA * MAX_SIGMA)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestD, ::testing::ValuesIn(inputsd));
 
@@ -326,12 +274,11 @@ std::ostream& operator<<(std::ostream& out, const std::vector<T>& v)
   return out;
 }
 
-// The following tests the 3 random number generators by checking that the
-// measured mean error is close to the well-known analytical result
-// (sigma/sqrt(n_samples)). To compute the mean error, we a number of
-// experiments computing the mean, giving us a distribution of the mean
-// itself. The mean error is simply the standard deviation of this
-// distribution (the standard deviation of the mean).
+// The following tests the two random number generators by checking that the measured mean error is
+// close to the well-known analytical result(sigma/sqrt(n_samples)). To compute the mean error, we
+// a number of experiments computing the mean, giving us a distribution of the mean itself. The
+// mean error is simply the standard deviation of this distribution (the standard deviation of the
+// mean).
 TEST(Rng, MeanError)
 {
   timeb time_struct;
@@ -380,7 +327,8 @@ TEST(Rng, MeanError)
     auto diff_expected_vs_measured_mean_error =
       std::abs(d_std_of_mean - d_std / std::sqrt(num_samples));
 
-    ASSERT_TRUE((diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5));
+    ASSERT_TRUE((diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5))
+      << "Failed with seed: " << seed << "\nrtype: " << rtype;
   }
   RAFT_CUDA_TRY(cudaStreamDestroy(stream));
 
diff --git a/cpp/test/spatial/epsilon_neighborhood.cu b/cpp/test/spatial/epsilon_neighborhood.cu
index 30cd79188b..c005549b04 100644
--- a/cpp/test/spatial/epsilon_neighborhood.cu
+++ b/cpp/test/spatial/epsilon_neighborhood.cu
@@ -93,26 +93,10 @@ TEST_P(EpsNeighTestFI, Result)
   for (int i = 0; i < param.n_batches; ++i) {
     RAFT_CUDA_TRY(cudaMemsetAsync(adj.data(), 0, sizeof(bool) * param.n_row * batchSize, stream));
     RAFT_CUDA_TRY(cudaMemsetAsync(vd.data(), 0, sizeof(int) * (batchSize + 1), stream));
-    epsUnexpL2SqNeighborhood<float, int>(adj.
-
-                                         data(),
-                                         vd
-
-                                           .
-
-                                         data(),
-                                         data
-
-                                           .
-
-                                         data(),
-                                         data
-
-                                             .
-
-                                           data()
-
-                                           + (i * batchSize * param.n_col),
+    epsUnexpL2SqNeighborhood<float, int>(adj.data(),
+                                         vd.data(),
+                                         data.data(),
+                                         data.data() + (i * batchSize * param.n_col),
                                          param.n_row,
                                          batchSize,
                                          param.n_col,

From b5de43726e7e3ff73f794b4dd847f581bcfe1ccb Mon Sep 17 00:00:00 2001
From: Matt Joux <mjoux@nvidia.com>
Date: Wed, 9 Mar 2022 14:56:29 +0100
Subject: [PATCH 138/171] clang-tidy support: improved clang run scripts with
 latest changes (see cugraph-ops) (#548)

This PR slightly improves the clang-tidy scripts: for now, they cannot be run successfully on all RAFT source files due to some dependencies not being compatible with clang, this support stays experimental and broken for certain sources.

However, this PR improves the scripts such that they can be used as references for other projects (in particular cudf which is looking into clang-tidy support as well). It makes the GCC tool-chain support much clearer and tries to use whatever compiler was actually used by `cmake` (instead of guessing what that compiler was). It is also compatible with the latest changes in RAPIDS CI docker images (see https://github.com/rapidsai/gpuci-build-environment/pull/246).

For reference of the scripts, see also https://github.com/rapidsai/cugraph-ops/pull/41 (requires RAPIDS organization access), which adds full clang-tidy support in a RAPIDS project.

Authors:
  - Matt Joux (https://github.com/MatthiasKohl)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/548
---
 cpp/scripts/run-clang-compile.py |  58 +++--
 cpp/scripts/run-clang-tidy.py    | 423 ++++++++++++++++++++++---------
 2 files changed, 338 insertions(+), 143 deletions(-)

diff --git a/cpp/scripts/run-clang-compile.py b/cpp/scripts/run-clang-compile.py
index 4edbde84b3..123f0e4075 100644
--- a/cpp/scripts/run-clang-compile.py
+++ b/cpp/scripts/run-clang-compile.py
@@ -20,7 +20,6 @@
 
 from __future__ import print_function
 import argparse
-import glob
 import json
 import multiprocessing as mp
 import os
@@ -29,6 +28,8 @@
 import subprocess
 
 
+CMAKE_COMPILER_REGEX = re.compile(
+    r"^\s*CMAKE_CXX_COMPILER:FILEPATH=(.+)\s*$", re.MULTILINE)
 CLANG_COMPILER = "clang++"
 GPU_ARCH_REGEX = re.compile(r"sm_(\d+)")
 SPACES = re.compile(r"\s+")
@@ -54,6 +55,10 @@ def parse_args():
         help="Regex used to select files for checking")
     argparser.add_argument(
         "-j", type=int, default=-1, help="Number of parallel jobs to launch.")
+    argparser.add_argument(
+        "-build_dir", type=str, default=None,
+        help="Directory from which compile commands should be called. "
+        "By default, directory of compile_commands.json file.")
     args = argparser.parse_args()
     if args.j <= 0:
         args.j = mp.cpu_count()
@@ -63,15 +68,39 @@ def parse_args():
     # recent enough to handle CUDA >= 11
     if not os.path.exists(args.cdb):
         raise Exception("Compilation database '%s' missing" % args.cdb)
+    if args.build_dir is None:
+        args.build_dir = os.path.dirname(args.cdb)
     return args
 
 
+def get_gcc_root(build_dir):
+    # first try to determine GCC based on CMakeCache
+    cmake_cache = os.path.join(build_dir, "CMakeCache.txt")
+    if os.path.isfile(cmake_cache):
+        with open(cmake_cache) as f:
+            content = f.read()
+        match = CMAKE_COMPILER_REGEX.search(content)
+        if match:
+            return os.path.dirname(os.path.dirname(match.group(1)))
+    # first fall-back to CONDA prefix if we have a build sysroot there
+    conda_prefix = os.environ.get("CONDA_PREFIX", "")
+    conda_sysroot = os.environ.get("CONDA_BUILD_SYSROOT", "")
+    if conda_prefix and conda_sysroot:
+        return conda_prefix
+    # second fall-back to default g++ install
+    default_gxx = shutil.which("g++")
+    if default_gxx:
+        return os.path.dirname(os.path.dirname(default_gxx))
+    raise Exception("Cannot find any g++ install on the system.")
+
+
 def list_all_cmds(cdb):
     with open(cdb, "r") as fp:
         return json.load(fp)
 
 
 def get_gpu_archs(command):
+    # clang only accepts a single architecture, so first determine the lowest
     archs = []
     for loc in range(len(command)):
         if (command[loc] != "-gencode" and command[loc] != "--generate-code"
@@ -83,8 +112,8 @@ def get_gpu_archs(command):
             arch_flag = command[loc + 1]
         match = GPU_ARCH_REGEX.search(arch_flag)
         if match is not None:
-            archs.append("--cuda-gpu-arch=sm_%s" % match.group(1))
-    return archs
+            archs.append(int(match.group(1)))
+    return ["--cuda-gpu-arch=sm_%d" % min(archs)]
 
 
 def get_index(arr, item_options):
@@ -113,15 +142,10 @@ def add_cuda_path(command, nvcc):
     if not nvcc_path:
         raise Exception("Command %s has invalid compiler %s" % (command, nvcc))
     cuda_root = os.path.dirname(os.path.dirname(nvcc_path))
-    # make sure that cuda root has version.txt
-    if not os.path.isfile(os.path.join(cuda_root, "version.txt")):
-        raise Exception(
-            "clang++ expects a `version.txt` file in your CUDA root path with "
-            "content `CUDA Version <major>.<minor>.<build>`")
     command.append('--cuda-path=%s' % cuda_root)
 
 
-def get_clang_args(cmd):
+def get_clang_args(cmd, build_dir):
     command, file = cmd["command"], cmd["file"]
     is_cuda = file.endswith(".cu")
     command = re.split(SPACES, command)
@@ -208,15 +232,8 @@ def get_clang_args(cmd):
     for i, x in reversed(list(enumerate(command))):
         if x.startswith("-Werror"):
             del command[i]
-    # add GCC headers if we can find GCC
-    gcc_path = shutil.which("gcc")
-    if gcc_path:
-        gcc_base = os.path.dirname(os.path.dirname(gcc_path))
-        gcc_glob1 = os.path.join(gcc_base, "lib", "gcc", "*", "*", "include")
-        gcc_glob2 = os.path.join(gcc_base, "lib64", "gcc", "*", "*", "include")
-        inc_dirs = glob.glob(gcc_glob1) + glob.glob(gcc_glob2)
-        for d in inc_dirs:
-            command.extend(["-isystem", d])
+    # try to figure out which GCC CMAKE used, and tell clang all about it
+    command.append("--gcc-toolchain=%s" % get_gcc_root(build_dir))
     return command
 
 
@@ -257,11 +274,10 @@ def print_result(passed, stdout, file):
 
 
 def run_clang(cmd, args):
-    command = get_clang_args(cmd)
-    cwd = os.path.dirname(args.cdb)
+    command = get_clang_args(cmd, args.build_dir)
     # compile only and dump output to /dev/null
     command.extend(["-c", cmd["file"], "-o", os.devnull])
-    status, out = run_clang_command(command, cwd)
+    status, out = run_clang_command(command, args.build_dir)
     # we immediately print the result since this is more interactive for user
     with lock:
         print_result(status, out, cmd["file"])
diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py
index ed1a633232..49f96aa18b 100644
--- a/cpp/scripts/run-clang-tidy.py
+++ b/cpp/scripts/run-clang-tidy.py
@@ -19,199 +19,349 @@
 #
 
 from __future__ import print_function
-import sys
-import re
-import os
-import subprocess
 import argparse
 import json
 import multiprocessing as mp
+import os
+import re
+import shutil
+import subprocess
 
 
-EXPECTED_VERSION = "8.0.1"
-VERSION_REGEX = re.compile(r"  LLVM version ([0-9.]+)")
+EXPECTED_VERSIONS = ("11.1.0",)
+VERSION_REGEX = re.compile(r"clang version ([0-9.]+)")
+CMAKE_COMPILER_REGEX = re.compile(
+    r"^\s*CMAKE_CXX_COMPILER:FILEPATH=(.+)\s*$", re.MULTILINE)
+CLANG_COMPILER = "clang++"
 GPU_ARCH_REGEX = re.compile(r"sm_(\d+)")
 SPACES = re.compile(r"\s+")
-SEPARATOR = "-" * 16
+XCOMPILER_FLAG = re.compile(r"-((Xcompiler)|(-compiler-options))=?")
+XPTXAS_FLAG = re.compile(r"-((Xptxas)|(-ptxas-options))=?")
+# any options that may have equal signs in nvcc but not in clang
+# add those options here if you find any
+OPTIONS_NO_EQUAL_SIGN = ['-isystem']
+SEPARATOR = "-" * 8
+END_SEPARATOR = "*" * 64
 
 
 def parse_args():
     argparser = argparse.ArgumentParser("Runs clang-tidy on a project")
-    argparser.add_argument("-cdb", type=str, default="compile_commands.json",
-                           help="Path to cmake-generated compilation database")
-    argparser.add_argument("-exe", type=str, default="clang-tidy",
-                           help="Path to clang-tidy exe")
-    argparser.add_argument("-ignore", type=str, default="[.]cu$",
-                           help="Regex used to ignore files from checking")
-    argparser.add_argument("-select", type=str, default=None,
-                           help="Regex used to select files for checking")
-    argparser.add_argument("-j", type=int, default=-1,
-                           help="Number of parallel jobs to launch.")
+    argparser.add_argument(
+        "-cdb", type=str, default="compile_commands.json",
+        help="Path to cmake-generated compilation database")
+    argparser.add_argument(
+        "-exe", type=str, default="clang-tidy", help="Path to clang-tidy exe")
+    argparser.add_argument(
+        "-ignore", type=str, default=None,
+        help="Regex used to ignore files from checking")
+    argparser.add_argument(
+        "-select", type=str, default=None,
+        help="Regex used to select files for checking")
+    argparser.add_argument(
+        "-j", type=int, default=-1, help="Number of parallel jobs to launch.")
+    argparser.add_argument(
+        "-root", type=str, default=None,
+        help="Repo root path to filter headers correctly, CWD by default.")
+    argparser.add_argument(
+        "-thrust_dir", type=str, default=None,
+        help="Pass the directory to a THRUST git repo recent enough for clang.")
+    argparser.add_argument(
+        "-build_dir", type=str, default=None,
+        help="Directory from which compile commands should be called. "
+        "By default, directory of compile_commands.json file.")
     args = argparser.parse_args()
     if args.j <= 0:
         args.j = mp.cpu_count()
     args.ignore_compiled = re.compile(args.ignore) if args.ignore else None
     args.select_compiled = re.compile(args.select) if args.select else None
-    ret = subprocess.check_output("%s --version" % args.exe, shell=True)
+    # we check clang's version so that it will work in CI
+    ret = subprocess.check_output("%s --version" % CLANG_COMPILER, shell=True)
     ret = ret.decode("utf-8")
-    version = VERSION_REGEX.search(ret)
+    version = VERSION_REGEX.match(ret)
     if version is None:
-        raise Exception("Failed to figure out clang-tidy version!")
+        raise Exception("Failed to figure out clang compiler version!")
     version = version.group(1)
-    if version != EXPECTED_VERSION:
-        raise Exception("clang-tidy exe must be v%s found '%s'" % \
-                        (EXPECTED_VERSION, version))
+    if version not in EXPECTED_VERSIONS:
+        raise Exception("clang compiler version must be in %s found '%s'" %
+                        (EXPECTED_VERSIONS, version))
     if not os.path.exists(args.cdb):
         raise Exception("Compilation database '%s' missing" % args.cdb)
+    # we assume that this script is run from repo root
+    if args.root is None:
+        args.root = os.getcwd()
+    args.root = os.path.realpath(os.path.expanduser(args.root))
+    # we need to have a recent enough cub version for clang to compile
+    if args.thrust_dir is None:
+        args.thrust_dir = os.path.join(
+            os.path.dirname(args.cdb), "thrust_1.15", "src", "thrust_1.15")
+    if args.build_dir is None:
+        args.build_dir = os.path.dirname(args.cdb)
+    if not os.path.isdir(args.thrust_dir):
+        raise Exception("Cannot find custom thrust dir '%s" % args.thrust_dir)
     return args
 
 
+def get_gcc_root(args):
+    # first try to determine GCC based on CMakeCache
+    cmake_cache = os.path.join(args.build_dir, "CMakeCache.txt")
+    if os.path.isfile(cmake_cache):
+        with open(cmake_cache) as f:
+            content = f.read()
+        match = CMAKE_COMPILER_REGEX.search(content)
+        if match:
+            return os.path.dirname(os.path.dirname(match.group(1)))
+    # first fall-back to CONDA prefix if we have a build sysroot there
+    conda_prefix = os.environ.get("CONDA_PREFIX", "")
+    conda_sysroot = os.environ.get("CONDA_BUILD_SYSROOT", "")
+    if conda_prefix and conda_sysroot:
+        return conda_prefix
+    # second fall-back to default g++ install
+    default_gxx = shutil.which("g++")
+    if default_gxx:
+        return os.path.dirname(os.path.dirname(default_gxx))
+    raise Exception("Cannot find any g++ install on the system.")
+
+
 def list_all_cmds(cdb):
     with open(cdb, "r") as fp:
         return json.load(fp)
 
 
 def get_gpu_archs(command):
+    # clang only accepts a single architecture, so first determine the lowest
     archs = []
     for loc in range(len(command)):
-        if command[loc] != "-gencode":
+        if (command[loc] != "-gencode" and command[loc] != "--generate-code"
+                and not command[loc].startswith("--generate-code=")):
             continue
-        arch_flag = command[loc + 1]
+        if command[loc].startswith("--generate-code="):
+            arch_flag = command[loc][len("--generate-code="):]
+        else:
+            arch_flag = command[loc + 1]
         match = GPU_ARCH_REGEX.search(arch_flag)
         if match is not None:
-            archs.append("--cuda-gpu-arch=sm_%s" % match.group(1))
-    return archs
+            archs.append(int(match.group(1)))
+    return ["--cuda-gpu-arch=sm_%d" % min(archs)]
 
 
-def get_index(arr, item):
-    try:
-        return arr.index(item)
-    except:
-        return -1
+def get_index(arr, item_options):
+    return set(i for i, s in enumerate(arr) for item in item_options
+               if s == item)
 
 
-def remove_item(arr, item):
-    loc = get_index(arr, item)
-    if loc >= 0:
-        del arr[loc]
-    return loc
+def remove_items(arr, item_options):
+    for i in sorted(get_index(arr, item_options), reverse=True):
+        del arr[i]
 
 
-def remove_item_plus_one(arr, item):
-    loc = get_index(arr, item)
-    if loc >= 0:
-        del arr[loc + 1]
-        del arr[loc]
-    return loc
+def remove_items_plus_one(arr, item_options):
+    for i in sorted(get_index(arr, item_options), reverse=True):
+        if i < len(arr) - 1:
+            del arr[i + 1]
+        del arr[i]
+    idx = set(i for i, s in enumerate(arr) for item in item_options
+              if s.startswith(item + "="))
+    for i in sorted(idx, reverse=True):
+        del arr[i]
 
 
-def get_clang_includes(exe):
-    dir = os.getenv("CONDA_PREFIX")
-    if dir is None:
-        ret = subprocess.check_output("which %s 2>&1" % exe, shell=True)
-        ret = ret.decode("utf-8")
-        dir = os.path.dirname(os.path.dirname(ret))
-    header = os.path.join(dir, "include", "ClangHeaders")
-    return ["-I", header]
+def add_cuda_path(command, nvcc):
+    nvcc_path = shutil.which(nvcc)
+    if not nvcc_path:
+        raise Exception("Command %s has invalid compiler %s" % (command, nvcc))
+    cuda_root = os.path.dirname(os.path.dirname(nvcc_path))
+    command.append('--cuda-path=%s' % cuda_root)
 
 
-def get_tidy_args(cmd, exe):
+def get_tidy_args(cmd, args):
     command, file = cmd["command"], cmd["file"]
     is_cuda = file.endswith(".cu")
     command = re.split(SPACES, command)
+    # get original compiler
+    cc_orig = command[0]
     # compiler is always clang++!
     command[0] = "clang++"
     # remove compilation and output targets from the original command
-    remove_item_plus_one(command, "-c")
-    remove_item_plus_one(command, "-o")
+    remove_items_plus_one(command, ["--compile", "-c"])
+    remove_items_plus_one(command, ["--output-file", "-o"])
     if is_cuda:
+        # include our own cub before anything else
+        # (left-most should have highest priority)
+        command.insert(1, "-I%s" % args.thrust_dir)
         # replace nvcc's "-gencode ..." with clang's "--cuda-gpu-arch ..."
         archs = get_gpu_archs(command)
         command.extend(archs)
-        while True:
-            loc = remove_item_plus_one(command, "-gencode")
-            if loc < 0:
-                break
+        # provide proper cuda path to clang
+        add_cuda_path(command, cc_orig)
+        # remove all kinds of nvcc flags clang doesn't know about
+        remove_items_plus_one(command, [
+            "--generate-code",
+            "-gencode",
+            "--x",
+            "-x",
+            "--compiler-bindir",
+            "-ccbin",
+            "--diag_suppress",
+            "-diag-suppress",
+            "--default-stream",
+            "-default-stream",
+        ])
+        remove_items(command, [
+            "-extended-lambda",
+            "--extended-lambda",
+            "-expt-extended-lambda",
+            "--expt-extended-lambda",
+            "-expt-relaxed-constexpr",
+            "--expt-relaxed-constexpr",
+            "--device-debug",
+            "-G",
+            "--generate-line-info",
+            "-lineinfo",
+        ])
         # "-x cuda" is the right usage in clang
-        loc = get_index(command, "-x")
-        if loc >= 0:
-            command[loc + 1] = "cuda"
-        remove_item_plus_one(command, "-ccbin")
-        remove_item(command, "--expt-extended-lambda")
-        remove_item(command, "--diag_suppress=unrecognized_gcc_pragma")
-    command.extend(get_clang_includes(exe))
+        command.extend(["-x", "cuda"])
+        # we remove -Xcompiler flags: here we basically have to hope for the
+        # best that clang++ will accept any flags which nvcc passed to gcc
+        for i, c in reversed(list(enumerate(command))):
+            new_c = XCOMPILER_FLAG.sub('', c)
+            if new_c == c:
+                continue
+            command[i:i + 1] = new_c.split(',')
+        # we also change -Xptxas to -Xcuda-ptxas, always adding space here
+        for i, c in reversed(list(enumerate(command))):
+            if XPTXAS_FLAG.search(c):
+                if not c.endswith("=") and i < len(command) - 1:
+                    del command[i + 1]
+                command[i] = '-Xcuda-ptxas'
+                command.insert(i + 1, XPTXAS_FLAG.sub('', c))
+        # several options like isystem don't expect `=`
+        for opt in OPTIONS_NO_EQUAL_SIGN:
+            opt_eq = opt + '='
+            # make sure that we iterate from back to front here for insert
+            for i, c in reversed(list(enumerate(command))):
+                if not c.startswith(opt_eq):
+                    continue
+                x = c.split('=')
+                # we only care about the first `=`
+                command[i] = x[0]
+                command.insert(i + 1, '='.join(x[1:]))
+        # use extensible whole program, to avoid ptx resolution/linking
+        command.extend(["-Xcuda-ptxas", "-ewp"])
+        # for libcudacxx, we need to allow variadic functions
+        command.extend(["-Xclang", "-fcuda-allow-variadic-functions"])
+        # add some additional CUDA intrinsics
+        cuda_intrinsics_file = os.path.join(
+            os.path.dirname(os.path.realpath(__file__)),
+            "__clang_cuda_additional_intrinsics.h")
+        command.extend(["-include", cuda_intrinsics_file])
+    # somehow this option gets onto the commandline, it is unrecognized by tidy
+    remove_items(command, [
+        "--forward-unknown-to-host-compiler",
+        "-forward-unknown-to-host-compiler"
+    ])
+    # do not treat warnings as errors here !
+    for i, x in reversed(list(enumerate(command))):
+        if x.startswith("-Werror"):
+            del command[i]
+    # try to figure out which GCC CMAKE used, and tell clang all about it
+    command.append("--gcc-toolchain=%s" % get_gcc_root(args))
     return command, is_cuda
 
 
-def run_clang_tidy_command(tidy_cmd):
+def check_output_for_errors(output):
+    # there shouldn't really be any allowed errors
+    warnings_found = 0
+    errors = []
+    for line in output.splitlines():
+        if line.find("error:") >= 0:
+            errors.append(line)
+        if line.find("warning:") >= 0:
+            warnings_found += 1
+    return warnings_found, errors
+
+
+def run_clang_tidy_command(tidy_cmd, cwd):
     cmd = " ".join(tidy_cmd)
-    result = subprocess.run(cmd, check=False, shell=True,
+    result = subprocess.run(cmd, check=False, shell=True, cwd=cwd,
                             stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    status = result.returncode == 0
-    if status:
-        out = ""
-    else:
-        out = "CMD: " + cmd
-    out += result.stdout.decode("utf-8").rstrip()
-    return status, out
+    result.stdout = result.stdout.decode("utf-8").strip()
+    out = "CMD: " + cmd + "\n"
+    out += "EXIT-CODE: %d\n" % result.returncode
+    n_warnings, errors = check_output_for_errors(result.stdout)
+    status = n_warnings == 0 and not errors
+    out += result.stdout
+    return status, out, errors
+
+
+class LockContext(object):
+    def __init__(self, lock=None) -> None:
+        self._lock = lock
+    
+    def __enter__(self):
+        if self._lock:
+            self._lock.acquire()
+        return self
+    
+    def __exit__(self, _, __, ___):
+        if self._lock:
+            self._lock.release()
+        return False  # we don't handle exceptions
+
+
+def print_result(passed, stdout, file, errors):
+    if any(errors):
+        raise Exception(
+            "File %s: got %d errors:\n%s" % (file, len(errors), stdout))
+    status_str = "PASSED" if passed else "FAILED"
+    print("%s File:%s %s %s" % (SEPARATOR, file, status_str, SEPARATOR))
+    if not passed and stdout:
+        print(stdout)
+        print("%s\n" % END_SEPARATOR)
+        return stdout.splitlines()
+    return []
 
 
 def run_clang_tidy(cmd, args):
-    command, is_cuda = get_tidy_args(cmd, args.exe)
-    tidy_cmd = [args.exe, "-header-filter=.*raft/cpp/.*", cmd["file"], "--", ]
+    command, is_cuda = get_tidy_args(cmd, args)
+    header_path_any = os.path.join(os.path.basename(args.root), "cpp", ".*")
+    header_filter = "-header-filter='.*%s[.](cuh|h|hpp)$'" % header_path_any
+    tidy_cmd = [args.exe, header_filter, cmd["file"], "--"]
     tidy_cmd.extend(command)
     status = True
     out = ""
     if is_cuda:
         tidy_cmd.append("--cuda-device-only")
         tidy_cmd.append(cmd["file"])
-        ret, out1 = run_clang_tidy_command(tidy_cmd)
+        ret, out1, errors1 = run_clang_tidy_command(tidy_cmd, args.build_dir)
         out += out1
-        out += "%s" % SEPARATOR
-        if not ret:
-            status = ret
+        out += "\n%s\n" % SEPARATOR
+        status = status and ret
         tidy_cmd[-2] = "--cuda-host-only"
-        ret, out1 = run_clang_tidy_command(tidy_cmd)
-        if not ret:
-            status = ret
+        ret, out1, errors2 = run_clang_tidy_command(tidy_cmd, args.build_dir)
+        status = status and ret
         out += out1
+        errors = errors1 + errors2
     else:
         tidy_cmd.append(cmd["file"])
-        ret, out1 = run_clang_tidy_command(tidy_cmd)
-        if not ret:
-            status = ret
+        ret, out1, errors = run_clang_tidy_command(tidy_cmd, args.build_dir)
+        status = status and ret
         out += out1
-    return status, out, cmd["file"]
-
-
-# yikes! global var :(
-results = []
-def collect_result(result):
-    global results
-    results.append(result)
-
+    # we immediately print the result since this is more interactive for user
+    with lock:
+        lines = print_result(status, out, cmd["file"], errors)
+        return status, lines
 
-def print_result(passed, stdout, file):
-    status_str = "PASSED" if passed else "FAILED"
-    print("%s File:%s %s %s" % (SEPARATOR, file, status_str, SEPARATOR))
-    if stdout:
-        print(stdout)
-        print("%s File:%s ENDS %s" % (SEPARATOR, file, SEPARATOR))
 
-
-def print_results():
-    global results
-    status = True
-    for passed, stdout, file in results:
-        print_result(passed, stdout, file)
-        if not passed:
-            status = False
-    return status
+def parse_results(results):
+    return all(r[0] for r in results), [s for r in results for s in r[1]]
 
 
 # mostly used for debugging purposes
 def run_sequential(args, all_files):
-    status = True
+    # lock must be defined as in `run_parallel`
+    global lock
+    lock = LockContext()
+    results = []
     # actual tidy checker
     for cmd in all_files:
         # skip files that we don't want to look at
@@ -221,15 +371,22 @@ def run_sequential(args, all_files):
         if args.select_compiled is not None and \
            re.search(args.select_compiled, cmd["file"]) is None:
             continue
-        passed, stdout, file = run_clang_tidy(cmd, args)
-        print_result(passed, stdout, file)
-        if not passed:
-            status = False
-    return status
+        results.append(run_clang_tidy(cmd, args))
+    return parse_results(results)
+
+
+def copy_lock(init_lock):
+    # this is required to pass locks to pool workers
+    # see https://stackoverflow.com/questions/25557686/
+    # python-sharing-a-lock-between-processes
+    global lock
+    lock = init_lock
 
 
 def run_parallel(args, all_files):
-    pool = mp.Pool(args.j)
+    init_lock = LockContext(mp.Lock())
+    pool = mp.Pool(args.j, initializer=copy_lock, initargs=(init_lock,))
+    results = []
     # actual tidy checker
     for cmd in all_files:
         # skip files that we don't want to look at
@@ -239,11 +396,11 @@ def run_parallel(args, all_files):
         if args.select_compiled is not None and \
            re.search(args.select_compiled, cmd["file"]) is None:
             continue
-        pool.apply_async(run_clang_tidy, args=(cmd, args),
-                         callback=collect_result)
+        results.append(pool.apply_async(run_clang_tidy, args=(cmd, args)))
+    results_final = [r.get() for r in results]
     pool.close()
     pool.join()
-    return print_results()
+    return parse_results(results_final)
 
 
 def main():
@@ -252,11 +409,33 @@ def main():
     if not os.path.exists(".git"):
         raise Exception("This needs to always be run from the root of repo")
     all_files = list_all_cmds(args.cdb)
+    # ensure that we use only the real paths
+    for cmd in all_files:
+        cmd["file"] = os.path.realpath(os.path.expanduser(cmd["file"]))
     if args.j == 1:
-        status = run_sequential(args, all_files)
+        status, lines = run_sequential(args, all_files)
     else:
-        status = run_parallel(args, all_files)
+        status, lines = run_parallel(args, all_files)
     if not status:
+        # first get a list of all checks that were run
+        ret = subprocess.check_output(args.exe + " --list-checks", shell=True)
+        ret = ret.decode("utf-8")
+        checks = [line.strip() for line in ret.splitlines()
+                  if line.startswith(' ' * 4)]
+        max_check_len = max(len(c) for c in checks)
+        check_counts = dict()
+        content = os.linesep.join(lines)
+        for check in checks:
+            check_counts[check] = content.count(check)
+        sorted_counts = sorted(
+            check_counts.items(), key=lambda x: x[1], reverse=True)
+        print("Failed {} check(s) in total. Counts as per below:".format(
+            sum(1 for _, count in sorted_counts if count > 0)))
+        for check, count in sorted_counts:
+            if count <= 0:
+                break
+            n_space = max_check_len - len(check) + 4
+            print("{}:{}{}".format(check, ' ' * n_space, count))
         raise Exception("clang-tidy failed! Refer to the errors above.")
 
 
From c0925f39301b66ee3f71df76d3d95dc473006f3b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Wed, 9 Mar 2022 18:40:48 -0500
Subject: [PATCH 139/171] mdspan PoC for distance make_blobs (#538)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/538
---
 README.md                                   |  69 ++++++--
 cpp/include/raft.hpp                        |   6 +-
 cpp/include/raft/distance/distance.cuh      | 138 ++++++++++++++-
 cpp/include/raft/distance/distance.hpp      | 157 +++++++++++++++--
 cpp/include/raft/mdarray.hpp                | 134 ++++++++++++++-
 cpp/include/raft/random/make_blobs.cuh      |  86 ++++++++++
 cpp/include/raft/random/make_blobs.hpp      |  83 ++++++++-
 cpp/include/raft/spatial/knn/ball_cover.cuh |   7 +-
 cpp/test/distance/distance_base.cuh         |  64 ++++---
 cpp/test/random/make_blobs.cu               | 179 +++++++++-----------
 10 files changed, 757 insertions(+), 166 deletions(-)

diff --git a/README.md b/README.md
index a79679c579..54dd394a69 100755
--- a/README.md
+++ b/README.md
@@ -33,37 +33,51 @@ The Python API is being improved to wrap the algorithms and primitives from the
 ## Getting started
 
 ### Rapids Memory Manager (RMM)
-RAFT relies heavily on [RMM](https://github.com/rapidsai/rmm) which, 
-like other projects in the RAPIDS ecosystem, eases the burden of configuring different allocation strategies globally 
-across the libraries that use it. RMM also provides [RAII](https://en.wikipedia.org/wiki/Resource_acquisition_is_initialization)) wrappers around device arrays that handle the allocation and cleanup.
+
+RAFT relies heavily on RMM which, like other projects in the RAPIDS ecosystem, eases the burden of configuring different allocation strategies globally across the libraries that use it.
+
+### Multi-dimensional Arrays
+
+The APIs in RAFT currently accept raw pointers to device memory and we are in the process of simplifying the APIs with the [mdspan](https://arxiv.org/abs/2010.06474) multi-dimensional array view for representing data in higher dimensions similar to the `ndarray` in the Numpy Python library. RAFT also contains the corresponding owning `mdarray` structure, which simplifies the allocation and management of multi-dimensional data in both host and device (GPU) memory. 
+
+The `mdarray` forms a convenience layer over RMM and can be constructed in RAFT using a number of different helper functions:
+
+```c++
+#include <raft/mdarray.hpp>
+
+int n_rows = 10;
+int n_cols = 10;
+
+auto scalar = raft::make_device_scalar(handle, 1.0);
+auto vector = raft::make_device_vector(handle, n_cols);
+auto matrix = raft::make_device_matrix(handle, n_rows, n_cols);
+```
 
 ### C++ Example
 
 Most of the primitives in RAFT accept a `raft::handle_t` object for the management of resources which are expensive to create, such CUDA streams, stream pools, and handles to other CUDA libraries like `cublas` and `cusolver`.
 
-The example below demonstrates creating a RAFT handle and using it with RMM's `device_uvector` to allocate memory on device and compute
+The example below demonstrates creating a RAFT handle and using it with `device_matrix` and `device_vector` to allocate memory, generating random clusters, and computing
 pairwise Euclidean distances:
 ```c++
 #include <raft/handle.hpp>
-#include <raft/distance/distance.hpp>
+#include <raft/mdarray.hpp>
+#include <raft/random/make_blobs.cuh>
+#include <raft/distance/distance.cuh>
 
-#include <rmm/device_uvector.hpp>
 raft::handle_t handle;
 
-int n_samples = ...;
-int n_features = ...;
+int n_samples = 5000;
+int n_features = 50;
 
-rmm::device_uvector<float> input(n_samples * n_features, handle.get_stream());
-rmm::device_uvector<float> output(n_samples * n_samples, handle.get_stream());
+auto input = raft::make_device_matrix<float>(handle, n_samples, n_features);
+auto labels = raft::make_device_vector<int>(handle, n_samples);
+auto output = raft::make_device_matrix<float>(handle, n_samples, n_samples);
 
-// ... Populate feature matrix ...
+raft::random::make_blobs(handle, input, labels);
 
 auto metric = raft::distance::DistanceType::L2SqrtExpanded;
-rmm::device_uvector<char> workspace(0, handle.get_stream());
-raft::distance::pairwise_distance(handle, input.data(), input.data(),
-                                  output.data(),
-                                  n_samples, n_samples, n_features,
-                                  workspace.data(), metric);
+raft::distance::pairwise_distance(handle, input.view(), input.view(), output.view(), metric);
 ```
 
 ## Installing
@@ -159,3 +173,26 @@ The folder structure mirrors other RAPIDS repos (cuDF, cuML, cuGraph...), with t
 ## Contributing
 
 If you are interested in contributing to the RAFT project, please read our [Contributing guidelines](CONTRIBUTING.md). Refer to the [Developer Guide](DEVELOPER_GUIDE.md) for details on the developer guidelines, workflows, and principals. 
+
+## References
+
+When citing RAFT generally, please consider referencing this Github project.
+```bibtex
+@misc{rapidsai, 
+  title={Rapidsai/raft: RAFT contains fundamental widely-used algorithms and primitives for data science, Graph and machine learning.},
+  url={https://github.com/rapidsai/raft}, 
+  journal={GitHub}, 
+  publisher={Nvidia RAPIDS}, 
+  author={Rapidsai},
+  year={2022}
+}
+```
+If citing the sparse pairwise distances API, please consider using the following bibtex:
+```bibtex
+@article{nolet2021semiring,
+  title={Semiring primitives for sparse neighborhood methods on the gpu},
+  author={Nolet, Corey J and Gala, Divye and Raff, Edward and Eaton, Joe and Rees, Brad and Zedlewski, John and Oates, Tim},
+  journal={arXiv preprint arXiv:2104.06357},
+  year={2021}
+}
+```
\ No newline at end of file
diff --git a/cpp/include/raft.hpp b/cpp/include/raft.hpp
index 08f836d3a8..f942692aeb 100644
--- a/cpp/include/raft.hpp
+++ b/cpp/include/raft.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#include "raft/handle.hpp"
+#include "raft/mdarray.hpp"
+#include "raft/span.hpp"
+
 #include <string>
 
 namespace raft {
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
index 71c9e8d32b..e13cfd94f8 100644
--- a/cpp/include/raft/distance/distance.cuh
+++ b/cpp/include/raft/distance/distance.cuh
@@ -23,6 +23,8 @@
 #include <raft/handle.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <raft/mdarray.hpp>
+
 namespace raft {
 namespace distance {
 
@@ -144,6 +146,35 @@ size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, In
   return detail::getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(x, y, m, n, k);
 }
 
+/**
+ * @brief Return the exact workspace size to compute the distance
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam Index_ Index type
+ * @param x first set of points (size m*k)
+ * @param y second set of points (size n*k)
+ * @return number of bytes needed in workspace
+ *
+ * @note If the specified distanceType doesn't need the workspace at all, it
+ * returns 0.
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int,
+          typename layout>
+size_t getWorkspaceSize(const raft::device_matrix_view<InType, layout> x,
+                        const raft::device_matrix_view<InType, layout> y)
+{
+  RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal.");
+
+  return getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(
+    x.data(), y.data(), x.extent(0), y.extent(0), x.extent(1));
+}
+
 /**
  * @brief Evaluate pairwise distances for the simple use case
  * @tparam DistanceType which distance to evaluate
@@ -160,9 +191,6 @@ size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, In
  * @param stream cuda stream
  * @param isRowMajor whether the matrices are row-major or col-major
  * @param metric_arg metric argument (used for Minkowski distance)
- *
- * @note if workspace is passed as nullptr, this will return in
- *  worksize, the number of bytes of workspace required
  */
 template <raft::distance::DistanceType distanceType,
           typename InType,
@@ -186,6 +214,58 @@ void distance(const InType* x,
     x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, metric_arg);
 }
 
+/**
+ * @brief Evaluate pairwise distances for the simple use case.
+ *
+ * Note: Only contiguous row- or column-major layouts supported currently.
+ *
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam Index_ Index type
+ * @param handle raft handle for managing expensive resources
+ * @param x first set of points (size n*k)
+ * @param y second set of points (size m*k)
+ * @param dist output distance matrix (size n*m)
+ * @param metric_arg metric argument (used for Minkowski distance)
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int,
+          typename layout = raft::layout_c_contiguous>
+void distance(raft::handle_t const& handle,
+              raft::device_matrix_view<InType, layout> const x,
+              raft::device_matrix_view<InType, layout> const y,
+              raft::device_matrix_view<OutType, layout> dist,
+              InType metric_arg = 2.0f)
+{
+  RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal.");
+  RAFT_EXPECTS(dist.extent(0) == x.extent(0),
+               "Number of rows in output must be equal to "
+               "number of rows in X");
+  RAFT_EXPECTS(dist.extent(1) == y.extent(0),
+               "Number of columns in output must be equal to "
+               "number of rows in Y");
+
+  RAFT_EXPECTS(x.is_contiguous(), "Input x must be contiguous.");
+  RAFT_EXPECTS(y.is_contiguous(), "Input y must be contiguous.");
+
+  auto is_rowmajor = std::is_same<layout, layout_c_contiguous>::value;
+
+  distance<distanceType, InType, AccType, OutType, Index_>(x.data(),
+                                                           y.data(),
+                                                           dist.data(),
+                                                           x.extent(0),
+                                                           y.extent(0),
+                                                           x.extent(1),
+                                                           handle.get_stream(),
+                                                           is_rowmajor,
+                                                           metric_arg);
+}
+
 /**
  * @defgroup pairwise_distance pairwise distance prims
  * @{
@@ -319,6 +399,58 @@ void pairwise_distance(const raft::handle_t& handle,
     handle, x, y, dist, m, n, k, workspace, metric, isRowMajor, metric_arg);
 }
 
+/**
+ * @defgroup pairwise_distance pairwise distance prims
+ * @{
+ * @brief Convenience wrapper around 'distance' prim to convert runtime metric
+ * into compile time for the purpose of dispatch
+ * @tparam Type input/accumulation/output data-type
+ * @tparam Index_ indexing type
+ * @param x first matrix of points (size mxk)
+ * @param y second matrix of points (size nxk)
+ * @param dist output distance matrix (size mxn)
+ * @param workspace temporary workspace buffer which can get resized as per the
+ * needed workspace size
+ * @param metric distance metric
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ */
+template <typename Type, typename Index_ = int, typename layout = layout_c_contiguous>
+void pairwise_distance(raft::handle_t const& handle,
+                       device_matrix_view<Type, layout> const x,
+                       device_matrix_view<Type, layout> const y,
+                       device_matrix_view<Type, layout> dist,
+                       raft::distance::DistanceType metric,
+                       Type metric_arg = 2.0f)
+{
+  RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal.");
+  RAFT_EXPECTS(dist.extent(0) == x.extent(0),
+               "Number of rows in output must be equal to "
+               "number of rows in X");
+  RAFT_EXPECTS(dist.extent(1) == y.extent(0),
+               "Number of columns in output must be equal to "
+               "number of rows in Y");
+
+  RAFT_EXPECTS(x.is_contiguous(), "Input x must be contiguous.");
+  RAFT_EXPECTS(y.is_contiguous(), "Input y must be contiguous.");
+  RAFT_EXPECTS(dist.is_contiguous(), "Output must be contiguous.");
+
+  bool rowmajor = x.stride(0) == 0;
+
+  rmm::device_uvector<char> workspace(0, handle.get_stream());
+
+  pairwise_distance(handle,
+                    x.data(),
+                    y.data(),
+                    dist.data(),
+                    x.extent(0),
+                    y.extent(0),
+                    x.extent(1),
+                    metric,
+                    rowmajor,
+                    metric_arg);
+}
+
 };  // namespace distance
 };  // namespace raft
 
diff --git a/cpp/include/raft/distance/distance.hpp b/cpp/include/raft/distance/distance.hpp
index f9fbde50e4..66b4efcede 100644
--- a/cpp/include/raft/distance/distance.hpp
+++ b/cpp/include/raft/distance/distance.hpp
@@ -28,6 +28,8 @@
 #include <raft/handle.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <raft/mdarray.hpp>
+
 namespace raft {
 namespace distance {
 
@@ -149,6 +151,34 @@ size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, In
   return detail::getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(x, y, m, n, k);
 }
 
+/**
+ * @brief Return the exact workspace size to compute the distance
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam Index_ Index type
+ * @param x first set of points (size m*k)
+ * @param y second set of points (size n*k)
+ * @return number of bytes needed in workspace
+ *
+ * @note If the specified distanceType doesn't need the workspace at all, it
+ * returns 0.
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int>
+size_t getWorkspaceSize(const raft::device_matrix_view<InType>& x,
+                        const raft::device_matrix_view<InType>& y)
+{
+  RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal.");
+
+  return getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(
+    x.data(), y.data(), x.extent(0), y.extent(0), x.extent(1));
+}
+
 /**
  * @brief Evaluate pairwise distances for the simple use case
  * @tparam DistanceType which distance to evaluate
@@ -165,9 +195,6 @@ size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, In
  * @param stream cuda stream
  * @param isRowMajor whether the matrices are row-major or col-major
  * @param metric_arg metric argument (used for Minkowski distance)
- *
- * @note if workspace is passed as nullptr, this will return in
- *  worksize, the number of bytes of workspace required
  */
 template <raft::distance::DistanceType distanceType,
           typename InType,
@@ -192,12 +219,76 @@ void distance(const InType* x,
 }
 
 /**
- * @defgroup pairwise_distance pairwise distance prims
- * @{
+ * @brief Evaluate pairwise distances for the simple use case.
+ *
+ * Note: Only contiguous row- or column-major layouts supported currently.
+ *
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam Index_ Index type
+ * @param handle raft handle for managing expensive resources
+ * @param x first set of points (size n*k)
+ * @param y second set of points (size m*k)
+ * @param dist output distance matrix (size n*m)
+ * @param metric_arg metric argument (used for Minkowski distance)
+ */
+
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int,
+          typename layout = raft::layout_c_contiguous>
+void distance(raft::handle_t const handle,
+              raft::device_matrix_view<InType, layout> const x,
+              raft::device_matrix_view<InType, layout> const y,
+              raft::device_matrix_view<OutType> dist,
+              InType metric_arg = 2.0f)
+{
+  RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal.");
+  RAFT_EXPECTS(dist.extent(0) == x.extent(0),
+               "Number of rows in output must be equal to "
+               "number of rows in X");
+  RAFT_EXPECTS(dist.extent(1) == y.extent(0),
+               "Number of columns in output must be equal to "
+               "number of rows in Y");
+
+  RAFT_EXPECTS(x.is_contiguous(), "Input x must be contiguous.");
+  RAFT_EXPECTS(y.is_contiguous(), "Input y must be contiguous.");
+
+  if (x.stride(0) == 0 && y.stride(0) == 0) {
+    distance<distanceType, InType, AccType, OutType, Index_>(x.data(),
+                                                             y.data(),
+                                                             dist.data(),
+                                                             x.extent(0),
+                                                             y.extent(0),
+                                                             x.extent(1),
+                                                             handle.get_stream(),
+                                                             true,
+                                                             metric_arg);
+  } else if (x.stride(0) > 0 && y.stride(0) > 0) {
+    distance<distanceType, InType, AccType, OutType, Index_>(x.data(),
+                                                             y.data(),
+                                                             dist.data(),
+                                                             x.extent(0),
+                                                             y.extent(0),
+                                                             x.extent(1),
+                                                             handle.get_stream(),
+                                                             false,
+                                                             metric_arg);
+  } else {
+    RAFT_FAIL("x and y must both have the same layout: row-major or column-major.");
+  }
+}
+
+/**
  * @brief Convenience wrapper around 'distance' prim to convert runtime metric
  * into compile time for the purpose of dispatch
  * @tparam Type input/accumulation/output data-type
  * @tparam Index_ indexing type
+ * @param handle raft handle for managing expensive resources
  * @param x first set of points
  * @param y second set of points
  * @param dist output distance matrix
@@ -207,8 +298,8 @@ void distance(const InType* x,
  * @param workspace temporary workspace buffer which can get resized as per the
  * needed workspace size
  * @param metric distance metric
- * @param stream cuda stream
  * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument
  */
 template <typename Type, typename Index_ = int>
 void pairwise_distance(const raft::handle_t& handle,
@@ -288,15 +379,13 @@ void pairwise_distance(const raft::handle_t& handle,
     default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
   };
 }
-/** @} */
 
 /**
- * @defgroup pairwise_distance pairwise distance prims
- * @{
  * @brief Convenience wrapper around 'distance' prim to convert runtime metric
  * into compile time for the purpose of dispatch
  * @tparam Type input/accumulation/output data-type
  * @tparam Index_ indexing type
+ * @param handle raft handle for managing expensive resources
  * @param x first set of points
  * @param y second set of points
  * @param dist output distance matrix
@@ -304,8 +393,8 @@ void pairwise_distance(const raft::handle_t& handle,
  * @param n number of points in y
  * @param k dimensionality
  * @param metric distance metric
- * @param stream cuda stream
  * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument
  */
 template <typename Type, typename Index_ = int>
 void pairwise_distance(const raft::handle_t& handle,
@@ -324,6 +413,54 @@ void pairwise_distance(const raft::handle_t& handle,
     handle, x, y, dist, m, n, k, workspace, metric, isRowMajor, metric_arg);
 }
 
+/**
+ * @brief Convenience wrapper around 'distance' prim to convert runtime metric
+ * into compile time for the purpose of dispatch
+ * @tparam Type input/accumulation/output data-type
+ * @tparam Index_ indexing type
+ * @param handle raft handle for managing expensive resources
+ * @param x first matrix of points (size mxk)
+ * @param y second matrix of points (size nxk)
+ * @param dist output distance matrix (size mxn)
+ * @param metric distance metric
+ * @param metric_arg metric argument
+ */
+template <typename Type, typename Index_ = int>
+void pairwise_distance(raft::handle_t const& handle,
+                       device_matrix_view<Type> const& x,
+                       device_matrix_view<Type> const& y,
+                       device_matrix_view<Type>& dist,
+                       raft::distance::DistanceType metric,
+                       Type metric_arg = 2.0f)
+{
+  RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal.");
+  RAFT_EXPECTS(dist.extent(0) == x.extent(0),
+               "Number of rows in output must be equal to "
+               "number of rows in X");
+  RAFT_EXPECTS(dist.extent(1) == y.extent(0),
+               "Number of columns in output must be equal to "
+               "number of rows in Y");
+
+  RAFT_EXPECTS(x.is_contiguous(), "Input x must be contiguous.");
+  RAFT_EXPECTS(y.is_contiguous(), "Input y must be contiguous.");
+  RAFT_EXPECTS(dist.is_contiguous(), "Output must be contiguous.");
+
+  bool rowmajor = x.stride(0) == 0;
+
+  rmm::device_uvector<char> workspace(0, handle.get_stream());
+
+  pairwise_distance(handle,
+                    x.data(),
+                    y.data(),
+                    dist.data(),
+                    x.extent(0),
+                    y.extent(0),
+                    x.extent(1),
+                    metric,
+                    rowmajor,
+                    metric_arg);
+}
+
 };  // namespace distance
 };  // namespace raft
 
diff --git a/cpp/include/raft/mdarray.hpp b/cpp/include/raft/mdarray.hpp
index 44ca526c16..f92a0e5e59 100644
--- a/cpp/include/raft/mdarray.hpp
+++ b/cpp/include/raft/mdarray.hpp
@@ -23,6 +23,7 @@
 #pragma once
 #include <experimental/mdspan>
 #include <raft/detail/mdarray.hpp>
+#include <raft/handle.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 namespace raft {
@@ -295,6 +296,10 @@ class mdarray {
 
 /**
  * @brief mdarray with host container policy
+ * @tparam ElementType the data type of the elements
+ * @tparam Extents defines the shape
+ * @tparam LayoutPolicy policy for indexing strides and layout ordering
+ * @tparam ContainerPolicy storage and accessor policy
  */
 template <typename ElementType,
           typename Extents,
@@ -305,6 +310,10 @@ using host_mdarray =
 
 /**
  * @brief mdarray with device container policy
+ * @tparam ElementType the data type of the elements
+ * @tparam Extents defines the shape
+ * @tparam LayoutPolicy policy for indexing strides and layout ordering
+ * @tparam ContainerPolicy storage and accessor policy
  */
 template <typename ElementType,
           typename Extents,
@@ -315,81 +324,99 @@ using device_mdarray =
 
 /**
  * @brief Shorthand for 0-dim host mdarray (scalar).
- *
- * Underlying storage is std::vector.
+ * @tparam ElementType the data type of the scalar element
  */
 template <typename ElementType>
 using host_scalar = host_mdarray<ElementType, detail::scalar_extent>;
 
 /**
  * @brief Shorthand for 0-dim host mdarray (scalar).
- *
- * Similar to rmm::device_scalar, underying storage is rmm::device_uvector.
+ * @tparam ElementType the data type of the scalar element
  */
 template <typename ElementType>
 using device_scalar = device_mdarray<ElementType, detail::scalar_extent>;
 
 /**
  * @brief Shorthand for 1-dim host mdarray.
+ * @tparam ElementType the data type of the vector elements
  */
 template <typename ElementType>
 using host_vector = host_mdarray<ElementType, detail::vector_extent>;
 
 /**
  * @brief Shorthand for 1-dim device mdarray.
+ * @tparam ElementType the data type of the vector elements
  */
 template <typename ElementType>
 using device_vector = device_mdarray<ElementType, detail::vector_extent>;
 
 /**
  * @brief Shorthand for c-contiguous host matrix.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
 using host_matrix = host_mdarray<ElementType, detail::matrix_extent, LayoutPolicy>;
 
 /**
  * @brief Shorthand for c-contiguous device matrix.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
 using device_matrix = device_mdarray<ElementType, detail::matrix_extent, LayoutPolicy>;
 
 /**
  * @brief Shorthand for 0-dim host mdspan (scalar).
+ * @tparam ElementType the data type of the scalar element
  */
 template <typename ElementType>
 using host_scalar_view = host_mdspan<ElementType, detail::scalar_extent>;
 
 /**
  * @brief Shorthand for 0-dim host mdspan (scalar).
+ * @tparam ElementType the data type of the scalar element
  */
 template <typename ElementType>
 using device_scalar_view = device_mdspan<ElementType, detail::scalar_extent>;
 
 /**
  * @brief Shorthand for 1-dim host mdspan.
+ * @tparam ElementType the data type of the vector elements
  */
 template <typename ElementType>
 using host_vector_view = host_mdspan<ElementType, detail::vector_extent>;
 
 /**
  * @brief Shorthand for 1-dim device mdspan.
+ * @tparam ElementType the data type of the vector elements
  */
 template <typename ElementType>
 using device_vector_view = device_mdspan<ElementType, detail::vector_extent>;
 
 /**
  * @brief Shorthand for c-contiguous host matrix view.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ *
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
 using host_matrix_view = host_mdspan<ElementType, detail::matrix_extent, LayoutPolicy>;
+
 /**
  * @brief Shorthand for c-contiguous device matrix view.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ *
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
 using device_matrix_view = device_mdspan<ElementType, detail::matrix_extent, LayoutPolicy>;
 
 /**
  * @brief Create a 0-dim (scalar) mdspan instance for host value.
+ *
+ * @tparam ElementType the data type of the matrix elements
+ * @param[in] ptr on device to wrap
  */
 template <typename ElementType>
 auto make_host_scalar_view(ElementType* ptr)
@@ -400,6 +427,9 @@ auto make_host_scalar_view(ElementType* ptr)
 
 /**
  * @brief Create a 0-dim (scalar) mdspan instance for device value.
+ *
+ * @tparam ElementType the data type of the matrix elements
+ * @param[in] ptr on device to wrap
  */
 template <typename ElementType>
 auto make_device_scalar_view(ElementType* ptr)
@@ -409,7 +439,14 @@ auto make_device_scalar_view(ElementType* ptr)
 }
 
 /**
- * @brief Create a 2-dim c-contiguous mdspan instance for host pointer.
+ * @brief Create a 2-dim c-contiguous mdspan instance for host pointer. It's
+ *        expected that the given layout policy match the layout of the underlying
+ *        pointer.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] ptr on host to wrap
+ * @param[in] n_rows number of rows in pointer
+ * @param[in] n_cols number of columns in pointer
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
 auto make_host_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
@@ -418,7 +455,14 @@ auto make_host_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
   return host_matrix_view<ElementType, LayoutPolicy>{ptr, extents};
 }
 /**
- * @brief Create a 2-dim c-contiguous mdspan instance for device pointer.
+ * @brief Create a 2-dim c-contiguous mdspan instance for device pointer. It's
+ *        expected that the given layout policy match the layout of the underlying
+ *        pointer.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] ptr on device to wrap
+ * @param[in] n_rows number of rows in pointer
+ * @param[in] n_cols number of columns in pointer
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
 auto make_device_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
@@ -429,6 +473,10 @@ auto make_device_matrix_view(ElementType* ptr, size_t n_rows, size_t n_cols)
 
 /**
  * @brief Create a 1-dim mdspan instance for host pointer.
+ * @tparam ElementType the data type of the vector elements
+ * @param[in] ptr on host to wrap
+ * @param[in] n number of elements in pointer
+ * @return raft::host_vector_view
  */
 template <typename ElementType>
 auto make_host_vector_view(ElementType* ptr, size_t n)
@@ -439,6 +487,10 @@ auto make_host_vector_view(ElementType* ptr, size_t n)
 
 /**
  * @brief Create a 1-dim mdspan instance for device pointer.
+ * @tparam ElementType the data type of the vector elements
+ * @param[in] ptr on device to wrap
+ * @param[in] n number of elements in pointer
+ * @return raft::device_vector_view
  */
 template <typename ElementType>
 auto make_device_vector_view(ElementType* ptr, size_t n)
@@ -449,6 +501,11 @@ auto make_device_vector_view(ElementType* ptr, size_t n)
 
 /**
  * @brief Create a 2-dim c-contiguous host mdarray.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] n_rows number or rows in matrix
+ * @param[in] n_cols number of columns in matrix
+ * @return raft::host_matrix
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
 auto make_host_matrix(size_t n_rows, size_t n_cols)
@@ -461,6 +518,12 @@ auto make_host_matrix(size_t n_rows, size_t n_cols)
 
 /**
  * @brief Create a 2-dim c-contiguous device mdarray.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] n_rows number or rows in matrix
+ * @param[in] n_cols number of columns in matrix
+ * @param[in] stream cuda stream for ordering events
+ * @return raft::device_matrix
  */
 template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
 auto make_device_matrix(size_t n_rows, size_t n_cols, rmm::cuda_stream_view stream)
@@ -471,10 +534,28 @@ auto make_device_matrix(size_t n_rows, size_t n_cols, rmm::cuda_stream_view stre
   return device_matrix<ElementType, LayoutPolicy>{extents, policy};
 }
 
+/**
+ * @brief Create a 2-dim c-contiguous device mdarray.
+ *
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy policy for strides and layout ordering
+ * @param[in] handle raft handle for managing expensive resources
+ * @param[in] n_rows number or rows in matrix
+ * @param[in] n_cols number of columns in matrix
+ * @return raft::device_matrix
+ */
+template <typename ElementType, typename LayoutPolicy = layout_c_contiguous>
+auto make_device_matrix(raft::handle_t const& handle, size_t n_rows, size_t n_cols)
+{
+  return make_device_matrix<ElementType, LayoutPolicy>(n_rows, n_cols, handle.get_stream());
+}
+
 /**
  * @brief Create a host scalar from v.
  *
- * Underlying storage is std::vector.
+ * @tparam ElementType the data type of the scalar element
+ * @param[in] v scalar type to wrap
+ * @return raft::host_scalar
  */
 template <typename ElementType>
 auto make_host_scalar(ElementType const& v)
@@ -493,7 +574,10 @@ auto make_host_scalar(ElementType const& v)
 /**
  * @brief Create a device scalar from v.
  *
- * Similar to rmm::device_scalar, underying storage is rmm::device_uvector.
+ * @tparam ElementType the data type of the scalar element
+ * @param[in] v scalar type to wrap on device
+ * @param[in] stream the cuda stream for ordering events
+ * @return raft::device_scalar
  */
 template <typename ElementType>
 auto make_device_scalar(ElementType const& v, rmm::cuda_stream_view stream)
@@ -506,8 +590,25 @@ auto make_device_scalar(ElementType const& v, rmm::cuda_stream_view stream)
   return scalar;
 }
 
+/**
+ * @brief Create a device scalar from v.
+ *
+ * @tparam ElementType the data type of the scalar element
+ * @param[in] handle raft handle for managing expensive cuda resources
+ * @param[in] v scalar to wrap on device
+ * @return raft::device_scalar
+ */
+template <typename ElementType>
+auto make_device_scalar(raft::handle_t const& handle, ElementType const& v)
+{
+  return make_device_scalar<ElementType>(v, handle.get_stream());
+}
+
 /**
  * @brief Create a 1-dim host mdarray.
+ * @tparam ElementType the data type of the vector elements
+ * @param[in] n number of elements in vector
+ * @return raft::host_vector
  */
 template <typename ElementType>
 auto make_host_vector(size_t n)
@@ -520,6 +621,10 @@ auto make_host_vector(size_t n)
 
 /**
  * @brief Create a 1-dim device mdarray.
+ * @tparam ElementType the data type of the vector elements
+ * @param[in] n number of elements in vector
+ * @param[in] stream the cuda stream for ordering events
+ * @return raft::device_vector
  */
 template <typename ElementType>
 auto make_device_vector(size_t n, rmm::cuda_stream_view stream)
@@ -529,4 +634,17 @@ auto make_device_vector(size_t n, rmm::cuda_stream_view stream)
   policy_t policy{stream};
   return device_vector<ElementType>{extents, policy};
 }
+
+/**
+ * @brief Create a 1-dim device mdarray.
+ * @tparam ElementType the data type of the vector elements
+ * @param[in] handle raft handle for managing expensive cuda resources
+ * @param[in] n number of elements in vector
+ * @return raft::device_vector
+ */
+template <typename ElementType>
+auto make_device_vector(raft::handle_t const& handle, size_t n)
+{
+  return make_device_vector<ElementType>(n, handle.get_stream());
+}
 }  // namespace raft
diff --git a/cpp/include/raft/random/make_blobs.cuh b/cpp/include/raft/random/make_blobs.cuh
index 2ad3a7960d..088690529a 100644
--- a/cpp/include/raft/random/make_blobs.cuh
+++ b/cpp/include/raft/random/make_blobs.cuh
@@ -20,6 +20,8 @@
 #pragma once
 
 #include "detail/make_blobs.cuh"
+#include <optional>
+#include <raft/mdarray.hpp>
 
 namespace raft::random {
 
@@ -91,6 +93,90 @@ void make_blobs(DataT* out,
                             type);
 }
 
+/**
+ * @brief GPU-equivalent of sklearn.datasets.make_blobs
+ *
+ * @tparam DataT output data type
+ * @tparam IdxT  indexing arithmetic type
+ *
+ * @param[out] out                generated data [on device]
+ *                                [dim = n_rows x n_cols]
+ * @param[out] labels             labels for the generated data [on device]
+ *                                [len = n_rows]
+ * @param[in]  n_rows             number of rows in the generated data
+ * @param[in]  n_cols             number of columns in the generated data
+ * @param[in]  n_clusters         number of clusters (or classes) to generate
+ * @param[in]  stream             cuda stream to schedule the work on
+ * @param[in]  row_major          whether input `centers` and output `out`
+ *                                buffers are to be stored in row or column
+ *                                major layout
+ * @param[in]  centers            centers of each of the cluster, pass a nullptr
+ *                                if you need this also to be generated randomly
+ *                                [on device] [dim = n_clusters x n_cols]
+ * @param[in]  cluster_std        standard deviation of each cluster center,
+ *                                pass a nullptr if this is to be read from the
+ *                                `cluster_std_scalar`. [on device]
+ *                                [len = n_clusters]
+ * @param[in]  cluster_std_scalar if 'cluster_std' is nullptr, then use this as
+ *                                the std-dev across all dimensions.
+ * @param[in]  shuffle            shuffle the generated dataset and labels
+ * @param[in]  center_box_min     min value of box from which to pick cluster
+ *                                centers. Useful only if 'centers' is nullptr
+ * @param[in]  center_box_max     max value of box from which to pick cluster
+ *                                centers. Useful only if 'centers' is nullptr
+ * @param[in]  seed               seed for the RNG
+ * @param[in]  type               RNG type
+ */
+template <typename DataT, typename IdxT, typename layout>
+void make_blobs(raft::handle_t const& handle,
+                raft::device_matrix_view<DataT, layout> out,
+                raft::device_vector_view<IdxT> labels,
+                IdxT n_clusters                                                  = 5,
+                std::optional<raft::device_matrix_view<DataT, layout>> centers   = std::nullopt,
+                std::optional<raft::device_vector_view<DataT>> const cluster_std = std::nullopt,
+                const DataT cluster_std_scalar                                   = (DataT)1.0,
+                bool shuffle                                                     = true,
+                DataT center_box_min                                             = (DataT)-10.0,
+                DataT center_box_max                                             = (DataT)10.0,
+                uint64_t seed                                                    = 0ULL,
+                GeneratorType type                                               = GenPhilox)
+{
+  if (centers.has_value()) {
+    RAFT_EXPECTS(centers.value().extent(0) == (std::size_t)n_clusters,
+                 "n_centers must equal size of centers");
+  }
+
+  if (cluster_std.has_value()) {
+    RAFT_EXPECTS(cluster_std.value().extent(0) == (std::size_t)n_clusters,
+                 "n_centers must equal size of cluster_std");
+  }
+
+  RAFT_EXPECTS(out.extent(0) == labels.extent(0),
+               "Number of labels must equal the number of row in output matrix");
+
+  RAFT_EXPECTS(out.is_contiguous(), "Output must be contiguous.");
+
+  bool row_major = std::is_same<layout, raft::layout_c_contiguous>::value;
+
+  auto prm_centers     = centers.has_value() ? centers.value().data() : nullptr;
+  auto prm_cluster_std = cluster_std.has_value() ? cluster_std.value().data() : nullptr;
+
+  detail::make_blobs_caller(out.data(),
+                            labels.data(),
+                            (IdxT)out.extent(0),
+                            (IdxT)out.extent(1),
+                            n_clusters,
+                            handle.get_stream(),
+                            row_major,
+                            prm_centers,
+                            prm_cluster_std,
+                            cluster_std_scalar,
+                            shuffle,
+                            center_box_min,
+                            center_box_max,
+                            seed,
+                            type);
+}
 }  // end namespace raft::random
 
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/random/make_blobs.hpp b/cpp/include/raft/random/make_blobs.hpp
index 19d4b8499b..02aef809e7 100644
--- a/cpp/include/raft/random/make_blobs.hpp
+++ b/cpp/include/raft/random/make_blobs.hpp
@@ -25,6 +25,8 @@
 #pragma once
 
 #include "detail/make_blobs.cuh"
+#include <optional>
+#include <raft/mdarray.hpp>
 
 namespace raft::random {
 
@@ -96,6 +98,85 @@ void make_blobs(DataT* out,
                             type);
 }
 
+/**
+ * @brief GPU-equivalent of sklearn.datasets.make_blobs
+ *
+ * @tparam DataT output data type
+ * @tparam IdxT  indexing arithmetic type
+ *
+ * @param[in]  handle             raft handle for managing expensive resources
+ * @param[out] out                generated data [on device]
+ *                                [dim = n_rows x n_cols]
+ * @param[out] labels             labels for the generated data [on device]
+ *                                [len = n_rows]
+ * @param[in]  n_clusters         number of clusters (or classes) to generate
+ * @param[in]  centers            centers of each of the cluster, pass a nullptr
+ *                                if you need this also to be generated randomly
+ *                                [on device] [dim = n_clusters x n_cols]
+ * @param[in]  cluster_std        standard deviation of each cluster center,
+ *                                pass a nullptr if this is to be read from the
+ *                                `cluster_std_scalar`. [on device]
+ *                                [len = n_clusters]
+ * @param[in]  cluster_std_scalar if 'cluster_std' is nullptr, then use this as
+ *                                the std-dev across all dimensions.
+ * @param[in]  shuffle            shuffle the generated dataset and labels
+ * @param[in]  center_box_min     min value of box from which to pick cluster
+ *                                centers. Useful only if 'centers' is nullptr
+ * @param[in]  center_box_max     max value of box from which to pick cluster
+ *                                centers. Useful only if 'centers' is nullptr
+ * @param[in]  seed               seed for the RNG
+ * @param[in]  type               RNG type
+ */
+template <typename DataT, typename IdxT, typename layout>
+void make_blobs(raft::handle_t const& handle,
+                raft::device_matrix_view<DataT, layout> out,
+                raft::device_vector_view<IdxT> labels,
+                IdxT n_clusters                                                  = 5,
+                std::optional<raft::device_matrix_view<DataT, layout>> centers   = std::nullopt,
+                std::optional<raft::device_vector_view<DataT>> const cluster_std = std::nullopt,
+                const DataT cluster_std_scalar                                   = (DataT)1.0,
+                bool shuffle                                                     = true,
+                DataT center_box_min                                             = (DataT)-10.0,
+                DataT center_box_max                                             = (DataT)10.0,
+                uint64_t seed                                                    = 0ULL,
+                GeneratorType type                                               = GenPhilox)
+{
+  if (centers.has_value()) {
+    RAFT_EXPECTS(centers.value().extent(0) == (std::size_t)n_clusters,
+                 "n_centers must equal size of centers");
+  }
+
+  if (cluster_std.has_value()) {
+    RAFT_EXPECTS(cluster_std.value().extent(0) == (std::size_t)n_clusters,
+                 "n_centers must equal size of cluster_std");
+  }
+
+  RAFT_EXPECTS(out.extent(0) == labels.extent(0),
+               "Number of labels must equal the number of row in output matrix");
+
+  RAFT_EXPECTS(out.is_contiguous(), "Output must be contiguous.");
+
+  bool row_major = std::is_same<layout, raft::layout_c_contiguous>::value;
+
+  auto prm_centers     = centers.has_value() ? centers.value().data() : nullptr;
+  auto prm_cluster_std = cluster_std.has_value() ? cluster_std.value().data() : nullptr;
+
+  detail::make_blobs_caller(out.data(),
+                            labels.data(),
+                            (IdxT)out.extent(0),
+                            (IdxT)out.extent(1),
+                            n_clusters,
+                            handle.get_stream(),
+                            row_major,
+                            prm_centers,
+                            prm_cluster_std,
+                            cluster_std_scalar,
+                            shuffle,
+                            center_box_min,
+                            center_box_max,
+                            seed,
+                            type);
+}
 }  // end namespace raft::random
 
-#endif
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/ball_cover.cuh b/cpp/include/raft/spatial/knn/ball_cover.cuh
index df797ecca2..62cd5aa45c 100644
--- a/cpp/include/raft/spatial/knn/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/ball_cover.cuh
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #ifndef __BALL_COVER_H
 #define __BALL_COVER_H
 
@@ -35,7 +34,7 @@ template <typename value_idx = std::int64_t, typename value_t, typename value_in
 void rbc_build_index(const raft::handle_t& handle,
                      BallCoverIndex<value_idx, value_t, value_int>& index)
 {
-  ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions");
+  ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
   if (index.metric == raft::distance::DistanceType::Haversine) {
     detail::rbc_build_index(handle, index, detail::HaversineFunc<value_t, value_int>());
   } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
@@ -85,7 +84,7 @@ void rbc_all_knn_query(const raft::handle_t& handle,
                        bool perform_post_filtering = true,
                        float weight                = 1.0)
 {
-  ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions");
+  ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
   if (index.metric == raft::distance::DistanceType::Haversine) {
     detail::rbc_all_knn_query(handle,
                               index,
@@ -152,7 +151,7 @@ void rbc_knn_query(const raft::handle_t& handle,
                    bool perform_post_filtering = true,
                    float weight                = 1.0)
 {
-  ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions");
+  ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation");
   if (index.metric == raft::distance::DistanceType::Haversine) {
     detail::rbc_knn_query(handle,
                           index,
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index 1070f18b96..3c4f3d7323 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -20,6 +20,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance.cuh>
+#include <raft/mdarray.hpp>
 #if defined RAFT_DISTANCE_COMPILED
 #include <raft/distance/specializations.cuh>
 #endif
@@ -383,7 +384,7 @@ template <typename DataType>
   return os;
 }
 
-template <raft::distance::DistanceType distanceType, typename DataType>
+template <raft::distance::DistanceType distanceType, typename DataType, typename layout>
 void distanceLauncher(DataType* x,
                       DataType* y,
                       DataType* dist,
@@ -393,14 +394,17 @@ void distanceLauncher(DataType* x,
                       int k,
                       DistanceInputs<DataType>& params,
                       DataType threshold,
-                      char* workspace,
-                      size_t worksize,
                       cudaStream_t stream,
-                      bool isRowMajor,
                       DataType metric_arg = 2.0f)
 {
-  raft::distance::distance<distanceType, DataType, DataType, DataType>(
-    x, y, dist, m, n, k, workspace, worksize, stream, isRowMajor, metric_arg);
+  raft::handle_t handle(stream);
+
+  auto x_v    = make_device_matrix_view<DataType, layout>(x, m, k);
+  auto y_v    = make_device_matrix_view<DataType, layout>(y, n, k);
+  auto dist_v = make_device_matrix_view<DataType, layout>(dist, m, n);
+
+  raft::distance::distance<distanceType, DataType, DataType, DataType, int, layout>(
+    handle, x_v, y_v, dist_v, metric_arg);
 }
 
 template <raft::distance::DistanceType distanceType, typename DataType>
@@ -446,25 +450,39 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
     }
     naiveDistance(
       dist_ref.data(), x.data(), y.data(), m, n, k, distanceType, isRowMajor, metric_arg, stream);
-    size_t worksize = raft::distance::getWorkspaceSize<distanceType, DataType, DataType, DataType>(
-      x.data(), y.data(), m, n, k);
-    rmm::device_uvector<char> workspace(worksize, stream);
+    //    size_t worksize = raft::distance::getWorkspaceSize<distanceType, DataType, DataType,
+    //    DataType>(
+    //      x.data(), y.data(), m, n, k);
+    //    rmm::device_uvector<char> workspace(worksize, stream);
 
     DataType threshold = -10000.f;
-    distanceLauncher<distanceType, DataType>(x.data(),
-                                             y.data(),
-                                             dist.data(),
-                                             dist2.data(),
-                                             m,
-                                             n,
-                                             k,
-                                             params,
-                                             threshold,
-                                             workspace.data(),
-                                             workspace.size(),
-                                             stream,
-                                             isRowMajor,
-                                             metric_arg);
+
+    if (isRowMajor) {
+      distanceLauncher<distanceType, DataType, layout_c_contiguous>(x.data(),
+                                                                    y.data(),
+                                                                    dist.data(),
+                                                                    dist2.data(),
+                                                                    m,
+                                                                    n,
+                                                                    k,
+                                                                    params,
+                                                                    threshold,
+                                                                    stream,
+                                                                    metric_arg);
+
+    } else {
+      distanceLauncher<distanceType, DataType, layout_f_contiguous>(x.data(),
+                                                                    y.data(),
+                                                                    dist.data(),
+                                                                    dist2.data(),
+                                                                    m,
+                                                                    n,
+                                                                    k,
+                                                                    params,
+                                                                    threshold,
+                                                                    stream,
+                                                                    metric_arg);
+    }
     handle.sync_stream(stream);
   }
 
diff --git a/cpp/test/random/make_blobs.cu b/cpp/test/random/make_blobs.cu
index b2b4ba9e66..36e6e3f838 100644
--- a/cpp/test/random/make_blobs.cu
+++ b/cpp/test/random/make_blobs.cu
@@ -19,6 +19,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
+#include <raft/mdarray.hpp>
 #include <raft/random/make_blobs.cuh>
 
 namespace raft {
@@ -68,19 +69,19 @@ struct MakeBlobsInputs {
   T tolerance;
   int rows, cols, n_clusters;
   T std;
-  bool row_major, shuffle;
+  bool shuffle;
   raft::random::GeneratorType gtype;
   uint64_t seed;
 };
 
-template <typename T>
+template <typename T, typename layout>
 class MakeBlobsTest : public ::testing::TestWithParam<MakeBlobsInputs<T>> {
  public:
   MakeBlobsTest()
     : params(::testing::TestWithParam<MakeBlobsInputs<T>>::GetParam()),
       stream(handle.get_stream()),
-      mu_vec(params.cols * params.n_clusters, stream),
-      mean_var(2 * params.n_clusters * params.cols, stream)
+      mu_vec(make_device_matrix<T, layout>(handle, params.n_clusters, params.cols)),
+      mean_var(make_device_vector<T>(handle, 2 * params.n_clusters * params.cols))
   {
   }
 
@@ -93,32 +94,31 @@ class MakeBlobsTest : public ::testing::TestWithParam<MakeBlobsInputs<T>> {
     auto len  = params.rows * params.cols;
     raft::random::Rng r(params.seed, params.gtype);
 
-    rmm::device_uvector<T> data(len, stream);
-    rmm::device_uvector<int> labels(params.rows, stream);
-    rmm::device_uvector<T> stats(2 * params.n_clusters * params.cols, stream);
-    rmm::device_uvector<int> lens(params.n_clusters, stream);
+    auto data   = make_device_matrix<T, layout>(handle, params.rows, params.cols);
+    auto labels = make_device_vector<int>(handle, params.rows);
+    auto stats  = make_device_vector<T>(handle, 2 * params.n_clusters * params.cols);
+    auto lens   = make_device_vector<int>(handle, params.n_clusters);
 
-    RAFT_CUDA_TRY(cudaMemsetAsync(stats.data(), 0, stats.size() * sizeof(T), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(stats.data(), 0, stats.extent(0) * sizeof(T), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(lens.data(), 0, lens.extent(0) * sizeof(int), stream));
     RAFT_CUDA_TRY(cudaMemsetAsync(mean_var.data(), 0, mean_var.size() * sizeof(T), stream));
-    RAFT_CUDA_TRY(cudaMemsetAsync(lens.data(), 0, lens.size() * sizeof(int), stream));
 
     r.uniform(mu_vec.data(), params.cols * params.n_clusters, T(-10.0), T(10.0), stream);
-    T* sigma_vec = nullptr;
-    make_blobs(data.data(),
-               labels.data(),
-               params.rows,
-               params.cols,
-               params.n_clusters,
-               stream,
-               params.row_major,
-               mu_vec.data(),
-               sigma_vec,
-               params.std,
-               params.shuffle,
-               T(-10.0),
-               T(10.0),
-               params.seed,
-               params.gtype);
+
+    make_blobs<T, int, layout>(handle,
+                               data.view(),
+                               labels.view(),
+                               params.n_clusters,
+                               std::make_optional(mu_vec.view()),
+                               std::nullopt,
+                               params.std,
+                               params.shuffle,
+                               T(-10.0),
+                               T(10.0),
+                               params.seed,
+                               params.gtype);
+
+    bool row_major           = std::is_same<layout, raft::layout_c_contiguous>::value;
     static const int threads = 128;
     meanKernel<T><<<raft::ceildiv(len, threads), threads, 0, stream>>>(stats.data(),
                                                                        lens.data(),
@@ -127,10 +127,10 @@ class MakeBlobsTest : public ::testing::TestWithParam<MakeBlobsInputs<T>> {
                                                                        params.rows,
                                                                        params.cols,
                                                                        params.n_clusters,
-                                                                       params.row_major);
+                                                                       row_major);
     int len1 = params.n_clusters * params.cols;
     compute_mean_var<T><<<raft::ceildiv(len1, threads), threads, 0, stream>>>(
-      mean_var.data(), stats.data(), lens.data(), params.n_clusters, params.cols, params.row_major);
+      mean_var.data(), stats.data(), lens.data(), params.n_clusters, params.cols, row_major);
   }
 
   void check()
@@ -146,87 +146,66 @@ class MakeBlobsTest : public ::testing::TestWithParam<MakeBlobsInputs<T>> {
   raft::handle_t handle;
   cudaStream_t stream = 0;
 
-  rmm::device_uvector<T> mu_vec, mean_var;
+  device_vector<T> mean_var;
+  device_matrix<T, layout> mu_vec;
   int num_sigma;
 };
 
-typedef MakeBlobsTest<float> MakeBlobsTestF;
+typedef MakeBlobsTest<float, raft::layout_c_contiguous> MakeBlobsTestF_RowMajor;
+typedef MakeBlobsTest<float, raft::layout_f_contiguous> MakeBlobsTestF_ColMajor;
+
 const std::vector<MakeBlobsInputs<float>> inputsf_t = {
-  {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, true, false, raft::random::GenPC, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, true, false, raft::random::GenPC, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, false, false, raft::random::GenPC, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, false, false, raft::random::GenPC, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, true, true, raft::random::GenPC, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, true, true, raft::random::GenPC, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.f, false, true, raft::random::GenPC, 1234ULL},
-  {0.011, 1024, 8, 3, 1.f, false, true, raft::random::GenPC, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, true, false, raft::random::GenPC, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, true, false, raft::random::GenPC, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, false, false, raft::random::GenPC, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, false, false, raft::random::GenPC, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, true, true, raft::random::GenPC, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, true, true, raft::random::GenPC, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.f, false, true, raft::random::GenPC, 1234ULL},
-  {0.011, 5003, 8, 5, 1.f, false, true, raft::random::GenPC, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, false, raft::random::GenPhilox, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, false, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, false, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, false, raft::random::GenPC, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, true, raft::random::GenPhilox, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, true, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.f, true, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.f, true, raft::random::GenPC, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, false, raft::random::GenPhilox, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, false, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, false, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, false, raft::random::GenPC, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, true, raft::random::GenPhilox, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, true, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.f, true, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.f, true, raft::random::GenPC, 1234ULL},
 };
 
-TEST_P(MakeBlobsTestF, Result) { check(); }
-INSTANTIATE_TEST_CASE_P(MakeBlobsTests, MakeBlobsTestF, ::testing::ValuesIn(inputsf_t));
+TEST_P(MakeBlobsTestF_RowMajor, Result) { check(); }
+INSTANTIATE_TEST_CASE_P(MakeBlobsTests, MakeBlobsTestF_RowMajor, ::testing::ValuesIn(inputsf_t));
+
+TEST_P(MakeBlobsTestF_ColMajor, Result) { check(); }
+INSTANTIATE_TEST_CASE_P(MakeBlobsTests, MakeBlobsTestF_ColMajor, ::testing::ValuesIn(inputsf_t));
+
+typedef MakeBlobsTest<double, raft::layout_c_contiguous> MakeBlobsTestD_RowMajor;
+typedef MakeBlobsTest<double, raft::layout_f_contiguous> MakeBlobsTestD_ColMajor;
 
-typedef MakeBlobsTest<double> MakeBlobsTestD;
 const std::vector<MakeBlobsInputs<double>> inputsd_t = {
-  {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, true, false, raft::random::GenPC, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, true, false, raft::random::GenPC, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, false, false, raft::random::GenPC, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, false, false, raft::random::GenPC, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, true, true, raft::random::GenPC, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, true, true, raft::random::GenPC, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 1024, 32, 3, 1.0, false, true, raft::random::GenPC, 1234ULL},
-  {0.011, 1024, 8, 3, 1.0, false, true, raft::random::GenPC, 1234ULL},
-
-  {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, true, false, raft::random::GenPC, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, true, false, raft::random::GenPC, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, false, false, raft::random::GenPC, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, false, false, raft::random::GenPC, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, true, true, raft::random::GenPC, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, true, true, raft::random::GenPC, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenPhilox, 1234ULL},
-  {0.0055, 5003, 32, 5, 1.0, false, true, raft::random::GenPC, 1234ULL},
-  {0.011, 5003, 8, 5, 1.0, false, true, raft::random::GenPC, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, false, raft::random::GenPhilox, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, false, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, false, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, false, raft::random::GenPC, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, true, raft::random::GenPhilox, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, true, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 1024, 32, 3, 1.0, true, raft::random::GenPC, 1234ULL},
+  {0.011, 1024, 8, 3, 1.0, true, raft::random::GenPC, 1234ULL},
+
+  {0.0055, 5003, 32, 5, 1.0, false, raft::random::GenPhilox, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, false, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, false, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, false, raft::random::GenPC, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, true, raft::random::GenPhilox, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, true, raft::random::GenPhilox, 1234ULL},
+  {0.0055, 5003, 32, 5, 1.0, true, raft::random::GenPC, 1234ULL},
+  {0.011, 5003, 8, 5, 1.0, true, raft::random::GenPC, 1234ULL},
 };
-TEST_P(MakeBlobsTestD, Result) { check(); }
-INSTANTIATE_TEST_CASE_P(MakeBlobsTests, MakeBlobsTestD, ::testing::ValuesIn(inputsd_t));
+TEST_P(MakeBlobsTestD_RowMajor, Result) { check(); }
+INSTANTIATE_TEST_CASE_P(MakeBlobsTests, MakeBlobsTestD_RowMajor, ::testing::ValuesIn(inputsd_t));
+
+TEST_P(MakeBlobsTestD_ColMajor, Result) { check(); }
+INSTANTIATE_TEST_CASE_P(MakeBlobsTests, MakeBlobsTestD_ColMajor, ::testing::ValuesIn(inputsd_t));
 
 }  // end namespace random
 }  // end namespace raft

From b28c705dede5923f80097ea0a030aa33d0c2ab99 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Fri, 11 Mar 2022 09:04:56 -0500
Subject: [PATCH 140/171] Adding logger (#550)

We've talked about moving this for awhile so I figured it's time to do it. This is also going to help @jnke2016 and I trace through a segfault in the cugraph mnmg code.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/raft/pull/550
---
 README.md                                     |   2 +-
 .../raft/common/detail/callback_sink.hpp      |  71 +++++
 cpp/include/raft/common/detail/scatter.cuh    |  52 +++
 cpp/include/raft/common/logger.hpp            | 298 ++++++++++++++++++
 cpp/include/raft/common/scatter.cuh           |  44 +--
 cpp/test/CMakeLists.txt                       |   1 +
 cpp/test/common/logger.cpp                    |  97 ++++++
 7 files changed, 528 insertions(+), 37 deletions(-)
 create mode 100644 cpp/include/raft/common/detail/callback_sink.hpp
 create mode 100644 cpp/include/raft/common/detail/scatter.cuh
 create mode 100644 cpp/include/raft/common/logger.hpp
 create mode 100644 cpp/test/common/logger.cpp

diff --git a/README.md b/README.md
index 54dd394a69..606197cde0 100755
--- a/README.md
+++ b/README.md
@@ -74,7 +74,7 @@ auto input = raft::make_device_matrix<float>(handle, n_samples, n_features);
 auto labels = raft::make_device_vector<int>(handle, n_samples);
 auto output = raft::make_device_matrix<float>(handle, n_samples, n_samples);
 
-raft::random::make_blobs(handle, input, labels);
+raft::random::make_blobs(handle, input.view(), labels.view());
 
 auto metric = raft::distance::DistanceType::L2SqrtExpanded;
 raft::distance::pairwise_distance(handle, input.view(), input.view(), output.view(), metric);
diff --git a/cpp/include/raft/common/detail/callback_sink.hpp b/cpp/include/raft/common/detail/callback_sink.hpp
new file mode 100644
index 0000000000..e6dc07b49d
--- /dev/null
+++ b/cpp/include/raft/common/detail/callback_sink.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <iostream>
+#include <mutex>
+
+#define SPDLOG_HEADER_ONLY
+#include <spdlog/common.h>
+#include <spdlog/details/log_msg.h>
+#include <spdlog/sinks/base_sink.h>
+
+namespace spdlog::sinks {
+
+typedef void (*LogCallback)(int lvl, const char* msg);
+
+template <class Mutex>
+class CallbackSink : public base_sink<Mutex> {
+ public:
+  explicit CallbackSink(std::string tag      = "spdlog",
+                        LogCallback callback = nullptr,
+                        void (*flush)()      = nullptr)
+    : _callback{callback}, _flush{flush} {};
+
+  void set_callback(LogCallback callback) { _callback = callback; }
+  void set_flush(void (*flush)()) { _flush = flush; }
+
+ protected:
+  void sink_it_(const details::log_msg& msg) override
+  {
+    spdlog::memory_buf_t formatted;
+    base_sink<Mutex>::formatter_->format(msg, formatted);
+    std::string msg_string = fmt::to_string(formatted);
+
+    if (_callback) {
+      _callback(static_cast<int>(msg.level), msg_string.c_str());
+    } else {
+      std::cout << msg_string;
+    }
+  }
+
+  void flush_() override
+  {
+    if (_flush) {
+      _flush();
+    } else {
+      std::cout << std::flush;
+    }
+  }
+
+  LogCallback _callback;
+  void (*_flush)();
+};
+
+using callback_sink_mt = CallbackSink<std::mutex>;
+using callback_sink_st = CallbackSink<details::null_mutex>;
+
+}  // end namespace spdlog::sinks
\ No newline at end of file
diff --git a/cpp/include/raft/common/detail/scatter.cuh b/cpp/include/raft/common/detail/scatter.cuh
new file mode 100644
index 0000000000..4087625320
--- /dev/null
+++ b/cpp/include/raft/common/detail/scatter.cuh
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/vectorized.cuh>
+
+namespace raft::detail {
+
+template <typename DataT, int VecLen, typename Lambda, typename IdxT>
+__global__ void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op)
+{
+  typedef TxN_t<DataT, VecLen> DataVec;
+  typedef TxN_t<IdxT, VecLen> IdxVec;
+  IdxT tid = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x);
+  tid *= VecLen;
+  if (tid >= len) return;
+  IdxVec idxIn;
+  idxIn.load(idx, tid);
+  DataVec dataIn;
+#pragma unroll
+  for (int i = 0; i < VecLen; ++i) {
+    auto inPos         = idxIn.val.data[i];
+    dataIn.val.data[i] = op(in[inPos], tid + i);
+  }
+  dataIn.store(out, tid);
+}
+
+template <typename DataT, int VecLen, typename Lambda, typename IdxT, int TPB>
+void scatterImpl(
+  DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op, cudaStream_t stream)
+{
+  const IdxT nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxT)TPB);
+  scatterKernel<DataT, VecLen, Lambda, IdxT><<<nblks, TPB, 0, stream>>>(out, in, idx, len, op);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+}  // namespace raft::detail
diff --git a/cpp/include/raft/common/logger.hpp b/cpp/include/raft/common/logger.hpp
new file mode 100644
index 0000000000..d8d020ee58
--- /dev/null
+++ b/cpp/include/raft/common/logger.hpp
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <stdarg.h>
+
+#include <algorithm>
+
+#include <memory>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#include <stdarg.h>
+
+#define SPDLOG_HEADER_ONLY
+#include <raft/common/detail/callback_sink.hpp>
+#include <spdlog/sinks/stdout_color_sinks.h>  // NOLINT
+#include <spdlog/spdlog.h>                    // NOLINT
+
+/**
+ * @defgroup logging levels used in raft
+ *
+ * @note exactly match the corresponding ones (but reverse in terms of value)
+ *       in spdlog for wrapping purposes
+ *
+ * @{
+ */
+#define RAFT_LEVEL_TRACE    6
+#define RAFT_LEVEL_DEBUG    5
+#define RAFT_LEVEL_INFO     4
+#define RAFT_LEVEL_WARN     3
+#define RAFT_LEVEL_ERROR    2
+#define RAFT_LEVEL_CRITICAL 1
+#define RAFT_LEVEL_OFF      0
+/** @} */
+
+#if !defined(RAFT_ACTIVE_LEVEL)
+#define RAFT_ACTIVE_LEVEL RAFT_LEVEL_DEBUG
+#endif
+
+namespace raft {
+
+static const std::string RAFT_NAME = "raft";
+static const std::string default_log_pattern("[%L] [%H:%M:%S.%f] %v");
+
+/**
+ * @defgroup CStringFormat Expand a C-style format string
+ *
+ * @brief Expands C-style formatted string into std::string
+ *
+ * @param[in] fmt format string
+ * @param[in] vl  respective values for each of format modifiers in the string
+ *
+ * @return the expanded `std::string`
+ *
+ * @{
+ */
+std::string format(const char* fmt, va_list& vl)
+{
+  char buf[4096];
+  vsnprintf(buf, sizeof(buf), fmt, vl);
+  return std::string(buf);
+}
+
+std::string format(const char* fmt, ...)
+{
+  va_list vl;
+  va_start(vl, fmt);
+  std::string str = format(fmt, vl);
+  va_end(vl);
+  return str;
+}
+/** @} */
+
+int convert_level_to_spdlog(int level)
+{
+  level = std::max(RAFT_LEVEL_OFF, std::min(RAFT_LEVEL_TRACE, level));
+  return RAFT_LEVEL_TRACE - level;
+}
+
+/**
+ * @brief The main Logging class for raft library.
+ *
+ * This class acts as a thin wrapper over the underlying `spdlog` interface. The
+ * design is done in this way in order to avoid us having to also ship `spdlog`
+ * header files in our installation.
+ *
+ * @todo This currently only supports logging to stdout. Need to add support in
+ *       future to add custom loggers as well [Issue #2046]
+ */
+class logger {
+ public:
+  // @todo setting the logger once per process with
+  logger(std::string const& name_ = "")
+    : sink{std::make_shared<spdlog::sinks::callback_sink_mt>()},
+      spdlogger{std::make_shared<spdlog::logger>(name_, sink)},
+      cur_pattern()
+  {
+    set_pattern(default_log_pattern);
+    set_level(RAFT_LEVEL_INFO);
+  }
+  /**
+   * @brief Singleton method to get the underlying logger object
+   *
+   * @return the singleton logger object
+   */
+  static logger& get(std::string const& name = "")
+  {
+    if (log_map.find(name) == log_map.end()) {
+      log_map[name] = std::make_shared<raft::logger>(name);
+    }
+    return *log_map[name];
+  }
+
+  /**
+   * @brief Set the logging level.
+   *
+   * Only messages with level equal or above this will be printed
+   *
+   * @param[in] level logging level
+   *
+   * @note The log level will actually be set only if the input is within the
+   *       range [RAFT_LEVEL_TRACE, RAFT_LEVEL_OFF]. If it is not, then it'll
+   *       be ignored. See documentation of decisiontree for how this gets used
+   */
+  void set_level(int level)
+  {
+    level = convert_level_to_spdlog(level);
+    spdlogger->set_level(static_cast<spdlog::level::level_enum>(level));
+  }
+
+  /**
+   * @brief Set the logging pattern
+   *
+   * @param[in] pattern the pattern to be set. Refer this link
+   *                    https://github.com/gabime/spdlog/wiki/3.-Custom-formatting
+   *                    to know the right syntax of this pattern
+   */
+  void set_pattern(const std::string& pattern)
+  {
+    cur_pattern = pattern;
+    spdlogger->set_pattern(pattern);
+  }
+
+  /**
+   * @brief Register a callback function to be run in place of usual log call
+   *
+   * @param[in] callback the function to be run on all logged messages
+   */
+  void set_callback(void (*callback)(int lvl, const char* msg)) { sink->set_callback(callback); }
+
+  /**
+   * @brief Register a flush function compatible with the registered callback
+   *
+   * @param[in] flush the function to use when flushing logs
+   */
+  void set_flush(void (*flush)()) { sink->set_flush(flush); }
+
+  /**
+   * @brief Tells whether messages will be logged for the given log level
+   *
+   * @param[in] level log level to be checked for
+   * @return true if messages will be logged for this level, else false
+   */
+  bool should_log_for(int level) const
+  {
+    level        = convert_level_to_spdlog(level);
+    auto level_e = static_cast<spdlog::level::level_enum>(level);
+    return spdlogger->should_log(level_e);
+  }
+
+  /**
+   * @brief Query for the current log level
+   *
+   * @return the current log level
+   */
+  int get_level() const
+  {
+    auto level_e = spdlogger->level();
+    return RAFT_LEVEL_TRACE - static_cast<int>(level_e);
+  }
+
+  /**
+   * @brief Get the current logging pattern
+   * @return the pattern
+   */
+  std::string get_pattern() const { return cur_pattern; }
+
+  /**
+   * @brief Main logging method
+   *
+   * @param[in] level logging level of this message
+   * @param[in] fmt   C-like format string, followed by respective params
+   */
+  void log(int level, const char* fmt, ...)
+  {
+    level        = convert_level_to_spdlog(level);
+    auto level_e = static_cast<spdlog::level::level_enum>(level);
+    // explicit check to make sure that we only expand messages when required
+    if (spdlogger->should_log(level_e)) {
+      va_list vl;
+      va_start(vl, fmt);
+      auto msg = format(fmt, vl);
+      va_end(vl);
+      spdlogger->log(level_e, msg);
+    }
+  }
+
+  /**
+   * @brief Flush logs by calling flush on underlying logger
+   */
+  void flush() { spdlogger->flush(); }
+
+  ~logger() {}
+
+ private:
+  logger();
+
+  static inline std::unordered_map<std::string, std::shared_ptr<raft::logger>> log_map;
+  std::shared_ptr<spdlog::sinks::callback_sink_mt> sink;
+  std::shared_ptr<spdlog::logger> spdlogger;
+  std::string cur_pattern;
+  int cur_level;
+};  // class logger
+
+};  // namespace raft
+
+/**
+ * @defgroup loggerMacros Helper macros for dealing with logging
+ * @{
+ */
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_TRACE)
+#define RAFT_LOG_TRACE(fmt, ...)                                          \
+  do {                                                                    \
+    std::stringstream ss;                                                 \
+    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
+    ss << raft::detail::format(fmt, ##__VA_ARGS__);                       \
+    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_TRACE, ss.str().c_str()); \
+  } while (0)
+#else
+#define RAFT_LOG_TRACE(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG)
+#define RAFT_LOG_DEBUG(fmt, ...)                                          \
+  do {                                                                    \
+    std::stringstream ss;                                                 \
+    ss << raft::format("%s:%d ", __FILE__, __LINE__);                     \
+    ss << raft::format(fmt, ##__VA_ARGS__);                               \
+    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_DEBUG, ss.str().c_str()); \
+  } while (0)
+#else
+#define RAFT_LOG_DEBUG(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_INFO)
+#define RAFT_LOG_INFO(fmt, ...) \
+  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_INFO, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_INFO(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_WARN)
+#define RAFT_LOG_WARN(fmt, ...) \
+  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_WARN, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_WARN(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_ERROR)
+#define RAFT_LOG_ERROR(fmt, ...) \
+  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_ERROR, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_ERROR(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_CRITICAL)
+#define RAFT_LOG_CRITICAL(fmt, ...) \
+  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_CRITICAL, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_CRITICAL(fmt, ...) void(0)
+#endif
+/** @} */
\ No newline at end of file
diff --git a/cpp/include/raft/common/scatter.cuh b/cpp/include/raft/common/scatter.cuh
index 2d25b85a50..9735ccdf2b 100644
--- a/cpp/include/raft/common/scatter.cuh
+++ b/cpp/include/raft/common/scatter.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,39 +16,11 @@
 
 #pragma once
 
+#include <raft/common/detail/scatter.cuh>
 #include <raft/cuda_utils.cuh>
-#include <raft/vectorized.cuh>
 
 namespace raft {
 
-template <typename DataT, int VecLen, typename Lambda, typename IdxT>
-__global__ void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op)
-{
-  typedef TxN_t<DataT, VecLen> DataVec;
-  typedef TxN_t<IdxT, VecLen> IdxVec;
-  IdxT tid = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x);
-  tid *= VecLen;
-  if (tid >= len) return;
-  IdxVec idxIn;
-  idxIn.load(idx, tid);
-  DataVec dataIn;
-#pragma unroll
-  for (int i = 0; i < VecLen; ++i) {
-    auto inPos         = idxIn.val.data[i];
-    dataIn.val.data[i] = op(in[inPos], tid + i);
-  }
-  dataIn.store(out, tid);
-}
-
-template <typename DataT, int VecLen, typename Lambda, typename IdxT, int TPB>
-void scatterImpl(
-  DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op, cudaStream_t stream)
-{
-  const IdxT nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxT)TPB);
-  scatterKernel<DataT, VecLen, Lambda, IdxT><<<nblks, TPB, 0, stream>>>(out, in, idx, len, op);
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
 /**
  * @brief Performs scatter operation based on the input indexing array
  * @tparam DataT data type whose array gets scattered
@@ -79,17 +51,17 @@ void scatter(DataT* out,
   constexpr size_t MaxPerElem = DataSize > IdxSize ? DataSize : IdxSize;
   size_t bytes                = len * MaxPerElem;
   if (16 / MaxPerElem && bytes % 16 == 0) {
-    scatterImpl<DataT, 16 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
+    detail::scatterImpl<DataT, 16 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else if (8 / MaxPerElem && bytes % 8 == 0) {
-    scatterImpl<DataT, 8 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
+    detail::scatterImpl<DataT, 8 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else if (4 / MaxPerElem && bytes % 4 == 0) {
-    scatterImpl<DataT, 4 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
+    detail::scatterImpl<DataT, 4 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else if (2 / MaxPerElem && bytes % 2 == 0) {
-    scatterImpl<DataT, 2 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
+    detail::scatterImpl<DataT, 2 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else if (1 / MaxPerElem) {
-    scatterImpl<DataT, 1 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
+    detail::scatterImpl<DataT, 1 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else {
-    scatterImpl<DataT, 1, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
+    detail::scatterImpl<DataT, 1, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   }
 }
 
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 0d3121fee6..f8ae28f550 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -16,6 +16,7 @@
 
 # keep the files in alphabetical order!
 add_executable(test_raft
+    test/common/logger.cpp
     test/common/seive.cu
     test/cudart_utils.cpp
     test/cluster_solvers.cu
diff --git a/cpp/test/common/logger.cpp b/cpp/test/common/logger.cpp
new file mode 100644
index 0000000000..218b33050c
--- /dev/null
+++ b/cpp/test/common/logger.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/common/logger.hpp>
+#include <string>
+
+namespace raft {
+
+TEST(logger, Test)
+{
+  RAFT_LOG_CRITICAL("This is a critical message");
+  RAFT_LOG_ERROR("This is an error message");
+  RAFT_LOG_WARN("This is a warning message");
+  RAFT_LOG_INFO("This is an info message");
+
+  logger::get(RAFT_NAME).set_level(RAFT_LEVEL_WARN);
+  ASSERT_EQ(RAFT_LEVEL_WARN, logger::get(RAFT_NAME).get_level());
+  logger::get(RAFT_NAME).set_level(RAFT_LEVEL_INFO);
+  ASSERT_EQ(RAFT_LEVEL_INFO, logger::get(RAFT_NAME).get_level());
+
+  ASSERT_FALSE(logger::get(RAFT_NAME).should_log_for(RAFT_LEVEL_TRACE));
+  ASSERT_FALSE(logger::get(RAFT_NAME).should_log_for(RAFT_LEVEL_DEBUG));
+  ASSERT_TRUE(logger::get(RAFT_NAME).should_log_for(RAFT_LEVEL_INFO));
+  ASSERT_TRUE(logger::get(RAFT_NAME).should_log_for(RAFT_LEVEL_WARN));
+}
+
+std::string logged = "";
+void exampleCallback(int lvl, const char* msg) { logged = std::string(msg); }
+
+int flushCount = 0;
+void exampleFlush() { ++flushCount; }
+
+class loggerTest : public ::testing::Test {
+ protected:
+  void SetUp() override
+  {
+    flushCount = 0;
+    logged     = "";
+    logger::get(RAFT_NAME).set_level(RAFT_LEVEL_TRACE);
+  }
+
+  void TearDown() override
+  {
+    logger::get(RAFT_NAME).set_callback(nullptr);
+    logger::get(RAFT_NAME).set_flush(nullptr);
+    logger::get(RAFT_NAME).set_level(RAFT_LEVEL_INFO);
+  }
+};
+
+TEST_F(loggerTest, callback)
+{
+  std::string testMsg;
+  logger::get(RAFT_NAME).set_callback(exampleCallback);
+
+  testMsg = "This is a critical message";
+  RAFT_LOG_CRITICAL(testMsg.c_str());
+  ASSERT_TRUE(logged.find(testMsg) != std::string::npos);
+
+  testMsg = "This is an error message";
+  RAFT_LOG_ERROR(testMsg.c_str());
+  ASSERT_TRUE(logged.find(testMsg) != std::string::npos);
+
+  testMsg = "This is a warning message";
+  RAFT_LOG_WARN(testMsg.c_str());
+  ASSERT_TRUE(logged.find(testMsg) != std::string::npos);
+
+  testMsg = "This is an info message";
+  RAFT_LOG_INFO(testMsg.c_str());
+  ASSERT_TRUE(logged.find(testMsg) != std::string::npos);
+
+  testMsg = "This is a debug message";
+  RAFT_LOG_DEBUG(testMsg.c_str());
+  ASSERT_TRUE(logged.find(testMsg) != std::string::npos);
+}
+
+TEST_F(loggerTest, flush)
+{
+  logger::get(RAFT_NAME).set_flush(exampleFlush);
+  logger::get(RAFT_NAME).flush();
+  ASSERT_EQ(1, flushCount);
+}
+
+}  // namespace raft
\ No newline at end of file

From dcc65094240258cce64d953b3edf4301103a5def Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Fri, 11 Mar 2022 15:29:52 +0100
Subject: [PATCH 141/171] Add benchmarks (#549)

Copied the benchmarking setup and one simple benchmark (as an example) from cuml.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Jordan Jacobelli (https://github.com/Ethyling)

URL: https://github.com/rapidsai/raft/pull/549
---
 BUILD.md                                |  56 +++--
 build.sh                                |  48 +++--
 ci/checks/style.sh                      |   1 +
 ci/gpu/build.sh                         |   4 +-
 conda/recipes/libraft_distance/build.sh |   3 +-
 conda/recipes/libraft_headers/build.sh  |   3 +-
 conda/recipes/libraft_nn/build.sh       |   3 +-
 cpp/CMakeLists.txt                      |  13 ++
 cpp/bench/CMakeLists.txt                |  60 ++++++
 cpp/bench/common/benchmark.hpp          | 262 ++++++++++++++++++++++++
 cpp/bench/linalg/reduce.cu              |  62 ++++++
 cpp/bench/main.cpp                      |  19 ++
 cpp/cmake/thirdparty/get_gbench.cmake   |  43 ++++
 13 files changed, 530 insertions(+), 47 deletions(-)
 create mode 100644 cpp/bench/CMakeLists.txt
 create mode 100644 cpp/bench/common/benchmark.hpp
 create mode 100644 cpp/bench/linalg/reduce.cu
 create mode 100644 cpp/bench/main.cpp
 create mode 100644 cpp/cmake/thirdparty/get_gbench.cmake

diff --git a/BUILD.md b/BUILD.md
index 1bf3783fae..457ee85aad 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -26,39 +26,52 @@ The recommended way to build and install RAFT is to use the `build.sh` script in
 
 ### <a id="install_header_only_cpp"></a>Header-only C++
 
-RAFT depends on many different core libraries such as `thrust`, `cub`, `cucollections`, and `rmm`, which will be downloaded automatically by `cmake` even when only installing the headers. It's important to note that while all the headers will be installed and available, some parts of the RAFT API depend on libraries like `FAISS`, which can also be downloaded in the RAFT build but will need to be told to do so. 
+RAFT depends on many different core libraries such as `thrust`, `cub`, `cucollections`, and `rmm`, which will be downloaded automatically by `cmake` even when only installing the headers. It's important to note that while all the headers will be installed and available, some parts of the RAFT API depend on libraries like `FAISS`, which can also be downloaded in the RAFT build but will need to be told to do so.
 
 The following example builds and installs raft in header-only mode:
 ```bash
-./build.sh libraft --nogtest
+./build.sh libraft
 ```
 
 ###<a id="shared_cpp_libs"></a>C++ Shared Libraries (optional)
 
-Shared libraries are provided to speed up compile times for larger libraries which may heavily utilize some of the APIs. These shared libraries can also significantly improve re-compile times while developing against the APIs. 
+Shared libraries are provided to speed up compile times for larger libraries which may heavily utilize some of the APIs. These shared libraries can also significantly improve re-compile times while developing against the APIs.
 
 Build all the shared libraries by passing `--compile-libs` flag to `build.sh`:
 
 ```bash
-./build.sh libraft --compile-libs --nogtest
+./build.sh libraft --compile-libs
 ```
- 
+
 To remain flexible, the individual shared libraries have their own flags and multiple can be used (though currently only the `nn` and `distance` packages contain shared libraries):
 ```bash
-./build.sh libraft --compile-nn --compile-dist --nogtest
+./build.sh libraft --compile-nn --compile-dist
 ```
 
 ###<a id="gtests"></a>Googletests
 
-Compile the Googletests by removing the `--nogtest` flag from `build.sh`:
+Compile the Googletests using the `tests` target in `build.sh`:
 ```bash
-./build.sh libraft --compile-nn --compile-dist
+./build.sh libraft tests --compile-nn --compile-dist
 ```
 
 To run C++ tests:
 
 ```bash
-./test_raft
+./cpp/build/test_raft
+```
+
+###<a id="benchmarks"></a>Benchmarks
+
+Compile the benchmarks using the `bench` target in `build.sh`:
+```bash
+./build.sh libraft bench --compile-nn --compile-dist
+```
+
+To run C++ tests:
+
+```bash
+./cpp/build/bench_raft
 ```
 
 ### <a id="cpp_using_cmake"></a>C++ Using Cmake
@@ -77,15 +90,16 @@ RAFT's cmake has the following configurable flags available:.
 
 | Flag | Possible Values | Default Value | Behavior |
 | --- | --- | --- | --- |
-| BUILD_TESTS | ON, OFF | ON | Compile Googletests |  
+| BUILD_TESTS | ON, OFF | ON | Compile Googletests |
+| BUILD_BENCH | ON, OFF | ON | Compile benchmarks |
 | RAFT_COMPILE_LIBRARIES | ON, OFF | OFF | Compiles all `libraft` shared libraries (these are required for Googletests) |
-| RAFT_COMPILE_NN_LIBRARY | ON, OFF | ON | Compiles the `libraft-nn` shared library |  
-| RAFT_COMPILE_DIST_LIBRARY | ON, OFF | ON | Compiles the `libraft-distance` shared library |  
+| RAFT_COMPILE_NN_LIBRARY | ON, OFF | ON | Compiles the `libraft-nn` shared library |
+| RAFT_COMPILE_DIST_LIBRARY | ON, OFF | ON | Compiles the `libraft-distance` shared library |
 | RAFT_ENABLE_NN_DEPENDENCIES | ON, OFF | OFF | Searches for dependencies of nearest neighbors API, such as FAISS, and compiles them if not found. |
-| RAFT_USE_FAISS_STATIC | ON, OFF | OFF | Statically link FAISS into `libraft-nn` | 
+| RAFT_USE_FAISS_STATIC | ON, OFF | OFF | Statically link FAISS into `libraft-nn` |
 | DETECT_CONDA_ENV | ON, OFF | ON | Enable detection of conda environment for dependencies |
 | NVTX | ON, OFF | OFF | Enable NVTX Markers |
-| CUDA_ENABLE_KERNELINFO | ON, OFF | OFF | Enables `kernelinfo` in nvcc. This is useful for `compute-sanitizer` | 
+| CUDA_ENABLE_KERNELINFO | ON, OFF | OFF | Enables `kernelinfo` in nvcc. This is useful for `compute-sanitizer` |
 | CUDA_ENABLE_LINEINFO  | ON, OFF | OFF | Enable the -lineinfo option for nvcc |
 | CUDA_STATIC_RUNTIME | ON, OFF | OFF | Statically link the CUDA runtime |
 
@@ -115,8 +129,8 @@ python setup.py install
 ```
 
 To run the Python tests:
-```bash 
-cd python 
+```bash
+cd python
 python -m pytest raft
 ```
 
@@ -142,14 +156,14 @@ The following example shows how to use the `libraft-distance` API with the pre-c
 
 RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library, so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` link target and `RAFT_INCLUDE_DIR` for includes. The `COMPILE_LIBRARIES` option enables the building of the shared libraries.
 
-The following `cmake` snippet enables a flexible configuration of RAFT: 
+The following `cmake` snippet enables a flexible configuration of RAFT:
 
 ```cmake
 
 set(RAFT_VERSION "22.04")
 
 function(find_and_configure_raft)
-  set(oneValueArgs VERSION FORK PINNED_TAG USE_FAISS_STATIC 
+  set(oneValueArgs VERSION FORK PINNED_TAG USE_FAISS_STATIC
           COMPILE_LIBRARIES ENABLE_NN_DEPENDENCIES CLONE_ON_PIN
           USE_NN_LIBRARY USE_DISTANCE_LIBRARY)
   cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
@@ -165,14 +179,14 @@ function(find_and_configure_raft)
   endif()
 
   #-----------------------------------------------------
-  # Add components 
+  # Add components
   #-----------------------------------------------------
 
   string(APPEND RAFT_COMPONENTS "")
   if(PKG_USE_NN_LIBRARY)
     string(APPEND RAFT_COMPONENTS " nn")
   endif()
-  
+
   if(PKG_USE_DISTANCE_LIBRARY)
     string(APPEND RAFT_COMPONENTS " distance")
   endif()
@@ -221,4 +235,4 @@ find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
 
 ### <a id="py_integration"></a>Python/Cython Integration
 
-Once installed, RAFT's Python library can be imported and used directly.
\ No newline at end of file
+Once installed, RAFT's Python library can be imported and used directly.
diff --git a/build.sh b/build.sh
index 9d3a796c65..eb5fa0a250 100755
--- a/build.sh
+++ b/build.sh
@@ -18,25 +18,26 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libraft pyraft docs -v -g --noinstall --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --nogtest --buildfaiss"
+VALIDARGS="clean libraft pyraft docs tests bench -v -g --noinstall --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --buildfaiss"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
    libraft          - build the raft C++ code only. Also builds the C-wrapper library
                       around the C++ code.
-   pyraft             - build the cuml Python package
+   pyraft           - build the cuml Python package
    docs             - build the documentation
+   tests            - build the tests
+   bench            - build the benchmarks
 
  and <flag> is:
    -v               - verbose build mode
    -g               - build for debug
-   --compile-libs    - compile shared libraries for all components
+   --compile-libs   - compile shared libraries for all components
    --compile-nn     - compile shared library for nn component
    --compile-dist   - compile shared library for distance component
    --allgpuarch     - build for all supported GPU architectures
    --buildfaiss     - build faiss statically into raft
-   --nogtest        - do not build google tests for libraft
-   --noinstall     - do not install cmake targets
+   --noinstall      - do not install cmake targets
    --nvtx           - Enable nvtx for profiling support
    --show_depr_warn - show cmake deprecation warnings
    -h               - print this text
@@ -53,12 +54,13 @@ BUILD_DIRS="${LIBRAFT_BUILD_DIR} ${PY_RAFT_BUILD_DIR} ${PYTHON_DEPS_CLONE}"
 CMAKE_LOG_LEVEL=""
 VERBOSE_FLAG=""
 BUILD_ALL_GPU_ARCH=0
-BUILD_TESTS=YES
+BUILD_TESTS=OFF
+BUILD_BENCH=OFF
 BUILD_STATIC_FAISS=OFF
 COMPILE_LIBRARIES=OFF
 COMPILE_NN_LIBRARY=OFF
 COMPILE_DIST_LIBRARY=OFF
-ENABLE_NN_DEPENDENCIES=${BUILD_TESTS}
+ENABLE_NN_DEPENDENCIES=OFF
 NVTX=OFF
 CLEAN=0
 DISABLE_DEPRECATION_WARNINGS=ON
@@ -110,11 +112,6 @@ fi
 if hasArg --allgpuarch; then
     BUILD_ALL_GPU_ARCH=1
 fi
-if hasArg --nogtest; then
-    BUILD_TESTS=OFF
-    COMPILE_LIBRARIES=OFF
-    ENABLE_NN_DEPENDENCIES=OFF
-fi
 
 if hasArg --compile-libs || (( ${NUMARGS} == 0 )); then
   COMPILE_LIBRARIES=ON
@@ -123,11 +120,24 @@ fi
 if hasArg --compile-nn || hasArg --compile-libs || (( ${NUMARGS} == 0 )); then
     ENABLE_NN_DEPENDENCIES=ON
     COMPILE_NN_LIBRARY=ON
-    CMAKE_TARGET="raft_nn_lib;${CMAKE_TARGET}"
+    CMAKE_TARGET="${CMAKE_TARGET};raft_nn_lib"
 fi
+
 if hasArg --compile-dist || hasArg --compile-libs || (( ${NUMARGS} == 0 )); then
     COMPILE_DIST_LIBRARY=ON
-    CMAKE_TARGET="raft_distance_lib;${CMAKE_TARGET}"
+    CMAKE_TARGET="${CMAKE_TARGET};raft_distance_lib"
+fi
+
+if hasArg tests || (( ${NUMARGS} == 0 )); then
+    BUILD_TESTS=ON
+    ENABLE_NN_DEPENDENCIES=ON
+    CMAKE_TARGET="${CMAKE_TARGET};test_raft"
+fi
+
+if hasArg bench || (( ${NUMARGS} == 0 )); then
+    BUILD_BENCH=ON
+    ENABLE_NN_DEPENDENCIES=ON
+    CMAKE_TARGET="${CMAKE_TARGET};bench_raft"
 fi
 
 if hasArg --buildfaiss; then
@@ -165,7 +175,7 @@ fi
 
 ################################################################################
 # Configure for building all C++ targets
-if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs; then
+if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || hasArg bench; then
     if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
         RAFT_CMAKE_CUDA_ARCHITECTURES="NATIVE"
         echo "Building for the architecture of the GPU in the system..."
@@ -184,17 +194,13 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs; then
           -DNVTX=${NVTX} \
           -DDISABLE_DEPRECATION_WARNINGS=${DISABLE_DEPRECATION_WARNINGS} \
           -DBUILD_TESTS=${BUILD_TESTS} \
+          -DBUILD_BENCH=${BUILD_BENCH} \
           -DCMAKE_MESSAGE_LOG_LEVEL=${CMAKE_LOG_LEVEL} \
           -DRAFT_COMPILE_NN_LIBRARY=${COMPILE_NN_LIBRARY} \
           -DRAFT_COMPILE_DIST_LIBRARY=${COMPILE_DIST_LIBRARY} \
           -DRAFT_USE_FAISS_STATIC=${BUILD_STATIC_FAISS}
 
-  if (( ${NUMARGS} == 0 )) || hasArg libraft; then
-      # Run all c++ targets at once
-      if ! hasArg --nogtest; then
-        CMAKE_TARGET="${CMAKE_TARGET};test_raft;"
-      fi
-
+  if [[ ${CMAKE_TARGET} != "" ]] || [[ ${INSTALL_TARGET} != "" ]]; then
       echo "-- Compiling targets: ${CMAKE_TARGET}, verbose=${VERBOSE_FLAG}"
       cmake --build  "${LIBRAFT_BUILD_DIR}" ${VERBOSE_FLAG} -j${PARALLEL_LEVEL} --target ${CMAKE_TARGET} ${INSTALL_TARGET}
   fi
diff --git a/ci/checks/style.sh b/ci/checks/style.sh
index 2ce8b446b8..fb5a64fdac 100644
--- a/ci/checks/style.sh
+++ b/ci/checks/style.sh
@@ -43,6 +43,7 @@ fi
 
 # Check for a consistent #include syntax
 HASH_INCLUDE=`python cpp/scripts/include_checker.py \
+                     cpp/bench \
                      cpp/include \
                      cpp/test \
                      2>&1`
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index afc6056b42..1affaef0b1 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -96,9 +96,9 @@ export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
 gpuci_logger "Build C++ and Python targets"
 # These should link against the existing shared libs
 if hasArg --skip-tests; then
-  "$WORKSPACE/build.sh" pyraft libraft -v --nogtest
-else
   "$WORKSPACE/build.sh" pyraft libraft -v
+else
+  "$WORKSPACE/build.sh" pyraft libraft tests bench -v
 fi
 
 gpuci_logger "sccache stats"
diff --git a/conda/recipes/libraft_distance/build.sh b/conda/recipes/libraft_distance/build.sh
index 7523263f01..062a5219db 100644
--- a/conda/recipes/libraft_distance/build.sh
+++ b/conda/recipes/libraft_distance/build.sh
@@ -1,3 +1,4 @@
 #!/usr/bin/env bash
+# Copyright (c) 2022, NVIDIA CORPORATION.
 
-./build.sh libraft -v --allgpuarch --compile-dist --nogtest
+./build.sh libraft -v --allgpuarch --compile-dist
diff --git a/conda/recipes/libraft_headers/build.sh b/conda/recipes/libraft_headers/build.sh
index ca6d9b4960..876f46cdfe 100644
--- a/conda/recipes/libraft_headers/build.sh
+++ b/conda/recipes/libraft_headers/build.sh
@@ -1,3 +1,4 @@
 #!/usr/bin/env bash
+# Copyright (c) 2022, NVIDIA CORPORATION.
 
-./build.sh libraft -v --allgpuarch --nogtest
+./build.sh libraft -v --allgpuarch
diff --git a/conda/recipes/libraft_nn/build.sh b/conda/recipes/libraft_nn/build.sh
index 5c60cd2fa1..4f6ffbca25 100644
--- a/conda/recipes/libraft_nn/build.sh
+++ b/conda/recipes/libraft_nn/build.sh
@@ -1,3 +1,4 @@
 #!/usr/bin/env bash
+# Copyright (c) 2022, NVIDIA CORPORATION.
 
-./build.sh libraft -v --allgpuarch --compile-nn --nogtest
+./build.sh libraft -v --allgpuarch --compile-nn
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c13ee03a33..c68be5e619 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -41,6 +41,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 # - User Options  ------------------------------------------------------------
 
 option(BUILD_TESTS "Build raft unit-tests" ON)
+option(BUILD_BENCH "Build raft C++ benchmark tests" ON)
 option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF)
 option(CUDA_ENABLE_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF)
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
@@ -58,6 +59,7 @@ include(CMakeDependentOption)
 cmake_dependent_option(RAFT_USE_FAISS_STATIC "Build and statically link the FAISS library for nearest neighbors search on GPU" ON RAFT_COMPILE_LIBRARIES OFF)
 
 message(VERBOSE "RAFT: Build RAFT unit-tests: ${BUILD_TESTS}")
+message(VERBOSE "RAFT: Building raft C++ benchmarks: ${BUILD_BENCH}")
 message(VERBOSE "RAFT: Enable detection of conda environment for dependencies: ${DETECT_CONDA_ENV}")
 message(VERBOSE "RAFT: Disable depreaction warnings " ${DISABLE_DEPRECATION_WARNINGS})
 message(VERBOSE "RAFT: Disable OpenMP: ${DISABLE_OPENMP}")
@@ -122,6 +124,10 @@ if(BUILD_TESTS)
   include(cmake/thirdparty/get_ucx.cmake)
 endif()
 
+if(BUILD_BENCH)
+  include(cmake/thirdparty/get_gbench.cmake)
+endif()
+
 ##############################################################################
 # - raft ---------------------------------------------------------------------
 
@@ -411,6 +417,13 @@ if(BUILD_TESTS)
   include(test/CMakeLists.txt)
 endif()
 
+##############################################################################
+# - build benchmark executable -----------------------------------------------
+
+if(BUILD_BENCH)
+  include(bench/CMakeLists.txt)
+endif()
+
 ##############################################################################
 # - doxygen targets ----------------------------------------------------------
 
diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
new file mode 100644
index 0000000000..9f0a6096d9
--- /dev/null
+++ b/cpp/bench/CMakeLists.txt
@@ -0,0 +1,60 @@
+#=============================================================================
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+set(RAFT_CPP_BENCH_TARGET "bench_raft")
+
+# (please keep the filenames in alphabetical order)
+add_executable(${RAFT_CPP_BENCH_TARGET}
+  bench/linalg/reduce.cu
+  bench/main.cpp
+)
+
+set_target_properties(${RAFT_CPP_BENCH_TARGET}
+  PROPERTIES BUILD_RPATH "\$ORIGIN"
+          # set target compile options
+          CXX_STANDARD                        17
+          CXX_STANDARD_REQUIRED               ON
+          CUDA_STANDARD                       17
+          CUDA_STANDARD_REQUIRED              ON
+          POSITION_INDEPENDENT_CODE           ON
+          INTERFACE_POSITION_INDEPENDENT_CODE ON
+          INSTALL_RPATH "\$ORIGIN/../../../lib"
+)
+
+target_compile_options(${RAFT_CPP_BENCH_TARGET}
+  PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
+          "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
+)
+
+target_include_directories(${RAFT_CPP_BENCH_TARGET}
+  PUBLIC  "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/bench>"
+)
+
+target_link_libraries(${RAFT_CPP_BENCH_TARGET}
+  PRIVATE
+    raft::raft
+    faiss::faiss
+    benchmark::benchmark
+    $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+    $<TARGET_NAME_IF_EXISTS:conda_env>
+)
+
+install(
+  TARGETS ${RAFT_CPP_BENCH_TARGET}
+  COMPONENT testing
+  DESTINATION bin/libraft/gbench
+  EXCLUDE_FROM_ALL
+)
diff --git a/cpp/bench/common/benchmark.hpp b/cpp/bench/common/benchmark.hpp
new file mode 100644
index 0000000000..93814ead44
--- /dev/null
+++ b/cpp/bench/common/benchmark.hpp
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include <raft/cudart_utils.h>
+#include <raft/interruptible.hpp>
+
+#include <benchmark/benchmark.h>
+
+#include <rmm/cuda_stream.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+namespace raft::bench {
+
+/**
+ * RAII way to temporary set the pooling memory allocator in rmm.
+ * This may be useful for benchmarking functions that do some memory allocations.
+ */
+struct using_pool_memory_res {
+ private:
+  rmm::mr::device_memory_resource* orig_res_;
+  rmm::mr::cuda_memory_resource cuda_res_;
+  rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> pool_res_;
+
+ public:
+  using_pool_memory_res(size_t initial_size, size_t max_size)
+    : orig_res_(rmm::mr::get_current_device_resource()),
+      pool_res_(&cuda_res_, initial_size, max_size)
+  {
+    rmm::mr::set_current_device_resource(&pool_res_);
+  }
+
+  using_pool_memory_res() : using_pool_memory_res(size_t(1) << size_t(30), size_t(16) << size_t(30))
+  {
+  }
+
+  ~using_pool_memory_res() { rmm::mr::set_current_device_resource(orig_res_); }
+};
+
+/**
+ * RAII way of timing cuda calls. This has been shamelessly copied from the
+ * cudf codebase via cuml codebase. So, credits for this class goes to cudf developers.
+ */
+struct cuda_event_timer {
+ private:
+  ::benchmark::State* state_;
+  rmm::cuda_stream_view stream_;
+  cudaEvent_t start_;
+  cudaEvent_t stop_;
+
+ public:
+  /**
+   * @param state  the benchmark::State whose timer we are going to update.
+   * @param stream CUDA stream we are measuring time on.
+   */
+  cuda_event_timer(::benchmark::State& state, rmm::cuda_stream_view stream)
+    : state_(&state), stream_(stream)
+  {
+    RAFT_CUDA_TRY(cudaEventCreate(&start_));
+    RAFT_CUDA_TRY(cudaEventCreate(&stop_));
+    raft::interruptible::synchronize(stream_);
+    RAFT_CUDA_TRY(cudaEventRecord(start_, stream_));
+  }
+  cuda_event_timer() = delete;
+
+  /**
+   * @brief The dtor stops the timer and performs a synchroniazation. Time of
+   *       the benchmark::State object provided to the ctor will be set to the
+   *       value given by `cudaEventElapsedTime()`.
+   */
+  ~cuda_event_timer()
+  {
+    RAFT_CUDA_TRY_NO_THROW(cudaEventRecord(stop_, stream_));
+    raft::interruptible::synchronize(stop_);
+    float milliseconds = 0.0f;
+    RAFT_CUDA_TRY_NO_THROW(cudaEventElapsedTime(&milliseconds, start_, stop_));
+    state_->SetIterationTime(milliseconds / 1000.f);
+    RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(start_));
+    RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(stop_));
+  }
+};
+
+/** Main fixture to be inherited and used by all other c++ benchmarks */
+class fixture {
+ private:
+  rmm::cuda_stream stream_owner_{};
+  rmm::device_buffer scratch_buf_;
+
+ public:
+  rmm::cuda_stream_view stream;
+
+  fixture() : stream{stream_owner_.view()}
+  {
+    int l2_cache_size = 0;
+    int device_id     = 0;
+    RAFT_CUDA_TRY(cudaGetDevice(&device_id));
+    RAFT_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_size, cudaDevAttrL2CacheSize, device_id));
+    scratch_buf_ = rmm::device_buffer(l2_cache_size, stream);
+  }
+
+  // every benchmark should be overriding this
+  virtual void run_benchmark(::benchmark::State& state) = 0;
+  virtual void generate_metrics(::benchmark::State& state) {}
+
+  /**
+   * The helper to be used inside `run_benchmark`, to loop over the state and record time using the
+   * cuda_event_timer.
+   */
+  template <typename Lambda>
+  void loop_on_state(::benchmark::State& state, Lambda benchmark_func, bool flush_L2 = true)
+  {
+    for (auto _ : state) {
+      if (flush_L2) {
+        RAFT_CUDA_TRY(cudaMemsetAsync(scratch_buf_.data(), 0, scratch_buf_.size(), stream));
+      }
+      cuda_event_timer timer(state, stream);
+      benchmark_func();
+    }
+  }
+};
+
+namespace internal {
+
+template <typename Class, typename... Params>
+class Fixture : public ::benchmark::Fixture {
+  using State = ::benchmark::State;
+
+ public:
+  explicit Fixture(const std::string name, const Params&... params)
+    : ::benchmark::Fixture(), params_(params...)
+  {
+    SetName(name.c_str());
+  }
+  Fixture() = delete;
+
+  void SetUp(const State& state) override
+  {
+    fixture_ =
+      std::apply([](const Params&... ps) { return std::make_unique<Class>(ps...); }, params_);
+  }
+  void TearDown(const State& state) override { fixture_.reset(); }
+  void SetUp(State& st) override { SetUp(const_cast<const State&>(st)); }
+  void TearDown(State& st) override { TearDown(const_cast<const State&>(st)); }
+
+ private:
+  std::unique_ptr<Class> fixture_;
+  std::tuple<Params...> params_;
+
+ protected:
+  void BenchmarkCase(State& state) override
+  {
+    fixture_->run_benchmark(state);
+    fixture_->generate_metrics(state);
+  }
+};  // class Fixture
+
+/**
+ * A helper struct to create a fixture for every combination of input vectors.
+ * Use with care, this can blow up quickly!
+ */
+template <typename Class, typename... Params>
+struct cartesian_registrar {
+  template <typename... Fixed>
+  static void run(const std::string case_name,
+                  const std::vector<Params>&... params,
+                  const Fixed&... fixed);
+};
+
+template <typename Class>
+struct cartesian_registrar<Class> {
+  template <typename... Fixed>
+  static void run(const std::string case_name, const Fixed&... fixed)
+  {
+    auto* b = ::benchmark::internal::RegisterBenchmarkInternal(
+      new Fixture<Class, Fixed...>(case_name, fixed...));
+    b->UseManualTime();
+    b->Unit(benchmark::kMillisecond);
+  }
+};
+
+template <typename Class, typename Param, typename... Params>
+struct cartesian_registrar<Class, Param, Params...> {
+  template <typename... Fixed>
+  static void run(const std::string case_name,
+                  const std::vector<Param>& param,
+                  const std::vector<Params>&... params,
+                  const Fixed&... fixed)
+  {
+    int param_len = param.size();
+    for (int i = 0; i < param_len; i++) {
+      cartesian_registrar<Class, Params...>::run(
+        case_name + "/" + std::to_string(i), params..., fixed..., param[i]);
+    }
+  }
+};
+
+template <typename Class>
+struct registrar {
+  /**
+   * Register a fixture `Class` named `testClass` for every combination of input `params`.
+   *
+   * @param test_class
+   *     A string representation of the `Class` name.
+   * @param test_name
+   *     Optional test name. Leave empty, if you don't need it.
+   * @param params
+   *     Zero or more vectors of parameters.
+   *     The generated test cases are a cartesian product of these vectors.
+   *     Use with care, this can blow up quickly!
+   */
+  template <typename... Params>
+  registrar(const std::string& test_class,
+            const std::string& test_name,
+            const std::vector<Params>&... params)
+  {
+    std::stringstream name_stream;
+    name_stream << test_class;
+    if (!test_name.empty()) { name_stream << "/" << test_name; }
+    cartesian_registrar<Class, Params...>::run(name_stream.str(), params...);
+  }
+};
+
+};  // namespace internal
+
+/**
+ * This is the entry point macro for all benchmarks. This needs to be called
+ * for the set of benchmarks to be registered so that the main harness inside
+ * google bench can find these benchmarks and run them.
+ *
+ * @param TestClass   child class of `raft::bench::Fixture` which contains
+ *                    the logic to generate the dataset and run training on it
+ *                    for a given algo. Ideally, once such struct is needed for
+ *                    every algo to be benchmarked
+ * @param test_name   a unique string to identify these tests at the end of run
+ *                    This is optional and if choose not to use this, pass an
+ *                    empty string
+ * @param params...   zero or more lists of params upon which to benchmark.
+ */
+#define RAFT_BENCH_REGISTER(TestClass, ...)                                             \
+  static raft::bench::internal::registrar<TestClass> BENCHMARK_PRIVATE_NAME(registrar)( \
+    #TestClass, __VA_ARGS__)
+
+}  // namespace raft::bench
diff --git a/cpp/bench/linalg/reduce.cu b/cpp/bench/linalg/reduce.cu
new file mode 100644
index 0000000000..018086a689
--- /dev/null
+++ b/cpp/bench/linalg/reduce.cu
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/linalg/reduce.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::linalg {
+
+struct input_size {
+  int rows, cols;
+  bool along_rows;
+};
+
+template <typename T>
+struct reduce : public fixture {
+  reduce(bool along_rows, const input_size& p)
+    : input_size(p), along_rows(along_rows), in(p.rows * p.cols, stream), out(p.rows, stream)
+  {
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    loop_on_state(state, [this]() {
+      raft::linalg::reduce(
+        out.data(), in.data(), input_size.cols, input_size.rows, T(0.f), true, along_rows, stream);
+    });
+  }
+
+ private:
+  bool along_rows;
+  input_size input_size;
+  rmm::device_uvector<T> in, out;
+};  // struct reduce
+
+const std::vector<input_size> kInputSizes{{8 * 1024, 1024},
+                                          {1024, 8 * 1024},
+                                          {8 * 1024, 8 * 1024},
+                                          {32 * 1024, 1024},
+                                          {1024, 32 * 1024},
+                                          {32 * 1024, 32 * 1024}};
+
+const std::vector<bool> kAlongRows{false, true};
+
+RAFT_BENCH_REGISTER(reduce<float>, "", kAlongRows, kInputSizes);
+RAFT_BENCH_REGISTER(reduce<double>, "", kAlongRows, kInputSizes);
+
+}  // namespace raft::bench::linalg
diff --git a/cpp/bench/main.cpp b/cpp/bench/main.cpp
new file mode 100644
index 0000000000..3162422e8e
--- /dev/null
+++ b/cpp/bench/main.cpp
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>  // NOLINT
+
+BENCHMARK_MAIN();
diff --git a/cpp/cmake/thirdparty/get_gbench.cmake b/cpp/cmake/thirdparty/get_gbench.cmake
new file mode 100644
index 0000000000..a3d5678f74
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_gbench.cmake
@@ -0,0 +1,43 @@
+#=============================================================================
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_gbench)
+
+    set(oneValueArgs VERSION PINNED_TAG)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN} )
+
+    rapids_cpm_find(benchmark ${PKG_VERSION}
+        GLOBAL_TARGETS benchmark::benchmark
+        CPM_ARGS
+            GIT_REPOSITORY  https://github.com/google/benchmark.git
+            GIT_TAG         ${PKG_PINNED_TAG}
+            OPTIONS
+              "BENCHMARK_ENABLE_GTEST_TESTS OFF"
+              "BENCHMARK_ENABLE_TESTING OFF"
+              "BENCHMARK_ENABLE_INSTALL OFF"
+              "CMAKE_BUILD_TYPE Release"
+              "CMAKE_INSTALL_LIBDIR lib"
+    )
+
+    if(NOT TARGET benchmark::benchmark)
+        add_library(benchmark::benchmark ALIAS benchmark)
+    endif()
+
+endfunction()
+
+find_and_configure_gbench(VERSION      1.5.3
+                          PINNED_TAG   c05843a9f622db08ad59804c190f98879b76beba)

From ecfbce8dc61f0aac95f7e1afc2179aecab14c2f2 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Fri, 11 Mar 2022 17:40:14 -0500
Subject: [PATCH 142/171] Disabling benchmarks building by default. (#553)

This was an oversight because this will break all consumers downstream who are not explicitly turning this off in their `get_raft`. We should not build these by default (and we should eventually turn off compiling gtests by default as well).

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Seunghwa Kang (https://github.com/seunghwak)

URL: https://github.com/rapidsai/raft/pull/553
---
 cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c68be5e619..4a96e1ee40 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -41,7 +41,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 # - User Options  ------------------------------------------------------------
 
 option(BUILD_TESTS "Build raft unit-tests" ON)
-option(BUILD_BENCH "Build raft C++ benchmark tests" ON)
+option(BUILD_BENCH "Build raft C++ benchmark tests" OFF)
 option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF)
 option(CUDA_ENABLE_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF)
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)

From 9d77117c6ddf53ac96b33c5da23be081c08e2136 Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Mon, 14 Mar 2022 10:57:45 -0400
Subject: [PATCH 143/171] Add `.github/ops-bot.yaml` config file (#554)

In the near future, the [rapidsai/ops-bot](https://github.com/rapidsai/ops-bot) GitHub application that we use for GitHub automation will be enabled on all repositories in the `rapidsai` GitHub organization. Since not all features of the application are applicable to all repositories, this PR adds a new file, `.github/ops-bot.yaml`, which can configure which features are enabled per repository.

Authors:
  - AJ Schmidt (https://github.com/ajschmidt8)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/raft/pull/554
---
 .github/ops-bot.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 .github/ops-bot.yaml

diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
new file mode 100644
index 0000000000..408e28843e
--- /dev/null
+++ b/.github/ops-bot.yaml
@@ -0,0 +1,8 @@
+# This file controls which features from the `ops-bot` repository below are enabled.
+# - https://github.com/rapidsai/ops-bot
+
+auto_merger: true
+branch_checker: true
+label_checker: true
+release_drafter: true
+external_contributors: true

From c705d2a17ee690a0d8a8294d1e566110bf819b46 Mon Sep 17 00:00:00 2001
From: Jordan Jacobelli <jjacobelli@nvidia.com>
Date: Tue, 15 Mar 2022 17:11:56 +0100
Subject: [PATCH 144/171] Upload packages using testing label for nightlies
 (#556)

Upload packages using `testing` label for nightlies

Authors:
  - Jordan Jacobelli (https://github.com/Ethyling)

Approvers:
  - Sevag Hanssian (https://github.com/sevagh)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/raft/pull/556
---
 ci/cpu/upload.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index fe1d651c31..5c40429332 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -9,8 +9,11 @@ set -e
 export GPUCI_RETRY_MAX=3
 export GPUCI_RETRY_SLEEP=30
 
-# Set default label options if they are not defined elsewhere
-export LABEL_OPTION=${LABEL_OPTION:-"--label main"}
+# Set label to testing by default. Use main label for main branch
+LABEL_OPTION="--label testing"
+if [[ "${SOURCE_BRANCH}" == "main" ]]; then
+  LABEL_OPTION="--label main"
+fi
 
 # Skip uploads unless BUILD_MODE == "branch"
 if [ ${BUILD_MODE} != "branch" ]; then

From f2ea77915434ceaf83e7f4f4523d964e4f861be8 Mon Sep 17 00:00:00 2001
From: Rick Ratzel <3039903+rlratzel@users.noreply.github.com>
Date: Tue, 15 Mar 2022 18:16:44 -0500
Subject: [PATCH 145/171] Updated dask and distributed to >=2022.02.1 (#557)

This updates pyraft to the version pins for dask and distributed to match those of other RAPIDS packages (cuml, cugraph, etc.)

cc @galipremsagar

Authors:
  - Rick Ratzel (https://github.com/rlratzel)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Corey J. Nolet (https://github.com/cjnolet)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/raft/pull/557
---
 conda/recipes/pyraft/meta.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conda/recipes/pyraft/meta.yaml b/conda/recipes/pyraft/meta.yaml
index eae9963204..8ae7e2663b 100644
--- a/conda/recipes/pyraft/meta.yaml
+++ b/conda/recipes/pyraft/meta.yaml
@@ -44,8 +44,8 @@ requirements:
     - rmm {{ minor_version }}
     - ucx-py {{ ucx_py_version }}
     - ucx-proc=*=gpu
-    - dask>=2021.11.1,<=2021.11.2
-    - distributed>=2021.11.1,<=2022.01.0
+    - dask>=2022.02.1
+    - distributed>=2022.02.1
     - cuda-python >=11.5,<12.0
     - joblib >=0.11
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}

From 6cab74463fafc93d8c21334126852f51b7313afd Mon Sep 17 00:00:00 2001
From: Jordan Jacobelli <jjacobelli@nvidia.com>
Date: Wed, 16 Mar 2022 17:50:37 +0100
Subject: [PATCH 146/171] Add local conda channel while looking for conda
 outputs

Signed-off-by: Jordan Jacobelli <jjacobelli@nvidia.com>
---
 ci/cpu/upload.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index 5c40429332..ddeae5284f 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -33,10 +33,10 @@ fi
 
 gpuci_logger "Get conda file output locations"
 
-export LIBRAFT_HEADERS_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/libraft_headers --output`
-export LIBRAFT_NN_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_nn --output`
-export LIBRAFT_DISTANCE_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_distance --output`
-export PYRAFT_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/pyraft --python=$PYTHON --output`
+export LIBRAFT_HEADERS_FILE=`conda build --croot ${CONDA_BLD_DIR} -c ${CONDA_LOCAL_CHANNEL} conda/recipes/libraft_headers --output`
+export LIBRAFT_NN_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} -c ${CONDA_LOCAL_CHANNEL} conda/recipes/libraft_nn --output`
+export LIBRAFT_DISTANCE_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} -c ${CONDA_LOCAL_CHANNEL} conda/recipes/libraft_distance --output`
+export PYRAFT_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/pyraft --python=$PYTHON -c ${CONDA_LOCAL_CHANNEL} --output`
 
 ################################################################################
 # UPLOAD - Conda packages

From 44a46cfa1311c9ca9cf5543bce2e53159bd1eeab Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 16 Mar 2022 17:20:18 -0500
Subject: [PATCH 147/171] Set `main` label by default (#559)

This PR changes the `label` option to use `main` for all package builds.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Mark Sadang (https://github.com/msadang)

URL: https://github.com/rapidsai/raft/pull/559
---
 ci/cpu/upload.sh | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index ddeae5284f..80c7bd0b70 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -9,11 +9,9 @@ set -e
 export GPUCI_RETRY_MAX=3
 export GPUCI_RETRY_SLEEP=30
 
-# Set label to testing by default. Use main label for main branch
-LABEL_OPTION="--label testing"
-if [[ "${SOURCE_BRANCH}" == "main" ]]; then
-  LABEL_OPTION="--label main"
-fi
+# Set label option.
+# LABEL_OPTION="--label testing"
+LABEL_OPTION="--label main"
 
 # Skip uploads unless BUILD_MODE == "branch"
 if [ ${BUILD_MODE} != "branch" ]; then

From 613c7227c904579e6478673b4097a6526838b849 Mon Sep 17 00:00:00 2001
From: Victor Lafargue <viclafargue@nvidia.com>
Date: Fri, 18 Mar 2022 01:01:54 +0100
Subject: [PATCH 148/171] Remove RAFT memory management (#400)

Answers #308.
Requires the appropriate changes in `cuML` and `cuGraph` before merging.

Authors:
  - Victor Lafargue (https://github.com/viclafargue)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/400
---
 cpp/include/raft/cudart_utils.h               |  36 +--
 cpp/include/raft/mr/allocator.hpp             |  70 ------
 cpp/include/raft/mr/buffer_base.hpp           | 211 ------------------
 cpp/include/raft/mr/device/allocator.hpp      |  57 -----
 cpp/include/raft/mr/device/buffer.hpp         |  70 ------
 cpp/include/raft/mr/host/allocator.hpp        |  61 -----
 cpp/include/raft/mr/host/buffer.hpp           |  85 -------
 .../raft/random/detail/make_regression.cuh    |   1 -
 cpp/include/raft/random/detail/rng_impl.cuh   |   2 +-
 .../raft/sparse/distance/detail/utils.cuh     |   2 -
 .../raft/stats/detail/homogeneity_score.cuh   |   1 -
 cpp/test/CMakeLists.txt                       |   2 -
 cpp/test/linalg/rsvd.cu                       |  10 +-
 cpp/test/span.cu                              |   8 +-
 cpp/test/spatial/fused_l2_knn.cu              |  90 ++++----
 cpp/test/stats/information_criterion.cu       |  53 ++---
 python/raft/common/handle.pxd                 |   7 -
 python/raft/common/handle.pyx                 |   1 -
 18 files changed, 86 insertions(+), 681 deletions(-)
 delete mode 100644 cpp/include/raft/mr/allocator.hpp
 delete mode 100644 cpp/include/raft/mr/buffer_base.hpp
 delete mode 100644 cpp/include/raft/mr/device/allocator.hpp
 delete mode 100644 cpp/include/raft/mr/device/buffer.hpp
 delete mode 100644 cpp/include/raft/mr/host/allocator.hpp
 delete mode 100644 cpp/include/raft/mr/host/buffer.hpp

diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index 936065afba..1940fcea51 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -299,40 +299,6 @@ void print_device_vector(const char* variable_name,
 }
 /** @} */
 
-static std::mutex mutex_;
-static std::unordered_map<void*, size_t> allocations;
-
-template <typename Type>
-void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, bool setZero = false)
-{
-  size_t size = len * sizeof(Type);
-  ptr         = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream);
-  if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream));
-
-  std::lock_guard<std::mutex> _(mutex_);
-  allocations[ptr] = size;
-}
-
-template <typename Type>
-void deallocate(Type*& ptr, rmm::cuda_stream_view stream)
-{
-  std::lock_guard<std::mutex> _(mutex_);
-  size_t size = allocations[ptr];
-  allocations.erase(ptr);
-  rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream);
-}
-
-inline void deallocate_all(rmm::cuda_stream_view stream)
-{
-  std::lock_guard<std::mutex> _(mutex_);
-  for (auto& alloc : allocations) {
-    void* ptr   = alloc.first;
-    size_t size = alloc.second;
-    rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream);
-  }
-  allocations.clear();
-}
-
 /** helper method to get max usable shared mem per block parameter */
 inline int getSharedMemPerBlock()
 {
diff --git a/cpp/include/raft/mr/allocator.hpp b/cpp/include/raft/mr/allocator.hpp
deleted file mode 100644
index 08a4987c91..0000000000
--- a/cpp/include/raft/mr/allocator.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cuda_runtime_api.h>
-
-#include <cstddef>
-
-namespace raft {
-namespace mr {
-
-/**
- * @brief Interface for an asynchronous device/host allocator.
- *
- * An implementation of this interface can make the following assumptions:
- * - It does not need to be but it can allow async allocate and deallocate.
- *
- * @note This interface does NOT support RAII. Thus, if you need RAII-enabled
- *       interface, better to use `device_buffer` or `host_buffer`.
- */
-class base_allocator {
- public:
-  /**
-   * @brief Asynchronously allocates a memory region.
-   *
-   * An implementation of this need to return a allocation of n bytes properly
-   * align bytes on the configured device. The allocation can optionally be
-   * asynchronous in the sense that it is only save to use after all work
-   * submitted to the passed in stream prior to the call to allocate has
-   * completed. If the allocation is used before, e.g. in another stream the
-   * behaviour may be undefined.
-   * @todo: Add alignment requirments.
-   *
-   * @param[in] n         number of bytes to allocate
-   * @param[in] stream    stream to issue the possible asynchronous allocation in
-   */
-  virtual void* allocate(std::size_t n, cudaStream_t stream) = 0;
-
-  /**
-   * @brief Asynchronously deallocates device memory
-   *
-   * An implementation of this need to ensure that the allocation that the
-   * passed in pointer points to remains usable until all work sheduled in
-   * stream prior to the call to deallocate has completed.
-   *
-   * @param[inout] p      pointer to the buffer to deallocte
-   * @param[in] n         size of the buffer to deallocte in bytes
-   * @param[in] stream    stream in which the allocation might be still in use
-   */
-  virtual void deallocate(void* p, std::size_t n, cudaStream_t stream) = 0;
-
-  virtual ~base_allocator() = default;
-};  // class base_allocator
-
-};  // namespace mr
-};  // namespace raft
diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp
deleted file mode 100644
index 96aa622525..0000000000
--- a/cpp/include/raft/mr/buffer_base.hpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/cudart_utils.h>
-
-#include <cuda_runtime.h>
-
-#include <cstddef>
-#include <memory>
-#include <utility>
-
-namespace raft {
-namespace mr {
-
-/**
- * @brief Base for all RAII-based owning of temporary memory allocations. This
- *        class should ideally not be used by users directly, but instead via
- *        the child classes `device_buffer` and `host_buffer`.
- *
- * @tparam T          data type
- * @tparam AllocatorT The underly allocator object
- */
-template <typename T, typename AllocatorT>
-class buffer_base {
- public:
-  using size_type       = std::size_t;
-  using value_type      = T;
-  using iterator        = value_type*;
-  using const_iterator  = const value_type*;
-  using reference       = T&;
-  using const_reference = const T&;
-
-  buffer_base() = delete;
-
-  buffer_base(const buffer_base& other) = delete;
-
-  buffer_base& operator=(const buffer_base& other) = delete;
-
-  /**
-   * @brief Main ctor
-   *
-   * @param[in] allocator asynchronous allocator used for managing buffer life
-   * @param[in] stream    cuda stream where this allocation operations are async
-   * @param[in] n         size of the buffer (in number of elements)
-   */
-  buffer_base(std::shared_ptr<AllocatorT> allocator, cudaStream_t stream, size_type n = 0)
-    : data_(nullptr), size_(n), capacity_(n), stream_(stream), allocator_(std::move(allocator))
-  {
-    if (capacity_ > 0) {
-      data_ =
-        static_cast<value_type*>(allocator_->allocate(capacity_ * sizeof(value_type), stream_));
-      RAFT_CUDA_TRY(cudaStreamSynchronize(stream_));
-    }
-  }
-
-  ~buffer_base() { release(); }
-
-  value_type* data() { return data_; }
-
-  const value_type* data() const { return data_; }
-
-  size_type size() const { return size_; }
-
-  void clear() { size_ = 0; }
-
-  iterator begin() { return data_; }
-
-  const_iterator begin() const { return data_; }
-
-  iterator end() { return data_ + size_; }
-
-  const_iterator end() const { return data_ + size_; }
-
-  /**
-   * @brief Reserve new memory size for this buffer.
-   *
-   * It re-allocates a fresh buffer if the new requested capacity is more than
-   * the current one, copies the old buffer contents to this new buffer and
-   * removes the old one.
-   *
-   * @param[in] new_capacity new capacity (in number of elements)
-   * @{
-   */
-  void reserve(size_type new_capacity)
-  {
-    if (new_capacity > capacity_) {
-      auto* new_data =
-        static_cast<value_type*>(allocator_->allocate(new_capacity * sizeof(value_type), stream_));
-      if (size_ > 0) { raft::copy(new_data, data_, size_, stream_); }
-      // Only deallocate if we have allocated a pointer
-      if (nullptr != data_) {
-        allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_);
-      }
-      data_     = new_data;
-      capacity_ = new_capacity;
-    }
-  }
-
-  void reserve(size_type new_capacity, cudaStream_t stream)
-  {
-    set_stream(stream);
-    reserve(new_capacity);
-  }
-  /** @} */
-
-  /**
-   * @brief Resize the underlying buffer (uses `reserve` method internally)
-   *
-   * @param[in] new_size new buffer size
-   * @{
-   */
-  void resize(const size_type new_size)
-  {
-    reserve(new_size);
-    size_ = new_size;
-  }
-
-  void resize(const size_type new_size, cudaStream_t stream)
-  {
-    set_stream(stream);
-    resize(new_size);
-  }
-  /** @} */
-
-  /**
-   * @brief Deletes the underlying buffer
-   *
-   * If this method is not explicitly called, it will be during the destructor
-   * @{
-   */
-  void release()
-  {
-    if (nullptr != data_) {
-      allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_);
-    }
-    data_     = nullptr;
-    capacity_ = 0;
-    size_     = 0;
-  }
-
-  void release(cudaStream_t stream)
-  {
-    set_stream(stream);
-    release();
-  }
-  /** @} */
-
-  /**
-   * @brief returns the underlying allocator used
-   *
-   * @return the allocator pointer
-   */
-  std::shared_ptr<AllocatorT> get_allocator() const { return allocator_; }
-
-  /**
-   * @brief returns the underlying stream used
-   *
-   * @return the cuda stream
-   */
-  cudaStream_t get_stream() const { return stream_; }
-
- protected:
-  value_type* data_;
-
- private:
-  size_type size_;
-  size_type capacity_;
-  cudaStream_t stream_;
-  std::shared_ptr<AllocatorT> allocator_;
-
-  /**
-   * @brief Sets a new cuda stream where the future operations will be queued
-   *
-   * This method makes sure that the inter-stream dependencies are met and taken
-   * care of, before setting the input stream as a new stream for this buffer.
-   * Ideally, the same cuda stream passed during constructor is expected to be
-   * used throughout this buffer's lifetime, for performance.
-   *
-   * @param[in] stream new cuda stream to be set. If it is the same as the
-   *                   current one, then this method will be a no-op.
-   */
-  void set_stream(cudaStream_t stream)
-  {
-    if (stream_ != stream) {
-      cudaEvent_t event;
-      RAFT_CUDA_TRY(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
-      RAFT_CUDA_TRY(cudaEventRecord(event, stream_));
-      RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, event, 0));
-      stream_ = stream;
-      RAFT_CUDA_TRY(cudaEventDestroy(event));
-    }
-  }
-};  // class buffer_base
-
-};  // namespace mr
-};  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/mr/device/allocator.hpp b/cpp/include/raft/mr/device/allocator.hpp
deleted file mode 100644
index 8d306a199f..0000000000
--- a/cpp/include/raft/mr/device/allocator.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/mr/allocator.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-
-#include <cstddef>
-
-namespace raft {
-namespace mr {
-namespace device {
-
-/**
- * @brief An explicit interface for an asynchronous device allocator.
- *
- * This is mostly done in order to reduce work needed in cuML codebase.
- * An implementation of this interface can make the following assumptions,
- * further to the ones listed in `Allocator`:
- * - Allocations may be always on the device that was specified on construction.
- */
-class allocator : public base_allocator {
-};
-
-/** Default device allocator based on the one provided by RMM */
-class default_allocator : public allocator {
- public:
-  void* allocate(std::size_t n, cudaStream_t stream) override
-  {
-    void* ptr = rmm::mr::get_current_device_resource()->allocate(n, stream);
-    return ptr;
-  }
-
-  void deallocate(void* p, std::size_t n, cudaStream_t stream) override
-  {
-    rmm::mr::get_current_device_resource()->deallocate(p, n, stream);
-  }
-};  // class default_allocator
-
-};  // namespace device
-};  // namespace mr
-};  // namespace raft
diff --git a/cpp/include/raft/mr/device/buffer.hpp b/cpp/include/raft/mr/device/buffer.hpp
deleted file mode 100644
index 954ce83d1f..0000000000
--- a/cpp/include/raft/mr/device/buffer.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "allocator.hpp"
-#include <memory>
-#include <raft/mr/buffer_base.hpp>
-
-namespace raft {
-namespace mr {
-namespace device {
-
-/**
- * @brief RAII object owning a contiguous typed device buffer. The passed in
- *        allocator supports asynchronous allocation and deallocation so this
- *        can also be used for temporary memory
- *
- * @code{.cpp}
- * template<typename T>
- * void foo(..., cudaStream_t stream) {
- *   ...
- *   raft::mr::device::buffer<T> temp(stream, 0);
- *   ...
- *   temp.resize(n);
- *   kernelA<<<grid,block,0,stream>>>(...,temp.data(),...);
- *   kernelB<<<grid,block,0,stream>>>(...,temp.data(),...);
- *   temp.release();
- *   ...
- * }
- * @endcode
- */
-template <typename T>
-class buffer : public buffer_base<T, allocator> {
- public:
-  using size_type       = typename buffer_base<T, allocator>::size_type;
-  using value_type      = typename buffer_base<T, allocator>::value_type;
-  using iterator        = typename buffer_base<T, allocator>::iterator;
-  using const_iterator  = typename buffer_base<T, allocator>::const_iterator;
-  using reference       = typename buffer_base<T, allocator>::reference;
-  using const_reference = typename buffer_base<T, allocator>::const_reference;
-
-  buffer() = delete;
-
-  buffer(const buffer& other) = delete;
-
-  buffer& operator=(const buffer& other) = delete;
-
-  buffer(std::shared_ptr<allocator> alloc, cudaStream_t stream, size_type n = 0)
-    : buffer_base<T, device::allocator>(alloc, stream, n)
-  {
-  }
-};  // class buffer
-
-};  // namespace device
-};  // namespace mr
-};  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/mr/host/allocator.hpp b/cpp/include/raft/mr/host/allocator.hpp
deleted file mode 100644
index 71b5465451..0000000000
--- a/cpp/include/raft/mr/host/allocator.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/cudart_utils.h>
-#include <raft/mr/allocator.hpp>
-
-#include <cuda_runtime.h>
-
-#include <cstddef>
-
-namespace raft {
-namespace mr {
-namespace host {
-
-/**
- * @brief An explicit interface for an asynchronous host allocations.
- *
- * This is mostly done in order to reduce work needed in cuML codebase.
- * An implementation of this interface can make the following assumptions,
- * further to the ones listed in `Allocator`:
- * - Allocations don't need to be zero copy accessible form a device.
- */
-class allocator : public base_allocator {
-};
-
-/** Default cudaMallocHost/cudaFreeHost based host allocator */
-class default_allocator : public allocator {
- public:
-  void* allocate(std::size_t n, cudaStream_t stream) override
-  {
-    void* ptr = nullptr;
-    RAFT_CUDA_TRY(cudaMallocHost(&ptr, n));
-    return ptr;
-  }
-
-  void deallocate(void* p, std::size_t n, cudaStream_t stream) override
-  {
-    // Must call _NO_THROW here since this is called frequently from object
-    // destructors which are "nothrow" by default
-    RAFT_CUDA_TRY_NO_THROW(cudaFreeHost(p));
-  }
-};  // class default_allocator
-
-};  // namespace host
-};  // namespace mr
-};  // namespace raft
diff --git a/cpp/include/raft/mr/host/buffer.hpp b/cpp/include/raft/mr/host/buffer.hpp
deleted file mode 100644
index 25aed3e725..0000000000
--- a/cpp/include/raft/mr/host/buffer.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "allocator.hpp"
-#include <memory>
-#include <raft/mr/buffer_base.hpp>
-#include <raft/mr/device/buffer.hpp>
-
-namespace raft {
-namespace mr {
-namespace host {
-
-/**
- * @brief RAII object owning a contigous typed host buffer (aka pinned memory).
- *        The passed in allocator supports asynchronus allocation and
- *        deallocation so this can also be used for temporary memory
- *
- * @code{.cpp}
- * template<typename T>
- * void foo(const T* in_d , T* out_d, ..., cudaStream_t stream) {
- *   ...
- *   raft::mr::host::buffer<T> temp(stream, 0);
- *   ...
- *   temp.resize(n);
- *   raft::copy(temp.data(), in_d, temp.size());
- *   ...
- *   raft::copy(out_d, temp.data(), temp.size());
- *   temp.release(stream);
- *   ...
- * }
- * @endcode
- */
-template <typename T>
-class buffer : public buffer_base<T, allocator> {
- public:
-  using size_type       = typename buffer_base<T, allocator>::size_type;
-  using value_type      = typename buffer_base<T, allocator>::value_type;
-  using iterator        = typename buffer_base<T, allocator>::iterator;
-  using const_iterator  = typename buffer_base<T, allocator>::const_iterator;
-  using reference       = typename buffer_base<T, allocator>::reference;
-  using const_reference = typename buffer_base<T, allocator>::const_reference;
-
-  buffer() = delete;
-
-  buffer(const buffer& other) = delete;
-
-  buffer& operator=(const buffer& other) = delete;
-
-  buffer(std::shared_ptr<allocator> alloc, const device::buffer<T>& other)
-    : buffer_base<T, allocator>(alloc, other.get_stream(), other.size())
-  {
-    if (other.size() > 0) { raft::copy(data_, other.data(), other.size(), other.get_stream()); }
-  }
-
-  buffer(std::shared_ptr<allocator> alloc, cudaStream_t stream, size_type n = 0)
-    : buffer_base<T, allocator>(alloc, stream, n)
-  {
-  }
-
-  reference operator[](size_type pos) { return data_[pos]; }
-
-  const_reference operator[](size_type pos) const { return data_[pos]; }
-
- private:
-  using buffer_base<T, allocator>::data_;
-};
-
-};  // namespace host
-};  // namespace mr
-};  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/random/detail/make_regression.cuh b/cpp/include/raft/random/detail/make_regression.cuh
index 8bab85e485..42c1319889 100644
--- a/cpp/include/raft/random/detail/make_regression.cuh
+++ b/cpp/include/raft/random/detail/make_regression.cuh
@@ -30,7 +30,6 @@
 #include <raft/linalg/qr.cuh>
 #include <raft/linalg/transpose.cuh>
 #include <raft/matrix/matrix.cuh>
-#include <raft/mr/device/buffer.hpp>
 #include <raft/random/permute.cuh>
 #include <raft/random/rng.cuh>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh
index 1b245ca45f..17ddb1d5d9 100644
--- a/cpp/include/raft/random/detail/rng_impl.cuh
+++ b/cpp/include/raft/random/detail/rng_impl.cuh
@@ -903,7 +903,7 @@ class RngImpl {
     rmm::device_uvector<char> workspace(0, stream);
     sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, (int)len, stream);
     if (outIdx != nullptr) {
-      CUDA_CHECK(cudaMemcpyAsync(
+      RAFT_CUDA_TRY(cudaMemcpyAsync(
         outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, cudaMemcpyDeviceToDevice, stream));
     }
     scatter<DataT, IdxT>(out, in, outIdxPtr, sampledLen, stream);
diff --git a/cpp/include/raft/sparse/distance/detail/utils.cuh b/cpp/include/raft/sparse/distance/detail/utils.cuh
index a2fe090c96..ed2b414c70 100644
--- a/cpp/include/raft/sparse/distance/detail/utils.cuh
+++ b/cpp/include/raft/sparse/distance/detail/utils.cuh
@@ -16,8 +16,6 @@
 
 #pragma once
 
-#include <raft/mr/device/allocator.hpp>
-
 #include <cub/cub.cuh>
 
 namespace raft {
diff --git a/cpp/include/raft/stats/detail/homogeneity_score.cuh b/cpp/include/raft/stats/detail/homogeneity_score.cuh
index 4c78553258..e781b58875 100644
--- a/cpp/include/raft/stats/detail/homogeneity_score.cuh
+++ b/cpp/include/raft/stats/detail/homogeneity_score.cuh
@@ -22,7 +22,6 @@
 
 #pragma once
 
-#include <raft/mr/device/allocator.hpp>
 #include <raft/stats/entropy.cuh>
 #include <raft/stats/mutual_info_score.cuh>
 
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index f8ae28f550..8d7b239624 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -75,8 +75,6 @@ add_executable(test_raft
     test/matrix/columnSort.cu
     test/matrix/linewise_op.cu
     test/mdarray.cu
-    test/mr/host/buffer.cpp
-    test/mr/device/buffer.cpp
     test/mst.cu
     test/random/make_blobs.cu
     test/random/make_regression.cu
diff --git a/cpp/test/linalg/rsvd.cu b/cpp/test/linalg/rsvd.cu
index 66b472c7e1..23d29c3d4e 100644
--- a/cpp/test/linalg/rsvd.cu
+++ b/cpp/test/linalg/rsvd.cu
@@ -102,11 +102,11 @@ class RsvdTest : public ::testing::TestWithParam<RsvdInputs<T>> {
       int len_redundant = m * n_redundant;
 
       r.normal(A.data(), len_informative, mu, sigma, stream);
-      CUDA_CHECK(cudaMemcpyAsync(A.data() + len_informative,
-                                 A.data(),
-                                 len_redundant * sizeof(T),
-                                 cudaMemcpyDeviceToDevice,
-                                 stream));
+      RAFT_CUDA_TRY(cudaMemcpyAsync(A.data() + len_informative,
+                                    A.data(),
+                                    len_redundant * sizeof(T),
+                                    cudaMemcpyDeviceToDevice,
+                                    stream));
     }
     std::vector<T> A_backup_cpu(m *
                                 n);  // Backup A matrix as svdJacobi will destroy the content of A
diff --git a/cpp/test/span.cu b/cpp/test/span.cu
index e121cea108..abecfc677c 100644
--- a/cpp/test/span.cu
+++ b/cpp/test/span.cu
@@ -30,16 +30,16 @@ struct TestStatus {
  public:
   TestStatus()
   {
-    CUDA_CHECK(cudaMalloc(&status_, sizeof(int)));
+    RAFT_CUDA_TRY(cudaMalloc(&status_, sizeof(int)));
     int h_status = 1;
-    CUDA_CHECK(cudaMemcpy(status_, &h_status, sizeof(int), cudaMemcpyHostToDevice));
+    RAFT_CUDA_TRY(cudaMemcpy(status_, &h_status, sizeof(int), cudaMemcpyHostToDevice));
   }
-  ~TestStatus() noexcept(false) { CUDA_CHECK(cudaFree(status_)); }
+  ~TestStatus() noexcept(false) { RAFT_CUDA_TRY(cudaFree(status_)); }
 
   int Get()
   {
     int h_status;
-    CUDA_CHECK(cudaMemcpy(&h_status, status_, sizeof(int), cudaMemcpyDeviceToHost));
+    RAFT_CUDA_TRY(cudaMemcpy(&h_status, status_, sizeof(int), cudaMemcpyDeviceToHost));
     return h_status;
   }
 
diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu
index 65c4284dd2..2ec4e86d1f 100644
--- a/cpp/test/spatial/fused_l2_knn.cu
+++ b/cpp/test/spatial/fused_l2_knn.cu
@@ -99,65 +99,70 @@ testing::AssertionResult devArrMatchKnnPair(const T* expected_idx,
 
 template <typename T>
 class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
+ public:
+  FusedL2KNNTest()
+    : stream_(handle_.get_stream()),
+      params_(::testing::TestWithParam<FusedL2KNNInputs>::GetParam()),
+      database(params_.num_db_vecs * params_.dim, stream_),
+      search_queries(params_.num_queries * params_.dim, stream_),
+      raft_indices_(params_.num_queries * params_.k, stream_),
+      raft_distances_(params_.num_queries * params_.k, stream_),
+      faiss_indices_(params_.num_queries * params_.k, stream_),
+      faiss_distances_(params_.num_queries * params_.k, stream_)
+  {
+    RAFT_CUDA_TRY(cudaMemsetAsync(database.data(), 0, database.size() * sizeof(T), stream_));
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(search_queries.data(), 0, search_queries.size() * sizeof(T), stream_));
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(raft_indices_.data(), 0, raft_indices_.size() * sizeof(int64_t), stream_));
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(raft_distances_.data(), 0, raft_distances_.size() * sizeof(T), stream_));
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(faiss_indices_.data(), 0, faiss_indices_.size() * sizeof(int64_t), stream_));
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(faiss_distances_.data(), 0, faiss_distances_.size() * sizeof(T), stream_));
+  }
+
  protected:
   void testBruteForce()
   {
-    cudaStream_t stream = handle_.get_stream();
-
     launchFaissBfknn();
     detail::fusedL2Knn(dim,
-                       raft_indices_,
-                       raft_distances_,
-                       database,
-                       search_queries,
+                       raft_indices_.data(),
+                       raft_distances_.data(),
+                       database.data(),
+                       search_queries.data(),
                        num_db_vecs,
                        num_queries,
                        k_,
                        true,
                        true,
-                       stream,
+                       stream_,
                        metric);
 
     // verify.
-    devArrMatchKnnPair(faiss_indices_,
-                       raft_indices_,
-                       faiss_distances_,
-                       raft_distances_,
+    devArrMatchKnnPair(faiss_indices_.data(),
+                       raft_indices_.data(),
+                       faiss_distances_.data(),
+                       raft_distances_.data(),
                        num_queries,
                        k_,
                        float(0.001),
-                       stream);
+                       stream_);
   }
 
   void SetUp() override
   {
-    params_     = ::testing::TestWithParam<FusedL2KNNInputs>::GetParam();
     num_queries = params_.num_queries;
     num_db_vecs = params_.num_db_vecs;
     dim         = params_.dim;
     k_          = params_.k;
     metric      = params_.metric_;
 
-    cudaStream_t stream = handle_.get_stream();
-
-    raft::allocate(database, num_db_vecs * dim, stream, true);
-    raft::allocate(search_queries, num_queries * dim, stream, true);
-
     unsigned long long int seed = 1234ULL;
     raft::random::Rng r(seed);
-    r.uniform(database, num_db_vecs * dim, T(-1.0), T(1.0), stream);
-    r.uniform(search_queries, num_queries * dim, T(-1.0), T(1.0), stream);
-
-    raft::allocate(raft_indices_, num_queries * k_, stream, true);
-    raft::allocate(raft_distances_, num_queries * k_, stream, true);
-    raft::allocate(faiss_indices_, num_queries * k_, stream, true);
-    raft::allocate(faiss_distances_, num_queries * k_, stream, true);
-  }
-
-  void TearDown() override
-  {
-    cudaStream_t stream = handle_.get_stream();
-    raft::deallocate_all(stream);
+    r.uniform(database.data(), num_db_vecs * dim, T(-1.0), T(1.0), stream_);
+    r.uniform(search_queries.data(), num_queries * dim, T(-1.0), T(1.0), stream_);
   }
 
   void launchFaissBfknn()
@@ -169,37 +174,38 @@ class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
     gpu_res.noTempMemory();
     int device;
     RAFT_CUDA_TRY(cudaGetDevice(&device));
-    gpu_res.setDefaultStream(device, handle_.get_stream());
+    gpu_res.setDefaultStream(device, stream_);
 
     faiss::gpu::GpuDistanceParams args;
     args.metric          = m;
     args.metricArg       = 0;
     args.k               = k_;
     args.dims            = dim;
-    args.vectors         = database;
+    args.vectors         = database.data();
     args.vectorsRowMajor = true;
     args.numVectors      = num_db_vecs;
-    args.queries         = search_queries;
+    args.queries         = search_queries.data();
     args.queriesRowMajor = true;
     args.numQueries      = num_queries;
-    args.outDistances    = faiss_distances_;
-    args.outIndices      = faiss_indices_;
+    args.outDistances    = faiss_distances_.data();
+    args.outIndices      = faiss_indices_.data();
 
     bfKnn(&gpu_res, args);
   }
 
  private:
   raft::handle_t handle_;
+  cudaStream_t stream_ = 0;
   FusedL2KNNInputs params_;
   int num_queries;
   int num_db_vecs;
   int dim;
-  T* database;
-  T* search_queries;
-  int64_t* raft_indices_;
-  T* raft_distances_;
-  int64_t* faiss_indices_;
-  T* faiss_distances_;
+  rmm::device_uvector<T> database;
+  rmm::device_uvector<T> search_queries;
+  rmm::device_uvector<int64_t> raft_indices_;
+  rmm::device_uvector<T> raft_distances_;
+  rmm::device_uvector<int64_t> faiss_indices_;
+  rmm::device_uvector<T> faiss_distances_;
   int k_;
   raft::distance::DistanceType metric;
 };
diff --git a/cpp/test/stats/information_criterion.cu b/cpp/test/stats/information_criterion.cu
index 802e3fee23..d61f8591a5 100644
--- a/cpp/test/stats/information_criterion.cu
+++ b/cpp/test/stats/information_criterion.cu
@@ -19,7 +19,8 @@
 #include <raft/stats/information_criterion.cuh>
 
 #include <raft/cudart_utils.h>
-#include <raft/mr/device/allocator.hpp>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <gtest/gtest.h>
 
@@ -59,21 +60,23 @@ struct BatchedICInputs {
 
 template <typename T>
 class BatchedICTest : public ::testing::TestWithParam<BatchedICInputs<T>> {
+ public:
+  BatchedICTest()
+    : params(::testing::TestWithParam<BatchedICInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      res_d(sizeof(T) * params.batch_size, stream)
+  {
+  }
+
  protected:
   void SetUp() override
   {
     using std::vector;
-    params = ::testing::TestWithParam<BatchedICInputs<T>>::GetParam();
-
-    // Create stream and allocator
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-    allocator = std::make_shared<raft::mr::device::default_allocator>();
 
     // Create arrays
     std::vector<T> loglike_h = std::vector<T>(params.batch_size);
     res_h.resize(params.batch_size);
-    T* loglike_d = (T*)allocator->allocate(sizeof(T) * params.batch_size, stream);
-    res_d        = (T*)allocator->allocate(sizeof(T) * params.batch_size, stream);
+    rmm::device_uvector<T> loglike_d(sizeof(T) * params.batch_size, stream);
 
     // Generate random data
     std::random_device rd;
@@ -83,11 +86,11 @@ class BatchedICTest : public ::testing::TestWithParam<BatchedICInputs<T>> {
       loglike_h[i] = std::log(udis(gen));
 
     // Copy the data to the device
-    raft::update_device(loglike_d, loglike_h.data(), params.batch_size, stream);
+    raft::update_device(loglike_d.data(), loglike_h.data(), params.batch_size, stream);
 
     // Compute the tested results
-    information_criterion_batched(res_d,
-                                  loglike_d,
+    information_criterion_batched(res_d.data(),
+                                  loglike_d.data(),
                                   params.ic_type,
                                   params.n_params,
                                   params.batch_size,
@@ -103,22 +106,14 @@ class BatchedICTest : public ::testing::TestWithParam<BatchedICInputs<T>> {
              params.n_samples);
 
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-
-    allocator->deallocate(loglike_d, sizeof(T) * params.batch_size, stream);
-  }
-
-  void TearDown() override
-  {
-    allocator->deallocate(res_d, sizeof(T) * params.batch_size, stream);
-    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
   }
 
  protected:
-  std::shared_ptr<raft::mr::device::default_allocator> allocator;
+  raft::handle_t handle;
+  cudaStream_t stream = 0;
   BatchedICInputs<T> params;
-  T* res_d;
+  rmm::device_uvector<T> res_d;
   std::vector<T> res_h;
-  cudaStream_t stream = 0;
 };
 
 // Test parameters (op, n_batches, m, n, p, q, tolerance)
@@ -133,13 +128,19 @@ using BatchedICTestD = BatchedICTest<double>;
 using BatchedICTestF = BatchedICTest<float>;
 TEST_P(BatchedICTestD, Result)
 {
-  ASSERT_TRUE(devArrMatchHost(
-    res_h.data(), res_d, params.batch_size, raft::CompareApprox<double>(params.tolerance), stream));
+  ASSERT_TRUE(devArrMatchHost(res_h.data(),
+                              res_d.data(),
+                              params.batch_size,
+                              raft::CompareApprox<double>(params.tolerance),
+                              stream));
 }
 TEST_P(BatchedICTestF, Result)
 {
-  ASSERT_TRUE(devArrMatchHost(
-    res_h.data(), res_d, params.batch_size, raft::CompareApprox<float>(params.tolerance), stream));
+  ASSERT_TRUE(devArrMatchHost(res_h.data(),
+                              res_d.data(),
+                              params.batch_size,
+                              raft::CompareApprox<float>(params.tolerance),
+                              stream));
 }
 
 INSTANTIATE_TEST_CASE_P(BatchedICTests, BatchedICTestD, ::testing::ValuesIn(inputsd));
diff --git a/python/raft/common/handle.pxd b/python/raft/common/handle.pxd
index 8415b7e3d7..48ec625370 100644
--- a/python/raft/common/handle.pxd
+++ b/python/raft/common/handle.pxd
@@ -26,19 +26,12 @@ from rmm._lib.cuda_stream_pool cimport cuda_stream_pool
 from libcpp.memory cimport shared_ptr
 from libcpp.memory cimport unique_ptr
 
-cdef extern from "raft/mr/device/allocator.hpp" \
-        namespace "raft::mr::device" nogil:
-    cdef cppclass allocator:
-        pass
-
 cdef extern from "raft/handle.hpp" namespace "raft" nogil:
     cdef cppclass handle_t:
         handle_t() except +
         handle_t(cuda_stream_view stream_view) except +
         handle_t(cuda_stream_view stream_view,
                  shared_ptr[cuda_stream_pool] stream_pool) except +
-        void set_device_allocator(shared_ptr[allocator] a) except +
-        shared_ptr[allocator] get_device_allocator() except +
         cuda_stream_view get_stream() except +
         void sync_stream() except +
 
diff --git a/python/raft/common/handle.pyx b/python/raft/common/handle.pyx
index 661c5b5f23..de3952cc87 100644
--- a/python/raft/common/handle.pyx
+++ b/python/raft/common/handle.pyx
@@ -20,7 +20,6 @@
 # cython: language_level = 3
 
 # import raft
-from libcpp.memory cimport shared_ptr
 from rmm._lib.cuda_stream_view cimport cuda_stream_per_thread
 from rmm._lib.cuda_stream_view cimport cuda_stream_view
 

From 0d777ea8fc40860466a769fbb537e26098da42f4 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Mon, 21 Mar 2022 19:59:24 -0400
Subject: [PATCH 149/171] Pylibraft for pairwise distances (#540)

The goal w/ the initial pylibraft package is to 1) create one end to end API (distances) and 2) show how we would reuse specializations from the existing shared libs in order to expose primitives through Python using minimal dependencies (only rmm, numpy, and scipy, for example).

Closes #546

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Divye Gala (https://github.com/divyegala)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/540
---
 .gitignore                                    |    4 +-
 BUILD.md                                      |  139 +-
 CONTRIBUTING.md                               |    2 +-
 DEVELOPER_GUIDE.md                            |    2 +-
 README.md                                     |  123 +-
 build.sh                                      |   77 +-
 ci/cpu/build.sh                               |   18 +-
 ci/cpu/upload.sh                              |   10 +-
 ci/gpu/build.sh                               |   26 +-
 ci/local/README.md                            |    2 +-
 conda/recipes/libraft_distance/build.sh       |    2 +-
 conda/recipes/libraft_headers/build.sh        |    2 +-
 conda/recipes/libraft_nn/build.sh             |    2 +-
 conda/recipes/pylibraft/build.sh              |    5 +
 conda/recipes/pylibraft/meta.yaml             |   53 +
 conda/recipes/pyraft/build.sh                 |    4 +-
 cpp/CMakeLists.txt                            |  141 +-
 cpp/cmake/modules/raft_export.cmake           |   26 +-
 cpp/cmake/thirdparty/get_cuco.cmake           |   24 +-
 cpp/cmake/thirdparty/get_faiss.cmake          |    3 +-
 cpp/cmake/thirdparty/get_gtest.cmake          |    2 +-
 cpp/cmake/thirdparty/get_libcudacxx.cmake     |    5 +-
 cpp/cmake/thirdparty/get_mdspan.cmake         |   14 +
 cpp/cmake/thirdparty/get_rmm.cmake            |   10 +-
 cpp/cmake/thirdparty/get_thrust.cmake         |   10 +-
 cpp/cmake/versions.json                       |    9 -
 cpp/include/raft.hpp                          |    3 +
 .../raft/common/detail/callback_sink.hpp      |    2 +-
 cpp/include/raft/common/detail/logger.hpp     |  153 ++
 cpp/include/raft/common/logger.hpp            |    2 +-
 cpp/include/raft/comms/comms.hpp              |   13 +-
 cpp/include/raft/cudart_utils.h               |   10 +
 cpp/include/raft/error.hpp                    |   12 +-
 cpp/include/raft/handle.hpp                   |   10 +
 cpp/include/raft/interruptible.hpp            |   10 +
 cpp/include/raft/linalg/cublas_macros.h       |   10 +
 cpp/include/raft/linalg/cusolver_macros.h     |   10 +
 .../raft_distance/pairwise_distance.hpp       |   41 +
 cpp/src/distance/pairwise_distance.cu         |   53 +
 cpp/test/CMakeLists.txt                       |   58 +-
 cpp/test/common/logger.cpp                    |    2 +-
 docs/source/conf.py                           |    2 +-
 .../test => pylibraft/pylibraft}/__init__.py  |    4 +-
 python/pylibraft/pylibraft/_version.py        |  567 +++++
 .../pylibraft/pylibraft/common/__init__.pxd   |   14 +
 python/pylibraft/pylibraft/common/__init__.py |   15 +
 python/pylibraft/pylibraft/common/cuda.pxd    |   22 +
 python/pylibraft/pylibraft/common/handle.pxd  |   35 +
 .../pylibraft/pylibraft/distance/__init__.pxd |   14 +
 .../pylibraft/pylibraft/distance/__init__.py  |   16 +
 .../pylibraft/distance/distance_type.pxd      |   40 +
 .../pylibraft/distance/pairwise_distance.pyx  |  136 ++
 python/pylibraft/pylibraft/test/__init__.py   |   14 +
 .../pylibraft/pylibraft/test/test_distance.py |   81 +
 python/pylibraft/setup.cfg                    |   56 +
 python/pylibraft/setup.py                     |  201 ++
 python/{ => pylibraft}/setuputils.py          |    2 +-
 python/{ => pylibraft}/versioneer.py          |    1 +
 python/{ => raft}/pytest.ini                  |    0
 python/raft/{ => raft}/__init__.py            |    2 +-
 python/raft/{ => raft}/_version.py            |    0
 python/raft/{ => raft}/common/__init__.pxd    |    0
 python/raft/{ => raft}/common/__init__.py     |    2 +-
 python/raft/{ => raft}/common/cuda.pxd        |    0
 python/raft/{ => raft}/common/cuda.pyx        |    0
 python/raft/{ => raft}/common/handle.pxd      |    0
 python/raft/{ => raft}/common/handle.pyx      |    0
 .../raft/{ => raft}/common/interruptible.pxd  |    0
 .../raft/{ => raft}/common/interruptible.pyx  |    0
 python/raft/{ => raft}/dask/__init__.py       |    2 +-
 .../raft/{ => raft}/dask/common/__init__.py   |    2 +-
 python/raft/{ => raft}/dask/common/comms.py   |    2 +-
 .../{ => raft}/dask/common/comms_utils.pyx    |    0
 python/raft/{ => raft}/dask/common/nccl.pyx   |    0
 python/raft/{ => raft}/dask/common/ucx.py     |    2 +-
 python/raft/{ => raft}/dask/common/utils.py   |    2 +-
 .../raft/{ => raft}/include_test/__init__.py  |    2 +-
 .../include_test/raft_include_test.pyx        |    2 +-
 python/raft/raft/test/__init__.py             |   14 +
 python/raft/{ => raft}/test/conftest.py       |    2 +
 python/raft/{ => raft}/test/test_comms.py     |    2 +-
 .../{ => raft}/test/test_interruptible.py     |    1 +
 python/raft/{ => raft}/test/test_raft.py      |    2 +-
 python/{ => raft}/setup.cfg                   |    0
 python/{ => raft}/setup.py                    |    2 +-
 python/raft/setuputils.py                     |   65 +
 python/raft/versioneer.py                     | 1823 +++++++++++++++++
 87 files changed, 3936 insertions(+), 317 deletions(-)
 create mode 100644 conda/recipes/pylibraft/build.sh
 create mode 100644 conda/recipes/pylibraft/meta.yaml
 delete mode 100644 cpp/cmake/versions.json
 create mode 100644 cpp/include/raft/common/detail/logger.hpp
 create mode 100644 cpp/include/raft_distance/pairwise_distance.hpp
 create mode 100644 cpp/src/distance/pairwise_distance.cu
 rename python/{raft/test => pylibraft/pylibraft}/__init__.py (92%)
 create mode 100644 python/pylibraft/pylibraft/_version.py
 create mode 100644 python/pylibraft/pylibraft/common/__init__.pxd
 create mode 100644 python/pylibraft/pylibraft/common/__init__.py
 create mode 100644 python/pylibraft/pylibraft/common/cuda.pxd
 create mode 100644 python/pylibraft/pylibraft/common/handle.pxd
 create mode 100644 python/pylibraft/pylibraft/distance/__init__.pxd
 create mode 100644 python/pylibraft/pylibraft/distance/__init__.py
 create mode 100644 python/pylibraft/pylibraft/distance/distance_type.pxd
 create mode 100644 python/pylibraft/pylibraft/distance/pairwise_distance.pyx
 create mode 100644 python/pylibraft/pylibraft/test/__init__.py
 create mode 100644 python/pylibraft/pylibraft/test/test_distance.py
 create mode 100644 python/pylibraft/setup.cfg
 create mode 100644 python/pylibraft/setup.py
 rename python/{ => pylibraft}/setuputils.py (97%)
 rename python/{ => pylibraft}/versioneer.py (99%)
 rename python/{ => raft}/pytest.ini (100%)
 rename python/raft/{ => raft}/__init__.py (92%)
 rename python/raft/{ => raft}/_version.py (100%)
 rename python/raft/{ => raft}/common/__init__.pxd (100%)
 rename python/raft/{ => raft}/common/__init__.py (92%)
 rename python/raft/{ => raft}/common/cuda.pxd (100%)
 rename python/raft/{ => raft}/common/cuda.pyx (100%)
 rename python/raft/{ => raft}/common/handle.pxd (100%)
 rename python/raft/{ => raft}/common/handle.pyx (100%)
 rename python/raft/{ => raft}/common/interruptible.pxd (100%)
 rename python/raft/{ => raft}/common/interruptible.pyx (100%)
 rename python/raft/{ => raft}/dask/__init__.py (92%)
 rename python/raft/{ => raft}/dask/common/__init__.py (96%)
 rename python/raft/{ => raft}/dask/common/comms.py (99%)
 rename python/raft/{ => raft}/dask/common/comms_utils.pyx (100%)
 rename python/raft/{ => raft}/dask/common/nccl.pyx (100%)
 rename python/raft/{ => raft}/dask/common/ucx.py (98%)
 rename python/raft/{ => raft}/dask/common/utils.py (95%)
 rename python/raft/{ => raft}/include_test/__init__.py (92%)
 rename python/raft/{ => raft}/include_test/raft_include_test.pyx (93%)
 create mode 100644 python/raft/raft/test/__init__.py
 rename python/raft/{ => raft}/test/conftest.py (96%)
 rename python/raft/{ => raft}/test/test_comms.py (99%)
 rename python/raft/{ => raft}/test/test_interruptible.py (97%)
 rename python/raft/{ => raft}/test/test_raft.py (94%)
 rename python/{ => raft}/setup.cfg (100%)
 rename python/{ => raft}/setup.py (99%)
 create mode 100755 python/raft/setuputils.py
 create mode 100644 python/raft/versioneer.py

diff --git a/.gitignore b/.gitignore
index 60a43f6b54..972d491b86 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,8 +17,8 @@ build/
 build_prims/
 dist/
 python/**/**/*.cpp
-python/external_repositories
-python/record.txt
+python/raft/record.txt
+python/pylibraft/record.txt
 log
 .ipynb_checkpoints
 .DS_Store
diff --git a/BUILD.md b/BUILD.md
index 457ee85aad..ef2d1a2bda 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -2,7 +2,8 @@
 
 - [Building and installing RAFT](#build_install)
     - [CUDA/GPU Requirements](#cuda_gpu_req)
-    - [Header-only C++](#nstall_header_only_cpp)
+    - [Build Dependencies](#required_depenencies)
+    - [Header-only C++](#install_header_only_cpp)
     - [C++ Shared Libraries](#shared_cpp_libs)
     - [Googletests](#gtests)
     - [C++ Using Cmake](#cpp_using_cmake)
@@ -16,43 +17,61 @@
 ## <a id="build_install"></a>Building and installing RAFT
 
 ### <a id="cuda_gpu_req"></a>CUDA/GPU Requirements
-- CUDA 11.0+
+- CUDA Toolkit 11.0+
 - NVIDIA driver 450.80.02+
-- Pascal architecture of better (Compute capability >= 6.0)
+- Pascal architecture of better (compute capability >= 6.0)
+
+### <a id="required_dependencies"></a>Build Dependencies
+
+In addition to the libraries included with cudatoolkit 11.0+, there are some other dependencies below for building RAFT from source. Many of the dependencies are optional and depend only on the primitives being used. All of these can be installed with cmake or [rapids-cpm](https://github.com/rapidsai/rapids-cmake#cpm) and many of them can be installed with [conda](https://anaconda.org).
+
+#### Required
+- [Thrust](https://github.com/NVIDIA/thrust) v1.15 / [CUB](https://github.com/NVIDIA/cub)
+- [RMM](https://github.com/rapidsai/rmm) corresponding to RAFT version.
+- [mdspan](https://github.com/rapidsai/mdspan)
+  
+#### Optional
+- [cuCollections](https://github.com/NVIDIA/cuCollections) - Used in `raft::sparse::distance` API
+- [Libcu++](https://github.com/NVIDIA/libcudacxx) v1.7.0
+- [FAISS](https://github.com/facebookresearch/faiss) v1.7.0 - Used in `raft::spatial::knn` API and needed to build tests.
+- [NCCL](https://github.com/NVIDIA/nccl) - Used in `raft::comms` API and needed to build `Pyraft`
+- [UCX](https://github.com/openucx/ucx) - Used in `raft::comms` API and needed to build `Pyraft`
+- [Googletest](https://github.com/google/googletest) - Needed to build tests
+- [Googlebench](https://github.com/google/benchmark) - Needed to build benchmarks
+- [Doxygen](https://github.com/doxygen/doxygen) - Needed to build docs
 
 C++ RAFT is a header-only library but provides the option of building shared libraries with template instantiations for common types to speed up compile times for larger projects.
 
-The recommended way to build and install RAFT is to use the `build.sh` script in the root of the repository. This script can build both the C++ and Python code and provides options for building and installing the headers, Googletests, and individual shared libraries.
+The recommended way to build and install RAFT is to use the `build.sh` script in the root of the repository. This script can build both the C++ and Python artifacts and provides options for building and installing the headers, tests, benchmarks, and individual shared libraries.
 
 ### <a id="install_header_only_cpp"></a>Header-only C++
 
-RAFT depends on many different core libraries such as `thrust`, `cub`, `cucollections`, and `rmm`, which will be downloaded automatically by `cmake` even when only installing the headers. It's important to note that while all the headers will be installed and available, some parts of the RAFT API depend on libraries like `FAISS`, which can also be downloaded in the RAFT build but will need to be told to do so.
+`build.sh` uses [rapids-cmake](https://github.com/rapidsai/rapids-cmake), which will automatically download any dependencies which are not already installed. It's important to note that while all the headers will be installed and available, some parts of the RAFT API depend on libraries like `FAISS`, which will need to be explicitly enabled in `build.sh`.
 
-The following example builds and installs raft in header-only mode:
+The following example will download the needed dependencies and install the RAFT headers into `$INSTALL_PREFIX/include/raft`. The `--install` flag can be omitted to just have the build download the needed dependencies. Since RAFT is primarily used at build-time, the dependencies will never be installed by the RAFT build, with the exception of building FAISS statically into the shared libraries.
 ```bash
-./build.sh libraft
+./build.sh libraft --install
 ```
 
-###<a id="shared_cpp_libs"></a>C++ Shared Libraries (optional)
-
-Shared libraries are provided to speed up compile times for larger libraries which may heavily utilize some of the APIs. These shared libraries can also significantly improve re-compile times while developing against the APIs.
-
-Build all the shared libraries by passing `--compile-libs` flag to `build.sh`:
+### <a id="shared_cpp_libs"></a>C++ Shared Libraries (optional)
 
+For larger projects which make heavy use of the pairwise distances or nearest neighbors APIs, shared libraries can be built to speed up compile times. These shared libraries can also significantly improve re-compile times both while developing RAFT and developing against the APIs. Build all of the available shared libraries by passing `--compile-libs` flag to `build.sh`:
 ```bash
 ./build.sh libraft --compile-libs
 ```
 
-To remain flexible, the individual shared libraries have their own flags and multiple can be used (though currently only the `nn` and `distance` packages contain shared libraries):
+Individual shared libraries have their own flags and multiple can be used (though currently only the `nn` and `distance` packages contain shared libraries):
 ```bash
 ./build.sh libraft --compile-nn --compile-dist
 ```
 
-###<a id="gtests"></a>Googletests
+Add the `--install` flag to the above example to also install the shared libraries into `$INSTALL_PREFIX/lib`.
+
+### <a id="gtests"></a>Tests
 
-Compile the Googletests using the `tests` target in `build.sh`:
+Compile the tests using the `tests` target in `build.sh`. By default, the shared libraries are assumed to be already built and on the library path. Add `--compile-libs` to also compile them.
 ```bash
-./build.sh libraft tests --compile-nn --compile-dist
+./build.sh libraft tests --compile-libs
 ```
 
 To run C++ tests:
@@ -61,14 +80,14 @@ To run C++ tests:
 ./cpp/build/test_raft
 ```
 
-###<a id="benchmarks"></a>Benchmarks
+### <a id="benchmarks"></a>Benchmarks
 
 Compile the benchmarks using the `bench` target in `build.sh`:
 ```bash
-./build.sh libraft bench --compile-nn --compile-dist
+./build.sh libraft bench
 ```
 
-To run C++ tests:
+To run the benchmarks:
 
 ```bash
 ./cpp/build/bench_raft
@@ -76,16 +95,15 @@ To run C++ tests:
 
 ### <a id="cpp_using_cmake"></a>C++ Using Cmake
 
-To install RAFT into a specific location, use `CMAKE_INSTALL_PREFIX`. The snippet below will install it into the current conda environment.
+Use `CMAKE_INSTALL_PREFIX` to install RAFT into a specific location. The snippet below will install it into the current conda environment:
 ```bash
 cd cpp
 mkdir build
 cd build
 cmake -D BUILD_TESTS=ON -DRAFT_COMPILE_LIBRARIES=ON -DRAFT_ENABLE_NN_DEPENDENCIES=ON  -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX ../
-make install
+make -j<parallel_level> install
 ```
 
-
 RAFT's cmake has the following configurable flags available:.
 
 | Flag | Possible Values | Default Value | Behavior |
@@ -95,58 +113,83 @@ RAFT's cmake has the following configurable flags available:.
 | RAFT_COMPILE_LIBRARIES | ON, OFF | OFF | Compiles all `libraft` shared libraries (these are required for Googletests) |
 | RAFT_COMPILE_NN_LIBRARY | ON, OFF | ON | Compiles the `libraft-nn` shared library |
 | RAFT_COMPILE_DIST_LIBRARY | ON, OFF | ON | Compiles the `libraft-distance` shared library |
-| RAFT_ENABLE_NN_DEPENDENCIES | ON, OFF | OFF | Searches for dependencies of nearest neighbors API, such as FAISS, and compiles them if not found. |
-| RAFT_USE_FAISS_STATIC | ON, OFF | OFF | Statically link FAISS into `libraft-nn` |
+| RAFT_ENABLE_NN_DEPENDENCIES | ON, OFF | OFF | Searches for dependencies of nearest neighbors API, such as FAISS, and compiles them if not found. Needed for `raft::spatial::knn` |
+| RAFT_ENABLE_cuco_DEPENDENCY | ON, OFF | ON | Enables the cuCollections dependency used by `raft::sparse::distance` |
+| RAFT_ENABLE_nccl_DEPENDENCY | ON, OFF | OFF | Enables NCCL dependency used by `raft::comms` and needed to build `pyraft` |
+| RAFT_ENABLE_ucx_DEPENDENCY | ON, OFF | OFF | Enables UCX dependency used by `raft::comms` and needed to build `pyraft` |
+| RAFT_USE_FAISS_STATIC | ON, OFF | OFF | Statically link FAISS into `libraft-nn` | 
+| RAFT_STATIC_LINK_LIBRARIES | ON, OFF | ON | Build static link libraries instead of shared libraries |
 | DETECT_CONDA_ENV | ON, OFF | ON | Enable detection of conda environment for dependencies |
 | NVTX | ON, OFF | OFF | Enable NVTX Markers |
 | CUDA_ENABLE_KERNELINFO | ON, OFF | OFF | Enables `kernelinfo` in nvcc. This is useful for `compute-sanitizer` |
 | CUDA_ENABLE_LINEINFO  | ON, OFF | OFF | Enable the -lineinfo option for nvcc |
 | CUDA_STATIC_RUNTIME | ON, OFF | OFF | Statically link the CUDA runtime |
 
-Shared libraries are provided for the `libraft-nn` and `libraft-distance` components currently. The `libraft-nn` component depends upon [FAISS](https://github.com/facebookresearch/faiss) and the `RAFT_ENABLE_NN_DEPENDENCIES` option will build it from source if it is not already installed.
-
-
+Currently, shared libraries are provided for the `libraft-nn` and `libraft-distance` components. The `libraft-nn` component depends upon [FAISS](https://github.com/facebookresearch/faiss) and the `RAFT_ENABLE_NN_DEPENDENCIES` option will build it from source if it is not already installed.
 
 ### <a id="python"></a>Python
 
 Conda environment scripts are provided for installing the necessary dependencies for building and using the Python APIs. It is preferred to use `mamba`, as it provides significant speedup over `conda`. The following example will install create and install dependencies for a CUDA 11.5 conda environment:
 
 ```bash
-conda env create --name raft_env -f conda/environments/raft_dev_cuda11.5.yml
+mamba env create --name raft_env_name -f conda/environments/raft_dev_cuda11.5.yml
+mamba activate raft_env_name
 ```
 
-The Python API can be built using the `build.sh` script:
+The Python APIs can be built using the `build.sh` script:
 
 ```bash
-./build.sh pyraft
+./build.sh pyraft pylibraft
 ```
 
-`setup.py` can also be used to build the Python API manually:
+`setup.py` can also be used to build the Python APIs manually:
 ```bash
-cd python
+cd python/raft
+python setup.py build_ext --inplace
+python setup.py install
+
+cd python/pylibraft
 python setup.py build_ext --inplace
 python setup.py install
 ```
 
 To run the Python tests:
 ```bash
-cd python
-python -m pytest raft
+cd python/raft
+py.test -s -v raft
+
+cd python pylibraft
+py.test -s -v pylibraft
 ```
 
 ## <a id="use_raft"></a>Using RAFT in downstream projects
 
+There are two different strategies for including RAFT in downstream projects, depending on whether or not the required dependencies are already installed and available on the `lib` and `include` paths. 
+
 ### <a id="cxx_integration"></a>C++ header-only integration using cmake
 
-Use RAFT in cmake projects with `find_package(raft)` for header-only operation and the `raft::raft` target will be available for configuring linking and `RAFT_INCLUDE_DIR` will be available for includes. Note that if any packages are used which require downstream dependencies, such as the `libraft-nn` package requiring FAISS, these dependencies will have be installed and configured in cmake independently.
+When the needed [build dependencies](#required_depenencies) are already satisfied, RAFT can be trivially integrated into downstream projects by cloning the repository and adding `cpp/include` from RAFT to the include path:
+```cmake
+set(RAFT_GIT_DIR ${CMAKE_CURRENT_BINARY_DIR}/raft CACHE STRING "Path to RAFT repo")
+ExternalProject_Add(raft
+  GIT_REPOSITORY    git@github.com:rapidsai/raft.git
+  GIT_TAG           branch-22.04
+  PREFIX            ${RAFT_GIT_DIR}
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   "")
+set(RAFT_INCLUDE_DIR ${RAFT_GIT_DIR}/raft/cpp/include CACHE STRING "RAFT include variable")
+```
+
+If RAFT has already been installed, such as by using the `build.sh` script, use `find_package(raft)` and the `raft::raft` target if using RAFT to interact only with the public APIs of consuming projects.
 
 ### <a id="use_shared_libs"></a>Using pre-compiled shared libraries
 
-Use `find_package(raft COMPONENTS nn, distance)` to enable the shared libraries and pass dependencies through separate targets for each component. In this example, `raft::distance` and `raft::nn` targets will be available for configuring linking paths. These targets will also pass through any transitive dependencies (such as FAISS in the case of the `nn` package).
+Use `find_package(raft COMPONENTS nn distance)` to enable the shared libraries and transitively pass dependencies through separate targets for each component. In this example, the `raft::distance` and `raft::nn` targets will be available for configuring linking paths in addition to `raft::raft`. These targets will also pass through any transitive dependencies (such as FAISS for the `nn` package).
 
-The pre-compiled libraries contain template specializations for commonly used types and require the additional include of header files with `extern template` definitions that tell the compiler not to instantiate templates that are already contained in the shared libraries. By convention, these header files are named `spectializations.hpp` and located in the base directory for the packages that contain specializations.
+The pre-compiled libraries contain template specializations for commonly used types, such as single- and double-precision floating-point. In order to use the symbols in the pre-compiled libraries, the compiler needs to be told not to instantiate templates that are already contained in the shared libraries. By convention, these header files are named `specializations.hpp` and located in the base directory for the packages that contain specializations.
 
-The following example shows how to use the `libraft-distance` API with the pre-compiled specializations:
+The following example tells the compiler to ignore the pre-compiled templates for the `libraft-distance` API so any symbols already compiled into pre-compiled shared library will be used instead:
 ```c++
 #include <raft/distance/distance.hpp>
 #include <raft/distance/specializations.hpp>
@@ -154,13 +197,17 @@ The following example shows how to use the `libraft-distance` API with the pre-c
 
 ### <a id="build_cxx_source"></a>Building RAFT C++ from source in cmake
 
-RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library, so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` link target and `RAFT_INCLUDE_DIR` for includes. The `COMPILE_LIBRARIES` option enables the building of the shared libraries.
+RAFT uses the [RAPIDS-CMake](https://github.com/rapidsai/rapids-cmake) library so it can be more easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [CMake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). 
+
+The following example is similar to invoking `find_package(raft)` but uses `rapids_cpm_find`, which provides a richer and more flexible configuration landscape by using CPM to fetch any dependencies not already available to the build. The `raft::raft` link target will be made available and it's recommended that it be used as a `PRIVATE` link dependency in downstream projects. The `COMPILE_LIBRARIES` option enables the building the shared libraries.
 
 The following `cmake` snippet enables a flexible configuration of RAFT:
 
 ```cmake
 
 set(RAFT_VERSION "22.04")
+set(RAFT_FORK "rapidsai")
+set(RAFT_PINNED_TAG "branch-${RAFT_VERSION}")
 
 function(find_and_configure_raft)
   set(oneValueArgs VERSION FORK PINNED_TAG USE_FAISS_STATIC
@@ -182,7 +229,6 @@ function(find_and_configure_raft)
   # Add components
   #-----------------------------------------------------
 
-  string(APPEND RAFT_COMPONENTS "")
   if(PKG_USE_NN_LIBRARY)
     string(APPEND RAFT_COMPONENTS " nn")
   endif()
@@ -197,8 +243,8 @@ function(find_and_configure_raft)
 
   rapids_cpm_find(raft ${PKG_VERSION}
           GLOBAL_TARGETS      raft::raft
-          BUILD_EXPORT_SET    proj-exports
-          INSTALL_EXPORT_SET  proj-exports
+          BUILD_EXPORT_SET    projname-exports
+          INSTALL_EXPORT_SET  projname-exports
           CPM_ARGS
           GIT_REPOSITORY https://github.com/${PKG_FORK}/raft.git
           GIT_TAG        ${PKG_PINNED_TAG}
@@ -206,6 +252,7 @@ function(find_and_configure_raft)
           FIND_PACKAGE_ARGUMENTS "COMPONENTS ${RAFT_COMPONENTS}"
           OPTIONS
           "BUILD_TESTS OFF"
+          "BUILD_BENCH OFF"
           "RAFT_ENABLE_NN_DEPENDENCIES ${PKG_ENABLE_NN_DEPENDENCIES}"
           "RAFT_USE_FAISS_STATIC ${PKG_USE_FAISS_STATIC}"
           "RAFT_COMPILE_LIBRARIES ${PKG_COMPILE_LIBRARIES}"
@@ -217,8 +264,8 @@ endfunction()
 # To use a different RAFT locally, set the CMake variable
 # CPM_raft_SOURCE=/path/to/local/raft
 find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
-        FORK             rapidsai
-        PINNED_TAG       branch-${RAFT_VERSION}
+        FORK             ${RAFT_FORK}
+        PINNED_TAG       ${RAFT_PINNED_TAG}
 
         # When PINNED_TAG above doesn't match cuml,
         # force local raft clone in build directory
@@ -233,6 +280,8 @@ find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
 )
 ```
 
+If using the nearest neighbors APIs without the shared libraries, set `ENABLE_NN_DEPENDENCIES=ON` and keep `USE_NN_LIBRARY=OFF`
+
 ### <a id="py_integration"></a>Python/Cython Integration
 
-Once installed, RAFT's Python library can be imported and used directly.
+Once installed, RAFT's Python library can be added to downstream conda recipes, imported and used directly.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 828986e190..faf777ba42 100755
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -23,7 +23,7 @@ into three categories:
 
 ### Your first issue
 
-1. Read the project's [README.md](https://github.com/rapidsai/RAFT/blob/main/README.md)
+1. Read the project's [README.md](https://github.com/rapidsai/raft)
     to learn how to setup the development environment
 2. Find an issue to work on. The best way is to look for the [good first issue](https://github.com/rapidsai/RAFT/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22)
     or [help wanted](https://github.com/rapidsai/RAFT/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) labels
diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md
index a045d13991..5c1e122525 100644
--- a/DEVELOPER_GUIDE.md
+++ b/DEVELOPER_GUIDE.md
@@ -4,7 +4,7 @@
 
 Devloping features and fixing bugs for the RAFT library itself is straightforward and only requires building and installing the relevant RAFT artifacts. 
 
-The process for working on a CUDA/C++ feature which spans RAFT and one or more consumers can vary slightly depending on whether the consuming project relies on a source build (as outlined in the [BUILD](BUILD.md#building-raft-c-from-source) docs). In such a case, the option `CPM_raft_SOURCE=/path/to/raft/source` can be passed to the cmake of the consuming project in order to build the local RAFT from source. The PR with relevant changes to the consuming project can also pin the RAFT version temporarily by explicitly changing the `FORK` and `PINNED_TAG` arguments to the RAFT branch containing their changes when invoking `find_and_configure_raft`.  The pin should be reverted after the changed is merged to the RAFT project and before it is merged to the dependent project(s) downstream. 
+The process for working on a CUDA/C++ feature which spans RAFT and one or more consumers can vary slightly depending on whether the consuming project relies on a source build (as outlined in the [BUILD](BUILD.md#install_header_only_cpp) docs). In such a case, the option `CPM_raft_SOURCE=/path/to/raft/source` can be passed to the cmake of the consuming project in order to build the local RAFT from source. The PR with relevant changes to the consuming project can also pin the RAFT version temporarily by explicitly changing the `FORK` and `PINNED_TAG` arguments to the RAFT branch containing their changes when invoking `find_and_configure_raft`.  The pin should be reverted after the changed is merged to the RAFT project and before it is merged to the dependent project(s) downstream.
 
 If building a feature which spans projects and not using the source build in cmake, the RAFT changes (both C++ and Python) will need to be installed into the environment of the consuming project before they can be used. The ideal integration of RAFT into consuming projects will enable both the source build in the consuming project only for this case but also rely on a more stable packaging (such as conda packaging) otherwise. 
 
diff --git a/README.md b/README.md
index 606197cde0..4f34bbc6b0 100755
--- a/README.md
+++ b/README.md
@@ -1,18 +1,17 @@
-# <div align="left"><img src="https://rapids.ai/assets/images/rapids_logo.png" width="90px"/>&nbsp;RAFT: RAPIDS Analytics Framework Toolkit</div>
+# <div align="left"><img src="https://rapids.ai/assets/images/rapids_logo.png" width="90px"/>&nbsp;RAFT: Reusable Accelerated Functions and Tools</div>
 
-RAFT contains fundamental widely-used algorithms and primitives for data science, graph and machine learning. The algorithms are CUDA-accelerated and form building-blocks for rapidly composing analytics in the [RAPIDS](https://rapids.ai) ecosystem. 
+RAFT contains fundamental widely-used algorithms and primitives for data science, graph and machine learning. The algorithms are CUDA-accelerated and form building-blocks for rapidly composing analytics.
 
-By taking a primitives-based approach to algorithm development, RAFT
+By taking a primitives-based approach to algorithm development, RAFT 
 - accelerates algorithm construction time
 - reduces the maintenance burden by maximizing reuse across projects, and
-- centralizes the core computations, allowing future optimizations to benefit all algorithms that use them.
+- centralizes core reusable computations, allowing future optimizations to benefit all algorithms that use them.
 
-The algorithms in RAFT span the following general categories:
+While not exhaustive, the following general categories help summarize the accelerated functions in RAFT:
 #####
 | Category | Examples |
 | --- | --- |
 | **Data Formats** | sparse & dense, conversions, data generation |
-| **Data Generation** | sparse, spatial, machine learning datasets |
 | **Dense Linear Algebra** | matrix arithmetic, norms, factorization, least squares, svd & eigenvalue problems |
 | **Spatial** | pairwise distances, nearest neighbors, neighborhood graph construction |
 | **Sparse Operations** | linear algebra, eigenvalue problems, slicing, symmetrization, labeling |
@@ -23,18 +22,15 @@ The algorithms in RAFT span the following general categories:
 
 RAFT provides a header-only C++ library and pre-compiled shared libraries that can 1) speed up compile times and 2) enable the APIs to be used without CUDA-enabled compilers.
 
-RAFT also provides a Python library that is currently limited to
-1. a python wrapper around the `raft::handle_t` for managing cuda library resources
-2. definitions for using `raft::handle_t` directly in cython
-3. tools for building multi-node multi-GPU algorithms that leverage [Dask](https://dask.org/)
-
-The Python API is being improved to wrap the algorithms and primitives from the categories above.
+RAFT also provides 2 Python libraries:
+- `pylibraft` - low-level Python wrappers around RAFT algorithms and primitives.
+- `pyraft` - reusable infrastructure for building analytics, including tools for building both single-GPU and multi-node multi-GPU algorithms.
 
 ## Getting started
 
-### Rapids Memory Manager (RMM)
+### RAPIDS Memory Manager (RMM)
 
-RAFT relies heavily on RMM which, like other projects in the RAPIDS ecosystem, eases the burden of configuring different allocation strategies globally across the libraries that use it.
+RAFT relies heavily on RMM which eases the burden of configuring different allocation strategies globally across the libraries that use it.
 
 ### Multi-dimensional Arrays
 
@@ -48,9 +44,9 @@ The `mdarray` forms a convenience layer over RMM and can be constructed in RAFT
 int n_rows = 10;
 int n_cols = 10;
 
-auto scalar = raft::make_device_scalar(handle, 1.0);
-auto vector = raft::make_device_vector(handle, n_cols);
-auto matrix = raft::make_device_matrix(handle, n_rows, n_cols);
+auto scalar = raft::make_device_scalar<float>(handle, 1.0);
+auto vector = raft::make_device_vector<float>(handle, n_cols);
+auto matrix = raft::make_device_matrix<float>(handle, n_rows, n_cols);
 ```
 
 ### C++ Example
@@ -80,38 +76,61 @@ auto metric = raft::distance::DistanceType::L2SqrtExpanded;
 raft::distance::pairwise_distance(handle, input.view(), input.view(), output.view(), metric);
 ```
 
+### Python Example
+
+The `pylibraft` package contains a Python API for RAFT algorithms and primitives. The package is currently limited to pairwise distances, and we will continue adding more.
+
+The example below demonstrates computing the pairwise Euclidean distances between cupy arrays. `pylibraft` is a low-level API that prioritizes efficiency and simplicity over being pythonic, which is shown here by pre-allocating the output memory before invoking the `pairwise_distance` function.
+
+```python
+import cupy as cp
+
+from pylibraft.distance import pairwise_distance
+
+n_samples = 5000
+n_features = 50
+
+in1 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
+in2 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32)
+output = cp.empty((n_samples, n_samples), dtype=cp.float32)
+
+pairwise_distance(in1, in2, output, metric="euclidean")
+```
+
 ## Installing
 
-RAFT can be installed through conda, cmake-package-manager (cpm), or by building the repository from source. 
+RAFT itself can be installed through conda, [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake), or by building the repository from source. Please refer to the [build instructions](BUILD.md) for more a comprehensive guide on building RAFT and using it in downstream projects.
 
 ### Conda
 
 The easiest way to install RAFT is through conda and several packages are provided.
-- `libraft-headers` contains all the CUDA/C++ headers
-- `libraft-nn` (optional) contains precompiled shared libraries for the nearest neighbors algorithms. If FAISS is not already installed in your environment, this will need to be installed to use the nearest neighbors headers.
-- `libraft-distance` (optional) contains shared libraries for distance algorithms.
-- `pyraft` (optional) contains the Python library
+- `libraft-headers` RAFT headers
+- `libraft-nn` (optional) contains shared libraries for the nearest neighbors primitives.
+- `libraft-distance` (optional) contains shared libraries for distance primitives.
+- `pylibraft` (optional) Python wrappers around RAFT algorithms and primitives
+- `pyraft` (optional) contains reusable Python infrastructure and tools to accelerate Python algorithm development.
 
-To install RAFT with conda (change to `rapidsai-nightly` for more up-to-date but less stable nightly packages)
+Use the following command to install RAFT with conda (replace `rapidsai` with `rapidsai-nightly` to install more up-to-date but less stable nightly packages). `mamba` is preferred over the `conda` command.
 ```bash
-conda install -c rapidsai libraft-headers libraft-nn libraft-distance pyraft
+mamba install -c rapidsai libraft-headers libraft-nn libraft-distance pyraft pylibraft
 ```
 
-After installing RAFT, `find_package(raft COMPONENTS nn distance)` can be used in your CUDA/C++ build. Note that the `COMPONENTS` are optional and will depend on the packages installed.
+After installing RAFT, `find_package(raft COMPONENTS nn distance)` can be used in your CUDA/C++ build. `COMPONENTS` are optional and will depend on the packages installed.
 
 ### CPM
 
-RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library, which makes it simple to include in downstream cmake projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). 
+RAFT uses the [RAPIDS-CMake](https://github.com/rapidsai/rapids-cmake) library, which makes it simple to include in downstream cmake projects. RAPIDS CMake provides a convenience layer around CPM. 
 
-After [installing](https://github.com/rapidsai/rapids-cmake#installation) rapids-cmake in your project, you can begin using RAFT by placing the code snippet below in a file named `get_raft.cmake` and including it in your cmake build with `include(get_raft.cmake)`. This will create the `raft::raft` target to add to configure the link libraries for your artifacts.
+After [installing](https://github.com/rapidsai/rapids-cmake#installation) rapids-cmake in your project, you can begin using RAFT by placing the code snippet below in a file named `get_raft.cmake` and including it in your cmake build with `include(get_raft.cmake)`. This will make available several targets to add to configure the link libraries for your artifacts.
 
 ```cmake
 
 set(RAFT_VERSION "22.04")
+set(RAFT_FORK "rapidsai")
+set(RAFT_PINNED_TAG "branch-${RAFT_VERSION}")
 
 function(find_and_configure_raft)
-  set(oneValueArgs VERSION FORK PINNED_TAG USE_FAISS_STATIC 
-          COMPILE_LIBRARIES ENABLE_NN_DEPENDENCIES)
+  set(oneValueArgs VERSION FORK PINNED_TAG COMPILE_LIBRARIES)
   cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
                             "${multiValueArgs}" ${ARGN} )
 
@@ -121,16 +140,15 @@ function(find_and_configure_raft)
 
   rapids_cpm_find(raft ${PKG_VERSION}
           GLOBAL_TARGETS      raft::raft
-          BUILD_EXPORT_SET    proj-exports
-          INSTALL_EXPORT_SET  proj-exports
+          BUILD_EXPORT_SET    projname-exports
+          INSTALL_EXPORT_SET  projname-exports
           CPM_ARGS
           GIT_REPOSITORY https://github.com/${PKG_FORK}/raft.git
           GIT_TAG        ${PKG_PINNED_TAG}
           SOURCE_SUBDIR  cpp
           OPTIONS
           "BUILD_TESTS OFF"
-          "RAFT_ENABLE_NN_DEPENDENCIES ${PKG_ENABLE_NN_DEPENDENCIES}"
-          "RAFT_USE_FAISS_STATIC ${PKG_USE_FAISS_STATIC}"
+          "BUILD_BENCH OFF"
           "RAFT_COMPILE_LIBRARIES ${PKG_COMPILE_LIBRARIES}"
   )
 
@@ -140,35 +158,46 @@ endfunction()
 # To use a different RAFT locally, set the CMake variable
 # CPM_raft_SOURCE=/path/to/local/raft
 find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
-        FORK             rapidsai
-        PINNED_TAG       branch-${RAFT_VERSION}
-
+        FORK             ${RAFT_FORK}
+        PINNED_TAG       ${RAFT_PINNED_TAG}
         COMPILE_LIBRARIES      NO
-        ENABLE_NN_DEPENDENCIES NO
-        USE_FAISS_STATIC       NO
 )
 ```
 
+Several CMake targets can be made available by adding components in the table below to the `RAFT_COMPONENTS` list above, separated by spaces. The `raft::raft` target will always be available.
+
+| Component | Target | Description | Base Dependencies |
+| --- | --- | --- | --- |
+| n/a | `raft::raft` | Full RAFT header library | CUDA toolkit library, RMM, std::mdspan, cuCollections, Thrust, NVTools |
+| distance | `raft::distance` | Pre-compiled template specializations for raft::distance | raft::raft |
+| nn | `raft::nn` | Pre-compiled template specializations for raft::spatial::knn | raft::raft, FAISS |
+
 ### Source
 
-The easiest way to build RAFT from source is to use the `build.sh` script at the root of the repository,
-1. create an environment with the RAFT dependencies: `conda env create --name raft_dev -f conda/environments/raft_dev_cuda11.5.yml`
-2. run the build script from the repository root: `./build.sh pyraft libraft --compile-libs`
+The easiest way to build RAFT from source is to use the `build.sh` script at the root of the repository:
+1. Create an environment with the needed dependencies: 
+```
+mamba env create --name raft_dev_env -f conda/environments/raft_dev_cuda11.5.yml
+mamba activate raft_dev_env
+```
+```
+./build.sh pyraft pylibraft libraft tests bench --compile-libs
+```
 
-The [Build](BUILD.md) instructions contain more details on building RAFT from source and including it in downstream projects. You can also find a more comprehensive version of the above CPM code snippet the [Building RAFT C++ from source](BUILD.md#build_cxx_source) guide.
+The [build](BUILD.md) instructions contain more details on building RAFT from source and including it in downstream projects. You can also find a more comprehensive version of the above CPM code snippet the [Building RAFT C++ from source](BUILD.md#build_cxx_source) section of the build instructions.
 
 ## Folder Structure and Contents
 
-The folder structure mirrors other RAPIDS repos (cuDF, cuML, cuGraph...), with the following folders:
+The folder structure mirrors other RAPIDS repos, with the following folders:
 
 - `ci`: Scripts for running CI in PRs
 - `conda`: Conda recipes and development conda environments
-- `cpp`: Source code for all C++ code. 
+- `cpp`: Source code for C++ libraries. 
   - `docs`: Doxygen configuration
-  - `include`: The C++ API is fully-contained here 
+  - `include`: The C++ API is fully-contained here
   - `src`: Compiled template specializations for the shared libraries
 - `docs`: Source code and scripts for building library documentation (doxygen + pydocs)
-- `python`: Source code for all Python source code.
+- `python`: Source code for Python libraries.
 
 ## Contributing
 
@@ -195,4 +224,4 @@ If citing the sparse pairwise distances API, please consider using the following
   journal={arXiv preprint arXiv:2104.06357},
   year={2021}
 }
-```
\ No newline at end of file
+```
diff --git a/build.sh b/build.sh
index eb5fa0a250..0c3fbaccb6 100755
--- a/build.sh
+++ b/build.sh
@@ -2,7 +2,7 @@
 
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-# cuml build script
+# raft build script
 
 # This script is used to build the component(s) in this repo from
 # source, and can be called with various options to customize the
@@ -18,13 +18,14 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libraft pyraft docs tests bench -v -g --noinstall --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --buildfaiss"
+VALIDARGS="clean libraft pyraft pylibraft docs tests bench clean -v -g --install --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --buildfaiss"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
    libraft          - build the raft C++ code only. Also builds the C-wrapper library
                       around the C++ code.
-   pyraft           - build the cuml Python package
+   pyraft           - build the pyraft Python package
+   pylibraft        - build the pylibraft Python package
    docs             - build the documentation
    tests            - build the tests
    bench            - build the benchmarks
@@ -37,8 +38,8 @@ HELP="$0 [<target> ...] [<flag> ...]
    --compile-dist   - compile shared library for distance component
    --allgpuarch     - build for all supported GPU architectures
    --buildfaiss     - build faiss statically into raft
-   --noinstall      - do not install cmake targets
-   --nvtx           - Enable nvtx for profiling support
+   --install        - install cmake targets
+   --nvtx           - enable nvtx for profiling support
    --show_depr_warn - show cmake deprecation warnings
    -h               - print this text
 
@@ -46,9 +47,9 @@ HELP="$0 [<target> ...] [<flag> ...]
 "
 LIBRAFT_BUILD_DIR=${LIBRAFT_BUILD_DIR:=${REPODIR}/cpp/build}
 SPHINX_BUILD_DIR=${REPODIR}/docs
-PY_RAFT_BUILD_DIR=${REPODIR}/python/build
-PYTHON_DEPS_CLONE=${REPODIR}/python/external_repositories
-BUILD_DIRS="${LIBRAFT_BUILD_DIR} ${PY_RAFT_BUILD_DIR} ${PYTHON_DEPS_CLONE}"
+PY_RAFT_BUILD_DIR=${REPODIR}/python/raft/build
+PY_LIBRAFT_BUILD_DIR=${REPODIR}/python/pylibraft/build
+BUILD_DIRS="${LIBRAFT_BUILD_DIR} ${PY_RAFT_BUILD_DIR} ${PY_LIBRAFT_BUILD_DIR}"
 
 # Set defaults for vars modified by flags to this script
 CMAKE_LOG_LEVEL=""
@@ -61,11 +62,15 @@ COMPILE_LIBRARIES=OFF
 COMPILE_NN_LIBRARY=OFF
 COMPILE_DIST_LIBRARY=OFF
 ENABLE_NN_DEPENDENCIES=OFF
+ENABLE_ucx_DEPENDENCY=OFF
+ENABLE_nccl_DEPENDENCY=OFF
+
 NVTX=OFF
 CLEAN=0
+UNINSTALL=0
 DISABLE_DEPRECATION_WARNINGS=ON
 CMAKE_TARGET=""
-INSTALL_TARGET="install"
+INSTALL_TARGET=""
 
 # Set defaults for vars that may not have been defined externally
 #  FIXME: if INSTALL_PREFIX is not set, check PREFIX, then check
@@ -81,10 +86,6 @@ function hasArg {
     (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ")
 }
 
-if hasArg --noinstall; then
-    INSTALL_TARGET=""
-fi
-
 if hasArg -h || hasArg --help; then
     echo "${HELP}"
     exit 0
@@ -101,6 +102,9 @@ if (( ${NUMARGS} != 0 )); then
 fi
 
 # Process flags
+if hasArg --install; then
+  INSTALL_TARGET="install"
+fi
 if hasArg -v; then
     VERBOSE_FLAG="-v"
     CMAKE_LOG_LEVEL="VERBOSE"
@@ -152,6 +156,13 @@ fi
 if hasArg clean; then
     CLEAN=1
 fi
+if hasArg uninstall; then
+  UNINSTALL=1
+fi
+
+if [[ ${CMAKE_TARGET} == "" ]]; then
+  CMAKE_TARGET="all"
+fi
 
 # If clean given, run it prior to any other steps
 if (( ${CLEAN} == 1 )); then
@@ -164,15 +175,22 @@ if (( ${CLEAN} == 1 )); then
           find ${bd} -mindepth 1 -delete
           rmdir ${bd} || true
       fi
-
     done
 
-    cd ${REPODIR}/python
+    cd ${REPODIR}/python/raft
     python setup.py clean --all
     cd ${REPODIR}
-fi
 
+    cd ${REPODIR}/python/pylibraft
+    python setup.py clean --all
+    cd ${REPODIR}
+fi
 
+# Pyraft requires ucx + nccl
+if (( ${NUMARGS} == 0 )) || hasArg pyraft || hasArg docs; then
+  ENABLE_nccl_DEPENDENCY=ON
+  ENABLE_ucx_DEPENDENCY=ON
+fi
 ################################################################################
 # Configure for building all C++ targets
 if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || hasArg bench; then
@@ -198,18 +216,35 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || has
           -DCMAKE_MESSAGE_LOG_LEVEL=${CMAKE_LOG_LEVEL} \
           -DRAFT_COMPILE_NN_LIBRARY=${COMPILE_NN_LIBRARY} \
           -DRAFT_COMPILE_DIST_LIBRARY=${COMPILE_DIST_LIBRARY} \
-          -DRAFT_USE_FAISS_STATIC=${BUILD_STATIC_FAISS}
+          -DRAFT_USE_FAISS_STATIC=${BUILD_STATIC_FAISS} \
+          -DRAFT_ENABLE_nccl_DEPENDENCY=${ENABLE_nccl_DEPENDENCY} \
+          -DRAFT_ENABLE_ucx_DEPENDENCY=${ENABLE_ucx_DEPENDENCY}
 
-  if [[ ${CMAKE_TARGET} != "" ]] || [[ ${INSTALL_TARGET} != "" ]]; then
+  if [[ ${CMAKE_TARGET} != "" ]]; then
       echo "-- Compiling targets: ${CMAKE_TARGET}, verbose=${VERBOSE_FLAG}"
-      cmake --build  "${LIBRAFT_BUILD_DIR}" ${VERBOSE_FLAG} -j${PARALLEL_LEVEL} --target ${CMAKE_TARGET} ${INSTALL_TARGET}
+      if [[ ${INSTALL_TARGET} != "" ]]; then
+        cmake --build  "${LIBRAFT_BUILD_DIR}" ${VERBOSE_FLAG} -j${PARALLEL_LEVEL} --target ${CMAKE_TARGET} ${INSTALL_TARGET}
+      else
+        cmake --build  "${LIBRAFT_BUILD_DIR}" ${VERBOSE_FLAG} -j${PARALLEL_LEVEL} --target ${CMAKE_TARGET}
+      fi
   fi
 fi
 
-# Build and (optionally) install the cuml Python package
+# Build and (optionally) install the pyraft Python package
 if (( ${NUMARGS} == 0 )) || hasArg pyraft || hasArg docs; then
 
-    cd ${REPODIR}/python
+    cd ${REPODIR}/python/raft
+    if [[ ${INSTALL_TARGET} != "" ]]; then
+        python setup.py build_ext -j${PARALLEL_LEVEL:-1} --inplace --library-dir=${LIBRAFT_BUILD_DIR} install --single-version-externally-managed --record=record.txt
+    else
+        python setup.py build_ext -j${PARALLEL_LEVEL:-1} --inplace --library-dir=${LIBRAFT_BUILD_DIR}
+    fi
+fi
+
+# Build and (optionally) install the pylibraft Python package
+if (( ${NUMARGS} == 0 )) || hasArg pylibraft; then
+
+    cd ${REPODIR}/python/pylibraft
     if [[ ${INSTALL_TARGET} != "" ]]; then
         python setup.py build_ext -j${PARALLEL_LEVEL:-1} --inplace --library-dir=${LIBRAFT_BUILD_DIR} install --single-version-externally-managed --record=record.txt
     else
diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 64d46a68c7..71228cb846 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -69,7 +69,7 @@ conda config --set ssl_verify False
 # machine with a single CUDA version, then have the gpu/build.sh script simply
 # install. This should eliminate a mismatch between different CUDA versions on
 # cpu vs. gpu builds that is problematic with CUDA 11.5 Enhanced Compat.
-if [ "$BUILD_LIBRAFT" == '1' ]; then
+if [ "$BUILD_LIBRAFT" == "1" ]; then
   BUILD_RAFT=1
   # If we are doing CUDA + Python builds, libraft package is located at ${CONDA_BLD_DIR}
   CONDA_LOCAL_CHANNEL="${CONDA_BLD_DIR}"
@@ -84,7 +84,7 @@ gpuci_mamba_retry install -c conda-forge boa
 # BUILD - Conda package builds
 ###############################################################################
 
-if [ "$BUILD_LIBRAFT" == '1' ]; then
+if [ "$BUILD_LIBRAFT" == "1" ]; then
   gpuci_logger "Building conda packages for libraft-nn, libraft-distance, and libraft-headers"
   if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libraft_headers
@@ -110,17 +110,23 @@ else
   gpuci_logger "SKIPPING build of conda packages for libraft-nn, libraft-distance and libraft-headers"
 fi
 
-if [ "$BUILD_RAFT" == "1" ]; then
-  gpuci_logger "Building conda packages for pyraft"
+if [ "$BUILD_RAFT" == '1' ]; then
+  gpuci_logger "Building Python conda packages for raft"
   if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pyraft --python=$PYTHON
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pylibraft --python=$PYTHON
   else
     gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pyraft -c ${CONDA_LOCAL_CHANNEL} --dirty --no-remove-work-dir --python=$PYTHON
-    mkdir -p ${CONDA_BLD_DIR}/pyraft
+    mkdir -p ${CONDA_BLD_DIR}/pyraft/work
     mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/pyraft/work
+
+    gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/pylibraft -c ${CONDA_LOCAL_CHANNEL} --dirty --no-remove-work-dir --python=$PYTHON
+    mkdir -p ${CONDA_BLD_DIR}/pylibraft/work
+    mv ${CONDA_BLD_DIR}/work ${CONDA_BLD_DIR}/pylibraft/work
+
   fi
 else
-  gpuci_logger "SKIPPING build of conda packages for pyraft"
+  gpuci_logger "SKIPPING build of Python conda packages for raft"
 fi
 
 ################################################################################
diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index 80c7bd0b70..822c15f0e1 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -34,7 +34,8 @@ gpuci_logger "Get conda file output locations"
 export LIBRAFT_HEADERS_FILE=`conda build --croot ${CONDA_BLD_DIR} -c ${CONDA_LOCAL_CHANNEL} conda/recipes/libraft_headers --output`
 export LIBRAFT_NN_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} -c ${CONDA_LOCAL_CHANNEL} conda/recipes/libraft_nn --output`
 export LIBRAFT_DISTANCE_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} -c ${CONDA_LOCAL_CHANNEL} conda/recipes/libraft_distance --output`
-export PYRAFT_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/pyraft --python=$PYTHON -c ${CONDA_LOCAL_CHANNEL} --output`
+export PYRAFT_FILE=`conda build --croot ${CONDA_BLD_DIR} -c ${CONDA_LOCAL_CHANNEL} conda/recipes/pyraft --python=$PYTHON --output`
+export PYLIBRAFT_FILE=`conda build --croot ${CONDA_BLD_DIR} -c ${CONDA_LOCAL_CHANNEL} conda/recipes/pylibraft --python=$PYTHON --output`
 
 ################################################################################
 # UPLOAD - Conda packages
@@ -45,7 +46,7 @@ gpuci_logger "Starting conda uploads"
 if [[ "$BUILD_LIBRAFT" == "1" && "$UPLOAD_LIBRAFT" == "1" ]]; then
 
   test -e ${LIBRAFT_HEADERS_FILE}
-  echo "Upload libraft-nn"
+  echo "Upload libraft-headers"
   echo ${LIBRAFT_HEADERS_FILE}
   gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${LIBRAFT_HEADERS_FILE} --no-progress
 
@@ -65,4 +66,9 @@ if [[ "$BUILD_RAFT" == "1" ]]; then
   echo "Upload pyraft"
   echo ${PYRAFT_FILE}
   gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${PYRAFT_FILE} --no-progress
+
+  test -e ${PYLIBRAFT_FILE}
+  echo "Upload pylibraft"
+  echo ${PYLIBRAFT_FILE}
+  gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${PYLIBRAFT_FILE} --no-progress
 fi
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 1affaef0b1..4427362103 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -73,6 +73,10 @@ pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-de
 pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
 set +x
 
+# Install pre-built conda packages from previous CI step
+gpuci_logger "Install libraft conda packages from CPU job"
+export LIBRAFT_CONDA_PACKAGES="$WORKSPACE/ci/artifacts/raft/cpu/.conda-bld/" # notice there is no `linux-64` here
+gpuci_mamba_retry install -c "${LIBRAFT_CONDA_PACKAGES}" libraft-headers libraft-distance libraft-nn
 
 gpuci_logger "Check compiler versions"
 python --version
@@ -90,15 +94,14 @@ conda list --show-channel-urls
 
 gpuci_logger "Adding ${CONDA_PREFIX}/lib to LD_LIBRARY_PATH"
 
-export LD_LIBRARY_PATH_CACHED=$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
 
 gpuci_logger "Build C++ and Python targets"
 # These should link against the existing shared libs
 if hasArg --skip-tests; then
-  "$WORKSPACE/build.sh" pyraft libraft -v
+  "$WORKSPACE/build.sh" pyraft pylibraft libraft -v
 else
-  "$WORKSPACE/build.sh" pyraft libraft tests bench -v
+  "$WORKSPACE/build.sh" pyraft pylibraft libraft tests bench  -v
 fi
 
 gpuci_logger "sccache stats"
@@ -107,11 +110,6 @@ sccache --show-stats
 gpuci_logger "Building docs"
 "$WORKSPACE/build.sh" docs -v
 
-gpuci_logger "Resetting LD_LIBRARY_PATH"
-
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH_CACHED
-export LD_LIBRARY_PATH_CACHED=""
-
 ################################################################################
 # TEST - Run GoogleTest and py.tests for RAFT
 ################################################################################
@@ -128,7 +126,13 @@ gpuci_logger "GoogleTest for raft"
 cd "$WORKSPACE/cpp/build"
 GTEST_OUTPUT="xml:$WORKSPACE/test-results/raft_cpp/" ./test_raft
 
-gpuci_logger "Python pytest for raft"
-cd "$WORKSPACE/python"
+gpuci_logger "Python pytest for pyraft"
+cd "$WORKSPACE/python/raft"
+python -m pytest --cache-clear --junitxml="$WORKSPACE/junit-pyraft.xml" -v -s
+
+gpuci_logger "Python pytest for pylibraft"
+cd "$WORKSPACE/python/pylibraft"
+python -m pytest --cache-clear --junitxml="$WORKSPACE/junit-pylibraft.xml" -v -s
 
-python -m pytest --cache-clear --junitxml="$WORKSPACE/junit-raft.xml" -v -s
+gpuci_logger "Building docs"
+"$WORKSPACE/build.sh" docs -v
diff --git a/ci/local/README.md b/ci/local/README.md
index 3b47ef3b53..bae3b278f0 100644
--- a/ci/local/README.md
+++ b/ci/local/README.md
@@ -23,7 +23,7 @@ where:
 ```
 
 Example Usage:
-`bash build.sh -r ~/rapids/raft -i gpuci/rapidsai-base:cuda9.2-ubuntu16.04-gcc5-py3.6`
+`bash build.sh -r ~/rapids/raft -i gpuci/rapidsai-base:cuda11.5-ubuntu20.04-py3.8`
 
 For a full list of available gpuCI docker images, visit our [DockerHub](https://hub.docker.com/r/gpuci/rapidsai-base/tags) page.
 
diff --git a/conda/recipes/libraft_distance/build.sh b/conda/recipes/libraft_distance/build.sh
index 062a5219db..d0843fdd79 100644
--- a/conda/recipes/libraft_distance/build.sh
+++ b/conda/recipes/libraft_distance/build.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env bash
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
-./build.sh libraft -v --allgpuarch --compile-dist
+./build.sh libraft --install -v --allgpuarch --compile-dist
diff --git a/conda/recipes/libraft_headers/build.sh b/conda/recipes/libraft_headers/build.sh
index 876f46cdfe..f239e545ef 100644
--- a/conda/recipes/libraft_headers/build.sh
+++ b/conda/recipes/libraft_headers/build.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env bash
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
-./build.sh libraft -v --allgpuarch
+./build.sh libraft --install -v --allgpuarch
diff --git a/conda/recipes/libraft_nn/build.sh b/conda/recipes/libraft_nn/build.sh
index 4f6ffbca25..9d53362738 100644
--- a/conda/recipes/libraft_nn/build.sh
+++ b/conda/recipes/libraft_nn/build.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env bash
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
-./build.sh libraft -v --allgpuarch --compile-nn
+./build.sh libraft --install -v --allgpuarch --compile-nn
diff --git a/conda/recipes/pylibraft/build.sh b/conda/recipes/pylibraft/build.sh
new file mode 100644
index 0000000000..442428e0ee
--- /dev/null
+++ b/conda/recipes/pylibraft/build.sh
@@ -0,0 +1,5 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#!/usr/bin/env bash
+
+# This assumes the script is executed from the root of the repo directory
+./build.sh pylibraft --install
diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
new file mode 100644
index 0000000000..eaca379c4e
--- /dev/null
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -0,0 +1,53 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+# Usage:
+#   conda build . -c conda-forge -c numba -c rapidsai -c pytorch
+{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
+{% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
+{% set cuda_version='.'.join(environ.get('CUDA', 'unknown').split('.')[:2]) %}
+{% set cuda_major=cuda_version.split('.')[0] %}
+{% set py_version=environ.get('CONDA_PY', 36) %}
+
+package:
+  name: pylibraft
+  version: {{ version }}
+
+source:
+  git_url: ../../..
+
+build:
+  number: {{ GIT_DESCRIBE_NUMBER }}
+  string: cuda{{ cuda_major }}_py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  script_env:
+    - CC
+    - CXX
+    - VERSION_SUFFIX
+
+requirements:
+  build:
+    - python x.x
+    - setuptools
+    - cython>=0.29,<0.30
+    - rmm {{ minor_version }}
+    - libraft-headers {{ version }}
+    - libraft-distance {{ version }}
+    - cudatoolkit {{ cuda_version }}.*
+    - cuda-python >=11.5,<12.0
+  run:
+    - python x.x
+    - libraft-headers {{ version }}
+    - libraft-distance {{ version }}
+    - cuda-python >=11.5,<12.0
+    - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
+
+tests:                                 # [linux64]
+  requirements:                        # [linux64]
+    - cudatoolkit {{ cuda_version }}.* # [linux64]
+  imports:                             # [linux64]
+    - pylibraft                        # [linux64]
+
+about:
+  home: http://rapids.ai/
+  license: Apache-2.0
+  # license_file: LICENSE
+  summary: pylibraft library
diff --git a/conda/recipes/pyraft/build.sh b/conda/recipes/pyraft/build.sh
index 044a34f906..4745f583f3 100644
--- a/conda/recipes/pyraft/build.sh
+++ b/conda/recipes/pyraft/build.sh
@@ -1,4 +1,6 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
 #!/usr/bin/env bash
+# Copyright (c) 2022, NVIDIA CORPORATION.
 
 # This assumes the script is executed from the root of the repo directory
-./build.sh pyraft
+./build.sh pyraft --install
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 4a96e1ee40..25ee402217 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -16,7 +16,7 @@
 
 cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR)
 file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.04/RAPIDS.cmake
-    ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
+        ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
 include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
 include(rapids-cmake)
 include(rapids-cpm)
@@ -55,6 +55,13 @@ option(RAFT_COMPILE_LIBRARIES "Enable building raft shared library instantiation
 option(RAFT_COMPILE_NN_LIBRARY "Enable building raft nearest neighbors shared library instantiations" OFF)
 option(RAFT_COMPILE_DIST_LIBRARY "Enable building raft distant shared library instantiations" OFF)
 option(RAFT_ENABLE_NN_DEPENDENCIES "Search for raft::nn dependencies like faiss" ${RAFT_COMPILE_LIBRARIES})
+
+option(RAFT_ENABLE_cuco_DEPENDENCY "Enable cuCollections dependency" ON)
+
+# Currently, UCX and NCCL are only needed to build Pyraft and so a simple find_package() is sufficient
+option(RAFT_ENABLE_nccl_DEPENDENCY "Enable NCCL dependency" OFF)
+option(RAFT_ENABLE_ucx_DEPENDENCY "Enable ucx dependency" OFF)
+
 include(CMakeDependentOption)
 cmake_dependent_option(RAFT_USE_FAISS_STATIC "Build and statically link the FAISS library for nearest neighbors search on GPU" ON RAFT_COMPILE_LIBRARIES OFF)
 
@@ -79,8 +86,8 @@ message(VERBOSE "RAFT: RMM_LOGGING_LEVEL = '${RMM_LOGGING_LEVEL}'.")
 if(DETECT_CONDA_ENV)
   rapids_cmake_support_conda_env( conda_env MODIFY_PREFIX_PATH )
   if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT AND DEFINED ENV{CONDA_PREFIX})
-      message(STATUS "RAFT: No CMAKE_INSTALL_PREFIX argument detected, setting to: $ENV{CONDA_PREFIX}")
-      set(CMAKE_INSTALL_PREFIX "$ENV{CONDA_PREFIX}")
+    message(STATUS "RAFT: No CMAKE_INSTALL_PREFIX argument detected, setting to: $ENV{CONDA_PREFIX}")
+    set(CMAKE_INSTALL_PREFIX "$ENV{CONDA_PREFIX}")
   endif()
 endif()
 
@@ -100,8 +107,7 @@ endif()
 # * set other CUDA compilation flags
 rapids_find_package(CUDAToolkit REQUIRED
     BUILD_EXPORT_SET raft-exports
-    INSTALL_EXPORT_SET raft-exports
-    )
+    INSTALL_EXPORT_SET raft-exports)
 include(cmake/modules/ConfigureCUDA.cmake)
 
 ##############################################################################
@@ -120,8 +126,6 @@ include(cmake/thirdparty/get_mdspan.cmake)
 
 if(BUILD_TESTS)
   include(cmake/thirdparty/get_gtest.cmake)
-  include(cmake/thirdparty/get_nccl.cmake)
-  include(cmake/thirdparty/get_ucx.cmake)
 endif()
 
 if(BUILD_BENCH)
@@ -130,7 +134,6 @@ endif()
 
 ##############################################################################
 # - raft ---------------------------------------------------------------------
-
 add_library(raft INTERFACE)
 add_library(raft::raft ALIAS raft)
 
@@ -139,23 +142,23 @@ target_include_directories(raft INTERFACE
         "$<INSTALL_INTERFACE:include>")
 
 target_link_libraries(raft INTERFACE
-  raft::Thrust
-  CUDA::cublas
-  CUDA::curand
-  CUDA::cusolver
-  CUDA::cudart
-  CUDA::cusparse
-  $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
-  rmm::rmm
-  cuco::cuco
-  std::mdspan)
+        raft::Thrust
+        $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
+        CUDA::cublas
+        CUDA::curand
+        CUDA::cusolver
+        CUDA::cudart
+        CUDA::cusparse
+        rmm::rmm
+        $<$<BOOL:${cuco_ADDED}>:cuco::cuco>
+        std::mdspan)
 
 target_compile_definitions(raft INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
 target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
 
 if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY OR RAFT_COMPILE_NN_LIBRARY)
   file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
-[=[
+          [=[
 SECTIONS
 {
 .nvFatBinSegment : { *(.nvFatBinSegment) }
@@ -168,6 +171,7 @@ set(RAFT_LIB_TYPE SHARED)
 if(${RAFT_STATIC_LINK_LIBRARIES})
   set(RAFT_LIB_TYPE STATIC)
 endif()
+
 ##############################################################################
 # - raft_distance ------------------------------------------------------------
 add_library(raft_distance INTERFACE)
@@ -180,7 +184,7 @@ set_target_properties(raft_distance PROPERTIES EXPORT_NAME distance)
 
 if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
   add_library(raft_distance_lib ${RAFT_LIB_TYPE}
-    src/distance/specializations/detail
+    src/distance/pairwise_distance.cu
     src/distance/specializations/detail/canberra.cu
     src/distance/specializations/detail/chebyshev.cu
     src/distance/specializations/detail/correlation.cu
@@ -227,7 +231,8 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_DIST_LIBRARY)
 
 endif()
 
-target_link_libraries(raft_distance INTERFACE raft::raft
+target_link_libraries(raft_distance INTERFACE
+    raft::raft
     $<TARGET_NAME_IF_EXISTS:raft_distance_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_distance_lib>
 )
@@ -244,17 +249,17 @@ set_target_properties(raft_nn PROPERTIES EXPORT_NAME nn)
 
 if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_NN_LIBRARY)
   add_library(raft_nn_lib ${RAFT_LIB_TYPE}
-    src/nn/specializations/ball_cover.cu
-    src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
-    src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
-    src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
-    src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
-    src/nn/specializations/fused_l2_knn_long_float_true.cu
-    src/nn/specializations/fused_l2_knn_long_float_false.cu
-    src/nn/specializations/fused_l2_knn_int_float_true.cu
-    src/nn/specializations/fused_l2_knn_int_float_false.cu
-    src/nn/specializations/knn.cu
-  )
+          src/nn/specializations/ball_cover.cu
+          src/nn/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
+          src/nn/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
+          src/nn/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
+          src/nn/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
+          src/nn/specializations/fused_l2_knn_long_float_true.cu
+          src/nn/specializations/fused_l2_knn_long_float_false.cu
+          src/nn/specializations/fused_l2_knn_int_float_true.cu
+          src/nn/specializations/fused_l2_knn_int_float_false.cu
+          src/nn/specializations/knn.cu
+          )
   set_target_properties(raft_nn_lib PROPERTIES OUTPUT_NAME raft_nn)
 
   target_link_libraries(raft_nn_lib PRIVATE raft::raft faiss::faiss)
@@ -267,10 +272,10 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_NN_LIBRARY)
 
   target_compile_definitions(raft_nn_lib
           INTERFACE "RAFT_NN_COMPILED")
-
 endif()
 
-target_link_libraries(raft_nn INTERFACE raft::raft faiss::faiss
+target_link_libraries(raft_nn INTERFACE
+    raft::raft
     $<TARGET_NAME_IF_EXISTS:raft_nn_lib>
     $<TARGET_NAME_IF_EXISTS:raft::raft_nn_lib>)
 
@@ -283,6 +288,7 @@ include(CPack)
 install(TARGETS raft
         DESTINATION ${lib_dir}
         EXPORT raft-exports)
+
 install(TARGETS raft_distance
         DESTINATION ${lib_dir}
         EXPORT raft-distance-exports)
@@ -303,9 +309,8 @@ if(TARGET raft_nn_lib)
 endif()
 
 
-install(DIRECTORY include/raft/
-        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft
-        )
+install(DIRECTORY include/raft
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft)
 
 # Temporary install of raft.hpp while the file is removed
 install(FILES include/raft.hpp
@@ -314,12 +319,11 @@ install(FILES include/raft.hpp
 ##############################################################################
 # - install export -----------------------------------------------------------
 set(doc_string
-[=[
+        [=[
 Provide targets for the RAFT: RAPIDS Analytics Framework Toolkit.
 
-RAPIDS Analytics Framework Toolkit contains shared representations,
-mathematical computational primitives, and utilities that accelerate
-building analytics and data science algorithms in the RAPIDS ecosystem.
+RAFT (Reusable Analytics Functions and other Tools) contains fundamental
+widely-used algorithms and primitives for data science, graph, and ml.
 
 Optional Components:
   - nn
@@ -334,9 +338,6 @@ Imported Targets:
 
 set(code_string
 [=[
-if(NOT TARGET raft::Thrust)
-  thrust_create_target(raft::Thrust FROM_OPTIONS)
-endif()
 
 if(distance IN_LIST raft_FIND_COMPONENTS)
   enable_language(CUDA)
@@ -352,30 +353,27 @@ if(nn IN_LIST raft_FIND_COMPONENTS)
   endif()
 endif()
 ]=]
-)
+        )
 
 # Use `rapids_export` for 22.04 as it will have COMPONENT support
 include(cmake/modules/raft_export.cmake)
 raft_export(INSTALL raft
-    EXPORT_SET raft-exports
-    COMPONENTS nn distance
-    GLOBAL_TARGETS raft nn distance
-    NAMESPACE raft::
-    DOCUMENTATION doc_string
-    FINAL_CODE_BLOCK code_string
-    )
+        COMPONENTS nn distance
+        EXPORT_SET raft-exports
+        GLOBAL_TARGETS raft nn distance
+        NAMESPACE raft::
+        DOCUMENTATION doc_string
+        FINAL_CODE_BLOCK code_string)
 
 ##############################################################################
 # - build export -------------------------------------------------------------
-
 raft_export(BUILD raft
-    EXPORT_SET raft-exports
-    COMPONENTS nn distance
-    GLOBAL_TARGETS raft raft_distance raft_nn
-    DOCUMENTATION doc_string
-    NAMESPACE raft::
-    FINAL_CODE_BLOCK code_string
-    )
+        EXPORT_SET raft-exports
+        COMPONENTS nn distance
+        GLOBAL_TARGETS raft raft_distance raft_nn
+        DOCUMENTATION doc_string
+        NAMESPACE raft::
+        FINAL_CODE_BLOCK code_string)
 
 ##############################################################################
 # - export/install optional components  --------------------------------------
@@ -389,23 +387,24 @@ endif()
 if(TARGET raft_nn_lib)
   list(APPEND raft_components  nn-lib)
 endif()
+
 foreach(comp IN LISTS raft_components)
   install(
-    EXPORT raft-${comp}-exports
-    FILE raft-${comp}-targets.cmake
-    NAMESPACE raft::
-    DESTINATION "${lib_dir}/cmake/raft"
+          EXPORT raft-${comp}-exports
+          FILE raft-${comp}-targets.cmake
+          NAMESPACE raft::
+          DESTINATION "${lib_dir}/cmake/raft"
   )
   export(
-    EXPORT raft-${comp}-exports
-    FILE ${RAFT_BINARY_DIR}/raft-${comp}-targets.cmake
-    NAMESPACE raft::
+          EXPORT raft-${comp}-exports
+          FILE ${RAFT_BINARY_DIR}/raft-${comp}-targets.cmake
+          NAMESPACE raft::
   )
   rapids_export_write_dependencies(
-    BUILD raft-${comp}-exports "${PROJECT_BINARY_DIR}/raft-${comp}-dependencies.cmake"
+          BUILD raft-${comp}-exports "${PROJECT_BINARY_DIR}/raft-${comp}-dependencies.cmake"
   )
   rapids_export_write_dependencies(
-    INSTALL raft-${comp}-exports "${PROJECT_BINARY_DIR}/rapids-cmake/raft/export/raft-${comp}-dependencies.cmake"
+          INSTALL raft-${comp}-exports "${PROJECT_BINARY_DIR}/rapids-cmake/raft/export/raft-${comp}-dependencies.cmake"
   )
 
 endforeach()
@@ -429,5 +428,5 @@ endif()
 
 include(cmake/doxygen.cmake)
 add_doxygen_target(IN_DOXYFILE doxygen/Doxyfile.in
-  OUT_DOXYFILE ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
-  CWD ${CMAKE_CURRENT_BINARY_DIR})
+        OUT_DOXYFILE ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
+        CWD ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/cpp/cmake/modules/raft_export.cmake b/cpp/cmake/modules/raft_export.cmake
index 4411433336..e89a9c5ee6 100644
--- a/cpp/cmake/modules/raft_export.cmake
+++ b/cpp/cmake/modules/raft_export.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -175,13 +175,13 @@ function(raft_export type project_name)
     set(scratch_dir "${PROJECT_BINARY_DIR}/rapids-cmake/${project_name}/export")
 
     configure_package_config_file("${CMAKE_CURRENT_FUNCTION_LIST_DIR}/config.cmake.in"
-                                  "${scratch_dir}/${project_name}-config.cmake"
-                                  INSTALL_DESTINATION "${install_location}")
+            "${scratch_dir}/${project_name}-config.cmake"
+            INSTALL_DESTINATION "${install_location}")
 
     if(rapids_version_set)
       write_basic_package_version_file(
-        "${scratch_dir}/${project_name}-config-version.cmake" VERSION ${rapids_project_version}
-        COMPATIBILITY ${rapids_project_version_compat})
+              "${scratch_dir}/${project_name}-config-version.cmake" VERSION ${rapids_project_version}
+              COMPATIBILITY ${rapids_project_version_compat})
     endif()
 
     install(EXPORT ${RAPIDS_EXPORT_SET} FILE ${project_name}-targets.cmake
@@ -207,32 +207,32 @@ function(raft_export type project_name)
   else()
     set(install_location "${PROJECT_BINARY_DIR}")
     configure_package_config_file("${CMAKE_CURRENT_FUNCTION_LIST_DIR}/config.cmake.in"
-                                  "${install_location}/${project_name}-config.cmake"
-                                  INSTALL_DESTINATION "${install_location}")
+            "${install_location}/${project_name}-config.cmake"
+            INSTALL_DESTINATION "${install_location}")
 
     if(rapids_version_set)
       write_basic_package_version_file(
-        "${install_location}/${project_name}-config-version.cmake" VERSION ${rapids_project_version}
-        COMPATIBILITY ${rapids_project_version_compat})
+              "${install_location}/${project_name}-config-version.cmake" VERSION ${rapids_project_version}
+              COMPATIBILITY ${rapids_project_version_compat})
     endif()
 
     export(EXPORT ${RAPIDS_EXPORT_SET} NAMESPACE ${RAPIDS_PROJECT_VERSION}
-           FILE "${install_location}/${project_name}-targets.cmake")
+            FILE "${install_location}/${project_name}-targets.cmake")
 
     if(TARGET rapids_export_build_${RAPIDS_EXPORT_SET})
       include("${rapids-cmake-dir}/export/write_dependencies.cmake")
       rapids_export_write_dependencies(BUILD ${RAPIDS_EXPORT_SET}
-                                       "${install_location}/${project_name}-dependencies.cmake")
+              "${install_location}/${project_name}-dependencies.cmake")
     endif()
 
     if(DEFINED RAPIDS_LANGUAGES)
       include("${rapids-cmake-dir}/export/write_language.cmake")
       foreach(lang IN LISTS RAPIDS_LANGUAGES)
         rapids_export_write_language(BUILD ${lang}
-                                     "${install_location}/${project_name}-${lang}-language.cmake")
+                "${install_location}/${project_name}-${lang}-language.cmake")
       endforeach()
     endif()
 
   endif()
 
-endfunction()
+endfunction()
\ No newline at end of file
diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index 381addb03c..3a70d34283 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -16,17 +16,19 @@
 
 function(find_and_configure_cuco VERSION)
 
-    rapids_cpm_find(cuco ${VERSION}
-      GLOBAL_TARGETS      cuco::cuco
-      BUILD_EXPORT_SET    raft-exports
-      INSTALL_EXPORT_SET  raft-exports
-      CPM_ARGS
-        GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
-        GIT_TAG        0ca860b824f5dc22cf8a41f09912e62e11f07d82
-        OPTIONS        "BUILD_TESTS OFF"
-                       "BUILD_BENCHMARKS OFF"
-                       "BUILD_EXAMPLES OFF"
-    )
+    if(RAFT_ENABLE_cuco_DEPENDENCY)
+        rapids_cpm_find(cuco ${VERSION}
+          GLOBAL_TARGETS      cuco::cuco
+          BUILD_EXPORT_SET    raft-exports
+          CPM_ARGS
+            EXCLUDE_FROM_ALL TRUE
+            GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
+            GIT_TAG        0ca860b824f5dc22cf8a41f09912e62e11f07d82
+            OPTIONS        "BUILD_TESTS OFF"
+                           "BUILD_BENCHMARKS OFF"
+                           "BUILD_EXAMPLES OFF"
+        )
+    endif()
 
 endfunction()
 
diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake
index 51ed34754b..b3c9abba75 100644
--- a/cpp/cmake/thirdparty/get_faiss.cmake
+++ b/cpp/cmake/thirdparty/get_faiss.cmake
@@ -69,5 +69,4 @@ endfunction()
 
 find_and_configure_faiss(VERSION    1.7.0
                          PINNED_TAG  bde7c0027191f29c9dadafe4f6e68ca0ee31fb30
-                         BUILD_STATIC_LIBS ${RAFT_USE_FAISS_STATIC}
-                        )
+                         BUILD_STATIC_LIBS ${RAFT_USE_FAISS_STATIC})
diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index 72fb0e18c6..04da801b79 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
index 5343250dca..a018341b24 100644
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -16,7 +16,8 @@
 function(find_and_configure_libcudacxx)
   include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
 
-  rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-exports INSTALL_EXPORT_SET raft-exports)
+  rapids_cpm_libcudacxx(BUILD_EXPORT_SET raft-exports
+                        INSTALL_EXPORT_SET raft-exports)
 
 endfunction()
 
diff --git a/cpp/cmake/thirdparty/get_mdspan.cmake b/cpp/cmake/thirdparty/get_mdspan.cmake
index c88d4e6857..12ac7ab0fd 100644
--- a/cpp/cmake/thirdparty/get_mdspan.cmake
+++ b/cpp/cmake/thirdparty/get_mdspan.cmake
@@ -1,3 +1,17 @@
+# =============================================================================
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
 function(find_and_configure_mdspan VERSION)
   rapids_cpm_find(
     mdspan ${VERSION}
diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake
index 7c155d446f..ffab703091 100644
--- a/cpp/cmake/thirdparty/get_rmm.cmake
+++ b/cpp/cmake/thirdparty/get_rmm.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,14 +15,8 @@
 #=============================================================================
 
 function(find_and_configure_rmm)
-
     include(${rapids-cmake-dir}/cpm/rmm.cmake)
-    rapids_cpm_rmm(
-        GLOBAL_TARGETS      rmm::rmm
-        BUILD_EXPORT_SET    raft-exports
-        INSTALL_EXPORT_SET  raft-exports
-    )
-
+    rapids_cpm_rmm(BUILD_EXPORT_SET raft-exports)
 endfunction()
 
 find_and_configure_rmm()
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index 3813d0ea02..fb9632ba5e 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -16,12 +16,8 @@
 function(find_and_configure_thrust)
     include(${rapids-cmake-dir}/cpm/thrust.cmake)
 
-    rapids_cpm_thrust(
-            NAMESPACE raft
-            BUILD_EXPORT_SET raft-exports
-            INSTALL_EXPORT_SET raft-exports
-    )
-
+    rapids_cpm_thrust( NAMESPACE raft )
+    rapids_export_package(BUILD thrust raft-exports)
 endfunction()
 
-find_and_configure_thrust()
\ No newline at end of file
+find_and_configure_thrust()
diff --git a/cpp/cmake/versions.json b/cpp/cmake/versions.json
deleted file mode 100644
index cca2dd8859..0000000000
--- a/cpp/cmake/versions.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "packages" : {
-      "Thrust" : {
-        "version" : "1.15.0",
-        "git_url" : "https://github.com/NVIDIA/thrust.git",
-        "git_tag" : "${version}"
-    }
-  }
-}
diff --git a/cpp/include/raft.hpp b/cpp/include/raft.hpp
index f942692aeb..fff4d09ffe 100644
--- a/cpp/include/raft.hpp
+++ b/cpp/include/raft.hpp
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ */
 #include "raft/handle.hpp"
 #include "raft/mdarray.hpp"
 #include "raft/span.hpp"
diff --git a/cpp/include/raft/common/detail/callback_sink.hpp b/cpp/include/raft/common/detail/callback_sink.hpp
index e6dc07b49d..a110af5c76 100644
--- a/cpp/include/raft/common/detail/callback_sink.hpp
+++ b/cpp/include/raft/common/detail/callback_sink.hpp
@@ -68,4 +68,4 @@ class CallbackSink : public base_sink<Mutex> {
 using callback_sink_mt = CallbackSink<std::mutex>;
 using callback_sink_st = CallbackSink<details::null_mutex>;
 
-}  // end namespace spdlog::sinks
\ No newline at end of file
+}  // end namespace spdlog::sinks
diff --git a/cpp/include/raft/common/detail/logger.hpp b/cpp/include/raft/common/detail/logger.hpp
new file mode 100644
index 0000000000..053b6e3c88
--- /dev/null
+++ b/cpp/include/raft/common/detail/logger.hpp
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <stdarg.h>
+
+#define SPDLOG_HEADER_ONLY
+#include <spdlog/sinks/stdout_color_sinks.h>  // NOLINT
+#include <spdlog/spdlog.h>                    // NOLINT
+
+#include <algorithm>
+
+#include <memory>
+#include <mutex>
+#include <sstream>
+#include <string>
+
+#include <raft/common/detail/callback_sink.hpp>
+
+/**
+ * @defgroup logging levels used in raft
+ *
+ * @note exactly match the corresponding ones (but reverse in terms of value)
+ *       in spdlog for wrapping purposes
+ *
+ * @{
+ */
+#define RAFT_LEVEL_TRACE    6
+#define RAFT_LEVEL_DEBUG    5
+#define RAFT_LEVEL_INFO     4
+#define RAFT_LEVEL_WARN     3
+#define RAFT_LEVEL_ERROR    2
+#define RAFT_LEVEL_CRITICAL 1
+#define RAFT_LEVEL_OFF      0
+/** @} */
+
+#if !defined(RAFT_ACTIVE_LEVEL)
+#define RAFT_ACTIVE_LEVEL RAFT_LEVEL_DEBUG
+#endif
+
+namespace spdlog {
+class logger;
+namespace sinks {
+template <class Mutex>
+class CallbackSink;
+using callback_sink_mt = CallbackSink<std::mutex>;
+};  // namespace sinks
+};  // namespace spdlog
+
+namespace raft::detail {
+
+/**
+ * @defgroup CStringFormat Expand a C-style format string
+ *
+ * @brief Expands C-style formatted string into std::string
+ *
+ * @param[in] fmt format string
+ * @param[in] vl  respective values for each of format modifiers in the string
+ *
+ * @return the expanded `std::string`
+ *
+ * @{
+ */
+std::string format(const char* fmt, va_list& vl)
+{
+  char buf[4096];
+  vsnprintf(buf, sizeof(buf), fmt, vl);
+  return std::string(buf);
+}
+
+std::string format(const char* fmt, ...)
+{
+  va_list vl;
+  va_start(vl, fmt);
+  std::string str = format(fmt, vl);
+  va_end(vl);
+  return str;
+}
+/** @} */
+
+int convert_level_to_spdlog(int level)
+{
+  level = std::max(RAFT_LEVEL_OFF, std::min(RAFT_LEVEL_TRACE, level));
+  return RAFT_LEVEL_TRACE - level;
+}
+
+};  // namespace raft::detail
+
+/**
+ * @defgroup loggerMacros Helper macros for dealing with logging
+ * @{
+ */
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_TRACE)
+#define RAFT_LOG_TRACE(fmt, ...)                                 \
+  do {                                                           \
+    std::stringstream ss;                                        \
+    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);    \
+    ss << raft::detail::format(fmt, ##__VA_ARGS__);              \
+    raft::logger::get().log(RAFT_LEVEL_TRACE, ss.str().c_str()); \
+  } while (0)
+#else
+#define RAFT_LOG_TRACE(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG)
+#define RAFT_LOG_DEBUG(fmt, ...)                                 \
+  do {                                                           \
+    std::stringstream ss;                                        \
+    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);    \
+    ss << raft::detail::format(fmt, ##__VA_ARGS__);              \
+    raft::logger::get().log(RAFT_LEVEL_DEBUG, ss.str().c_str()); \
+  } while (0)
+#else
+#define RAFT_LOG_DEBUG(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_INFO)
+#define RAFT_LOG_INFO(fmt, ...) raft::logger::get().log(RAFT_LEVEL_INFO, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_INFO(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_WARN)
+#define RAFT_LOG_WARN(fmt, ...) raft::logger::get().log(RAFT_LEVEL_WARN, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_WARN(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_ERROR)
+#define RAFT_LOG_ERROR(fmt, ...) raft::logger::get().log(RAFT_LEVEL_ERROR, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_ERROR(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_CRITICAL)
+#define RAFT_LOG_CRITICAL(fmt, ...) raft::logger::get().log(RAFT_LEVEL_CRITICAL, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_CRITICAL(fmt, ...) void(0)
+#endif
+/** @} */
\ No newline at end of file
diff --git a/cpp/include/raft/common/logger.hpp b/cpp/include/raft/common/logger.hpp
index d8d020ee58..9066e103d0 100644
--- a/cpp/include/raft/common/logger.hpp
+++ b/cpp/include/raft/common/logger.hpp
@@ -295,4 +295,4 @@ class logger {
 #else
 #define RAFT_LOG_CRITICAL(fmt, ...) void(0)
 #endif
-/** @} */
\ No newline at end of file
+/** @} */
diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index 05678a7e49..b30a4648a6 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -14,11 +14,18 @@
  * limitations under the License.
  */
 
-#pragma once
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use raft_runtime/comms.hpp instead.
+ */
 
-#include <raft/cudart_utils.h>
+#ifndef __RAFT_RT_COMMS_H
+#define __RAFT_RT_COMMS_H
+
+#pragma once
 
 #include <memory>
+#include <raft/error.hpp>
 #include <vector>
 
 namespace raft {
@@ -632,3 +639,5 @@ class comms_t {
 
 }  // namespace comms
 }  // namespace raft
+
+#endif
diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index 1940fcea51..3a18d7e420 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use raft_runtime/cudart_utils.hpp instead.
+ */
+
+#ifndef __RAFT_RT_CUDART_UTILS_H
+#define __RAFT_RT_CUDART_UTILS_H
+
 #pragma once
 
 #include <raft/error.hpp>
@@ -397,3 +405,5 @@ IntType gcd(IntType a, IntType b)
 }
 
 }  // namespace raft
+
+#endif
diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp
index 0eba4326e6..5e1aa3af28 100644
--- a/cpp/include/raft/error.hpp
+++ b/cpp/include/raft/error.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the include/raft_runtime/error.hpp instead.
+ */
+
+#ifndef __RAFT_RT_ERROR
+#define __RAFT_RT_ERROR
+
 #pragma once
 
 #include <cstdio>
@@ -169,3 +177,5 @@ struct logic_error : public raft::exception {
     SET_ERROR_MSG(msg, "RAFT failure at ", fmt, ##__VA_ARGS__); \
     throw raft::logic_error(msg);                               \
   } while (0)
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 7d6a5bfafd..158816f762 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the include/raft_runtime/handle.hpp instead.
+ */
+
+#ifndef __RAFT_RT_HANDLE
+#define __RAFT_RT_HANDLE
+
 #pragma once
 
 #include <memory>
@@ -332,3 +340,5 @@ class stream_syncer {
 };  // class stream_syncer
 
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/interruptible.hpp b/cpp/include/raft/interruptible.hpp
index 7ff5ca0c88..6764065363 100644
--- a/cpp/include/raft/interruptible.hpp
+++ b/cpp/include/raft/interruptible.hpp
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the include/raft_runtime/interruptible.hpp instead.
+ */
+
+#ifndef __RAFT_RT_INTERRUPTIBLE_H
+#define __RAFT_RT_INTERRUPTIBLE_H
+
 #pragma once
 
 #include <memory>
@@ -264,3 +272,5 @@ class interruptible {
 };
 
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/cublas_macros.h b/cpp/include/raft/linalg/cublas_macros.h
index 1cb5cfc81a..0281c5c667 100644
--- a/cpp/include/raft/linalg/cublas_macros.h
+++ b/cpp/include/raft/linalg/cublas_macros.h
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use raft_runtime/cublas_macros.hpp instead.
+ */
+
+#ifndef __RAFT_RT_CUBLAS_MACROS_H
+#define __RAFT_RT_CUBLAS_MACROS_H
+
 #pragma once
 
 #include <cublas_v2.h>
@@ -114,3 +122,5 @@ inline const char* cublas_error_to_string(cublasStatus_t err)
 #ifndef CUBLAS_CHECK_NO_THROW
 #define CUBLAS_CHECK_NO_THROW(call) RAFT_CUBLAS_TRY_NO_THROW(call)
 #endif
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/cusolver_macros.h b/cpp/include/raft/linalg/cusolver_macros.h
index 6db0577509..df27f7ce26 100644
--- a/cpp/include/raft/linalg/cusolver_macros.h
+++ b/cpp/include/raft/linalg/cusolver_macros.h
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use raft_runtime/cusolver_macros.hpp instead.
+ */
+
+#ifndef __RAFT_RT_CUSOLVER_MACROS_H
+#define __RAFT_RT_CUSOLVER_MACROS_H
+
 #pragma once
 
 #include <cusolverDn.h>
@@ -110,3 +118,5 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err)
 #ifndef CUSOLVER_CHECK_NO_THROW
 #define CUSOLVER_CHECK_NO_THROW(call) CUSOLVER_TRY_NO_THROW(call)
 #endif
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft_distance/pairwise_distance.hpp b/cpp/include/raft_distance/pairwise_distance.hpp
new file mode 100644
index 0000000000..50fdbbdd8c
--- /dev/null
+++ b/cpp/include/raft_distance/pairwise_distance.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/distance_type.hpp>
+
+namespace raft::distance::runtime {
+void pairwise_distance(raft::handle_t const& handle,
+                       float* x,
+                       float* y,
+                       float* dists,
+                       int m,
+                       int n,
+                       int k,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor,
+                       float metric_arg);
+
+void pairwise_distance(raft::handle_t const& handle,
+                       double* x,
+                       double* y,
+                       double* dists,
+                       int m,
+                       int n,
+                       int k,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor,
+                       float metric_arg);
+}  // namespace raft::distance::runtime
\ No newline at end of file
diff --git a/cpp/src/distance/pairwise_distance.cu b/cpp/src/distance/pairwise_distance.cu
new file mode 100644
index 0000000000..3a9ff469a1
--- /dev/null
+++ b/cpp/src/distance/pairwise_distance.cu
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/distance.cuh>
+#include <raft/distance/distance_type.hpp>
+#include <raft/distance/specializations.cuh>
+#include <raft/handle.hpp>
+
+namespace raft::distance::runtime {
+
+void pairwise_distance(raft::handle_t const& handle,
+                       float* x,
+                       float* y,
+                       float* dists,
+                       int m,
+                       int n,
+                       int k,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor,
+                       float metric_arg)
+{
+  raft::distance::pairwise_distance<float, int>(
+    handle, x, y, dists, m, n, k, metric, isRowMajor, metric_arg);
+}
+
+void pairwise_distance(raft::handle_t const& handle,
+                       double* x,
+                       double* y,
+                       double* dists,
+                       int m,
+                       int n,
+                       int k,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor,
+                       float metric_arg)
+{
+  raft::distance::pairwise_distance<double, int>(
+    handle, x, y, dists, m, n, k, metric, isRowMajor, metric_arg);
+}
+}  // namespace raft::distance::runtime
\ No newline at end of file
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 8d7b239624..c03e5d6bcd 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -138,44 +138,42 @@ add_executable(test_raft
 )
 
 set_target_properties(test_raft
-PROPERTIES BUILD_RPATH                         "\$ORIGIN"
-           # set target compile options
-           CXX_STANDARD                        17
-           CXX_STANDARD_REQUIRED               ON
-           CUDA_STANDARD                       17
-           CUDA_STANDARD_REQUIRED              ON
-           POSITION_INDEPENDENT_CODE           ON
-           INTERFACE_POSITION_INDEPENDENT_CODE ON
-           INSTALL_RPATH "\$ORIGIN/../../../lib"
-)
+        PROPERTIES BUILD_RPATH                         "\$ORIGIN"
+        # set target compile options
+        CXX_STANDARD                        17
+        CXX_STANDARD_REQUIRED               ON
+        CUDA_STANDARD                       17
+        CUDA_STANDARD_REQUIRED              ON
+        POSITION_INDEPENDENT_CODE           ON
+        INTERFACE_POSITION_INDEPENDENT_CODE ON
+        INSTALL_RPATH "\$ORIGIN/../../../lib"
+        )
 
 target_compile_options(test_raft
         PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
-                "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
-)
+        "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
+        )
 
 target_include_directories(test_raft
-    PUBLIC  "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/test>"
-)
+        PUBLIC  "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/test>"
+        )
 
 
 target_link_libraries(test_raft
-PRIVATE
-  raft::raft
-  raft::distance
-  raft::nn
-  NCCL::NCCL
-  faiss::faiss
-  GTest::gtest
-  GTest::gtest_main
-  Threads::Threads
-  $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
-  $<TARGET_NAME_IF_EXISTS:conda_env>
-)
+        PRIVATE
+        raft::raft
+        raft::distance
+        raft::nn
+        faiss::faiss
+        GTest::gtest
+        GTest::gtest_main
+        Threads::Threads
+        $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+        $<TARGET_NAME_IF_EXISTS:conda_env>)
 
 install(
-    TARGETS test_raft
-    COMPONENT testing
-    DESTINATION bin/libraft/gtests
-    EXCLUDE_FROM_ALL
+        TARGETS test_raft
+        COMPONENT testing
+        DESTINATION bin/libraft/gtests
+        EXCLUDE_FROM_ALL
 )
diff --git a/cpp/test/common/logger.cpp b/cpp/test/common/logger.cpp
index 218b33050c..813ce2b5f1 100644
--- a/cpp/test/common/logger.cpp
+++ b/cpp/test/common/logger.cpp
@@ -94,4 +94,4 @@ TEST_F(loggerTest, flush)
   ASSERT_EQ(1, flushCount);
 }
 
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 22979b102b..6fd7e3d702 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -24,7 +24,7 @@
 # is relative to the documentation root, use os.path.abspath to make it
 # absolute, like shown here.
 sys.path.insert(0, os.path.abspath('sphinxext'))
-sys.path.insert(0, os.path.abspath('../../python'))
+sys.path.insert(0, os.path.abspath('../../python/raft'))
 
 from github_link import make_linkcode_resolve # noqa
 
diff --git a/python/raft/test/__init__.py b/python/pylibraft/pylibraft/__init__.py
similarity index 92%
rename from python/raft/test/__init__.py
rename to python/pylibraft/pylibraft/__init__.py
index df8a4ae3b9..273b4497cc 100644
--- a/python/raft/test/__init__.py
+++ b/python/pylibraft/pylibraft/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,4 +11,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
\ No newline at end of file
+#
diff --git a/python/pylibraft/pylibraft/_version.py b/python/pylibraft/pylibraft/_version.py
new file mode 100644
index 0000000000..58cd44da3b
--- /dev/null
+++ b/python/pylibraft/pylibraft/_version.py
@@ -0,0 +1,567 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+# This file helps to compute a version number in source trees obtained from
+# git-archive tarball (such as those provided by githubs download-from-tag
+# feature). Distribution tarballs (built by setup.py sdist) and build
+# directories (produced by setup.py build) will contain a much shorter file
+# that just contains the computed version number.
+
+# This file is released into the public domain. Generated by
+# versioneer-0.18 (https://github.com/warner/python-versioneer)
+
+"""Git implementation of _version.py."""
+
+import errno
+import os
+import re
+import subprocess
+import sys
+
+
+def get_keywords():
+    """Get the keywords needed to look up the version information."""
+    # these strings will be replaced by git during git-archive.
+    # setup.py/versioneer.py will grep for the variable names, so they must
+    # each be defined on a line of their own. _version.py will just call
+    # get_keywords().
+    git_refnames = "$Format:%d$"
+    git_full = "$Format:%H$"
+    git_date = "$Format:%ci$"
+    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
+    return keywords
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_config():
+    """Create, populate and return the VersioneerConfig() object."""
+    # these strings are filled in when 'setup.py versioneer' creates
+    # _version.py
+    cfg = VersioneerConfig()
+    cfg.VCS = "git"
+    cfg.style = "pep440"
+    cfg.tag_prefix = "v"
+    cfg.parentdir_prefix = "pylibraft-"
+    cfg.versionfile_source = "pylibraft/_version.py"
+    cfg.verbose = False
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Decorator to mark a method as the handler for a particular VCS."""
+
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+
+    return decorate
+
+
+def run_command(
+        commands, args, cwd=None, verbose=False, hide_stderr=False, env=None
+):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen(
+                [c] + args,
+                cwd=cwd,
+                env=env,
+                stdout=subprocess.PIPE,
+                stderr=(subprocess.PIPE if hide_stderr else None),
+                )
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %s" % dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %s" % (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip()
+    if sys.version_info[0] >= 3:
+        stdout = stdout.decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %s (error)" % dispcmd)
+            print("stdout was %s" % stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {
+                "version": dirname[len(parentdir_prefix):],
+                "full-revisionid": None,
+                "dirty": False,
+                "error": None,
+                "date": None,
+            }
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print(
+            "Tried directories %s but none started with prefix %s"
+            % (str(rootdirs), parentdir_prefix)
+        )
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r"\d", r)])
+        if verbose:
+            print("discarding '%s', no digits" % ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %s" % ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %s" % r)
+            return {
+                "version": r,
+                "full-revisionid": keywords["full"].strip(),
+                "dirty": False,
+                "error": None,
+                "date": date,
+            }
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {
+        "version": "0+unknown",
+        "full-revisionid": keywords["full"].strip(),
+        "dirty": False,
+        "error": "no suitable tags",
+        "date": None,
+    }
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(
+        GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True
+    )
+    if rc != 0:
+        if verbose:
+            print("Directory %s not under git control" % root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(
+        GITS,
+        [
+            "describe",
+            "--tags",
+            "--dirty",
+            "--always",
+            "--long",
+            "--match",
+            "%s*" % tag_prefix,
+            ],
+        cwd=root,
+    )
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[: git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = (
+                    "unable to parse git-describe output: '%s'" % describe_out
+            )
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%s' doesn't start with prefix '%s'"
+                print(fmt % (full_tag, tag_prefix))
+            pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
+                full_tag,
+                tag_prefix,
+            )
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(
+            GITS, ["rev-list", "HEAD", "--count"], cwd=root
+        )
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
+        0
+    ].strip()
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post.dev%d" % pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post.dev%d" % pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Eexceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {
+            "version": "unknown",
+            "full-revisionid": pieces.get("long"),
+            "dirty": None,
+            "error": pieces["error"],
+            "date": None,
+        }
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%s'" % style)
+
+    return {
+        "version": rendered,
+        "full-revisionid": pieces["long"],
+        "dirty": pieces["dirty"],
+        "error": None,
+        "date": pieces.get("date"),
+    }
+
+
+def get_versions():
+    """Get version information or return default if unable to do so."""
+    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
+    # __file__, we can work backwards from there to the root. Some
+    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
+    # case we can only use expanded keywords.
+
+    cfg = get_config()
+    verbose = cfg.verbose
+
+    try:
+        return git_versions_from_keywords(
+            get_keywords(), cfg.tag_prefix, verbose
+        )
+    except NotThisMethod:
+        pass
+
+    try:
+        root = os.path.realpath(__file__)
+        # versionfile_source is the relative path from the top of the source
+        # tree (where the .git directory might live) to this file. Invert
+        # this to find the root from __file__.
+        for i in cfg.versionfile_source.split("/"):
+            root = os.path.dirname(root)
+    except NameError:
+        return {
+            "version": "0+unknown",
+            "full-revisionid": None,
+            "dirty": None,
+            "error": "unable to find root of source tree",
+            "date": None,
+        }
+
+    try:
+        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
+        return render(pieces, cfg.style)
+    except NotThisMethod:
+        pass
+
+    try:
+        if cfg.parentdir_prefix:
+            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+    except NotThisMethod:
+        pass
+
+    return {
+        "version": "0+unknown",
+        "full-revisionid": None,
+        "dirty": None,
+        "error": "unable to compute version",
+        "date": None,
+    }
diff --git a/python/pylibraft/pylibraft/common/__init__.pxd b/python/pylibraft/pylibraft/common/__init__.pxd
new file mode 100644
index 0000000000..273b4497cc
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/__init__.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pylibraft/pylibraft/common/__init__.py b/python/pylibraft/pylibraft/common/__init__.py
new file mode 100644
index 0000000000..527e644b8f
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
diff --git a/python/pylibraft/pylibraft/common/cuda.pxd b/python/pylibraft/pylibraft/common/cuda.pxd
new file mode 100644
index 0000000000..ae6246dee1
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/cuda.pxd
@@ -0,0 +1,22 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from cuda.ccudart cimport cudaStream_t
+
+cdef class Stream:
+    cdef cudaStream_t s
+
+    cdef cudaStream_t getStream(self)
diff --git a/python/pylibraft/pylibraft/common/handle.pxd b/python/pylibraft/pylibraft/common/handle.pxd
new file mode 100644
index 0000000000..bc248a335b
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/handle.pxd
@@ -0,0 +1,35 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from libcpp.memory cimport shared_ptr
+from rmm._lib.cuda_stream_view cimport cuda_stream_view
+from rmm._lib.cuda_stream_pool cimport cuda_stream_pool
+from libcpp.memory cimport shared_ptr
+from libcpp.memory cimport unique_ptr
+
+cdef extern from "raft/handle.hpp" namespace "raft" nogil:
+    cdef cppclass handle_t:
+        handle_t() except +
+        handle_t(cuda_stream_view stream_view) except +
+        handle_t(cuda_stream_view stream_view,
+                 shared_ptr[cuda_stream_pool] stream_pool) except +
+        cuda_stream_view get_stream() except +
+        void sync_stream() except +
diff --git a/python/pylibraft/pylibraft/distance/__init__.pxd b/python/pylibraft/pylibraft/distance/__init__.pxd
new file mode 100644
index 0000000000..273b4497cc
--- /dev/null
+++ b/python/pylibraft/pylibraft/distance/__init__.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pylibraft/pylibraft/distance/__init__.py b/python/pylibraft/pylibraft/distance/__init__.py
new file mode 100644
index 0000000000..ca3e6c5a2e
--- /dev/null
+++ b/python/pylibraft/pylibraft/distance/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .pairwise_distance import distance as pairwise_distance
\ No newline at end of file
diff --git a/python/pylibraft/pylibraft/distance/distance_type.pxd b/python/pylibraft/pylibraft/distance/distance_type.pxd
new file mode 100644
index 0000000000..2c01e42e53
--- /dev/null
+++ b/python/pylibraft/pylibraft/distance/distance_type.pxd
@@ -0,0 +1,40 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+cdef extern from "raft/distance/distance_type.hpp" namespace "raft::distance":
+
+    ctypedef enum DistanceType:
+        L2Expanded "raft::distance::DistanceType::L2Expanded"
+        L2SqrtExpanded "raft::distance::DistanceType::L2SqrtExpanded"
+        CosineExpanded "raft::distance::DistanceType::CosineExpanded"
+        L1 "raft::distance::DistanceType::L1"
+        L2Unexpanded "raft::distance::DistanceType::L2Unexpanded"
+        L2SqrtUnexpanded "raft::distance::DistanceType::L2SqrtUnexpanded"
+        InnerProduct "raft::distance::DistanceType::InnerProduct"
+        Linf "raft::distance::DistanceType::Linf"
+        Canberra "raft::distance::DistanceType::Canberra"
+        LpUnexpanded "raft::distance::DistanceType::LpUnexpanded"
+        CorrelationExpanded "raft::distance::DistanceType::CorrelationExpanded"
+        JaccardExpanded "raft::distance::DistanceType::JaccardExpanded"
+        HellingerExpanded "raft::distance::DistanceType::HellingerExpanded"
+        Haversine "raft::distance::DistanceType::Haversine"
+        BrayCurtis "raft::distance::DistanceType::BrayCurtis"
+        JensenShannon "raft::distance::DistanceType::JensenShannon"
+        HammingUnexpanded "raft::distance::DistanceType::HammingUnexpanded"
+        KLDivergence "raft::distance::DistanceType::KLDivergence"
+        RusselRaoExpanded "raft::distance::DistanceType::RusselRaoExpanded"
+        DiceExpanded "raft::distance::DistanceType::DiceExpanded"
+        Precomputed "raft::distance::DistanceType::Precomputed"
diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
new file mode 100644
index 0000000000..713a1d57d4
--- /dev/null
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
@@ -0,0 +1,136 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import numpy as np
+
+from libc.stdint cimport uintptr_t
+from cython.operator cimport dereference as deref
+
+from libcpp cimport bool
+from .distance_type cimport DistanceType
+from pylibraft.common.handle cimport handle_t
+
+cdef extern from "raft_distance/pairwise_distance.hpp" \
+        namespace "raft::distance::runtime":
+
+    cdef void pairwise_distance(const handle_t &handle,
+                                float *x,
+                                float *y,
+                                float *dists,
+                                int m,
+                                int n,
+                                int k,
+                                DistanceType metric,
+                                bool isRowMajor,
+                                float metric_arg)
+
+    cdef void pairwise_distance(const handle_t &handle,
+                                double *x,
+                                double *y,
+                                double *dists,
+                                int m,
+                                int n,
+                                int k,
+                                DistanceType metric,
+                                bool isRowMajor,
+                                float metric_arg)
+
+DISTANCE_TYPES = {
+    "l2": DistanceType.L2SqrtUnexpanded,
+    "euclidean": DistanceType.L2SqrtUnexpanded,
+    "l1": DistanceType.L1,
+    "cityblock": DistanceType.L1,
+    "inner_product": DistanceType.InnerProduct,
+    "chebyshev": DistanceType.Linf,
+    "canberra": DistanceType.Canberra,
+    "lp": DistanceType.LpUnexpanded,
+    "correlation": DistanceType.CorrelationExpanded,
+    "jaccard": DistanceType.JaccardExpanded,
+    "hellinger": DistanceType.HellingerExpanded,
+    "braycurtis": DistanceType.BrayCurtis,
+    "jensenshannon": DistanceType.JensenShannon,
+    "hamming": DistanceType.HammingUnexpanded,
+    "kl_divergence": DistanceType.KLDivergence,
+    "russellrao": DistanceType.RusselRaoExpanded,
+    "dice": DistanceType.DiceExpanded
+}
+
+SUPPORTED_DISTANCES = list(DISTANCE_TYPES.keys())
+
+
+def distance(X, Y, dists, metric="euclidean"):
+    """
+    Compute pairwise distances between X and Y
+
+    Parameters
+    ----------
+
+    X : CUDA array interface compliant matrix shape (m, k)
+    Y : CUDA array interface compliant matrix shape (n, k)
+    dists : Writable CUDA array interface matrix shape (m, n)
+    metric : string denoting the metric type
+    """
+
+    # TODO: Validate inputs, shapes, etc...
+    x_cai = X.__cuda_array_interface__
+    y_cai = Y.__cuda_array_interface__
+    dists_cai = dists.__cuda_array_interface__
+
+    m = x_cai["shape"][0]
+    n = y_cai["shape"][0]
+    k = x_cai["shape"][1]
+
+    x_ptr = <uintptr_t>x_cai["data"][0]
+    y_ptr = <uintptr_t>y_cai["data"][0]
+    d_ptr = <uintptr_t>dists_cai["data"][0]
+
+    cdef handle_t *h = new handle_t()
+
+    x_dt = np.dtype(x_cai["typestr"])
+    y_dt = np.dtype(y_cai["typestr"])
+    d_dt = np.dtype(dists_cai["typestr"])
+
+    if metric not in SUPPORTED_DISTANCES:
+        raise ValueError("metric %s is not supported" % metric)
+
+    cdef DistanceType distance_type = DISTANCE_TYPES[metric]
+
+    if x_dt != y_dt or x_dt != d_dt:
+        raise ValueError("Inputs must have the same dtypes")
+
+    if x_dt == np.float32:
+        pairwise_distance(deref(h),
+                          <float*> x_ptr,
+                          <float*> y_ptr,
+                          <float*> d_ptr,
+                          <int>m,
+                          <int>n,
+                          <int>k,
+                          <DistanceType>distance_type,
+                          <bool>True,
+                          <float>0.0)
+    elif x_dt == np.float64:
+        pairwise_distance(deref(h),
+                          <double*> x_ptr,
+                          <double*> y_ptr,
+                          <double*> d_ptr,
+                          <int>m,
+                          <int>n,
+                          <int>k,
+                          <DistanceType>distance_type,
+                          <bool>True,
+                          <float>0.0)
+    else:
+        raise ValueError("dtype %s not supported" % x_dt)
diff --git a/python/pylibraft/pylibraft/test/__init__.py b/python/pylibraft/pylibraft/test/__init__.py
new file mode 100644
index 0000000000..273b4497cc
--- /dev/null
+++ b/python/pylibraft/pylibraft/test/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pylibraft/pylibraft/test/test_distance.py b/python/pylibraft/pylibraft/test/test_distance.py
new file mode 100644
index 0000000000..594f6e2f66
--- /dev/null
+++ b/python/pylibraft/pylibraft/test/test_distance.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from scipy.spatial.distance import cdist
+import pytest
+import numpy as np
+
+import rmm
+
+from pylibraft.distance import pairwise_distance
+
+
+class TestDeviceBuffer:
+
+    def __init__(self, ndarray):
+        self.ndarray_ = ndarray
+        self.device_buffer_ = \
+            rmm.DeviceBuffer.to_device(ndarray.ravel(order="C").tobytes())
+
+    @property
+    def __cuda_array_interface__(self):
+        device_cai = self.device_buffer_.__cuda_array_interface__
+        host_cai = self.ndarray_.__array_interface__.copy()
+        host_cai["data"] = (device_cai["data"][0], device_cai["data"][1])
+
+        return host_cai
+
+    def copy_to_host(self):
+        return np.frombuffer(self.device_buffer_.tobytes(),
+                             dtype=self.ndarray_.dtype,
+                             like=self.ndarray_)\
+            .astype(self.ndarray_.dtype)\
+            .reshape(self.ndarray_.shape)
+
+
+@pytest.mark.parametrize("n_rows", [100])
+@pytest.mark.parametrize("n_cols", [100])
+@pytest.mark.parametrize("metric", ["euclidean", "cityblock", "chebyshev",
+                                    "canberra", "correlation", "hamming",
+                                    "jensenshannon", "russellrao"])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_distance(n_rows, n_cols, metric, dtype):
+    input1 = np.random.random_sample((n_rows, n_cols)).astype(dtype)
+
+    # RussellRao expects boolean arrays
+    if metric == "russellrao":
+        input1[input1 < 0.5] = 0
+        input1[input1 >= 0.5] = 1
+
+    # JensenShannon expects probability arrays
+    elif metric == "jensenshannon":
+        norm = np.sum(input1, axis=1)
+        input1 = (input1.T / norm).T
+
+    output = np.zeros((n_rows, n_rows), dtype=dtype)
+
+    expected = cdist(input1, input1, metric)
+
+    expected[expected <= 1e-5] = 0.0
+
+    input1_device = TestDeviceBuffer(input1)
+    output_device = TestDeviceBuffer(output)
+
+    pairwise_distance(input1_device, input1_device, output_device, metric)
+    actual = output_device.copy_to_host()
+
+    actual[actual <= 1e-5] = 0.0
+
+    assert np.allclose(expected, actual, rtol=1e-4)
diff --git a/python/pylibraft/setup.cfg b/python/pylibraft/setup.cfg
new file mode 100644
index 0000000000..e1f4865ac9
--- /dev/null
+++ b/python/pylibraft/setup.cfg
@@ -0,0 +1,56 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+[flake8]
+exclude = __init__.py,versioneer.py
+# See the docstring in versioneer.py for instructions. Note that you must
+# re-run 'versioneer.py setup' after changing this section, and commit the
+# resulting files.
+
+[versioneer]
+VCS = git
+style = pep440
+versionfile_source = pylibraft/_version.py
+versionfile_build = pylibraft/_version.py
+tag_prefix = v
+parentdir_prefix = pylibraft-
+
+[isort]
+line_length=79
+multi_line_output=3
+include_trailing_comma=True
+force_grid_wrap=0
+combine_as_imports=True
+order_by_type=True
+known_dask=
+    dask
+    distributed
+    dask_cuda
+known_rapids=
+    nvtext
+    cudf
+    cuml
+    cugraph
+    dask_cudf
+    rmm
+known_first_party=
+    raft
+default_section=THIRDPARTY
+sections=FUTURE,STDLIB,THIRDPARTY,DASK,RAPIDS,FIRSTPARTY,LOCALFOLDER
+skip=
+    thirdparty
+    .eggs
+    .git
+    .hg
+    .mypy_cache
+    .tox
+    .venv
+    _build
+    buck-out
+    build
+    dist
+    __init__.py
+
+[options]
+packages = find:
+install_requires = numpy
+python_requires = >=3.7,<3.9
diff --git a/python/pylibraft/setup.py b/python/pylibraft/setup.py
new file mode 100644
index 0000000000..290202403d
--- /dev/null
+++ b/python/pylibraft/setup.py
@@ -0,0 +1,201 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy
+import os
+import shutil
+import sys
+import sysconfig
+
+# Must import in this order:
+#   setuptools -> Cython.Distutils.build_ext -> setuptools.command.build_ext
+# Otherwise, setuptools.command.build_ext ends up inheriting from
+# Cython.Distutils.old_build_ext which we do not want
+import setuptools
+
+try:
+    from Cython.Distutils.build_ext import new_build_ext as _build_ext
+except ImportError:
+    from setuptools.command.build_ext import build_ext as _build_ext
+
+from distutils.sysconfig import get_python_lib
+
+import setuptools.command.build_ext
+from setuptools import find_packages, setup
+from setuptools.extension import Extension
+
+from setuputils import clean_folder
+from setuputils import get_environment_option
+from setuputils import get_cli_option
+
+from pathlib import Path
+
+import versioneer
+
+
+##############################################################################
+# - Dependencies include and lib folder setup --------------------------------
+
+install_requires = [
+    'cython'
+]
+
+cuda_home = get_environment_option("CUDA_HOME")
+
+clean_artifacts = get_cli_option('clean')
+single_gpu_build = get_cli_option('--singlegpu')
+
+
+if not cuda_home:
+    cuda_home = (
+        os.popen('echo "$(dirname $(dirname $(which nvcc)))"').read().strip()
+    )
+    print("-- Using nvcc to detect CUDA, found at " + str(cuda_home))
+cuda_include_dir = os.path.join(cuda_home, "include")
+cuda_lib_dir = os.path.join(cuda_home, "lib64")
+
+##############################################################################
+# - Clean target -------------------------------------------------------------
+
+if clean_artifacts:
+    print("-- Cleaning all Python and Cython build artifacts...")
+
+    try:
+        setup_file_path = str(Path(__file__).parent.absolute())
+        shutil.rmtree(setup_file_path + '/.pytest_cache', ignore_errors=True)
+        shutil.rmtree(setup_file_path + '/pylibraft.egg-info',
+                      ignore_errors=True)
+        shutil.rmtree(setup_file_path + '/__pycache__', ignore_errors=True)
+
+        clean_folder(setup_file_path + '/pylibraft')
+        shutil.rmtree(setup_file_path + '/build')
+
+    except IOError:
+        pass
+
+    # need to terminate script so cythonizing doesn't get triggered after
+    # cleanup unintendedly
+    sys.argv.remove("clean")
+
+    if "--all" in sys.argv:
+        sys.argv.remove("--all")
+
+    if len(sys.argv) == 1:
+        sys.exit(0)
+
+
+##############################################################################
+# - Cython extensions build and parameters -----------------------------------
+
+libs = ['raft_distance', 'cudart', "cusolver", "cusparse", "cublas"]
+
+include_dirs = [cuda_include_dir,
+                numpy.get_include(),
+                "../../cpp/include/",
+                os.path.dirname(sysconfig.get_path("include"))]
+
+extensions = [
+    Extension("*",
+              sources=["pylibraft/**/*.pyx"],
+              include_dirs=include_dirs,
+              library_dirs=[get_python_lib()],
+              runtime_library_dirs=[cuda_lib_dir,
+                                    get_python_lib(),
+                                    os.path.join(os.sys.prefix, "lib")],
+              libraries=libs,
+              language='c++',
+              extra_compile_args=['-std=c++17'])
+]
+
+
+class build_ext_no_debug(_build_ext):
+
+    def build_extensions(self):
+        def remove_flags(compiler, *flags):
+            for flag in flags:
+                try:
+                    compiler.compiler_so = list(
+                        filter((flag).__ne__, compiler.compiler_so)
+                    )
+                except Exception:
+                    pass
+
+        # Full optimization
+        self.compiler.compiler_so.append("-O3")
+
+        # Ignore deprecation declaration warnings
+        self.compiler.compiler_so.append("-Wno-deprecated-declarations")
+
+        # No debug symbols, full optimization, no '-Wstrict-prototypes' warning
+        remove_flags(
+            self.compiler, "-g", "-G", "-O1", "-O2", "-Wstrict-prototypes"
+        )
+        super().build_extensions()
+
+    def finalize_options(self):
+        if self.distribution.ext_modules:
+            # Delay import this to allow for Cython-less installs
+            from Cython.Build.Dependencies import cythonize
+
+            nthreads = getattr(self, "parallel", None)  # -j option in Py3.5+
+            nthreads = int(nthreads) if nthreads else None
+            self.distribution.ext_modules = cythonize(
+                self.distribution.ext_modules,
+                nthreads=nthreads,
+                force=self.force,
+                gdb_debug=False,
+                compiler_directives=dict(
+                    profile=False, language_level=3, embedsignature=True
+                ),
+            )
+        # Skip calling super() and jump straight to setuptools
+        setuptools.command.build_ext.build_ext.finalize_options(self)
+
+
+cmdclass = dict()
+cmdclass.update(versioneer.get_cmdclass())
+cmdclass["build_ext"] = build_ext_no_debug
+
+
+##############################################################################
+# - Python package generation ------------------------------------------------
+
+
+setup(name='pylibraft',
+      description="RAFT: Reusable Algorithms Functions and other Tools",
+      version=versioneer.get_version(),
+      classifiers=[
+        "Intended Audience :: Developers",
+        "Programming Language :: Python",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7"
+      ],
+      author="NVIDIA Corporation",
+      setup_requires=['cython'],
+      ext_modules=extensions,
+      package_data=dict.fromkeys(
+                         find_packages(include=["pylibraft.distance",
+                                                "pylibraft.distance.includes",
+                                                "pylibraft.common",
+                                                "pylibraft.common.includes"]),
+                         ["*.hpp", "*.pxd"],
+      ),
+      packages=find_packages(include=['pylibraft', 'pylibraft.*']),
+      install_requires=install_requires,
+      license="Apache",
+      cmdclass=cmdclass,
+      zip_safe=False
+      )
diff --git a/python/setuputils.py b/python/pylibraft/setuputils.py
similarity index 97%
rename from python/setuputils.py
rename to python/pylibraft/setuputils.py
index 61cb2da273..d93e4b06a4 100755
--- a/python/setuputils.py
+++ b/python/pylibraft/setuputils.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/versioneer.py b/python/pylibraft/versioneer.py
similarity index 99%
rename from python/versioneer.py
rename to python/pylibraft/versioneer.py
index 64fea1c892..b8c4bc423b 100644
--- a/python/versioneer.py
+++ b/python/pylibraft/versioneer.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
 
 # Version: 0.18
 
diff --git a/python/pytest.ini b/python/raft/pytest.ini
similarity index 100%
rename from python/pytest.ini
rename to python/raft/pytest.ini
diff --git a/python/raft/__init__.py b/python/raft/raft/__init__.py
similarity index 92%
rename from python/raft/__init__.py
rename to python/raft/raft/__init__.py
index b2431b4f6c..5face05ef3 100644
--- a/python/raft/__init__.py
+++ b/python/raft/raft/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/_version.py b/python/raft/raft/_version.py
similarity index 100%
rename from python/raft/_version.py
rename to python/raft/raft/_version.py
diff --git a/python/raft/common/__init__.pxd b/python/raft/raft/common/__init__.pxd
similarity index 100%
rename from python/raft/common/__init__.pxd
rename to python/raft/raft/common/__init__.pxd
diff --git a/python/raft/common/__init__.py b/python/raft/raft/common/__init__.py
similarity index 92%
rename from python/raft/common/__init__.py
rename to python/raft/raft/common/__init__.py
index b5ef2b3079..62db7d5831 100644
--- a/python/raft/common/__init__.py
+++ b/python/raft/raft/common/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/common/cuda.pxd b/python/raft/raft/common/cuda.pxd
similarity index 100%
rename from python/raft/common/cuda.pxd
rename to python/raft/raft/common/cuda.pxd
diff --git a/python/raft/common/cuda.pyx b/python/raft/raft/common/cuda.pyx
similarity index 100%
rename from python/raft/common/cuda.pyx
rename to python/raft/raft/common/cuda.pyx
diff --git a/python/raft/common/handle.pxd b/python/raft/raft/common/handle.pxd
similarity index 100%
rename from python/raft/common/handle.pxd
rename to python/raft/raft/common/handle.pxd
diff --git a/python/raft/common/handle.pyx b/python/raft/raft/common/handle.pyx
similarity index 100%
rename from python/raft/common/handle.pyx
rename to python/raft/raft/common/handle.pyx
diff --git a/python/raft/common/interruptible.pxd b/python/raft/raft/common/interruptible.pxd
similarity index 100%
rename from python/raft/common/interruptible.pxd
rename to python/raft/raft/common/interruptible.pxd
diff --git a/python/raft/common/interruptible.pyx b/python/raft/raft/common/interruptible.pyx
similarity index 100%
rename from python/raft/common/interruptible.pyx
rename to python/raft/raft/common/interruptible.pyx
diff --git a/python/raft/dask/__init__.py b/python/raft/raft/dask/__init__.py
similarity index 92%
rename from python/raft/dask/__init__.py
rename to python/raft/raft/dask/__init__.py
index 74231d256f..f6a1c28ea8 100644
--- a/python/raft/dask/__init__.py
+++ b/python/raft/raft/dask/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/dask/common/__init__.py b/python/raft/raft/dask/common/__init__.py
similarity index 96%
rename from python/raft/dask/common/__init__.py
rename to python/raft/raft/dask/common/__init__.py
index c2265f6828..8c25cdde90 100644
--- a/python/raft/dask/common/__init__.py
+++ b/python/raft/raft/dask/common/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/dask/common/comms.py b/python/raft/raft/dask/common/comms.py
similarity index 99%
rename from python/raft/dask/common/comms.py
rename to python/raft/raft/dask/common/comms.py
index ee768b41ff..549ac7fccb 100644
--- a/python/raft/dask/common/comms.py
+++ b/python/raft/raft/dask/common/comms.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/dask/common/comms_utils.pyx b/python/raft/raft/dask/common/comms_utils.pyx
similarity index 100%
rename from python/raft/dask/common/comms_utils.pyx
rename to python/raft/raft/dask/common/comms_utils.pyx
diff --git a/python/raft/dask/common/nccl.pyx b/python/raft/raft/dask/common/nccl.pyx
similarity index 100%
rename from python/raft/dask/common/nccl.pyx
rename to python/raft/raft/dask/common/nccl.pyx
diff --git a/python/raft/dask/common/ucx.py b/python/raft/raft/dask/common/ucx.py
similarity index 98%
rename from python/raft/dask/common/ucx.py
rename to python/raft/raft/dask/common/ucx.py
index f61479a0eb..eb246853f4 100644
--- a/python/raft/dask/common/ucx.py
+++ b/python/raft/raft/dask/common/ucx.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/dask/common/utils.py b/python/raft/raft/dask/common/utils.py
similarity index 95%
rename from python/raft/dask/common/utils.py
rename to python/raft/raft/dask/common/utils.py
index fdb5acfb5d..daf51530be 100644
--- a/python/raft/dask/common/utils.py
+++ b/python/raft/raft/dask/common/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/include_test/__init__.py b/python/raft/raft/include_test/__init__.py
similarity index 92%
rename from python/raft/include_test/__init__.py
rename to python/raft/raft/include_test/__init__.py
index 2b81c05b26..ea3511ea64 100644
--- a/python/raft/include_test/__init__.py
+++ b/python/raft/raft/include_test/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/include_test/raft_include_test.pyx b/python/raft/raft/include_test/raft_include_test.pyx
similarity index 93%
rename from python/raft/include_test/raft_include_test.pyx
rename to python/raft/raft/include_test/raft_include_test.pyx
index 6ebcb79256..7d860b4c35 100644
--- a/python/raft/include_test/raft_include_test.pyx
+++ b/python/raft/raft/include_test/raft_include_test.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/raft/test/__init__.py b/python/raft/raft/test/__init__.py
new file mode 100644
index 0000000000..99e0b7fac2
--- /dev/null
+++ b/python/raft/raft/test/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
\ No newline at end of file
diff --git a/python/raft/test/conftest.py b/python/raft/raft/test/conftest.py
similarity index 96%
rename from python/raft/test/conftest.py
rename to python/raft/raft/test/conftest.py
index 7ba0e36b0e..f5cdc49700 100644
--- a/python/raft/test/conftest.py
+++ b/python/raft/raft/test/conftest.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
 import pytest
 
 from dask.distributed import Client
diff --git a/python/raft/test/test_comms.py b/python/raft/raft/test/test_comms.py
similarity index 99%
rename from python/raft/test/test_comms.py
rename to python/raft/raft/test/test_comms.py
index a540e8db10..345cdbf037 100644
--- a/python/raft/test/test_comms.py
+++ b/python/raft/raft/test/test_comms.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/test/test_interruptible.py b/python/raft/raft/test/test_interruptible.py
similarity index 97%
rename from python/raft/test/test_interruptible.py
rename to python/raft/raft/test/test_interruptible.py
index 81f4f99ed8..a3559f6476 100644
--- a/python/raft/test/test_interruptible.py
+++ b/python/raft/raft/test/test_interruptible.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
 
 import os
 import pytest
diff --git a/python/raft/test/test_raft.py b/python/raft/raft/test/test_raft.py
similarity index 94%
rename from python/raft/test/test_raft.py
rename to python/raft/raft/test/test_raft.py
index 9f0524e198..796a4fface 100644
--- a/python/raft/test/test_raft.py
+++ b/python/raft/raft/test/test_raft.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/setup.cfg b/python/raft/setup.cfg
similarity index 100%
rename from python/setup.cfg
rename to python/raft/setup.cfg
diff --git a/python/setup.py b/python/raft/setup.py
similarity index 99%
rename from python/setup.py
rename to python/raft/setup.py
index 10beca1eb4..4af7ff2a88 100644
--- a/python/setup.py
+++ b/python/raft/setup.py
@@ -106,7 +106,7 @@
 
 include_dirs = [cuda_include_dir,
                 numpy.get_include(),
-                "../cpp/include/",
+                "../../cpp/include/",
                 os.path.dirname(sysconfig.get_path("include"))]
 
 extensions = [
diff --git a/python/raft/setuputils.py b/python/raft/setuputils.py
new file mode 100755
index 0000000000..8893e09fd3
--- /dev/null
+++ b/python/raft/setuputils.py
@@ -0,0 +1,65 @@
+#
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import glob
+import os
+import shutil
+import sys
+
+
+def get_environment_option(name):
+    ENV_VARIABLE = os.environ.get(name, False)
+
+    if not ENV_VARIABLE:
+        print("-- " + name + " environment variable not set.")
+
+    else:
+        print("-- " + name + " detected with value: " + str(ENV_VARIABLE))
+
+    return ENV_VARIABLE
+
+
+def get_cli_option(name):
+    if name in sys.argv:
+        print("-- Detected " + str(name) + " build option.")
+        return True
+
+    else:
+        return False
+
+
+def clean_folder(path):
+    """
+    Function to clean all Cython and Python artifacts and cache folders. It
+    clean the folder as well as its direct children recursively.
+
+    Parameters
+    ----------
+    path : String
+        Path to the folder to be cleaned.
+    """
+    shutil.rmtree(path + '/__pycache__', ignore_errors=True)
+
+    folders = glob.glob(path + '/*/')
+    for folder in folders:
+        shutil.rmtree(folder + '/__pycache__', ignore_errors=True)
+
+        clean_folder(folder)
+
+        cython_exts = glob.glob(folder + '/*.cpp')
+        cython_exts.extend(glob.glob(folder + '/*.cpython*'))
+        for file in cython_exts:
+            os.remove(file)
diff --git a/python/raft/versioneer.py b/python/raft/versioneer.py
new file mode 100644
index 0000000000..b8c4bc423b
--- /dev/null
+++ b/python/raft/versioneer.py
@@ -0,0 +1,1823 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+# Version: 0.18
+
+"""The Versioneer - like a rocketeer, but for versions.
+
+The Versioneer
+==============
+
+* like a rocketeer, but for versions!
+* https://github.com/warner/python-versioneer
+* Brian Warner
+* License: Public Domain
+* Compatible With: python2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, and pypy
+* [![Latest Version]
+(https://pypip.in/version/versioneer/badge.svg?style=flat)
+](https://pypi.python.org/pypi/versioneer/)
+* [![Build Status]
+(https://travis-ci.org/warner/python-versioneer.png?branch=master)
+](https://travis-ci.org/warner/python-versioneer)
+
+This is a tool for managing a recorded version number in distutils-based
+python projects. The goal is to remove the tedious and error-prone "update
+the embedded version string" step from your release process. Making a new
+release should be as easy as recording a new tag in your version-control
+system, and maybe making new tarballs.
+
+
+## Quick Install
+
+* `pip install versioneer` to somewhere to your $PATH
+* add a `[versioneer]` section to your setup.cfg (see below)
+* run `versioneer install` in your source tree, commit the results
+
+## Version Identifiers
+
+Source trees come from a variety of places:
+
+* a version-control system checkout (mostly used by developers)
+* a nightly tarball, produced by build automation
+* a snapshot tarball, produced by a web-based VCS browser, like github's
+  "tarball from tag" feature
+* a release tarball, produced by "setup.py sdist", distributed through PyPI
+
+Within each source tree, the version identifier (either a string or a number,
+this tool is format-agnostic) can come from a variety of places:
+
+* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows
+  about recent "tags" and an absolute revision-id
+* the name of the directory into which the tarball was unpacked
+* an expanded VCS keyword ($Id$, etc)
+* a `_version.py` created by some earlier build step
+
+For released software, the version identifier is closely related to a VCS
+tag. Some projects use tag names that include more than just the version
+string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool
+needs to strip the tag prefix to extract the version identifier. For
+unreleased software (between tags), the version identifier should provide
+enough information to help developers recreate the same tree, while also
+giving them an idea of roughly how old the tree is (after version 1.2, before
+version 1.3). Many VCS systems can report a description that captures this,
+for example `git describe --tags --dirty --always` reports things like
+"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the
+0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has
+uncommitted changes.
+
+The version identifier is used for multiple purposes:
+
+* to allow the module to self-identify its version: `myproject.__version__`
+* to choose a name and prefix for a 'setup.py sdist' tarball
+
+## Theory of Operation
+
+Versioneer works by adding a special `_version.py` file into your source
+tree, where your `__init__.py` can import it. This `_version.py` knows how to
+dynamically ask the VCS tool for version information at import time.
+
+`_version.py` also contains `$Revision$` markers, and the installation
+process marks `_version.py` to have this marker rewritten with a tag name
+during the `git archive` command. As a result, generated tarballs will
+contain enough information to get the proper version.
+
+To allow `setup.py` to compute a version too, a `versioneer.py` is added to
+the top level of your source tree, next to `setup.py` and the `setup.cfg`
+that configures it. This overrides several distutils/setuptools commands to
+compute the version when invoked, and changes `setup.py build` and `setup.py
+sdist` to replace `_version.py` with a small static file that contains just
+the generated version data.
+
+## Installation
+
+See [INSTALL.md](./INSTALL.md) for detailed installation instructions.
+
+## Version-String Flavors
+
+Code which uses Versioneer can learn about its version string at runtime by
+importing `_version` from your main `__init__.py` file and running the
+`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can
+import the top-level `versioneer.py` and run `get_versions()`.
+
+Both functions return a dictionary with different flavors of version
+information:
+
+* `['version']`: A condensed version string, rendered using the selected
+  style. This is the most commonly used value for the project's version
+  string. The default "pep440" style yields strings like `0.11`,
+  `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section
+  below for alternative styles.
+
+* `['full-revisionid']`: detailed revision identifier. For Git, this is the
+  full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac".
+
+* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the
+  commit date in ISO 8601 format. This will be None if the date is not
+  available.
+
+* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that
+  this is only accurate if run in a VCS checkout, otherwise it is likely to
+  be False or None
+
+* `['error']`: if the version string could not be computed, this will be set
+  to a string describing the problem, otherwise it will be None. It may be
+  useful to throw an exception in setup.py if this is set, to avoid e.g.
+  creating tarballs with a version string of "unknown".
+
+Some variants are more useful than others. Including `full-revisionid` in a
+bug report should allow developers to reconstruct the exact code being tested
+(or indicate the presence of local changes that should be shared with the
+developers). `version` is suitable for display in an "about" box or a CLI
+`--version` output: it can be easily compared against release notes and lists
+of bugs fixed in various releases.
+
+The installer adds the following text to your `__init__.py` to place a basic
+version in `YOURPROJECT.__version__`:
+
+    from ._version import get_versions
+    __version__ = get_versions()['version']
+    del get_versions
+
+## Styles
+
+The setup.cfg `style=` configuration controls how the VCS information is
+rendered into a version string.
+
+The default style, "pep440", produces a PEP440-compliant string, equal to the
+un-prefixed tag name for actual releases, and containing an additional "local
+version" section with more detail for in-between builds. For Git, this is
+TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags
+--dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the
+tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and
+that this commit is two revisions ("+2") beyond the "0.11" tag. For released
+software (exactly equal to a known tag), the identifier will only contain the
+stripped tag, e.g. "0.11".
+
+Other styles are available. See [details.md](details.md) in the Versioneer
+source tree for descriptions.
+
+## Debugging
+
+Versioneer tries to avoid fatal errors: if something goes wrong, it will tend
+to return a version of "0+unknown". To investigate the problem, run `setup.py
+version`, which will run the version-lookup code in a verbose mode, and will
+display the full contents of `get_versions()` (including the `error` string,
+which may help identify what went wrong).
+
+## Known Limitations
+
+Some situations are known to cause problems for Versioneer. This details the
+most significant ones. More can be found on Github
+[issues page](https://github.com/warner/python-versioneer/issues).
+
+### Subprojects
+
+Versioneer has limited support for source trees in which `setup.py` is not in
+the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are
+two common reasons why `setup.py` might not be in the root:
+
+* Source trees which contain multiple subprojects, such as
+  [Buildbot](https://github.com/buildbot/buildbot), which contains both
+  "master" and "slave" subprojects, each with their own `setup.py`,
+  `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI
+  distributions (and upload multiple independently-installable tarballs).
+* Source trees whose main purpose is to contain a C library, but which also
+  provide bindings to Python (and perhaps other langauges) in subdirectories.
+
+Versioneer will look for `.git` in parent directories, and most operations
+should get the right version string. However `pip` and `setuptools` have bugs
+and implementation details which frequently cause `pip install .` from a
+subproject directory to fail to find a correct version string (so it usually
+defaults to `0+unknown`).
+
+`pip install --editable .` should work correctly. `setup.py install` might
+work too.
+
+Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in
+some later version.
+
+[Bug #38](https://github.com/warner/python-versioneer/issues/38) is tracking
+this issue. The discussion in
+[PR #61](https://github.com/warner/python-versioneer/pull/61) describes the
+issue from the Versioneer side in more detail.
+[pip PR#3176](https://github.com/pypa/pip/pull/3176) and
+[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve
+pip to let Versioneer work correctly.
+
+Versioneer-0.16 and earlier only looked for a `.git` directory next to the
+`setup.cfg`, so subprojects were completely unsupported with those releases.
+
+### Editable installs with setuptools <= 18.5
+
+`setup.py develop` and `pip install --editable .` allow you to install a
+project into a virtualenv once, then continue editing the source code (and
+test) without re-installing after every change.
+
+"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a
+convenient way to specify executable scripts that should be installed along
+with the python package.
+
+These both work as expected when using modern setuptools. When using
+setuptools-18.5 or earlier, however, certain operations will cause
+`pkg_resources.DistributionNotFound` errors when running the entrypoint
+script, which must be resolved by re-installing the package. This happens
+when the install happens with one version, then the egg_info data is
+regenerated while a different version is checked out. Many setup.py commands
+cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into
+a different virtualenv), so this can be surprising.
+
+[Bug #83](https://github.com/warner/python-versioneer/issues/83) describes
+this one, but upgrading to a newer version of setuptools should probably
+resolve it.
+
+### Unicode version strings
+
+While Versioneer works (and is continually tested) with both Python 2 and
+Python 3, it is not entirely consistent with bytes-vs-unicode distinctions.
+Newer releases probably generate unicode version strings on py2. It's not
+clear that this is wrong, but it may be surprising for applications when then
+write these strings to a network connection or include them in bytes-oriented
+APIs like cryptographic checksums.
+
+[Bug #71](https://github.com/warner/python-versioneer/issues/71) investigates
+this question.
+
+
+## Updating Versioneer
+
+To upgrade your project to a new release of Versioneer, do the following:
+
+* install the new Versioneer (`pip install -U versioneer` or equivalent)
+* edit `setup.cfg`, if necessary, to include any new configuration settings
+  indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details.
+* re-run `versioneer install` in your source tree, to replace
+  `SRC/_version.py`
+* commit any changed files
+
+## Future Directions
+
+This tool is designed to make it easily extended to other version-control
+systems: all VCS-specific components are in separate directories like
+src/git/ . The top-level `versioneer.py` script is assembled from these
+components by running make-versioneer.py . In the future, make-versioneer.py
+will take a VCS name as an argument, and will construct a version of
+`versioneer.py` that is specific to the given VCS. It might also take the
+configuration arguments that are currently provided manually during
+installation by editing setup.py . Alternatively, it might go the other
+direction and include code from all supported VCS systems, reducing the
+number of intermediate scripts.
+
+
+## License
+
+To make Versioneer easier to embed, all its code is dedicated to the public
+domain. The `_version.py` that it creates is also in the public domain.
+Specifically, both are released under the Creative Commons "Public Domain
+Dedication" license (CC0-1.0), as described in
+https://creativecommons.org/publicdomain/zero/1.0/ .
+
+"""
+
+from __future__ import print_function
+try:
+    import configparser
+except ImportError:
+    import ConfigParser as configparser
+import errno
+import json
+import os
+import re
+import subprocess
+import sys
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_root():
+    """Get the project root directory.
+
+    We require that all commands are run from the project root, i.e. the
+    directory that contains setup.py, setup.cfg, and versioneer.py .
+    """
+    root = os.path.realpath(os.path.abspath(os.getcwd()))
+    setup_py = os.path.join(root, "setup.py")
+    versioneer_py = os.path.join(root, "versioneer.py")
+    if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
+        # allow 'python path/to/setup.py COMMAND'
+        root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0])))
+        setup_py = os.path.join(root, "setup.py")
+        versioneer_py = os.path.join(root, "versioneer.py")
+    if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
+        err = ("Versioneer was unable to run the project root directory. "
+               "Versioneer requires setup.py to be executed from "
+               "its immediate directory (like 'python setup.py COMMAND'), "
+               "or in a way that lets it use sys.argv[0] to find the root "
+               "(like 'python path/to/setup.py COMMAND').")
+        raise VersioneerBadRootError(err)
+    try:
+        # Certain runtime workflows (setup.py install/develop in a setuptools
+        # tree) execute all dependencies in a single python process, so
+        # "versioneer" may be imported multiple times, and python's shared
+        # module-import table will cache the first one. So we can't use
+        # os.path.dirname(__file__), as that will find whichever
+        # versioneer.py was first imported, even in later projects.
+        me = os.path.realpath(os.path.abspath(__file__))
+        me_dir = os.path.normcase(os.path.splitext(me)[0])
+        vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0])
+        if me_dir != vsr_dir:
+            print("Warning: build in %s is using versioneer.py from %s"
+                  % (os.path.dirname(me), versioneer_py))
+    except NameError:
+        pass
+    return root
+
+
+def get_config_from_root(root):
+    """Read the project setup.cfg file to determine Versioneer config."""
+    # This might raise EnvironmentError (if setup.cfg is missing), or
+    # configparser.NoSectionError (if it lacks a [versioneer] section), or
+    # configparser.NoOptionError (if it lacks "VCS="). See the docstring at
+    # the top of versioneer.py for instructions on writing your setup.cfg .
+    setup_cfg = os.path.join(root, "setup.cfg")
+    parser = configparser.SafeConfigParser()
+    with open(setup_cfg, "r") as f:
+        parser.readfp(f)
+    VCS = parser.get("versioneer", "VCS")  # mandatory
+
+    def get(parser, name):
+        if parser.has_option("versioneer", name):
+            return parser.get("versioneer", name)
+        return None
+    cfg = VersioneerConfig()
+    cfg.VCS = VCS
+    cfg.style = get(parser, "style") or ""
+    cfg.versionfile_source = get(parser, "versionfile_source")
+    cfg.versionfile_build = get(parser, "versionfile_build")
+    cfg.tag_prefix = get(parser, "tag_prefix")
+    if cfg.tag_prefix in ("''", '""'):
+        cfg.tag_prefix = ""
+    cfg.parentdir_prefix = get(parser, "parentdir_prefix")
+    cfg.verbose = get(parser, "verbose")
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+# these dictionaries contain VCS-specific tools
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Decorator to mark a method as the handler for a particular VCS."""
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+    return decorate
+
+
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
+                env=None):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
+                                 stdout=subprocess.PIPE,
+                                 stderr=(subprocess.PIPE if hide_stderr
+                                         else None))
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %s" % dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %s" % (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip()
+    if sys.version_info[0] >= 3:
+        stdout = stdout.decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %s (error)" % dispcmd)
+            print("stdout was %s" % stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+LONG_VERSION_PY['git'] = '''
+# This file helps to compute a version number in source trees obtained from
+# git-archive tarball (such as those provided by githubs download-from-tag
+# feature). Distribution tarballs (built by setup.py sdist) and build
+# directories (produced by setup.py build) will contain a much shorter file
+# that just contains the computed version number.
+
+# This file is released into the public domain. Generated by
+# versioneer-0.18 (https://github.com/warner/python-versioneer)
+
+"""Git implementation of _version.py."""
+
+import errno
+import os
+import re
+import subprocess
+import sys
+
+
+def get_keywords():
+    """Get the keywords needed to look up the version information."""
+    # these strings will be replaced by git during git-archive.
+    # setup.py/versioneer.py will grep for the variable names, so they must
+    # each be defined on a line of their own. _version.py will just call
+    # get_keywords().
+    git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s"
+    git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s"
+    git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s"
+    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
+    return keywords
+
+
+class VersioneerConfig:
+    """Container for Versioneer configuration parameters."""
+
+
+def get_config():
+    """Create, populate and return the VersioneerConfig() object."""
+    # these strings are filled in when 'setup.py versioneer' creates
+    # _version.py
+    cfg = VersioneerConfig()
+    cfg.VCS = "git"
+    cfg.style = "%(STYLE)s"
+    cfg.tag_prefix = "%(TAG_PREFIX)s"
+    cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s"
+    cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s"
+    cfg.verbose = False
+    return cfg
+
+
+class NotThisMethod(Exception):
+    """Exception raised if a method is not valid for the current scenario."""
+
+
+LONG_VERSION_PY = {}
+HANDLERS = {}
+
+
+def register_vcs_handler(vcs, method):  # decorator
+    """Decorator to mark a method as the handler for a particular VCS."""
+    def decorate(f):
+        """Store f in HANDLERS[vcs][method]."""
+        if vcs not in HANDLERS:
+            HANDLERS[vcs] = {}
+        HANDLERS[vcs][method] = f
+        return f
+    return decorate
+
+
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
+                env=None):
+    """Call the given command(s)."""
+    assert isinstance(commands, list)
+    p = None
+    for c in commands:
+        try:
+            dispcmd = str([c] + args)
+            # remember shell=False, so use git.cmd on windows, not just git
+            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
+                                 stdout=subprocess.PIPE,
+                                 stderr=(subprocess.PIPE if hide_stderr
+                                         else None))
+            break
+        except EnvironmentError:
+            e = sys.exc_info()[1]
+            if e.errno == errno.ENOENT:
+                continue
+            if verbose:
+                print("unable to run %%s" %% dispcmd)
+                print(e)
+            return None, None
+    else:
+        if verbose:
+            print("unable to find command, tried %%s" %% (commands,))
+        return None, None
+    stdout = p.communicate()[0].strip()
+    if sys.version_info[0] >= 3:
+        stdout = stdout.decode()
+    if p.returncode != 0:
+        if verbose:
+            print("unable to run %%s (error)" %% dispcmd)
+            print("stdout was %%s" %% stdout)
+        return None, p.returncode
+    return stdout, p.returncode
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {"version": dirname[len(parentdir_prefix):],
+                    "full-revisionid": None,
+                    "dirty": False, "error": None, "date": None}
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print("Tried directories %%s but none started with prefix %%s" %%
+              (str(rootdirs), parentdir_prefix))
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %%d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r'\d', r)])
+        if verbose:
+            print("discarding '%%s', no digits" %% ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %%s" %% ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %%s" %% r)
+            return {"version": r,
+                    "full-revisionid": keywords["full"].strip(),
+                    "dirty": False, "error": None,
+                    "date": date}
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {"version": "0+unknown",
+            "full-revisionid": keywords["full"].strip(),
+            "dirty": False, "error": "no suitable tags", "date": None}
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
+                          hide_stderr=True)
+    if rc != 0:
+        if verbose:
+            print("Directory %%s not under git control" %% root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
+                                          "--always", "--long",
+                                          "--match", "%%s*" %% tag_prefix],
+                                   cwd=root)
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[:git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = ("unable to parse git-describe output: '%%s'"
+                               %% describe_out)
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%%s' doesn't start with prefix '%%s'"
+                print(fmt %% (full_tag, tag_prefix))
+            pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'"
+                               %% (full_tag, tag_prefix))
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
+                                    cwd=root)
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"],
+                       cwd=root)[0].strip()
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post.dev%%d" %% pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post.dev%%d" %% pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%%d" %% pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%%s" %% pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%%d" %% pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%%s" %% pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Eexceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%%d" %% pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%%d" %% pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {"version": "unknown",
+                "full-revisionid": pieces.get("long"),
+                "dirty": None,
+                "error": pieces["error"],
+                "date": None}
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%%s'" %% style)
+
+    return {"version": rendered, "full-revisionid": pieces["long"],
+            "dirty": pieces["dirty"], "error": None,
+            "date": pieces.get("date")}
+
+
+def get_versions():
+    """Get version information or return default if unable to do so."""
+    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
+    # __file__, we can work backwards from there to the root. Some
+    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
+    # case we can only use expanded keywords.
+
+    cfg = get_config()
+    verbose = cfg.verbose
+
+    try:
+        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
+                                          verbose)
+    except NotThisMethod:
+        pass
+
+    try:
+        root = os.path.realpath(__file__)
+        # versionfile_source is the relative path from the top of the source
+        # tree (where the .git directory might live) to this file. Invert
+        # this to find the root from __file__.
+        for i in cfg.versionfile_source.split('/'):
+            root = os.path.dirname(root)
+    except NameError:
+        return {"version": "0+unknown", "full-revisionid": None,
+                "dirty": None,
+                "error": "unable to find root of source tree",
+                "date": None}
+
+    try:
+        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
+        return render(pieces, cfg.style)
+    except NotThisMethod:
+        pass
+
+    try:
+        if cfg.parentdir_prefix:
+            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+    except NotThisMethod:
+        pass
+
+    return {"version": "0+unknown", "full-revisionid": None,
+            "dirty": None,
+            "error": "unable to compute version", "date": None}
+'''
+
+
+@register_vcs_handler("git", "get_keywords")
+def git_get_keywords(versionfile_abs):
+    """Extract version information from the given file."""
+    # the code embedded in _version.py can just fetch the value of these
+    # keywords. When used from setup.py, we don't want to import _version.py,
+    # so we do it with a regexp instead. This function is not used from
+    # _version.py.
+    keywords = {}
+    try:
+        f = open(versionfile_abs, "r")
+        for line in f.readlines():
+            if line.strip().startswith("git_refnames ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["refnames"] = mo.group(1)
+            if line.strip().startswith("git_full ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["full"] = mo.group(1)
+            if line.strip().startswith("git_date ="):
+                mo = re.search(r'=\s*"(.*)"', line)
+                if mo:
+                    keywords["date"] = mo.group(1)
+        f.close()
+    except EnvironmentError:
+        pass
+    return keywords
+
+
+@register_vcs_handler("git", "keywords")
+def git_versions_from_keywords(keywords, tag_prefix, verbose):
+    """Get version information from git keywords."""
+    if not keywords:
+        raise NotThisMethod("no keywords at all, weird")
+    date = keywords.get("date")
+    if date is not None:
+        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
+        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
+        # -like" string, which we must then edit to make compliant), because
+        # it's been around since git-1.5.3, and it's too difficult to
+        # discover which version we're using, or to work around using an
+        # older one.
+        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+    refnames = keywords["refnames"].strip()
+    if refnames.startswith("$Format"):
+        if verbose:
+            print("keywords are unexpanded, not using")
+        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
+    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
+    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
+    TAG = "tag: "
+    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    if not tags:
+        # Either we're using git < 1.8.3, or there really are no tags. We use
+        # a heuristic: assume all version tags have a digit. The old git %d
+        # expansion behaves like git log --decorate=short and strips out the
+        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
+        # between branches and tags. By ignoring refnames without digits, we
+        # filter out many common branch names like "release" and
+        # "stabilization", as well as "HEAD" and "master".
+        tags = set([r for r in refs if re.search(r'\d', r)])
+        if verbose:
+            print("discarding '%s', no digits" % ",".join(refs - tags))
+    if verbose:
+        print("likely tags: %s" % ",".join(sorted(tags)))
+    for ref in sorted(tags):
+        # sorting will prefer e.g. "2.0" over "2.0rc1"
+        if ref.startswith(tag_prefix):
+            r = ref[len(tag_prefix):]
+            if verbose:
+                print("picking %s" % r)
+            return {"version": r,
+                    "full-revisionid": keywords["full"].strip(),
+                    "dirty": False, "error": None,
+                    "date": date}
+    # no suitable tags, so version is "0+unknown", but full hex is still there
+    if verbose:
+        print("no suitable tags, using unknown + full revision id")
+    return {"version": "0+unknown",
+            "full-revisionid": keywords["full"].strip(),
+            "dirty": False, "error": "no suitable tags", "date": None}
+
+
+@register_vcs_handler("git", "pieces_from_vcs")
+def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+    """Get version from 'git describe' in the root of the source tree.
+
+    This only gets called if the git-archive 'subst' keywords were *not*
+    expanded, and _version.py hasn't already been rewritten with a short
+    version string, meaning we're inside a checked out source tree.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
+                          hide_stderr=True)
+    if rc != 0:
+        if verbose:
+            print("Directory %s not under git control" % root)
+        raise NotThisMethod("'git rev-parse --git-dir' returned error")
+
+    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
+    # if there isn't one, this yields HEX[-dirty] (no NUM)
+    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
+                                          "--always", "--long",
+                                          "--match", "%s*" % tag_prefix],
+                                   cwd=root)
+    # --long was added in git-1.5.5
+    if describe_out is None:
+        raise NotThisMethod("'git describe' failed")
+    describe_out = describe_out.strip()
+    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    if full_out is None:
+        raise NotThisMethod("'git rev-parse' failed")
+    full_out = full_out.strip()
+
+    pieces = {}
+    pieces["long"] = full_out
+    pieces["short"] = full_out[:7]  # maybe improved later
+    pieces["error"] = None
+
+    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
+    # TAG might have hyphens.
+    git_describe = describe_out
+
+    # look for -dirty suffix
+    dirty = git_describe.endswith("-dirty")
+    pieces["dirty"] = dirty
+    if dirty:
+        git_describe = git_describe[:git_describe.rindex("-dirty")]
+
+    # now we have TAG-NUM-gHEX or HEX
+
+    if "-" in git_describe:
+        # TAG-NUM-gHEX
+        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        if not mo:
+            # unparseable. Maybe git-describe is misbehaving?
+            pieces["error"] = ("unable to parse git-describe output: '%s'"
+                               % describe_out)
+            return pieces
+
+        # tag
+        full_tag = mo.group(1)
+        if not full_tag.startswith(tag_prefix):
+            if verbose:
+                fmt = "tag '%s' doesn't start with prefix '%s'"
+                print(fmt % (full_tag, tag_prefix))
+            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
+                               % (full_tag, tag_prefix))
+            return pieces
+        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+
+        # distance: number of commits since tag
+        pieces["distance"] = int(mo.group(2))
+
+        # commit: short hex revision ID
+        pieces["short"] = mo.group(3)
+
+    else:
+        # HEX: no tags
+        pieces["closest-tag"] = None
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
+                                    cwd=root)
+        pieces["distance"] = int(count_out)  # total number of commits
+
+    # commit date: see ISO-8601 comment in git_versions_from_keywords()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
+                       cwd=root)[0].strip()
+    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
+
+    return pieces
+
+
+def do_vcs_install(manifest_in, versionfile_source, ipy):
+    """Git-specific installation logic for Versioneer.
+
+    For Git, this means creating/changing .gitattributes to mark _version.py
+    for export-subst keyword substitution.
+    """
+    GITS = ["git"]
+    if sys.platform == "win32":
+        GITS = ["git.cmd", "git.exe"]
+    files = [manifest_in, versionfile_source]
+    if ipy:
+        files.append(ipy)
+    try:
+        me = __file__
+        if me.endswith(".pyc") or me.endswith(".pyo"):
+            me = os.path.splitext(me)[0] + ".py"
+        versioneer_file = os.path.relpath(me)
+    except NameError:
+        versioneer_file = "versioneer.py"
+    files.append(versioneer_file)
+    present = False
+    try:
+        f = open(".gitattributes", "r")
+        for line in f.readlines():
+            if line.strip().startswith(versionfile_source):
+                if "export-subst" in line.strip().split()[1:]:
+                    present = True
+        f.close()
+    except EnvironmentError:
+        pass
+    if not present:
+        f = open(".gitattributes", "a+")
+        f.write("%s export-subst\n" % versionfile_source)
+        f.close()
+        files.append(".gitattributes")
+    run_command(GITS, ["add", "--"] + files)
+
+
+def versions_from_parentdir(parentdir_prefix, root, verbose):
+    """Try to determine the version from the parent directory name.
+
+    Source tarballs conventionally unpack into a directory that includes both
+    the project name and a version string. We will also support searching up
+    two directory levels for an appropriately named parent directory
+    """
+    rootdirs = []
+
+    for i in range(3):
+        dirname = os.path.basename(root)
+        if dirname.startswith(parentdir_prefix):
+            return {"version": dirname[len(parentdir_prefix):],
+                    "full-revisionid": None,
+                    "dirty": False, "error": None, "date": None}
+        else:
+            rootdirs.append(root)
+            root = os.path.dirname(root)  # up a level
+
+    if verbose:
+        print("Tried directories %s but none started with prefix %s" %
+              (str(rootdirs), parentdir_prefix))
+    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
+
+
+SHORT_VERSION_PY = """
+# This file was generated by 'versioneer.py' (0.18) from
+# revision-control system data, or from the parent directory name of an
+# unpacked source archive. Distribution tarballs contain a pre-generated copy
+# of this file.
+
+import json
+
+version_json = '''
+%s
+'''  # END VERSION_JSON
+
+
+def get_versions():
+    return json.loads(version_json)
+"""
+
+
+def versions_from_file(filename):
+    """Try to determine the version from _version.py if present."""
+    try:
+        with open(filename) as f:
+            contents = f.read()
+    except EnvironmentError:
+        raise NotThisMethod("unable to read _version.py")
+    mo = re.search(r"version_json = '''\n(.*)'''  # END VERSION_JSON",
+                   contents, re.M | re.S)
+    if not mo:
+        mo = re.search(r"version_json = '''\r\n(.*)'''  # END VERSION_JSON",
+                       contents, re.M | re.S)
+    if not mo:
+        raise NotThisMethod("no version_json in _version.py")
+    return json.loads(mo.group(1))
+
+
+def write_to_version_file(filename, versions):
+    """Write the given version number to the given _version.py file."""
+    os.unlink(filename)
+    contents = json.dumps(versions, sort_keys=True,
+                          indent=1, separators=(",", ": "))
+    with open(filename, "w") as f:
+        f.write(SHORT_VERSION_PY % contents)
+
+    print("set %s to '%s'" % (filename, versions["version"]))
+
+
+def plus_or_dot(pieces):
+    """Return a + if we don't already have one, else return a ."""
+    if "+" in pieces.get("closest-tag", ""):
+        return "."
+    return "+"
+
+
+def render_pep440(pieces):
+    """Build up version string, with post-release "local version identifier".
+
+    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
+    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
+
+    Exceptions:
+    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"],
+                                          pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def render_pep440_pre(pieces):
+    """TAG[.post.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post.devDISTANCE
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += ".post.dev%d" % pieces["distance"]
+    else:
+        # exception #1
+        rendered = "0.post.dev%d" % pieces["distance"]
+    return rendered
+
+
+def render_pep440_post(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX] .
+
+    The ".dev0" means dirty. Note that .dev0 sorts backwards
+    (a dirty tree will appear "older" than the corresponding clean one),
+    but you shouldn't be releasing software with -dirty anyways.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+    return rendered
+
+
+def render_pep440_old(pieces):
+    """TAG[.postDISTANCE[.dev0]] .
+
+    The ".dev0" means dirty.
+
+    Eexceptions:
+    1: no tags. 0.postDISTANCE[.dev0]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["dirty"]:
+                rendered += ".dev0"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["dirty"]:
+            rendered += ".dev0"
+    return rendered
+
+
+def render_git_describe(pieces):
+    """TAG[-DISTANCE-gHEX][-dirty].
+
+    Like 'git describe --tags --dirty --always'.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"]:
+            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render_git_describe_long(pieces):
+    """TAG-DISTANCE-gHEX[-dirty].
+
+    Like 'git describe --tags --dirty --always -long'.
+    The distance/hash is unconditional.
+
+    Exceptions:
+    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
+    else:
+        # exception #1
+        rendered = pieces["short"]
+    if pieces["dirty"]:
+        rendered += "-dirty"
+    return rendered
+
+
+def render(pieces, style):
+    """Render the given version pieces into the requested style."""
+    if pieces["error"]:
+        return {"version": "unknown",
+                "full-revisionid": pieces.get("long"),
+                "dirty": None,
+                "error": pieces["error"],
+                "date": None}
+
+    if not style or style == "default":
+        style = "pep440"  # the default
+
+    if style == "pep440":
+        rendered = render_pep440(pieces)
+    elif style == "pep440-pre":
+        rendered = render_pep440_pre(pieces)
+    elif style == "pep440-post":
+        rendered = render_pep440_post(pieces)
+    elif style == "pep440-old":
+        rendered = render_pep440_old(pieces)
+    elif style == "git-describe":
+        rendered = render_git_describe(pieces)
+    elif style == "git-describe-long":
+        rendered = render_git_describe_long(pieces)
+    else:
+        raise ValueError("unknown style '%s'" % style)
+
+    return {"version": rendered, "full-revisionid": pieces["long"],
+            "dirty": pieces["dirty"], "error": None,
+            "date": pieces.get("date")}
+
+
+class VersioneerBadRootError(Exception):
+    """The project root directory is unknown or missing key files."""
+
+
+def get_versions(verbose=False):
+    """Get the project version from whatever source is available.
+
+    Returns dict with two keys: 'version' and 'full'.
+    """
+    if "versioneer" in sys.modules:
+        # see the discussion in cmdclass.py:get_cmdclass()
+        del sys.modules["versioneer"]
+
+    root = get_root()
+    cfg = get_config_from_root(root)
+
+    assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg"
+    handlers = HANDLERS.get(cfg.VCS)
+    assert handlers, "unrecognized VCS '%s'" % cfg.VCS
+    verbose = verbose or cfg.verbose
+    assert cfg.versionfile_source is not None, \
+        "please set versioneer.versionfile_source"
+    assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix"
+
+    versionfile_abs = os.path.join(root, cfg.versionfile_source)
+
+    # extract version from first of: _version.py, VCS command (e.g. 'git
+    # describe'), parentdir. This is meant to work for developers using a
+    # source checkout, for users of a tarball created by 'setup.py sdist',
+    # and for users of a tarball/zipball created by 'git archive' or github's
+    # download-from-tag feature or the equivalent in other VCSes.
+
+    get_keywords_f = handlers.get("get_keywords")
+    from_keywords_f = handlers.get("keywords")
+    if get_keywords_f and from_keywords_f:
+        try:
+            keywords = get_keywords_f(versionfile_abs)
+            ver = from_keywords_f(keywords, cfg.tag_prefix, verbose)
+            if verbose:
+                print("got version from expanded keyword %s" % ver)
+            return ver
+        except NotThisMethod:
+            pass
+
+    try:
+        ver = versions_from_file(versionfile_abs)
+        if verbose:
+            print("got version from file %s %s" % (versionfile_abs, ver))
+        return ver
+    except NotThisMethod:
+        pass
+
+    from_vcs_f = handlers.get("pieces_from_vcs")
+    if from_vcs_f:
+        try:
+            pieces = from_vcs_f(cfg.tag_prefix, root, verbose)
+            ver = render(pieces, cfg.style)
+            if verbose:
+                print("got version from VCS %s" % ver)
+            return ver
+        except NotThisMethod:
+            pass
+
+    try:
+        if cfg.parentdir_prefix:
+            ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
+            if verbose:
+                print("got version from parentdir %s" % ver)
+            return ver
+    except NotThisMethod:
+        pass
+
+    if verbose:
+        print("unable to compute version")
+
+    return {"version": "0+unknown", "full-revisionid": None,
+            "dirty": None, "error": "unable to compute version",
+            "date": None}
+
+
+def get_version():
+    """Get the short version string for this project."""
+    return get_versions()["version"]
+
+
+def get_cmdclass():
+    """Get the custom setuptools/distutils subclasses used by Versioneer."""
+    if "versioneer" in sys.modules:
+        del sys.modules["versioneer"]
+        # this fixes the "python setup.py develop" case (also 'install' and
+        # 'easy_install .'), in which subdependencies of the main project are
+        # built (using setup.py bdist_egg) in the same python process. Assume
+        # a main project A and a dependency B, which use different versions
+        # of Versioneer. A's setup.py imports A's Versioneer, leaving it in
+        # sys.modules by the time B's setup.py is executed, causing B to run
+        # with the wrong versioneer. Setuptools wraps the sub-dep builds in a
+        # sandbox that restores sys.modules to it's pre-build state, so the
+        # parent is protected against the child's "import versioneer". By
+        # removing ourselves from sys.modules here, before the child build
+        # happens, we protect the child from the parent's versioneer too.
+        # Also see https://github.com/warner/python-versioneer/issues/52
+
+    cmds = {}
+
+    # we add "version" to both distutils and setuptools
+    from distutils.core import Command
+
+    class cmd_version(Command):
+        description = "report generated version string"
+        user_options = []
+        boolean_options = []
+
+        def initialize_options(self):
+            pass
+
+        def finalize_options(self):
+            pass
+
+        def run(self):
+            vers = get_versions(verbose=True)
+            print("Version: %s" % vers["version"])
+            print(" full-revisionid: %s" % vers.get("full-revisionid"))
+            print(" dirty: %s" % vers.get("dirty"))
+            print(" date: %s" % vers.get("date"))
+            if vers["error"]:
+                print(" error: %s" % vers["error"])
+    cmds["version"] = cmd_version
+
+    # we override "build_py" in both distutils and setuptools
+    #
+    # most invocation pathways end up running build_py:
+    #  distutils/build -> build_py
+    #  distutils/install -> distutils/build ->..
+    #  setuptools/bdist_wheel -> distutils/install ->..
+    #  setuptools/bdist_egg -> distutils/install_lib -> build_py
+    #  setuptools/install -> bdist_egg ->..
+    #  setuptools/develop -> ?
+    #  pip install:
+    #   copies source tree to a tempdir before running egg_info/etc
+    #   if .git isn't copied too, 'git describe' will fail
+    #   then does setup.py bdist_wheel, or sometimes setup.py install
+    #  setup.py egg_info -> ?
+
+    # we override different "build_py" commands for both environments
+    if "setuptools" in sys.modules:
+        from setuptools.command.build_py import build_py as _build_py
+    else:
+        from distutils.command.build_py import build_py as _build_py
+
+    class cmd_build_py(_build_py):
+        def run(self):
+            root = get_root()
+            cfg = get_config_from_root(root)
+            versions = get_versions()
+            _build_py.run(self)
+            # now locate _version.py in the new build/ directory and replace
+            # it with an updated value
+            if cfg.versionfile_build:
+                target_versionfile = os.path.join(self.build_lib,
+                                                  cfg.versionfile_build)
+                print("UPDATING %s" % target_versionfile)
+                write_to_version_file(target_versionfile, versions)
+    cmds["build_py"] = cmd_build_py
+
+    if "cx_Freeze" in sys.modules:  # cx_freeze enabled?
+        from cx_Freeze.dist import build_exe as _build_exe
+        # nczeczulin reports that py2exe won't like the pep440-style string
+        # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g.
+        # setup(console=[{
+        #   "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION
+        #   "product_version": versioneer.get_version(),
+        #   ...
+
+        class cmd_build_exe(_build_exe):
+            def run(self):
+                root = get_root()
+                cfg = get_config_from_root(root)
+                versions = get_versions()
+                target_versionfile = cfg.versionfile_source
+                print("UPDATING %s" % target_versionfile)
+                write_to_version_file(target_versionfile, versions)
+
+                _build_exe.run(self)
+                os.unlink(target_versionfile)
+                with open(cfg.versionfile_source, "w") as f:
+                    LONG = LONG_VERSION_PY[cfg.VCS]
+                    f.write(LONG %
+                            {"DOLLAR": "$",
+                             "STYLE": cfg.style,
+                             "TAG_PREFIX": cfg.tag_prefix,
+                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                             })
+        cmds["build_exe"] = cmd_build_exe
+        del cmds["build_py"]
+
+    if 'py2exe' in sys.modules:  # py2exe enabled?
+        try:
+            from py2exe.distutils_buildexe import py2exe as _py2exe  # py3
+        except ImportError:
+            from py2exe.build_exe import py2exe as _py2exe  # py2
+
+        class cmd_py2exe(_py2exe):
+            def run(self):
+                root = get_root()
+                cfg = get_config_from_root(root)
+                versions = get_versions()
+                target_versionfile = cfg.versionfile_source
+                print("UPDATING %s" % target_versionfile)
+                write_to_version_file(target_versionfile, versions)
+
+                _py2exe.run(self)
+                os.unlink(target_versionfile)
+                with open(cfg.versionfile_source, "w") as f:
+                    LONG = LONG_VERSION_PY[cfg.VCS]
+                    f.write(LONG %
+                            {"DOLLAR": "$",
+                             "STYLE": cfg.style,
+                             "TAG_PREFIX": cfg.tag_prefix,
+                             "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                             "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                             })
+        cmds["py2exe"] = cmd_py2exe
+
+    # we override different "sdist" commands for both environments
+    if "setuptools" in sys.modules:
+        from setuptools.command.sdist import sdist as _sdist
+    else:
+        from distutils.command.sdist import sdist as _sdist
+
+    class cmd_sdist(_sdist):
+        def run(self):
+            versions = get_versions()
+            self._versioneer_generated_versions = versions
+            # unless we update this, the command will keep using the old
+            # version
+            self.distribution.metadata.version = versions["version"]
+            return _sdist.run(self)
+
+        def make_release_tree(self, base_dir, files):
+            root = get_root()
+            cfg = get_config_from_root(root)
+            _sdist.make_release_tree(self, base_dir, files)
+            # now locate _version.py in the new base_dir directory
+            # (remembering that it may be a hardlink) and replace it with an
+            # updated value
+            target_versionfile = os.path.join(base_dir, cfg.versionfile_source)
+            print("UPDATING %s" % target_versionfile)
+            write_to_version_file(target_versionfile,
+                                  self._versioneer_generated_versions)
+    cmds["sdist"] = cmd_sdist
+
+    return cmds
+
+
+CONFIG_ERROR = """
+setup.cfg is missing the necessary Versioneer configuration. You need
+a section like:
+
+ [versioneer]
+ VCS = git
+ style = pep440
+ versionfile_source = src/myproject/_version.py
+ versionfile_build = myproject/_version.py
+ tag_prefix =
+ parentdir_prefix = myproject-
+
+You will also need to edit your setup.py to use the results:
+
+ import versioneer
+ setup(version=versioneer.get_version(),
+       cmdclass=versioneer.get_cmdclass(), ...)
+
+Please read the docstring in ./versioneer.py for configuration instructions,
+edit setup.cfg, and re-run the installer or 'python versioneer.py setup'.
+"""
+
+SAMPLE_CONFIG = """
+# See the docstring in versioneer.py for instructions. Note that you must
+# re-run 'versioneer.py setup' after changing this section, and commit the
+# resulting files.
+
+[versioneer]
+#VCS = git
+#style = pep440
+#versionfile_source =
+#versionfile_build =
+#tag_prefix =
+#parentdir_prefix =
+
+"""
+
+INIT_PY_SNIPPET = """
+from ._version import get_versions
+__version__ = get_versions()['version']
+del get_versions
+"""
+
+
+def do_setup():
+    """Main VCS-independent setup function for installing Versioneer."""
+    root = get_root()
+    try:
+        cfg = get_config_from_root(root)
+    except (EnvironmentError, configparser.NoSectionError,
+            configparser.NoOptionError) as e:
+        if isinstance(e, (EnvironmentError, configparser.NoSectionError)):
+            print("Adding sample versioneer config to setup.cfg",
+                  file=sys.stderr)
+            with open(os.path.join(root, "setup.cfg"), "a") as f:
+                f.write(SAMPLE_CONFIG)
+        print(CONFIG_ERROR, file=sys.stderr)
+        return 1
+
+    print(" creating %s" % cfg.versionfile_source)
+    with open(cfg.versionfile_source, "w") as f:
+        LONG = LONG_VERSION_PY[cfg.VCS]
+        f.write(LONG % {"DOLLAR": "$",
+                        "STYLE": cfg.style,
+                        "TAG_PREFIX": cfg.tag_prefix,
+                        "PARENTDIR_PREFIX": cfg.parentdir_prefix,
+                        "VERSIONFILE_SOURCE": cfg.versionfile_source,
+                        })
+
+    ipy = os.path.join(os.path.dirname(cfg.versionfile_source),
+                       "__init__.py")
+    if os.path.exists(ipy):
+        try:
+            with open(ipy, "r") as f:
+                old = f.read()
+        except EnvironmentError:
+            old = ""
+        if INIT_PY_SNIPPET not in old:
+            print(" appending to %s" % ipy)
+            with open(ipy, "a") as f:
+                f.write(INIT_PY_SNIPPET)
+        else:
+            print(" %s unmodified" % ipy)
+    else:
+        print(" %s doesn't exist, ok" % ipy)
+        ipy = None
+
+    # Make sure both the top-level "versioneer.py" and versionfile_source
+    # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so
+    # they'll be copied into source distributions. Pip won't be able to
+    # install the package without this.
+    manifest_in = os.path.join(root, "MANIFEST.in")
+    simple_includes = set()
+    try:
+        with open(manifest_in, "r") as f:
+            for line in f:
+                if line.startswith("include "):
+                    for include in line.split()[1:]:
+                        simple_includes.add(include)
+    except EnvironmentError:
+        pass
+    # That doesn't cover everything MANIFEST.in can do
+    # (http://docs.python.org/2/distutils/sourcedist.html#commands), so
+    # it might give some false negatives. Appending redundant 'include'
+    # lines is safe, though.
+    if "versioneer.py" not in simple_includes:
+        print(" appending 'versioneer.py' to MANIFEST.in")
+        with open(manifest_in, "a") as f:
+            f.write("include versioneer.py\n")
+    else:
+        print(" 'versioneer.py' already in MANIFEST.in")
+    if cfg.versionfile_source not in simple_includes:
+        print(" appending versionfile_source ('%s') to MANIFEST.in" %
+              cfg.versionfile_source)
+        with open(manifest_in, "a") as f:
+            f.write("include %s\n" % cfg.versionfile_source)
+    else:
+        print(" versionfile_source already in MANIFEST.in")
+
+    # Make VCS-specific changes. For git, this means creating/changing
+    # .gitattributes to mark _version.py for export-subst keyword
+    # substitution.
+    do_vcs_install(manifest_in, cfg.versionfile_source, ipy)
+    return 0
+
+
+def scan_setup_py():
+    """Validate the contents of setup.py against Versioneer's expectations."""
+    found = set()
+    setters = False
+    errors = 0
+    with open("setup.py", "r") as f:
+        for line in f.readlines():
+            if "import versioneer" in line:
+                found.add("import")
+            if "versioneer.get_cmdclass()" in line:
+                found.add("cmdclass")
+            if "versioneer.get_version()" in line:
+                found.add("get_version")
+            if "versioneer.VCS" in line:
+                setters = True
+            if "versioneer.versionfile_source" in line:
+                setters = True
+    if len(found) != 3:
+        print("")
+        print("Your setup.py appears to be missing some important items")
+        print("(but I might be wrong). Please make sure it has something")
+        print("roughly like the following:")
+        print("")
+        print(" import versioneer")
+        print(" setup( version=versioneer.get_version(),")
+        print("        cmdclass=versioneer.get_cmdclass(),  ...)")
+        print("")
+        errors += 1
+    if setters:
+        print("You should remove lines like 'versioneer.VCS = ' and")
+        print("'versioneer.versionfile_source = ' . This configuration")
+        print("now lives in setup.cfg, and should be removed from setup.py")
+        print("")
+        errors += 1
+    return errors
+
+
+if __name__ == "__main__":
+    cmd = sys.argv[1]
+    if cmd == "setup":
+        errors = do_setup()
+        errors += scan_setup_py()
+        if errors:
+            sys.exit(1)

From cf586c4eac26a96904760b4db5ec5522fcaef995 Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Mon, 21 Mar 2022 20:15:30 -0400
Subject: [PATCH 150/171] Update update-version.sh (#560)

Remove line referencing deleted file

Authors:
  - Ray Douglass (https://github.com/raydouglass)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/raft/pull/560
---
 ci/release/update-version.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 83521e5d11..166698110f 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -39,4 +39,3 @@ for FILE in conda/environments/*.yml; do
 done
 
 sed_runner "s/export UCX_PY_VERSION=.*/export UCX_PY_VERSION='${NEXT_UCX_PY_VERSION}'/g" ci/gpu/build.sh
-sed_runner "s/export UCX_PY_VERSION=.*/export UCX_PY_VERSION='${NEXT_UCX_PY_VERSION}'/g" ci/local/old-gpubuild.sh

From c333848127faafa61c043b0433cbee59e9bc009b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Mon, 21 Mar 2022 21:25:24 -0400
Subject: [PATCH 151/171] Fixing accidental removal of thrust target from
 cmakelists (#571)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/raft/pull/571
---
 cpp/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 25ee402217..515fd60a39 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -339,6 +339,10 @@ Imported Targets:
 set(code_string
 [=[
 
+if(NOT TARGET raft::Thrust)
+  thrust_create_target(raft::Thrust FROM_OPTIONS)
+endif()
+
 if(distance IN_LIST raft_FIND_COMPONENTS)
   enable_language(CUDA)
 endif()

From 6e552ec390995e0ae84648b45e66ffd75f5c43c4 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Mon, 21 Mar 2022 23:28:43 -0400
Subject: [PATCH 152/171] Fixing raft headers dir (#574)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/574
---
 cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 515fd60a39..4fe2a17d3d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -310,7 +310,7 @@ endif()
 
 
 install(DIRECTORY include/raft
-        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft)
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 
 # Temporary install of raft.hpp while the file is removed
 install(FILES include/raft.hpp

From 15eb20a4e90a2c9267dea7046dc9cb9cba3e678e Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Mon, 21 Mar 2022 21:38:53 -0700
Subject: [PATCH 153/171] Thrust package name case (#576)

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/576
---
 cpp/cmake/thirdparty/get_thrust.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index fb9632ba5e..a0ea09483b 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -17,7 +17,7 @@ function(find_and_configure_thrust)
     include(${rapids-cmake-dir}/cpm/thrust.cmake)
 
     rapids_cpm_thrust( NAMESPACE raft )
-    rapids_export_package(BUILD thrust raft-exports)
+    rapids_export_package(BUILD Thrust raft-exports)
 endfunction()
 
 find_and_configure_thrust()

From ebc4febd2c3be1adf52d89e0bb61552e9411ef87 Mon Sep 17 00:00:00 2001
From: Zach Bjornson <zbbjornson@gmail.com>
Date: Tue, 22 Mar 2022 10:33:03 -0700
Subject: [PATCH 154/171] Use unanchored clang-format version check (#573)

`clang-format -version` installed with apt on Ubuntu reports `Ubuntu clang-format version 11.0.0-2~ubuntu20.04.1`, so we need an unanchored search here.

(Equivalent to https://github.com/rapidsai/cuml/commit/dafcd6f30da3101ab7a4d2852dcf11d8aa0d8125.)

Authors:
  - Zach Bjornson (https://github.com/zbjornson)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/573
---
 cpp/scripts/run-clang-format.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/scripts/run-clang-format.py b/cpp/scripts/run-clang-format.py
index 56fc9a5928..2545907688 100755
--- a/cpp/scripts/run-clang-format.py
+++ b/cpp/scripts/run-clang-format.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -64,7 +64,7 @@ def parse_args():
         args.dstdir = tempfile.mkdtemp()
     ret = subprocess.check_output("%s --version" % args.exe, shell=True)
     ret = ret.decode("utf-8")
-    version = VERSION_REGEX.match(ret)
+    version = VERSION_REGEX.search(ret)
     if version is None:
         raise Exception("Failed to figure out clang-format version!")
     version = version.group(1)

From e74c01258d8e1d46983617dae7957ddd527bddea Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Tue, 22 Mar 2022 14:12:16 -0400
Subject: [PATCH 155/171] Adding INSTALL_EXPORT_SET for cuco, rmm, thrust
 (#579)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/579
---
 cpp/cmake/thirdparty/get_cuco.cmake   | 1 +
 cpp/cmake/thirdparty/get_rmm.cmake    | 3 ++-
 cpp/cmake/thirdparty/get_thrust.cmake | 5 +++--
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index 3a70d34283..dc0bf6a029 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -20,6 +20,7 @@ function(find_and_configure_cuco VERSION)
         rapids_cpm_find(cuco ${VERSION}
           GLOBAL_TARGETS      cuco::cuco
           BUILD_EXPORT_SET    raft-exports
+          INSTALL_EXPORT_SET  raft-exports
           CPM_ARGS
             EXCLUDE_FROM_ALL TRUE
             GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake
index ffab703091..5a7d54ea4a 100644
--- a/cpp/cmake/thirdparty/get_rmm.cmake
+++ b/cpp/cmake/thirdparty/get_rmm.cmake
@@ -16,7 +16,8 @@
 
 function(find_and_configure_rmm)
     include(${rapids-cmake-dir}/cpm/rmm.cmake)
-    rapids_cpm_rmm(BUILD_EXPORT_SET raft-exports)
+    rapids_cpm_rmm(BUILD_EXPORT_SET raft-exports
+                   INSTALL_EXPORT_SET  raft-exports)
 endfunction()
 
 find_and_configure_rmm()
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index a0ea09483b..03dfecde6a 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -16,8 +16,9 @@
 function(find_and_configure_thrust)
     include(${rapids-cmake-dir}/cpm/thrust.cmake)
 
-    rapids_cpm_thrust( NAMESPACE raft )
-    rapids_export_package(BUILD Thrust raft-exports)
+    rapids_cpm_thrust( NAMESPACE raft
+                       BUILD_EXPORT_SET raft-exports
+                       INSTALL_EXPORT_SET raft-exports)
 endfunction()
 
 find_and_configure_thrust()

From 1c544930dcfd84df203442b12be0bcf0effc38fd Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Tue, 22 Mar 2022 12:51:19 -0700
Subject: [PATCH 156/171] Don't add gtest to build export set or generate a
 gtest-config.cmake (#565)

Fixes errors configuring RAFT now that rapids-cmake [is enforcing](https://github.com/rapidsai/rapids-cmake/pull/168) GTest v1.10.0.

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/565
---
 cpp/cmake/thirdparty/get_gtest.cmake | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index 04da801b79..7efad7886c 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -15,22 +15,8 @@
 #=============================================================================
 
 function(find_and_configure_gtest )
-
     include(${rapids-cmake-dir}/cpm/gtest.cmake)
-    rapids_cpm_gtest(BUILD_EXPORT_SET raft-exports
-                     EXCLUDE_FROM_ALL TRUE)
-
-    if(GTest_ADDED)
-        rapids_export(BUILD GTest
-          VERSION ${GTest_VERSION}
-          EXPORT_SET GTestTargets
-          GLOBAL_TARGETS gtest gmock gtest_main gmock_main
-          NAMESPACE GTest::)
-
-        include("${rapids-cmake-dir}/export/find_package_root.cmake")
-        rapids_export_find_package_root(BUILD GTest [=[${CMAKE_CURRENT_LIST_DIR}]=] raft-exports)
-    endif()
-
+    rapids_cpm_gtest()
 endfunction()
 
 find_and_configure_gtest()

From 15b2ef7df69d26aaadfbcfef84c2a9e2186b494d Mon Sep 17 00:00:00 2001
From: Zach Bjornson <zbbjornson@gmail.com>
Date: Tue, 22 Mar 2022 12:55:57 -0700
Subject: [PATCH 157/171] Add missing thrust includes to transpose.cuh (#575)

Including `raft/linalg/transpose.cuh` appears to work today, but a few weeks ago it didn't because of these missing includes. Either way, these should be here because they're used.

I can't figure out how to get include-what-you-use to process .cuh files, but that would be a nice check for all of the RAPIDS repos.

Authors:
  - Zach Bjornson (https://github.com/zbjornson)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/575
---
 cpp/include/raft/linalg/detail/transpose.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/include/raft/linalg/detail/transpose.cuh b/cpp/include/raft/linalg/detail/transpose.cuh
index 659d3a8ef6..c09b7a2450 100644
--- a/cpp/include/raft/linalg/detail/transpose.cuh
+++ b/cpp/include/raft/linalg/detail/transpose.cuh
@@ -20,6 +20,8 @@
 
 #include <raft/handle.hpp>
 #include <rmm/exec_policy.hpp>
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
 
 namespace raft {
 namespace linalg {

From 82fd06808f931d68e078e076b7cbb5bab5e2fa17 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Tue, 22 Mar 2022 17:21:54 -0400
Subject: [PATCH 158/171] removing exclude_from_all from cuco (#580)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/580
---
 cpp/cmake/thirdparty/get_cuco.cmake | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index dc0bf6a029..3f2434d9be 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -22,7 +22,6 @@ function(find_and_configure_cuco VERSION)
           BUILD_EXPORT_SET    raft-exports
           INSTALL_EXPORT_SET  raft-exports
           CPM_ARGS
-            EXCLUDE_FROM_ALL TRUE
             GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
             GIT_TAG        0ca860b824f5dc22cf8a41f09912e62e11f07d82
             OPTIONS        "BUILD_TESTS OFF"

From 45384fef3fd8f986a5a34ae4578f540df3989482 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 22 Mar 2022 16:54:16 -0500
Subject: [PATCH 159/171] Pin `dask` and `distributed` version (#581)

Pin changes to be in-line with : https://github.com/rapidsai/cudf/pull/10481

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/raft/pull/581
---
 ci/gpu/build.sh                          | 4 ++--
 conda/environments/raft_dev_cuda11.0.yml | 4 ++--
 conda/environments/raft_dev_cuda11.2.yml | 4 ++--
 conda/environments/raft_dev_cuda11.4.yml | 4 ++--
 conda/environments/raft_dev_cuda11.5.yml | 4 ++--
 conda/recipes/pyraft/meta.yaml           | 4 ++--
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 4427362103..69d21a5bb4 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -69,8 +69,8 @@ gpuci_mamba_retry install -y -c conda-forge -c rapidsai -c rapidsai-nightly -c n
 # Install the master version of dask, distributed, and dask-ml
 gpuci_logger "Install the master version of dask and distributed"
 set -x
-pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
+pip install "git+https://github.com/dask/distributed.git@2022.03.0" --upgrade --no-deps
+pip install "git+https://github.com/dask/dask.git@2022.03.0" --upgrade --no-deps
 set +x
 
 # Install pre-built conda packages from previous CI step
diff --git a/conda/environments/raft_dev_cuda11.0.yml b/conda/environments/raft_dev_cuda11.0.yml
index c345e07ba2..3c35937a89 100644
--- a/conda/environments/raft_dev_cuda11.0.yml
+++ b/conda/environments/raft_dev_cuda11.0.yml
@@ -22,8 +22,8 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@main
-    - git+https://github.com/dask/distributed.git@main
+    - git+https://github.com/dask/dask.git@2022.03.0
+    - git+https://github.com/dask/distributed.git@2022.03.0
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/
diff --git a/conda/environments/raft_dev_cuda11.2.yml b/conda/environments/raft_dev_cuda11.2.yml
index 537f9e6c85..eedb92d8a5 100644
--- a/conda/environments/raft_dev_cuda11.2.yml
+++ b/conda/environments/raft_dev_cuda11.2.yml
@@ -22,8 +22,8 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@main
-    - git+https://github.com/dask/distributed.git@main
+    - git+https://github.com/dask/dask.git@2022.03.0
+    - git+https://github.com/dask/distributed.git@2022.03.0
     
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/
diff --git a/conda/environments/raft_dev_cuda11.4.yml b/conda/environments/raft_dev_cuda11.4.yml
index 8add42966e..a3a022f784 100644
--- a/conda/environments/raft_dev_cuda11.4.yml
+++ b/conda/environments/raft_dev_cuda11.4.yml
@@ -22,8 +22,8 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@main
-    - git+https://github.com/dask/distributed.git@main
+    - git+https://github.com/dask/dask.git@2022.03.0
+    - git+https://github.com/dask/distributed.git@2022.03.0
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/
diff --git a/conda/environments/raft_dev_cuda11.5.yml b/conda/environments/raft_dev_cuda11.5.yml
index bf1b93694d..7b4c50ce70 100644
--- a/conda/environments/raft_dev_cuda11.5.yml
+++ b/conda/environments/raft_dev_cuda11.5.yml
@@ -23,8 +23,8 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@main
-    - git+https://github.com/dask/distributed.git@main
+    - git+https://github.com/dask/dask.git@2022.03.0
+    - git+https://github.com/dask/distributed.git@2022.03.0
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/
diff --git a/conda/recipes/pyraft/meta.yaml b/conda/recipes/pyraft/meta.yaml
index 8ae7e2663b..72c51bb698 100644
--- a/conda/recipes/pyraft/meta.yaml
+++ b/conda/recipes/pyraft/meta.yaml
@@ -44,8 +44,8 @@ requirements:
     - rmm {{ minor_version }}
     - ucx-py {{ ucx_py_version }}
     - ucx-proc=*=gpu
-    - dask>=2022.02.1
-    - distributed>=2022.02.1
+    - dask==2022.03.0
+    - distributed==2022.03.0
     - cuda-python >=11.5,<12.0
     - joblib >=0.11
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}

From f97e905ce0a8f0ab2a33c758e16dfdd6df3ce27b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Tue, 22 Mar 2022 19:00:34 -0400
Subject: [PATCH 160/171] exposing faiss::faiss (#582)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/582
---
 cpp/CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 4fe2a17d3d..a75b266415 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -262,7 +262,7 @@ if(RAFT_COMPILE_LIBRARIES OR RAFT_COMPILE_NN_LIBRARY)
           )
   set_target_properties(raft_nn_lib PROPERTIES OUTPUT_NAME raft_nn)
 
-  target_link_libraries(raft_nn_lib PRIVATE raft::raft faiss::faiss)
+  target_link_libraries(raft_nn_lib PUBLIC faiss::faiss PRIVATE raft::raft)
   target_compile_options(raft_nn_lib
           PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
@@ -320,10 +320,10 @@ install(FILES include/raft.hpp
 # - install export -----------------------------------------------------------
 set(doc_string
         [=[
-Provide targets for the RAFT: RAPIDS Analytics Framework Toolkit.
+Provide targets for the RAFT: Reusable Accelerated Functions and Tools
 
-RAFT (Reusable Analytics Functions and other Tools) contains fundamental
-widely-used algorithms and primitives for data science, graph, and ml.
+RAFT contains fundamental widely-used algorithms and primitives
+for data science, graph, and ml.
 
 Optional Components:
   - nn

From ab57c1fb930a985ed1e53a726d76791b75e8935a Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Tue, 22 Mar 2022 17:22:50 -0700
Subject: [PATCH 161/171] Link to `cuco` and add faiss `EXCLUDE_FROM_ALL`
 option (#583)

This PR includes a few fixes to support source-only builds:
1. Defines linkage to `cuco::cuco` if the `RAFT_ENABLE_cuco_DEPENDENCY` variable is set, not if `cuco_ADDED` is true
2. Adds a flag to control the `EXCLUDE_FROM_ALL` for the faiss dependency. This flag can be off for conda builds, but true for C++-only source builds
3. Writes `version_config.hpp` header and fixes a potential GoogleBench issue

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/583
---
 cpp/CMakeLists.txt                   | 12 +++++++++++-
 cpp/cmake/thirdparty/get_faiss.cmake |  7 ++++---
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a75b266415..e48508099f 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -28,6 +28,14 @@ rapids_cuda_init_architectures(RAFT)
 
 project(RAFT VERSION 22.04.00 LANGUAGES CXX CUDA)
 
+# Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs to
+# have different values for the `Threads::Threads` target. Setting this flag ensures
+# `Threads::Threads` is the same value in first run and subsequent runs.
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+
+# Write the version header
+rapids_cmake_write_version_file(include/raft/version_config.hpp)
+
 ##############################################################################
 # - build type ---------------------------------------------------------------
 
@@ -62,6 +70,8 @@ option(RAFT_ENABLE_cuco_DEPENDENCY "Enable cuCollections dependency" ON)
 option(RAFT_ENABLE_nccl_DEPENDENCY "Enable NCCL dependency" OFF)
 option(RAFT_ENABLE_ucx_DEPENDENCY "Enable ucx dependency" OFF)
 
+option(RAFT_EXCLUDE_FAISS_FROM_ALL "Exclude FAISS targets from RAFT's 'all' target" ON)
+
 include(CMakeDependentOption)
 cmake_dependent_option(RAFT_USE_FAISS_STATIC "Build and statically link the FAISS library for nearest neighbors search on GPU" ON RAFT_COMPILE_LIBRARIES OFF)
 
@@ -150,7 +160,7 @@ target_link_libraries(raft INTERFACE
         CUDA::cudart
         CUDA::cusparse
         rmm::rmm
-        $<$<BOOL:${cuco_ADDED}>:cuco::cuco>
+        $<$<BOOL:${RAFT_ENABLE_cuco_DEPENDENCY}>:cuco::cuco>
         std::mdspan)
 
 target_compile_definitions(raft INTERFACE $<$<BOOL:${NVTX}>:NVTX_ENABLED>)
diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake
index b3c9abba75..0eb849775e 100644
--- a/cpp/cmake/thirdparty/get_faiss.cmake
+++ b/cpp/cmake/thirdparty/get_faiss.cmake
@@ -15,7 +15,7 @@
 #=============================================================================
 
 function(find_and_configure_faiss)
-    set(oneValueArgs VERSION PINNED_TAG BUILD_STATIC_LIBS)
+    set(oneValueArgs VERSION PINNED_TAG BUILD_STATIC_LIBS EXCLUDE_FROM_ALL)
     cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
                           "${multiValueArgs}" ${ARGN} )
 
@@ -36,7 +36,7 @@ function(find_and_configure_faiss)
           CPM_ARGS
             GIT_REPOSITORY   https://github.com/facebookresearch/faiss.git
             GIT_TAG          ${PKG_PINNED_TAG}
-            EXCLUDE_FROM_ALL TRUE
+            EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL}
             OPTIONS
               "FAISS_ENABLE_PYTHON OFF"
               "CUDAToolkit_ROOT ${CUDAToolkit_LIBRARY_DIR}"
@@ -69,4 +69,5 @@ endfunction()
 
 find_and_configure_faiss(VERSION    1.7.0
                          PINNED_TAG  bde7c0027191f29c9dadafe4f6e68ca0ee31fb30
-                         BUILD_STATIC_LIBS ${RAFT_USE_FAISS_STATIC})
+                         BUILD_STATIC_LIBS ${RAFT_USE_FAISS_STATIC}
+                         EXCLUDE_FROM_ALL ${RAFT_EXCLUDE_FAISS_FROM_ALL})

From 9a11b83c74d3ce26ca5529e8ab4ca459e0a8afce Mon Sep 17 00:00:00 2001
From: Jordan Jacobelli <jjacobelli@nvidia.com>
Date: Wed, 23 Mar 2022 13:56:25 +0100
Subject: [PATCH 162/171] Fix commands to get conda output files (#584)

Authors:
  - Jordan Jacobelli (https://github.com/Ethyling)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/raft/pull/584
---
 ci/cpu/upload.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index 822c15f0e1..9126423008 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -31,11 +31,11 @@ fi
 
 gpuci_logger "Get conda file output locations"
 
-export LIBRAFT_HEADERS_FILE=`conda build --croot ${CONDA_BLD_DIR} -c ${CONDA_LOCAL_CHANNEL} conda/recipes/libraft_headers --output`
+export LIBRAFT_HEADERS_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} -c ${CONDA_LOCAL_CHANNEL} conda/recipes/libraft_headers --output`
 export LIBRAFT_NN_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} -c ${CONDA_LOCAL_CHANNEL} conda/recipes/libraft_nn --output`
 export LIBRAFT_DISTANCE_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} -c ${CONDA_LOCAL_CHANNEL} conda/recipes/libraft_distance --output`
-export PYRAFT_FILE=`conda build --croot ${CONDA_BLD_DIR} -c ${CONDA_LOCAL_CHANNEL} conda/recipes/pyraft --python=$PYTHON --output`
-export PYLIBRAFT_FILE=`conda build --croot ${CONDA_BLD_DIR} -c ${CONDA_LOCAL_CHANNEL} conda/recipes/pylibraft --python=$PYTHON --output`
+export PYRAFT_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} -c ${CONDA_LOCAL_CHANNEL} conda/recipes/pyraft --python=$PYTHON --output`
+export PYLIBRAFT_FILE=`conda build --no-build-id --croot ${CONDA_BLD_DIR} -c ${CONDA_LOCAL_CHANNEL} conda/recipes/pylibraft --python=$PYTHON --output`
 
 ################################################################################
 # UPLOAD - Conda packages

From f66f1b31b4c376cc02b979fc4b9519df6b654cf9 Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Wed, 23 Mar 2022 13:36:15 -0400
Subject: [PATCH 163/171] update `ops-bot` config file (#586)

I recently committed a config file to be used by the [rapidsai/ops-bot](https://github.com/rapidsai/ops-bot/) and in hindsight, I should've had the new `external_contributors` functionality set to `false` until we're ready to roll it out everywhere. This PR fixes that.

Authors:
   - AJ Schmidt (https://github.com/ajschmidt8)

Approvers:
---
 .github/ops-bot.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
index 408e28843e..0a52b67955 100644
--- a/.github/ops-bot.yaml
+++ b/.github/ops-bot.yaml
@@ -5,4 +5,4 @@ auto_merger: true
 branch_checker: true
 label_checker: true
 release_drafter: true
-external_contributors: true
+external_contributors: false

From 3997a8fbecebb667e1b018b51ee8a91f249c80b7 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Thu, 24 Mar 2022 22:10:59 -0400
Subject: [PATCH 164/171] Updating docs for 22.04 (#566)

Finalizing some more bits of the docs. This has also included cleaning up several header files to make the docs a little more clean.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/566
---
 README.md                                     |   2 +-
 cpp/doxygen/Doxyfile.in                       |   8 +-
 cpp/doxygen/main_page.md                      |  26 +-
 cpp/include/raft.hpp                          |   2 +-
 cpp/include/raft/cache/cache_util.cuh         |   4 +-
 cpp/include/raft/comms/comms.hpp              |   2 +-
 cpp/include/raft/cudart_utils.h               |   2 +-
 cpp/include/raft/device_atomics.cuh           |   4 +-
 cpp/include/raft/distance/distance.cuh        |  26 +-
 cpp/include/raft/distance/distance.hpp        | 448 +----------------
 cpp/include/raft/distance/fused_l2_nn.hpp     |   2 +-
 cpp/include/raft/distance/specializations.hpp |   2 +-
 cpp/include/raft/error.hpp                    |   4 +-
 cpp/include/raft/handle.hpp                   |   4 +-
 cpp/include/raft/interruptible.hpp            |   4 +-
 cpp/include/raft/lap/lap.hpp                  |   4 +-
 cpp/include/raft/linalg/add.cuh               |   8 +
 cpp/include/raft/linalg/add.hpp               |   2 +-
 cpp/include/raft/linalg/axpy.hpp              |   2 +-
 cpp/include/raft/linalg/binary_op.hpp         |   2 +-
 .../raft/linalg/cholesky_r1_update.cuh        |   8 +-
 .../raft/linalg/cholesky_r1_update.hpp        |  10 +-
 .../raft/linalg/coalesced_reduction.hpp       |   2 +-
 cpp/include/raft/linalg/contractions.hpp      |   2 +-
 cpp/include/raft/linalg/cublas_macros.h       |   4 +-
 cpp/include/raft/linalg/cusolver_macros.h     |   4 +-
 cpp/include/raft/linalg/divide.hpp            |   2 +-
 cpp/include/raft/linalg/eig.hpp               |   2 +-
 cpp/include/raft/linalg/eltwise.hpp           |   2 +-
 cpp/include/raft/linalg/gemm.hpp              |   2 +-
 cpp/include/raft/linalg/gemv.hpp              |   2 +-
 cpp/include/raft/linalg/init.hpp              |   2 +-
 cpp/include/raft/linalg/lanczos.hpp           |   2 +-
 cpp/include/raft/linalg/lstsq.hpp             |   2 +-
 cpp/include/raft/linalg/map.hpp               |  40 +-
 cpp/include/raft/linalg/map_then_reduce.hpp   |   2 +-
 cpp/include/raft/linalg/matrix_vector_op.hpp  |   2 +-
 .../raft/linalg/mean_squared_error.hpp        |   2 +-
 cpp/include/raft/linalg/multiply.hpp          |   2 +-
 cpp/include/raft/linalg/norm.hpp              |   2 +-
 cpp/include/raft/linalg/power.hpp             |   2 +-
 cpp/include/raft/linalg/qr.hpp                |  64 +--
 cpp/include/raft/linalg/reduce.hpp            |   2 +-
 .../raft/linalg/reduce_cols_by_key.hpp        |   2 +-
 .../raft/linalg/reduce_rows_by_key.hpp        |   2 +-
 cpp/include/raft/linalg/rsvd.hpp              |   2 +-
 cpp/include/raft/linalg/sqrt.hpp              |   2 +-
 cpp/include/raft/linalg/strided_reduction.hpp |   2 +-
 cpp/include/raft/linalg/subtract.hpp          |   2 +-
 cpp/include/raft/linalg/svd.hpp               |   2 +-
 cpp/include/raft/linalg/ternary_op.hpp        |   2 +-
 cpp/include/raft/linalg/transpose.hpp         |   2 +-
 cpp/include/raft/linalg/unary_op.hpp          |   2 +-
 cpp/include/raft/matrix/col_wise_sort.hpp     |   2 +-
 cpp/include/raft/matrix/math.hpp              | 453 +-----------------
 cpp/include/raft/matrix/matrix.hpp            |   2 +-
 cpp/include/raft/random/make_blobs.cuh        |   7 +-
 cpp/include/raft/random/make_blobs.hpp        | 162 +------
 cpp/include/raft/random/make_regression.hpp   |   2 +-
 .../raft/random/multi_variable_gaussian.hpp   |   2 +-
 cpp/include/raft/random/permute.hpp           |   2 +-
 cpp/include/raft/random/rng.hpp               | 365 +-------------
 cpp/include/raft/sparse/convert/coo.hpp       |   2 +-
 cpp/include/raft/sparse/convert/csr.hpp       |   2 +-
 cpp/include/raft/sparse/convert/dense.hpp     |   2 +-
 cpp/include/raft/sparse/csr.hpp               |   6 +-
 cpp/include/raft/sparse/distance/distance.hpp | 122 +----
 .../raft/sparse/hierarchy/single_linkage.hpp  |  51 +-
 cpp/include/raft/sparse/linalg/add.hpp        |   2 +-
 cpp/include/raft/sparse/linalg/degree.hpp     |   2 +-
 cpp/include/raft/sparse/linalg/norm.hpp       |   2 +-
 cpp/include/raft/sparse/linalg/spectral.hpp   |   2 +-
 cpp/include/raft/sparse/linalg/symmetrize.cuh |  36 +-
 cpp/include/raft/sparse/linalg/symmetrize.hpp | 154 +-----
 cpp/include/raft/sparse/linalg/transpose.hpp  |   2 +-
 cpp/include/raft/sparse/mst/mst.hpp           |  43 +-
 cpp/include/raft/sparse/op/filter.hpp         |   2 +-
 cpp/include/raft/sparse/op/reduce.hpp         |   2 +-
 cpp/include/raft/sparse/op/row_op.hpp         |   2 +-
 cpp/include/raft/sparse/op/slice.hpp          |   2 +-
 cpp/include/raft/sparse/op/sort.hpp           |   2 +-
 .../sparse/selection/connect_components.hpp   |   2 +-
 cpp/include/raft/sparse/selection/knn.hpp     |   2 +-
 .../raft/sparse/selection/knn_graph.hpp       |   2 +-
 cpp/include/raft/spatial/knn/ann.hpp          |   2 +-
 cpp/include/raft/spatial/knn/ball_cover.hpp   |   2 +-
 .../raft/spatial/knn/epsilon_neighborhood.hpp |   2 +-
 cpp/include/raft/spatial/knn/knn.hpp          |   2 +-
 .../raft/spatial/knn/specializations.hpp      |   2 +-
 cpp/include/raft/spectral/cluster_solvers.hpp |   2 +-
 cpp/include/raft/spectral/eigen_solvers.hpp   |   2 +-
 .../raft/spectral/modularity_maximization.cuh |  28 +-
 .../raft/spectral/modularity_maximization.hpp |  78 +--
 cpp/include/raft/spectral/partition.cuh       |  27 +-
 cpp/include/raft/spectral/partition.hpp       |  87 +---
 cpp/include/raft/stats/accuracy.hpp           |   2 +-
 .../raft/stats/adjusted_rand_index.hpp        |   2 +-
 cpp/include/raft/stats/completeness_score.hpp |   2 +-
 cpp/include/raft/stats/contingency_matrix.hpp |   2 +-
 cpp/include/raft/stats/cov.hpp                |   2 +-
 cpp/include/raft/stats/dispersion.hpp         |   2 +-
 cpp/include/raft/stats/entropy.hpp            |   2 +-
 cpp/include/raft/stats/histogram.hpp          |   2 +-
 cpp/include/raft/stats/homogeneity_score.hpp  |   2 +-
 .../raft/stats/information_criterion.hpp      |   2 +-
 cpp/include/raft/stats/kl_divergence.hpp      |   2 +-
 cpp/include/raft/stats/mean.hpp               |   2 +-
 cpp/include/raft/stats/mean_center.hpp        |   2 +-
 cpp/include/raft/stats/meanvar.hpp            |   2 +-
 cpp/include/raft/stats/minmax.hpp             |   2 +-
 cpp/include/raft/stats/mutual_info_score.hpp  |   2 +-
 cpp/include/raft/stats/r2_score.hpp           |   2 +-
 cpp/include/raft/stats/rand_index.hpp         |   2 +-
 cpp/include/raft/stats/regression_metrics.hpp |   2 +-
 cpp/include/raft/stats/silhouette_score.hpp   |   2 +-
 cpp/include/raft/stats/specializations.hpp    |   2 +-
 cpp/include/raft/stats/stddev.hpp             |   2 +-
 cpp/include/raft/stats/sum.hpp                |   2 +-
 .../raft/stats/trustworthiness_score.hpp      |   2 +-
 cpp/include/raft/stats/v_measure.hpp          |   2 +-
 cpp/include/raft/stats/weighted_mean.hpp      |   2 +-
 docs/source/conf.py                           |   1 +
 docs/source/cpp_api.rst                       |   9 +-
 docs/source/cpp_api/clustering.rst            |  25 +
 docs/source/cpp_api/core.rst                  |  35 +-
 docs/source/cpp_api/distributed.rst           |   0
 docs/source/cpp_api/linalg.rst                |   8 +
 docs/source/cpp_api/matrix.rst                |   8 +
 docs/source/cpp_api/nn.rst                    |  14 -
 docs/source/cpp_api/optimization.rst          |  19 +
 docs/source/cpp_api/random.rst                |  12 +
 docs/source/cpp_api/sparse.rst                |  37 +-
 docs/source/cpp_api/spatial.rst               |  11 +-
 docs/source/cpp_api/stats.rst                 |   8 +
 docs/source/cuda_cpp.rst                      |   2 +-
 docs/source/index.rst                         |  10 +-
 docs/source/pylibraft_api.rst                 |  13 +
 docs/source/pyraft_api.rst                    |  20 +
 docs/source/python.rst                        |   9 +-
 docs/source/python_api.rst                    |  17 -
 .../pylibraft/distance/distance_type.pxd      |   4 +
 .../pylibraft/distance/pairwise_distance.pyx  |  34 +-
 142 files changed, 464 insertions(+), 2301 deletions(-)
 create mode 100644 docs/source/cpp_api/clustering.rst
 delete mode 100644 docs/source/cpp_api/distributed.rst
 create mode 100644 docs/source/cpp_api/matrix.rst
 delete mode 100644 docs/source/cpp_api/nn.rst
 create mode 100644 docs/source/cpp_api/optimization.rst
 create mode 100644 docs/source/cpp_api/random.rst
 create mode 100644 docs/source/pylibraft_api.rst
 create mode 100644 docs/source/pyraft_api.rst
 delete mode 100644 docs/source/python_api.rst

diff --git a/README.md b/README.md
index 4f34bbc6b0..f73d474efc 100755
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ While not exhaustive, the following general categories help summarize the accele
 | **Spatial** | pairwise distances, nearest neighbors, neighborhood graph construction |
 | **Sparse Operations** | linear algebra, eigenvalue problems, slicing, symmetrization, labeling |
 | **Basic Clustering** | spectral clustering, hierarchical clustering, k-means |
-| **Optimization** | combinatorial optimization, iterative solvers |
+| **Solvers** | combinatorial optimization, iterative solvers |
 | **Statistics** | sampling, moments and summary statistics, metrics |
 | **Distributed Tools** | multi-node multi-gpu infrastructure |
 
diff --git a/cpp/doxygen/Doxyfile.in b/cpp/doxygen/Doxyfile.in
index c83224050e..2ca265c454 100644
--- a/cpp/doxygen/Doxyfile.in
+++ b/cpp/doxygen/Doxyfile.in
@@ -798,7 +798,7 @@ INPUT_ENCODING         = UTF-8
 # *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f, *.for, *.tcl,
 # *.vhd, *.vhdl, *.ucf, *.qsf, *.as and *.js.
 
-FILE_PATTERNS          = *.hpp
+FILE_PATTERNS          = *.hpp *.cuh
 
 # The RECURSIVE tag can be used to specify whether or not subdirectories should
 # be searched for input files as well.
@@ -814,8 +814,7 @@ RECURSIVE              = YES
 # run.
 
 EXCLUDE                = @CMAKE_CURRENT_SOURCE_DIR@/include/raft/sparse/linalg/symmetrize.hpp \ # Contains device code
-                         @CMAKE_CURRENT_SOURCE_DIR@/include/raft/sparse/csr.hpp \    # Contains device code
-                         @CMAKE_CURRENT_SOURCE_DIR@/include/raft/sparse/detail/cusparse_wrappers.h
+                         @CMAKE_CURRENT_SOURCE_DIR@/include/raft/sparse/csr.hpp     # Contains device code
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -832,8 +831,7 @@ EXCLUDE_SYMLINKS       = NO
 # exclude all test directories for example use the pattern */test/*
 
 EXCLUDE_PATTERNS       = */detail/* \
-                         */specializations/* \
-                         */spectral/*
+                         */specializations/*
 
 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
 # (namespaces, classes, functions, etc.) that should be excluded from the
diff --git a/cpp/doxygen/main_page.md b/cpp/doxygen/main_page.md
index 070a8f1f1d..ff0c7820c7 100644
--- a/cpp/doxygen/main_page.md
+++ b/cpp/doxygen/main_page.md
@@ -1,14 +1,22 @@
 # libraft
 
-RAFT (RAPIDS Analytics Framework Toolkit) is a library containing building-blocks for rapid composition of RAPIDS Analytics. These building-blocks include shared representations, mathematical computational primitives, and utilities that accelerate building analytics and data science algorithms in the RAPIDS ecosystem. Both the C++ and Python components can be included in consuming libraries, providing building-blocks for both dense and sparse matrix formats in the following general categories:
+RAFT contains fundamental widely-used algorithms and primitives for data science, graph and machine learning. The algorithms are CUDA-accelerated and form building-blocks for rapidly composing analytics.
+
+By taking a primitives-based approach to algorithm development, RAFT
+- accelerates algorithm construction time,
+- reduces the maintenance burden by maximizing reuse across projects, and
+- centralizes core reusable computations, allowing future optimizations to benefit all algorithms that use them.
+
+While not exhaustive, the following general categories help summarize the accelerated functions in RAFT:
 
 #####
-| Category | Description / Examples |
+| Category | Examples |
 | --- | --- |
-| **Data Formats** | tensor representations and conversions for both sparse and dense formats |
-| **Data Generation** | graph, spatial, and machine learning dataset generation |
-| **Dense Operations** | linear algebra, statistics |
-| **Spatial** | pairwise distances, nearest neighbors, neighborhood / proximity graph construction |
-| **Sparse/Graph Operations** | linear algebra, statistics, slicing, msf, spectral embedding/clustering, slhc, vertex degree |
-| **Solvers** | eigenvalue decomposition, least squares, lanczos |
-| **Tools** | multi-node multi-gpu communicator, utilities |
+| **Data Formats** | sparse & dense, conversions, data generation |
+| **Dense Linear Algebra** | matrix arithmetic, norms, factorization, least squares, svd & eigenvalue problems |
+| **Spatial** | pairwise distances, nearest neighbors, neighborhood graph construction |
+| **Sparse Operations** | linear algebra, eigenvalue problems, slicing, symmetrization, labeling |
+| **Basic Clustering** | spectral clustering, hierarchical clustering, k-means |
+| **Solvers** | combinatorial optimization, iterative solvers |
+| **Statistics** | sampling, moments and summary statistics, metrics |
+| **Distributed Tools** | multi-node multi-gpu infrastructure |
diff --git a/cpp/include/raft.hpp b/cpp/include/raft.hpp
index fff4d09ffe..b1b8255b7e 100644
--- a/cpp/include/raft.hpp
+++ b/cpp/include/raft.hpp
@@ -15,7 +15,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  */
 #include "raft/handle.hpp"
 #include "raft/mdarray.hpp"
diff --git a/cpp/include/raft/cache/cache_util.cuh b/cpp/include/raft/cache/cache_util.cuh
index dc9327bb94..3e2222eff1 100644
--- a/cpp/include/raft/cache/cache_util.cuh
+++ b/cpp/include/raft/cache/cache_util.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -111,7 +111,7 @@ __global__ void store_vecs(const math_t* tile,
  * @brief Map a key to a cache set.
  *
  * @param key key to be hashed
- * @param n_cache_set number of cache sets
+ * @param n_cache_sets number of cache sets
  * @return index of the cache set [0..n_cache_set)
  */
 int DI hash(int key, int n_cache_sets) { return key % n_cache_sets; }
diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index b30a4648a6..9fb2b5a2c6 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -15,7 +15,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use raft_runtime/comms.hpp instead.
  */
 
diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index 3a18d7e420..4ba1e18768 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -15,7 +15,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use raft_runtime/cudart_utils.hpp instead.
  */
 
diff --git a/cpp/include/raft/device_atomics.cuh b/cpp/include/raft/device_atomics.cuh
index e3b324d030..d1ca239170 100644
--- a/cpp/include/raft/device_atomics.cuh
+++ b/cpp/include/raft/device_atomics.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -462,7 +462,7 @@ struct typesAtomicCASImpl<T, 8> {
  * int8_t, int16_t, int32_t, int64_t, float, double
  *
  * @param[in] address The address of old value in global or shared memory
- * @param[in] val The value to be computed
+ * @param[in] update_value The value to be computed
  * @param[in] op  The binary operator used for compute
  *
  * @returns The old value at `address`
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
index e13cfd94f8..d8e60550ca 100644
--- a/cpp/include/raft/distance/distance.cuh
+++ b/cpp/include/raft/distance/distance.cuh
@@ -25,6 +25,11 @@
 
 #include <raft/mdarray.hpp>
 
+/**
+ * @defgroup pairwise_distance pairwise distance prims
+ * @{
+ */
+
 namespace raft {
 namespace distance {
 
@@ -267,12 +272,11 @@ void distance(raft::handle_t const& handle,
 }
 
 /**
- * @defgroup pairwise_distance pairwise distance prims
- * @{
  * @brief Convenience wrapper around 'distance' prim to convert runtime metric
  * into compile time for the purpose of dispatch
  * @tparam Type input/accumulation/output data-type
  * @tparam Index_ indexing type
+ * @param handle raft handle for managing expensive resources
  * @param x first set of points
  * @param y second set of points
  * @param dist output distance matrix
@@ -282,8 +286,8 @@ void distance(raft::handle_t const& handle,
  * @param workspace temporary workspace buffer which can get resized as per the
  * needed workspace size
  * @param metric distance metric
- * @param stream cuda stream
  * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
  */
 template <typename Type, typename Index_ = int>
 void pairwise_distance(const raft::handle_t& handle,
@@ -363,15 +367,13 @@ void pairwise_distance(const raft::handle_t& handle,
     default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
   };
 }
-/** @} */
 
 /**
- * @defgroup pairwise_distance pairwise distance prims
- * @{
  * @brief Convenience wrapper around 'distance' prim to convert runtime metric
  * into compile time for the purpose of dispatch
  * @tparam Type input/accumulation/output data-type
  * @tparam Index_ indexing type
+ * @param handle raft handle for managing expensive resources
  * @param x first set of points
  * @param y second set of points
  * @param dist output distance matrix
@@ -379,8 +381,8 @@ void pairwise_distance(const raft::handle_t& handle,
  * @param n number of points in y
  * @param k dimensionality
  * @param metric distance metric
- * @param stream cuda stream
  * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
  */
 template <typename Type, typename Index_ = int>
 void pairwise_distance(const raft::handle_t& handle,
@@ -400,20 +402,16 @@ void pairwise_distance(const raft::handle_t& handle,
 }
 
 /**
- * @defgroup pairwise_distance pairwise distance prims
- * @{
  * @brief Convenience wrapper around 'distance' prim to convert runtime metric
  * into compile time for the purpose of dispatch
  * @tparam Type input/accumulation/output data-type
  * @tparam Index_ indexing type
+ * @param handle raft handle for managing expensive resources
  * @param x first matrix of points (size mxk)
  * @param y second matrix of points (size nxk)
  * @param dist output distance matrix (size mxn)
- * @param workspace temporary workspace buffer which can get resized as per the
- * needed workspace size
  * @param metric distance metric
- * @param stream cuda stream
- * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
  */
 template <typename Type, typename Index_ = int, typename layout = layout_c_contiguous>
 void pairwise_distance(raft::handle_t const& handle,
@@ -454,4 +452,6 @@ void pairwise_distance(raft::handle_t const& handle,
 };  // namespace distance
 };  // namespace raft
 
+/** @} */
+
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/distance/distance.hpp b/cpp/include/raft/distance/distance.hpp
index 66b4efcede..e5d39be86b 100644
--- a/cpp/include/raft/distance/distance.hpp
+++ b/cpp/include/raft/distance/distance.hpp
@@ -14,454 +14,10 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
-#ifndef __DISTANCE_H
-#define __DISTANCE_H
-
 #pragma once
 
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/distance_type.hpp>
-#include <raft/handle.hpp>
-#include <rmm/device_uvector.hpp>
-
-#include <raft/mdarray.hpp>
-
-namespace raft {
-namespace distance {
-
-/**
- * @brief Evaluate pairwise distances with the user epilogue lamba allowed
- * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param workspace temporary workspace needed for computations
- * @param worksize number of bytes of the workspace
- * @param fin_op the final gemm epilogue lambda
- * @param stream cuda stream
- * @param isRowMajor whether the matrices are row-major or col-major
- * @param metric_arg metric argument (used for Minkowski distance)
- *
- * @note fin_op: This is a device lambda which is supposed to operate upon the
- * input which is AccType and returns the output in OutType. It's signature is
- * as follows:  <pre>OutType fin_op(AccType in, int g_idx);</pre>. If one needs
- * any other parameters, feel free to pass them via closure.
- */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void distance(const InType* x,
-              const InType* y,
-              OutType* dist,
-              Index_ m,
-              Index_ n,
-              Index_ k,
-              void* workspace,
-              size_t worksize,
-              FinalLambda fin_op,
-              cudaStream_t stream,
-              bool isRowMajor   = true,
-              InType metric_arg = 2.0f)
-{
-  detail::distance<distanceType, InType, AccType, OutType, FinalLambda, Index_>(
-    x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg);
-}
-
-/**
- * @brief Evaluate pairwise distances for the simple use case
- * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam Index_ Index type
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param workspace temporary workspace needed for computations
- * @param worksize number of bytes of the workspace
- * @param stream cuda stream
- * @param isRowMajor whether the matrices are row-major or col-major
- * @param metric_arg metric argument (used for Minkowski distance)
- *
- * @note if workspace is passed as nullptr, this will return in
- *  worksize, the number of bytes of workspace required
- */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename Index_ = int>
-void distance(const InType* x,
-              const InType* y,
-              OutType* dist,
-              Index_ m,
-              Index_ n,
-              Index_ k,
-              void* workspace,
-              size_t worksize,
-              cudaStream_t stream,
-              bool isRowMajor   = true,
-              InType metric_arg = 2.0f)
-{
-  detail::distance<distanceType, InType, AccType, OutType, Index_>(
-    x, y, dist, m, n, k, workspace, worksize, stream, isRowMajor, metric_arg);
-}
-
-/**
- * @brief Return the exact workspace size to compute the distance
- * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam Index_ Index type
- * @param x first set of points
- * @param y second set of points
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- *
- * @note If the specified distanceType doesn't need the workspace at all, it
- * returns 0.
- */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename Index_ = int>
-size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k)
-{
-  return detail::getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(x, y, m, n, k);
-}
-
-/**
- * @brief Return the exact workspace size to compute the distance
- * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam Index_ Index type
- * @param x first set of points (size m*k)
- * @param y second set of points (size n*k)
- * @return number of bytes needed in workspace
- *
- * @note If the specified distanceType doesn't need the workspace at all, it
- * returns 0.
- */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename Index_ = int>
-size_t getWorkspaceSize(const raft::device_matrix_view<InType>& x,
-                        const raft::device_matrix_view<InType>& y)
-{
-  RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal.");
-
-  return getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(
-    x.data(), y.data(), x.extent(0), y.extent(0), x.extent(1));
-}
-
-/**
- * @brief Evaluate pairwise distances for the simple use case
- * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam Index_ Index type
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param stream cuda stream
- * @param isRowMajor whether the matrices are row-major or col-major
- * @param metric_arg metric argument (used for Minkowski distance)
- */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename Index_ = int>
-void distance(const InType* x,
-              const InType* y,
-              OutType* dist,
-              Index_ m,
-              Index_ n,
-              Index_ k,
-              cudaStream_t stream,
-              bool isRowMajor   = true,
-              InType metric_arg = 2.0f)
-{
-  rmm::device_uvector<char> workspace(0, stream);
-  auto worksize = getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(x, y, m, n, k);
-  workspace.resize(worksize, stream);
-  detail::distance<distanceType, InType, AccType, OutType, Index_>(
-    x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, metric_arg);
-}
-
-/**
- * @brief Evaluate pairwise distances for the simple use case.
- *
- * Note: Only contiguous row- or column-major layouts supported currently.
- *
- * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam Index_ Index type
- * @param handle raft handle for managing expensive resources
- * @param x first set of points (size n*k)
- * @param y second set of points (size m*k)
- * @param dist output distance matrix (size n*m)
- * @param metric_arg metric argument (used for Minkowski distance)
- */
-
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename Index_ = int,
-          typename layout = raft::layout_c_contiguous>
-void distance(raft::handle_t const handle,
-              raft::device_matrix_view<InType, layout> const x,
-              raft::device_matrix_view<InType, layout> const y,
-              raft::device_matrix_view<OutType> dist,
-              InType metric_arg = 2.0f)
-{
-  RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal.");
-  RAFT_EXPECTS(dist.extent(0) == x.extent(0),
-               "Number of rows in output must be equal to "
-               "number of rows in X");
-  RAFT_EXPECTS(dist.extent(1) == y.extent(0),
-               "Number of columns in output must be equal to "
-               "number of rows in Y");
-
-  RAFT_EXPECTS(x.is_contiguous(), "Input x must be contiguous.");
-  RAFT_EXPECTS(y.is_contiguous(), "Input y must be contiguous.");
-
-  if (x.stride(0) == 0 && y.stride(0) == 0) {
-    distance<distanceType, InType, AccType, OutType, Index_>(x.data(),
-                                                             y.data(),
-                                                             dist.data(),
-                                                             x.extent(0),
-                                                             y.extent(0),
-                                                             x.extent(1),
-                                                             handle.get_stream(),
-                                                             true,
-                                                             metric_arg);
-  } else if (x.stride(0) > 0 && y.stride(0) > 0) {
-    distance<distanceType, InType, AccType, OutType, Index_>(x.data(),
-                                                             y.data(),
-                                                             dist.data(),
-                                                             x.extent(0),
-                                                             y.extent(0),
-                                                             x.extent(1),
-                                                             handle.get_stream(),
-                                                             false,
-                                                             metric_arg);
-  } else {
-    RAFT_FAIL("x and y must both have the same layout: row-major or column-major.");
-  }
-}
-
-/**
- * @brief Convenience wrapper around 'distance' prim to convert runtime metric
- * into compile time for the purpose of dispatch
- * @tparam Type input/accumulation/output data-type
- * @tparam Index_ indexing type
- * @param handle raft handle for managing expensive resources
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param workspace temporary workspace buffer which can get resized as per the
- * needed workspace size
- * @param metric distance metric
- * @param isRowMajor whether the matrices are row-major or col-major
- * @param metric_arg metric argument
- */
-template <typename Type, typename Index_ = int>
-void pairwise_distance(const raft::handle_t& handle,
-                       const Type* x,
-                       const Type* y,
-                       Type* dist,
-                       Index_ m,
-                       Index_ n,
-                       Index_ k,
-                       rmm::device_uvector<char>& workspace,
-                       raft::distance::DistanceType metric,
-                       bool isRowMajor = true,
-                       Type metric_arg = 2.0f)
-{
-  switch (metric) {
-    case raft::distance::DistanceType::L2Expanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2Expanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::L2SqrtExpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2SqrtExpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::CosineExpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::CosineExpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::L1:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L1>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::L2Unexpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2Unexpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::L2SqrtUnexpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2SqrtUnexpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::Linf:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::Linf>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::HellingerExpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::HellingerExpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::LpUnexpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::LpUnexpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor, metric_arg);
-      break;
-    case raft::distance::DistanceType::Canberra:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::Canberra>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::HammingUnexpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::HammingUnexpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::JensenShannon:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::JensenShannon>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::RusselRaoExpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::RusselRaoExpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::KLDivergence:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::KLDivergence>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    case raft::distance::DistanceType::CorrelationExpanded:
-      detail::
-        pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::CorrelationExpanded>(
-          x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
-      break;
-    default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
-  };
-}
-
-/**
- * @brief Convenience wrapper around 'distance' prim to convert runtime metric
- * into compile time for the purpose of dispatch
- * @tparam Type input/accumulation/output data-type
- * @tparam Index_ indexing type
- * @param handle raft handle for managing expensive resources
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param metric distance metric
- * @param isRowMajor whether the matrices are row-major or col-major
- * @param metric_arg metric argument
- */
-template <typename Type, typename Index_ = int>
-void pairwise_distance(const raft::handle_t& handle,
-                       const Type* x,
-                       const Type* y,
-                       Type* dist,
-                       Index_ m,
-                       Index_ n,
-                       Index_ k,
-                       raft::distance::DistanceType metric,
-                       bool isRowMajor = true,
-                       Type metric_arg = 2.0f)
-{
-  rmm::device_uvector<char> workspace(0, handle.get_stream());
-  pairwise_distance<Type, Index_>(
-    handle, x, y, dist, m, n, k, workspace, metric, isRowMajor, metric_arg);
-}
-
-/**
- * @brief Convenience wrapper around 'distance' prim to convert runtime metric
- * into compile time for the purpose of dispatch
- * @tparam Type input/accumulation/output data-type
- * @tparam Index_ indexing type
- * @param handle raft handle for managing expensive resources
- * @param x first matrix of points (size mxk)
- * @param y second matrix of points (size nxk)
- * @param dist output distance matrix (size mxn)
- * @param metric distance metric
- * @param metric_arg metric argument
- */
-template <typename Type, typename Index_ = int>
-void pairwise_distance(raft::handle_t const& handle,
-                       device_matrix_view<Type> const& x,
-                       device_matrix_view<Type> const& y,
-                       device_matrix_view<Type>& dist,
-                       raft::distance::DistanceType metric,
-                       Type metric_arg = 2.0f)
-{
-  RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal.");
-  RAFT_EXPECTS(dist.extent(0) == x.extent(0),
-               "Number of rows in output must be equal to "
-               "number of rows in X");
-  RAFT_EXPECTS(dist.extent(1) == y.extent(0),
-               "Number of columns in output must be equal to "
-               "number of rows in Y");
-
-  RAFT_EXPECTS(x.is_contiguous(), "Input x must be contiguous.");
-  RAFT_EXPECTS(y.is_contiguous(), "Input y must be contiguous.");
-  RAFT_EXPECTS(dist.is_contiguous(), "Output must be contiguous.");
-
-  bool rowmajor = x.stride(0) == 0;
-
-  rmm::device_uvector<char> workspace(0, handle.get_stream());
-
-  pairwise_distance(handle,
-                    x.data(),
-                    y.data(),
-                    dist.data(),
-                    x.extent(0),
-                    y.extent(0),
-                    x.extent(1),
-                    metric,
-                    rowmajor,
-                    metric_arg);
-}
-
-};  // namespace distance
-};  // namespace raft
-
-#endif
\ No newline at end of file
+#include <raft/distance/distance.cuh>
\ No newline at end of file
diff --git a/cpp/include/raft/distance/fused_l2_nn.hpp b/cpp/include/raft/distance/fused_l2_nn.hpp
index 1cb3ee39eb..768e33b3a7 100644
--- a/cpp/include/raft/distance/fused_l2_nn.hpp
+++ b/cpp/include/raft/distance/fused_l2_nn.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/distance/specializations.hpp b/cpp/include/raft/distance/specializations.hpp
index db426c30d2..641968d9f1 100644
--- a/cpp/include/raft/distance/specializations.hpp
+++ b/cpp/include/raft/distance/specializations.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp
index 5e1aa3af28..0927142829 100644
--- a/cpp/include/raft/error.hpp
+++ b/cpp/include/raft/error.hpp
@@ -15,7 +15,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the include/raft_runtime/error.hpp instead.
  */
 
@@ -178,4 +178,4 @@ struct logic_error : public raft::exception {
     throw raft::logic_error(msg);                               \
   } while (0)
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 158816f762..6f049503c8 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -15,7 +15,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the include/raft_runtime/handle.hpp instead.
  */
 
@@ -341,4 +341,4 @@ class stream_syncer {
 
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/interruptible.hpp b/cpp/include/raft/interruptible.hpp
index 6764065363..603f19ff35 100644
--- a/cpp/include/raft/interruptible.hpp
+++ b/cpp/include/raft/interruptible.hpp
@@ -15,7 +15,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the include/raft_runtime/interruptible.hpp instead.
  */
 
@@ -273,4 +273,4 @@ class interruptible {
 
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/lap/lap.hpp b/cpp/include/raft/lap/lap.hpp
index 238af9545d..2a4f10f000 100644
--- a/cpp/include/raft/lap/lap.hpp
+++ b/cpp/include/raft/lap/lap.hpp
@@ -12,7 +12,7 @@
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
- * limitations under the License.
+ * limitations under the License.+
  *
  *      CUDA Implementation of O(n^3) alternating tree Hungarian Algorithm
  *      Authors: Ketan Date and Rakesh Nagi
@@ -24,7 +24,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh
index 92152a8c03..e25c9df9ef 100644
--- a/cpp/include/raft/linalg/add.cuh
+++ b/cpp/include/raft/linalg/add.cuh
@@ -16,6 +16,11 @@
 #ifndef __ADD_H
 #define __ADD_H
 
+/**
+ * @defgroup arithmetic Dense matrix arithmetic
+ * @{
+ */
+
 #pragma once
 
 #include "detail/add.cuh"
@@ -26,6 +31,7 @@ namespace linalg {
 using detail::adds_scalar;
 
 /**
+ * @ingroup arithmetic
  * @brief Elementwise scalar add operation on the input buffer
  *
  * @tparam InT     input data-type. Also the data-type upon which the math ops
@@ -87,4 +93,6 @@ void addDevScalar(math_t* outDev,
 };  // end namespace linalg
 };  // end namespace raft
 
+/** @} */
+
 #endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/add.hpp b/cpp/include/raft/linalg/add.hpp
index 32c7f68459..a80398fcad 100644
--- a/cpp/include/raft/linalg/add.hpp
+++ b/cpp/include/raft/linalg/add.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/axpy.hpp b/cpp/include/raft/linalg/axpy.hpp
index 921ed3f89b..c227ba66c8 100644
--- a/cpp/include/raft/linalg/axpy.hpp
+++ b/cpp/include/raft/linalg/axpy.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/binary_op.hpp b/cpp/include/raft/linalg/binary_op.hpp
index 468c278909..9983e8ab50 100644
--- a/cpp/include/raft/linalg/binary_op.hpp
+++ b/cpp/include/raft/linalg/binary_op.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh
index 7d22d6bcf7..d8e838a634 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh
@@ -62,7 +62,7 @@ namespace linalg {
  * // Initialize arrays
  * int ld_L = n_rows;
  * rmm::device_uvector<math_t> L(ld_L * n_rows, stream);
- * MLCommon::LinAlg::choleskyRank1Update(handle, L, n_rows, ld_L, nullptr,
+ * raft::linalg::choleskyRank1Update(handle, L, n_rows, ld_L, nullptr,
  *                                       &n_bytes, CUBLAS_FILL_MODE_LOWER,
  *                                       stream);
  * rmm::device_uvector<char> workspace(n_bytes, stream);
@@ -74,7 +74,7 @@ namespace linalg {
  *   RAFT_CUBLAS_TRY(cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1,
  *                           L + n - 1, ld_L, stream));
  *   // Update Cholesky factorization
- *   MLCommon::LinAlg::choleskyRank1Update(
+ *   raft::linalg::choleskyRank1Update(
  *       handle, L, rank, ld_L, workspace, &n_bytes, CUBLAS_FILL_MODE_LOWER,
  *       stream);
  * }
@@ -86,7 +86,7 @@ namespace linalg {
  * // Initialize arrays
  * int ld_U = n_rows;
  * rmm::device_uvector<math_t> U(ld_U * n_rows, stream);
- * MLCommon::LinAlg::choleskyRank1Update(handle, L, n_rows, ld_U, nullptr,
+ * raft::linalg::choleskyRank1Update(handle, L, n_rows, ld_U, nullptr,
  *                                       &n_bytes, CUBLAS_FILL_MODE_UPPER,
  *                                       stream);
  * rmm::device_uvector<char> workspace(stream, n_bytes, stream);
@@ -98,7 +98,7 @@ namespace linalg {
  *   raft::copy(U + ld_U * (n-1), A_new, n-1, stream);
  *   //
  *   // Update Cholesky factorization
- *   MLCommon::LinAlg::choleskyRank1Update(
+ *   raft::linalg::choleskyRank1Update(
  *       handle, U, n, ld_U, workspace, &n_bytes, CUBLAS_FILL_MODE_UPPER,
  *       stream);
  * }
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.hpp b/cpp/include/raft/linalg/cholesky_r1_update.hpp
index b55f5d06da..1158ad3aa4 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.hpp
+++ b/cpp/include/raft/linalg/cholesky_r1_update.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
@@ -67,7 +67,7 @@ namespace linalg {
  * // Initialize arrays
  * int ld_L = n_rows;
  * rmm::device_uvector<math_t> L(ld_L * n_rows, stream);
- * MLCommon::LinAlg::choleskyRank1Update(handle, L, n_rows, ld_L, nullptr,
+ * raft::linalg::choleskyRank1Update(handle, L, n_rows, ld_L, nullptr,
  *                                       &n_bytes, CUBLAS_FILL_MODE_LOWER,
  *                                       stream);
  * rmm::device_uvector<char> workspace(n_bytes, stream);
@@ -79,7 +79,7 @@ namespace linalg {
  *   RAFT_CUBLAS_TRY(cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1,
  *                           L + n - 1, ld_L, stream));
  *   // Update Cholesky factorization
- *   MLCommon::LinAlg::choleskyRank1Update(
+ *   raft::linalg::choleskyRank1Update(
  *       handle, L, rank, ld_L, workspace, &n_bytes, CUBLAS_FILL_MODE_LOWER,
  *       stream);
  * }
@@ -91,7 +91,7 @@ namespace linalg {
  * // Initialize arrays
  * int ld_U = n_rows;
  * rmm::device_uvector<math_t> U(ld_U * n_rows, stream);
- * MLCommon::LinAlg::choleskyRank1Update(handle, L, n_rows, ld_U, nullptr,
+ * raft::linalg::choleskyRank1Update(handle, L, n_rows, ld_U, nullptr,
  *                                       &n_bytes, CUBLAS_FILL_MODE_UPPER,
  *                                       stream);
  * rmm::device_uvector<char> workspace(stream, n_bytes, stream);
@@ -103,7 +103,7 @@ namespace linalg {
  *   raft::copy(U + ld_U * (n-1), A_new, n-1, stream);
  *   //
  *   // Update Cholesky factorization
- *   MLCommon::LinAlg::choleskyRank1Update(
+ *   raft::linalg::choleskyRank1Update(
  *       handle, U, n, ld_U, workspace, &n_bytes, CUBLAS_FILL_MODE_UPPER,
  *       stream);
  * }
diff --git a/cpp/include/raft/linalg/coalesced_reduction.hpp b/cpp/include/raft/linalg/coalesced_reduction.hpp
index 4b9e5d262f..48f8798a03 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.hpp
+++ b/cpp/include/raft/linalg/coalesced_reduction.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/contractions.hpp b/cpp/include/raft/linalg/contractions.hpp
index 84c86b93a4..256593d9ae 100644
--- a/cpp/include/raft/linalg/contractions.hpp
+++ b/cpp/include/raft/linalg/contractions.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/cublas_macros.h b/cpp/include/raft/linalg/cublas_macros.h
index 0281c5c667..a321a080c8 100644
--- a/cpp/include/raft/linalg/cublas_macros.h
+++ b/cpp/include/raft/linalg/cublas_macros.h
@@ -15,7 +15,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use raft_runtime/cublas_macros.hpp instead.
  */
 
@@ -123,4 +123,4 @@ inline const char* cublas_error_to_string(cublasStatus_t err)
 #define CUBLAS_CHECK_NO_THROW(call) RAFT_CUBLAS_TRY_NO_THROW(call)
 #endif
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/cusolver_macros.h b/cpp/include/raft/linalg/cusolver_macros.h
index df27f7ce26..fa1cd3d4c7 100644
--- a/cpp/include/raft/linalg/cusolver_macros.h
+++ b/cpp/include/raft/linalg/cusolver_macros.h
@@ -15,7 +15,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use raft_runtime/cusolver_macros.hpp instead.
  */
 
@@ -119,4 +119,4 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err)
 #define CUSOLVER_CHECK_NO_THROW(call) CUSOLVER_TRY_NO_THROW(call)
 #endif
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/divide.hpp b/cpp/include/raft/linalg/divide.hpp
index 88b919b92a..8d1bd37186 100644
--- a/cpp/include/raft/linalg/divide.hpp
+++ b/cpp/include/raft/linalg/divide.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/eig.hpp b/cpp/include/raft/linalg/eig.hpp
index 9417b6fb3f..032c4e97f9 100644
--- a/cpp/include/raft/linalg/eig.hpp
+++ b/cpp/include/raft/linalg/eig.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/eltwise.hpp b/cpp/include/raft/linalg/eltwise.hpp
index 0ebefc7c25..62624f6eeb 100644
--- a/cpp/include/raft/linalg/eltwise.hpp
+++ b/cpp/include/raft/linalg/eltwise.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/gemm.hpp b/cpp/include/raft/linalg/gemm.hpp
index 736590938b..56621e4f8b 100644
--- a/cpp/include/raft/linalg/gemm.hpp
+++ b/cpp/include/raft/linalg/gemm.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/gemv.hpp b/cpp/include/raft/linalg/gemv.hpp
index d6e0e0326b..3b6b60263b 100644
--- a/cpp/include/raft/linalg/gemv.hpp
+++ b/cpp/include/raft/linalg/gemv.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/init.hpp b/cpp/include/raft/linalg/init.hpp
index af3486f278..db7b0f9cfe 100644
--- a/cpp/include/raft/linalg/init.hpp
+++ b/cpp/include/raft/linalg/init.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp
index 7663af3cb2..75e3d11444 100644
--- a/cpp/include/raft/linalg/lanczos.hpp
+++ b/cpp/include/raft/linalg/lanczos.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/lstsq.hpp b/cpp/include/raft/linalg/lstsq.hpp
index 008fcab653..f90cd00ea3 100644
--- a/cpp/include/raft/linalg/lstsq.hpp
+++ b/cpp/include/raft/linalg/lstsq.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/map.hpp b/cpp/include/raft/linalg/map.hpp
index d4ee231eb1..8321dcebe4 100644
--- a/cpp/include/raft/linalg/map.hpp
+++ b/cpp/include/raft/linalg/map.hpp
@@ -14,46 +14,10 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
-#ifndef __MAP_H
-#define __MAP_H
-
 #pragma once
 
-#include "detail/map.cuh"
-
-namespace raft {
-namespace linalg {
-
-/**
- * @brief CUDA version of map
- * @tparam InType data-type upon which the math operation will be performed
- * @tparam MapOp the device-lambda performing the actual operation
- * @tparam TPB threads-per-block in the final kernel launched
- * @tparam Args additional parameters
- * @tparam OutType data-type in which the result will be stored
- * @param out the output of the map operation (assumed to be a device pointer)
- * @param len number of elements in the input array
- * @param map the device-lambda
- * @param stream cuda-stream where to launch this kernel
- * @param in the input array
- * @param args additional input arrays
- */
-
-template <typename InType,
-          typename MapOp,
-          int TPB = 256,
-          typename... Args,
-          typename OutType = InType>
-void map(OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
-{
-  detail::mapImpl<InType, OutType, MapOp, TPB, Args...>(out, len, map, stream, in, args...);
-}
-
-}  // namespace linalg
-};  // namespace raft
-
-#endif
\ No newline at end of file
+#include "map.cuh"
diff --git a/cpp/include/raft/linalg/map_then_reduce.hpp b/cpp/include/raft/linalg/map_then_reduce.hpp
index c4b136d1b8..235485926b 100644
--- a/cpp/include/raft/linalg/map_then_reduce.hpp
+++ b/cpp/include/raft/linalg/map_then_reduce.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/matrix_vector_op.hpp b/cpp/include/raft/linalg/matrix_vector_op.hpp
index c041d4c263..574d4aee63 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.hpp
+++ b/cpp/include/raft/linalg/matrix_vector_op.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/mean_squared_error.hpp b/cpp/include/raft/linalg/mean_squared_error.hpp
index 95428d47e0..7a7f03ee18 100644
--- a/cpp/include/raft/linalg/mean_squared_error.hpp
+++ b/cpp/include/raft/linalg/mean_squared_error.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/multiply.hpp b/cpp/include/raft/linalg/multiply.hpp
index 260fb25018..eb933cd607 100644
--- a/cpp/include/raft/linalg/multiply.hpp
+++ b/cpp/include/raft/linalg/multiply.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/norm.hpp b/cpp/include/raft/linalg/norm.hpp
index 7be524f6de..958784d67e 100644
--- a/cpp/include/raft/linalg/norm.hpp
+++ b/cpp/include/raft/linalg/norm.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/power.hpp b/cpp/include/raft/linalg/power.hpp
index 124ee8513a..d1506ff7a9 100644
--- a/cpp/include/raft/linalg/power.hpp
+++ b/cpp/include/raft/linalg/power.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/qr.hpp b/cpp/include/raft/linalg/qr.hpp
index da8736b46f..f0194ddbf9 100644
--- a/cpp/include/raft/linalg/qr.hpp
+++ b/cpp/include/raft/linalg/qr.hpp
@@ -14,70 +14,10 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
-#ifndef __QR_H
-#define __QR_H
-
 #pragma once
 
-#include "detail/qr.cuh"
-
-namespace raft {
-namespace linalg {
-
-/**
- * @defgroup QRdecomp QR decomposition
- * @{
- */
-
-/**
- * @brief compute QR decomp and return only Q matrix
- * @param handle: raft handle
- * @param M: input matrix
- * @param Q: Q matrix to be returned (on GPU)
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param stream cuda stream
- * @{
- */
-template <typename math_t>
-void qrGetQ(const raft::handle_t& handle,
-            const math_t* M,
-            math_t* Q,
-            int n_rows,
-            int n_cols,
-            cudaStream_t stream)
-{
-  detail::qrGetQ(handle, M, Q, n_rows, n_cols, stream);
-}
-
-/**
- * @brief compute QR decomp and return both Q and R matrices
- * @param handle: raft handle
- * @param M: input matrix
- * @param Q: Q matrix to be returned (on GPU)
- * @param R: R matrix to be returned (on GPU)
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param stream cuda stream
- */
-template <typename math_t>
-void qrGetQR(const raft::handle_t& handle,
-             math_t* M,
-             math_t* Q,
-             math_t* R,
-             int n_rows,
-             int n_cols,
-             cudaStream_t stream)
-{
-  detail::qrGetQR(handle, M, Q, R, n_rows, n_cols, stream);
-}
-/** @} */
-
-};  // namespace linalg
-};  // namespace raft
-
-#endif
\ No newline at end of file
+#include "qr.cuh"
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/reduce.hpp b/cpp/include/raft/linalg/reduce.hpp
index b9f057771a..b9cc2c6e9d 100644
--- a/cpp/include/raft/linalg/reduce.hpp
+++ b/cpp/include/raft/linalg/reduce.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.hpp b/cpp/include/raft/linalg/reduce_cols_by_key.hpp
index a338d8572b..c24baa60de 100644
--- a/cpp/include/raft/linalg/reduce_cols_by_key.hpp
+++ b/cpp/include/raft/linalg/reduce_cols_by_key.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.hpp b/cpp/include/raft/linalg/reduce_rows_by_key.hpp
index 70ce9eaa4f..d18a00aa1d 100644
--- a/cpp/include/raft/linalg/reduce_rows_by_key.hpp
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/rsvd.hpp b/cpp/include/raft/linalg/rsvd.hpp
index 2dd5faa332..ac6e13b555 100644
--- a/cpp/include/raft/linalg/rsvd.hpp
+++ b/cpp/include/raft/linalg/rsvd.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/sqrt.hpp b/cpp/include/raft/linalg/sqrt.hpp
index 9856173248..9c66ee2d14 100644
--- a/cpp/include/raft/linalg/sqrt.hpp
+++ b/cpp/include/raft/linalg/sqrt.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/strided_reduction.hpp b/cpp/include/raft/linalg/strided_reduction.hpp
index 534f8edcf7..3b1597dfc3 100644
--- a/cpp/include/raft/linalg/strided_reduction.hpp
+++ b/cpp/include/raft/linalg/strided_reduction.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/subtract.hpp b/cpp/include/raft/linalg/subtract.hpp
index 2420ce69e2..accf57a939 100644
--- a/cpp/include/raft/linalg/subtract.hpp
+++ b/cpp/include/raft/linalg/subtract.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/svd.hpp b/cpp/include/raft/linalg/svd.hpp
index 765f364d5b..01788a4188 100644
--- a/cpp/include/raft/linalg/svd.hpp
+++ b/cpp/include/raft/linalg/svd.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/ternary_op.hpp b/cpp/include/raft/linalg/ternary_op.hpp
index 1e8892211c..bce9eacb11 100644
--- a/cpp/include/raft/linalg/ternary_op.hpp
+++ b/cpp/include/raft/linalg/ternary_op.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/transpose.hpp b/cpp/include/raft/linalg/transpose.hpp
index 765d523b16..caa6bafedf 100644
--- a/cpp/include/raft/linalg/transpose.hpp
+++ b/cpp/include/raft/linalg/transpose.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/linalg/unary_op.hpp b/cpp/include/raft/linalg/unary_op.hpp
index 12d841340b..ca1e3f9875 100644
--- a/cpp/include/raft/linalg/unary_op.hpp
+++ b/cpp/include/raft/linalg/unary_op.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/matrix/col_wise_sort.hpp b/cpp/include/raft/matrix/col_wise_sort.hpp
index f259bc71a8..83a8738219 100644
--- a/cpp/include/raft/matrix/col_wise_sort.hpp
+++ b/cpp/include/raft/matrix/col_wise_sort.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/matrix/math.hpp b/cpp/include/raft/matrix/math.hpp
index ab02c8a85f..6ed9a0d358 100644
--- a/cpp/include/raft/matrix/math.hpp
+++ b/cpp/include/raft/matrix/math.hpp
@@ -14,459 +14,10 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
-#ifndef __MATH_H
-#define __MATH_H
-
 #pragma once
 
-#include "detail/math.cuh"
-
-namespace raft {
-namespace matrix {
-
-/**
- * @defgroup MatrixMathOp math operation on the input matrix
- * @{
- */
-
-/**
- * @brief Power of every element in the input matrix
- * @param in: input matrix
- * @param out: output matrix. The result is stored in the out matrix
- * @param scalar: every element is multiplied with scalar.
- * @param len: number elements of input matrix
- * @param stream cuda stream
- */
-template <typename math_t>
-void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream)
-{
-  detail::power(in, out, scalar, len, stream);
-}
-
-/**
- * @brief Power of every element in the input matrix
- * @param inout: input matrix and also the result is stored
- * @param scalar: every element is multiplied with scalar.
- * @param len: number elements of input matrix
- * @param stream cuda stream
- */
-template <typename math_t>
-void power(math_t* inout, math_t scalar, int len, cudaStream_t stream)
-{
-  detail::power(inout, scalar, len, stream);
-}
-
-/**
- * @brief Power of every element in the input matrix
- * @param inout: input matrix and also the result is stored
- * @param len: number elements of input matrix
- * @param stream cuda stream
- */
-template <typename math_t>
-void power(math_t* inout, int len, cudaStream_t stream)
-{
-  detail::power(inout, len, stream);
-}
-
-/**
- * @brief Power of every element in the input matrix
- * @param in: input matrix
- * @param out: output matrix. The result is stored in the out matrix
- * @param len: number elements of input matrix
- * @param stream cuda stream
- * @{
- */
-template <typename math_t>
-void power(math_t* in, math_t* out, int len, cudaStream_t stream)
-{
-  detail::power(in, out, len, stream);
-}
-
-/**
- * @brief Square root of every element in the input matrix
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param in: input matrix and also the result is stored
- * @param out: output matrix. The result is stored in the out matrix
- * @param scalar: every element is multiplied with scalar
- * @param len: number elements of input matrix
- * @param stream cuda stream
- * @param set_neg_zero whether to set negative numbers to zero
- */
-template <typename math_t, typename IdxType = int>
-void seqRoot(math_t* in,
-             math_t* out,
-             math_t scalar,
-             IdxType len,
-             cudaStream_t stream,
-             bool set_neg_zero = false)
-{
-  detail::seqRoot(in, out, scalar, len, stream, set_neg_zero);
-}
-
-/**
- * @brief Square root of every element in the input matrix
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param inout: input matrix and also the result is stored
- * @param scalar: every element is multiplied with scalar
- * @param len: number elements of input matrix
- * @param stream cuda stream
- * @param set_neg_zero whether to set negative numbers to zero
- */
-template <typename math_t, typename IdxType = int>
-void seqRoot(
-  math_t* inout, math_t scalar, IdxType len, cudaStream_t stream, bool set_neg_zero = false)
-{
-  detail::seqRoot(inout, scalar, len, stream, set_neg_zero);
-}
-
-/**
- * @brief Square root of every element in the input matrix
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param in: input matrix and also the result is stored
- * @param out: output matrix. The result is stored in the out matrix
- * @param len: number elements of input matrix
- * @param stream cuda stream
- */
-template <typename math_t, typename IdxType = int>
-void seqRoot(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
-{
-  detail::seqRoot(in, out, len, stream);
-}
-
-/**
- * @brief Square root of every element in the input matrix
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param inout: input matrix with in-place results
- * @param len: number elements of input matrix
- * @param stream cuda stream
- */
-template <typename math_t, typename IdxType = int>
-void seqRoot(math_t* inout, IdxType len, cudaStream_t stream)
-{
-  detail::seqRoot(inout, len, stream);
-}
-
-/**
- * @brief sets the small values to zero based on a defined threshold
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param out: output matrix. The result is stored in the out matrix
- * @param in: input matrix
- * @param len: number elements of input matrix
- * @param stream cuda stream
- * @param thres threshold to set values to zero
- */
-template <typename math_t, typename IdxType = int>
-void setSmallValuesZero(
-  math_t* out, const math_t* in, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
-{
-  detail::setSmallValuesZero(out, in, len, stream, thres);
-}
-
-/**
- * @brief sets the small values to zero based on a defined threshold
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param inout: input matrix and also the result is stored
- * @param len: number elements of input matrix
- * @param stream cuda stream
- * @param thres: threshold
- */
-template <typename math_t, typename IdxType = int>
-void setSmallValuesZero(math_t* inout, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
-{
-  detail::setSmallValuesZero(inout, len, stream, thres);
-}
-
-/**
- * @brief Reciprocal of every element in the input matrix
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param in: input matrix and also the result is stored
- * @param out: output matrix. The result is stored in the out matrix
- * @param scalar: every element is multiplied with scalar
- * @param len: number elements of input matrix
- * @param stream cuda stream
- * @param setzero round down to zero if the input is less the threshold
- * @param thres the threshold used to forcibly set inputs to zero
- * @{
- */
-template <typename math_t, typename IdxType = int>
-void reciprocal(math_t* in,
-                math_t* out,
-                math_t scalar,
-                int len,
-                cudaStream_t stream,
-                bool setzero = false,
-                math_t thres = 1e-15)
-{
-  detail::reciprocal(in, out, scalar, len, stream, setzero, thres);
-}
-
-/**
- * @brief Reciprocal of every element in the input matrix
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param inout: input matrix with in-place results
- * @param scalar: every element is multiplied with scalar
- * @param len: number elements of input matrix
- * @param stream cuda stream
- * @param setzero round down to zero if the input is less the threshold
- * @param thres the threshold used to forcibly set inputs to zero
- * @{
- */
-template <typename math_t, typename IdxType = int>
-void reciprocal(math_t* inout,
-                math_t scalar,
-                IdxType len,
-                cudaStream_t stream,
-                bool setzero = false,
-                math_t thres = 1e-15)
-{
-  detail::reciprocal(inout, scalar, len, stream, setzero, thres);
-}
-
-/**
- * @brief Reciprocal of every element in the input matrix
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param inout: input matrix and also the result is stored
- * @param len: number elements of input matrix
- * @param stream cuda stream
- */
-template <typename math_t, typename IdxType = int>
-void reciprocal(math_t* inout, IdxType len, cudaStream_t stream)
-{
-  detail::reciprocal(inout, len, stream);
-}
-
-/**
- * @brief Reciprocal of every element in the input matrix
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param in: input matrix and also the result is stored
- * @param out: output matrix. The result is stored in the out matrix
- * @param len: number elements of input matrix
- * @param stream cuda stream
- */
-template <typename math_t, typename IdxType = int>
-void reciprocal(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
-{
-  detail::reciprocal(in, out, len, stream);
-}
-
-/**
- * @brief set values to scalar in matrix
- * @tparam math_t data-type upon which the math operation will be performed
- * @param out output matrix. The result is stored in the out matrix
- * @param in input matrix
- * @param scalar svalar value
- * @param len number elements of input matrix
- * @param stream cuda stream
- */
-template <typename math_t>
-void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_t stream = 0)
-{
-  detail::setValue(out, in, scalar, len, stream);
-}
-
-/**
- * @brief ratio of every element over sum of input vector is calculated
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam IdxType Integer type used to for addressing
- * @param handle
- * @param src: input matrix
- * @param dest: output matrix. The result is stored in the dest matrix
- * @param len: number elements of input matrix
- * @param stream cuda stream
- */
-template <typename math_t, typename IdxType = int>
-void ratio(
-  const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream)
-{
-  detail::ratio(handle, src, dest, len, stream);
-}
-
-/** @} */
-
-/**
- * @brief Argmax: find the row idx with maximum value for each column
- * @param in: input matrix
- * @param n_rows: number of rows of input matrix
- * @param n_cols: number of columns of input matrix
- * @param out: output vector of size n_cols
- * @param stream: cuda stream
- */
-template <typename math_t>
-void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t stream)
-{
-  detail::argmax(in, n_rows, n_cols, out, stream);
-}
-
-/**
- * @brief sign flip for PCA. This is used to stabilize the sign of column
- * major eigen vectors. Flips the sign if the column has negative |max|.
- * @param inout: input matrix. Result also stored in this parameter
- * @param n_rows: number of rows of input matrix
- * @param n_cols: number of columns of input matrix
- * @param stream cuda stream
- */
-template <typename math_t>
-void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream)
-{
-  detail::signFlip(inout, n_rows, n_cols, stream);
-}
-
-/**
- * @brief multiply each row or column of matrix with vector
- * @param data input matrix, results are in-place
- * @param vec input vector
- * @param n_row number of rows of input matrix
- * @param n_col number of columns of input matrix
- * @param rowMajor whether matrix is row major
- * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
- * @param stream cuda stream
- */
-template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryMult(Type* data,
-                            const Type* vec,
-                            IdxType n_row,
-                            IdxType n_col,
-                            bool rowMajor,
-                            bool bcastAlongRows,
-                            cudaStream_t stream)
-{
-  detail::matrixVectorBinaryMult<Type, IdxType, TPB>(
-    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
-}
-
-/**
- * @brief multiply each row or column of matrix with vector, skipping zeros in vector
- * @param data input matrix, results are in-place
- * @param vec input vector
- * @param n_row number of rows of input matrix
- * @param n_col number of columns of input matrix
- * @param rowMajor whether matrix is row major
- * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
- * @param stream cuda stream
- */
-template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryMultSkipZero(Type* data,
-                                    const Type* vec,
-                                    IdxType n_row,
-                                    IdxType n_col,
-                                    bool rowMajor,
-                                    bool bcastAlongRows,
-                                    cudaStream_t stream)
-{
-  detail::matrixVectorBinaryMultSkipZero<Type, IdxType, TPB>(
-    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
-}
-
-/**
- * @brief divide each row or column of matrix with vector
- * @param data input matrix, results are in-place
- * @param vec input vector
- * @param n_row number of rows of input matrix
- * @param n_col number of columns of input matrix
- * @param rowMajor whether matrix is row major
- * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
- * @param stream cuda stream
- */
-template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryDiv(Type* data,
-                           const Type* vec,
-                           IdxType n_row,
-                           IdxType n_col,
-                           bool rowMajor,
-                           bool bcastAlongRows,
-                           cudaStream_t stream)
-{
-  detail::matrixVectorBinaryDiv<Type, IdxType, TPB>(
-    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
-}
-
-/**
- * @brief divide each row or column of matrix with vector, skipping zeros in vector
- * @param data input matrix, results are in-place
- * @param vec input vector
- * @param n_row number of rows of input matrix
- * @param n_col number of columns of input matrix
- * @param rowMajor whether matrix is row major
- * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
- * @param stream cuda stream
- * @param return_zero result is zero if true and vector value is below threshold, original value if
- * false
- */
-template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryDivSkipZero(Type* data,
-                                   const Type* vec,
-                                   IdxType n_row,
-                                   IdxType n_col,
-                                   bool rowMajor,
-                                   bool bcastAlongRows,
-                                   cudaStream_t stream,
-                                   bool return_zero = false)
-{
-  detail::matrixVectorBinaryDivSkipZero<Type, IdxType, TPB>(
-    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream, return_zero);
-}
-
-/**
- * @brief add each row or column of matrix with vector
- * @param data input matrix, results are in-place
- * @param vec input vector
- * @param n_row number of rows of input matrix
- * @param n_col number of columns of input matrix
- * @param rowMajor whether matrix is row major
- * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
- * @param stream cuda stream
- */
-template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryAdd(Type* data,
-                           const Type* vec,
-                           IdxType n_row,
-                           IdxType n_col,
-                           bool rowMajor,
-                           bool bcastAlongRows,
-                           cudaStream_t stream)
-{
-  detail::matrixVectorBinaryAdd<Type, IdxType, TPB>(
-    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
-}
-
-/**
- * @brief subtract each row or column of matrix with vector
- * @param data input matrix, results are in-place
- * @param vec input vector
- * @param n_row number of rows of input matrix
- * @param n_col number of columns of input matrix
- * @param rowMajor whether matrix is row major
- * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
- * @param stream cuda stream
- */
-template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinarySub(Type* data,
-                           const Type* vec,
-                           IdxType n_row,
-                           IdxType n_col,
-                           bool rowMajor,
-                           bool bcastAlongRows,
-                           cudaStream_t stream)
-{
-  detail::matrixVectorBinarySub<Type, IdxType, TPB>(
-    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
-}
-
-};  // end namespace matrix
-};  // end namespace raft
-
-#endif
\ No newline at end of file
+#include "math.cuh"
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp
index cf5f5d1f25..7409140d7c 100644
--- a/cpp/include/raft/matrix/matrix.hpp
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/random/make_blobs.cuh b/cpp/include/raft/random/make_blobs.cuh
index 088690529a..5da960fc1a 100644
--- a/cpp/include/raft/random/make_blobs.cuh
+++ b/cpp/include/raft/random/make_blobs.cuh
@@ -99,17 +99,12 @@ void make_blobs(DataT* out,
  * @tparam DataT output data type
  * @tparam IdxT  indexing arithmetic type
  *
+ * @param[in] handle raft handle for managing expensive resources
  * @param[out] out                generated data [on device]
  *                                [dim = n_rows x n_cols]
  * @param[out] labels             labels for the generated data [on device]
  *                                [len = n_rows]
- * @param[in]  n_rows             number of rows in the generated data
- * @param[in]  n_cols             number of columns in the generated data
  * @param[in]  n_clusters         number of clusters (or classes) to generate
- * @param[in]  stream             cuda stream to schedule the work on
- * @param[in]  row_major          whether input `centers` and output `out`
- *                                buffers are to be stored in row or column
- *                                major layout
  * @param[in]  centers            centers of each of the cluster, pass a nullptr
  *                                if you need this also to be generated randomly
  *                                [on device] [dim = n_clusters x n_cols]
diff --git a/cpp/include/raft/random/make_blobs.hpp b/cpp/include/raft/random/make_blobs.hpp
index 02aef809e7..372839b500 100644
--- a/cpp/include/raft/random/make_blobs.hpp
+++ b/cpp/include/raft/random/make_blobs.hpp
@@ -15,168 +15,10 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
-#ifndef __MAKE_BLOBS_H
-#define __MAKE_BLOBS_H
-
 #pragma once
 
-#include "detail/make_blobs.cuh"
-#include <optional>
-#include <raft/mdarray.hpp>
-
-namespace raft::random {
-
-/**
- * @brief GPU-equivalent of sklearn.datasets.make_blobs
- *
- * @tparam DataT output data type
- * @tparam IdxT  indexing arithmetic type
- *
- * @param[out] out                generated data [on device]
- *                                [dim = n_rows x n_cols]
- * @param[out] labels             labels for the generated data [on device]
- *                                [len = n_rows]
- * @param[in]  n_rows             number of rows in the generated data
- * @param[in]  n_cols             number of columns in the generated data
- * @param[in]  n_clusters         number of clusters (or classes) to generate
- * @param[in]  stream             cuda stream to schedule the work on
- * @param[in]  row_major          whether input `centers` and output `out`
- *                                buffers are to be stored in row or column
- *                                major layout
- * @param[in]  centers            centers of each of the cluster, pass a nullptr
- *                                if you need this also to be generated randomly
- *                                [on device] [dim = n_clusters x n_cols]
- * @param[in]  cluster_std        standard deviation of each cluster center,
- *                                pass a nullptr if this is to be read from the
- *                                `cluster_std_scalar`. [on device]
- *                                [len = n_clusters]
- * @param[in]  cluster_std_scalar if 'cluster_std' is nullptr, then use this as
- *                                the std-dev across all dimensions.
- * @param[in]  shuffle            shuffle the generated dataset and labels
- * @param[in]  center_box_min     min value of box from which to pick cluster
- *                                centers. Useful only if 'centers' is nullptr
- * @param[in]  center_box_max     max value of box from which to pick cluster
- *                                centers. Useful only if 'centers' is nullptr
- * @param[in]  seed               seed for the RNG
- * @param[in]  type               RNG type
- */
-template <typename DataT, typename IdxT>
-void make_blobs(DataT* out,
-                IdxT* labels,
-                IdxT n_rows,
-                IdxT n_cols,
-                IdxT n_clusters,
-                cudaStream_t stream,
-                bool row_major                 = true,
-                const DataT* centers           = nullptr,
-                const DataT* cluster_std       = nullptr,
-                const DataT cluster_std_scalar = (DataT)1.0,
-                bool shuffle                   = true,
-                DataT center_box_min           = (DataT)-10.0,
-                DataT center_box_max           = (DataT)10.0,
-                uint64_t seed                  = 0ULL,
-                GeneratorType type             = GenPhilox)
-{
-  detail::make_blobs_caller(out,
-                            labels,
-                            n_rows,
-                            n_cols,
-                            n_clusters,
-                            stream,
-                            row_major,
-                            centers,
-                            cluster_std,
-                            cluster_std_scalar,
-                            shuffle,
-                            center_box_min,
-                            center_box_max,
-                            seed,
-                            type);
-}
-
-/**
- * @brief GPU-equivalent of sklearn.datasets.make_blobs
- *
- * @tparam DataT output data type
- * @tparam IdxT  indexing arithmetic type
- *
- * @param[in]  handle             raft handle for managing expensive resources
- * @param[out] out                generated data [on device]
- *                                [dim = n_rows x n_cols]
- * @param[out] labels             labels for the generated data [on device]
- *                                [len = n_rows]
- * @param[in]  n_clusters         number of clusters (or classes) to generate
- * @param[in]  centers            centers of each of the cluster, pass a nullptr
- *                                if you need this also to be generated randomly
- *                                [on device] [dim = n_clusters x n_cols]
- * @param[in]  cluster_std        standard deviation of each cluster center,
- *                                pass a nullptr if this is to be read from the
- *                                `cluster_std_scalar`. [on device]
- *                                [len = n_clusters]
- * @param[in]  cluster_std_scalar if 'cluster_std' is nullptr, then use this as
- *                                the std-dev across all dimensions.
- * @param[in]  shuffle            shuffle the generated dataset and labels
- * @param[in]  center_box_min     min value of box from which to pick cluster
- *                                centers. Useful only if 'centers' is nullptr
- * @param[in]  center_box_max     max value of box from which to pick cluster
- *                                centers. Useful only if 'centers' is nullptr
- * @param[in]  seed               seed for the RNG
- * @param[in]  type               RNG type
- */
-template <typename DataT, typename IdxT, typename layout>
-void make_blobs(raft::handle_t const& handle,
-                raft::device_matrix_view<DataT, layout> out,
-                raft::device_vector_view<IdxT> labels,
-                IdxT n_clusters                                                  = 5,
-                std::optional<raft::device_matrix_view<DataT, layout>> centers   = std::nullopt,
-                std::optional<raft::device_vector_view<DataT>> const cluster_std = std::nullopt,
-                const DataT cluster_std_scalar                                   = (DataT)1.0,
-                bool shuffle                                                     = true,
-                DataT center_box_min                                             = (DataT)-10.0,
-                DataT center_box_max                                             = (DataT)10.0,
-                uint64_t seed                                                    = 0ULL,
-                GeneratorType type                                               = GenPhilox)
-{
-  if (centers.has_value()) {
-    RAFT_EXPECTS(centers.value().extent(0) == (std::size_t)n_clusters,
-                 "n_centers must equal size of centers");
-  }
-
-  if (cluster_std.has_value()) {
-    RAFT_EXPECTS(cluster_std.value().extent(0) == (std::size_t)n_clusters,
-                 "n_centers must equal size of cluster_std");
-  }
-
-  RAFT_EXPECTS(out.extent(0) == labels.extent(0),
-               "Number of labels must equal the number of row in output matrix");
-
-  RAFT_EXPECTS(out.is_contiguous(), "Output must be contiguous.");
-
-  bool row_major = std::is_same<layout, raft::layout_c_contiguous>::value;
-
-  auto prm_centers     = centers.has_value() ? centers.value().data() : nullptr;
-  auto prm_cluster_std = cluster_std.has_value() ? cluster_std.value().data() : nullptr;
-
-  detail::make_blobs_caller(out.data(),
-                            labels.data(),
-                            (IdxT)out.extent(0),
-                            (IdxT)out.extent(1),
-                            n_clusters,
-                            handle.get_stream(),
-                            row_major,
-                            prm_centers,
-                            prm_cluster_std,
-                            cluster_std_scalar,
-                            shuffle,
-                            center_box_min,
-                            center_box_max,
-                            seed,
-                            type);
-}
-}  // end namespace raft::random
-
-#endif
\ No newline at end of file
+#include "make_blobs.cuh"
\ No newline at end of file
diff --git a/cpp/include/raft/random/make_regression.hpp b/cpp/include/raft/random/make_regression.hpp
index c050a447ed..4f6b2717f6 100644
--- a/cpp/include/raft/random/make_regression.hpp
+++ b/cpp/include/raft/random/make_regression.hpp
@@ -19,7 +19,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/random/multi_variable_gaussian.hpp b/cpp/include/raft/random/multi_variable_gaussian.hpp
index fd1de4aadd..6b85ec6a14 100644
--- a/cpp/include/raft/random/multi_variable_gaussian.hpp
+++ b/cpp/include/raft/random/multi_variable_gaussian.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/random/permute.hpp b/cpp/include/raft/random/permute.hpp
index 3507d66cc3..26e22e403b 100644
--- a/cpp/include/raft/random/permute.hpp
+++ b/cpp/include/raft/random/permute.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/random/rng.hpp b/cpp/include/raft/random/rng.hpp
index 2d1af6a97e..519325f6d3 100644
--- a/cpp/include/raft/random/rng.hpp
+++ b/cpp/include/raft/random/rng.hpp
@@ -14,371 +14,10 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
-#ifndef __RNG_H
-#define __RNG_H
-
 #pragma once
 
-#include "detail/rng_impl.cuh"
-
-namespace raft {
-namespace random {
-
-using detail::RngState;
-
-using detail::GeneratorType;
-using detail::GenPC;
-using detail::GenPhilox;
-
-using detail::PCGenerator;
-using detail::PhiloxGenerator;
-
-using detail::BernoulliDistParams;
-using detail::ExponentialDistParams;
-using detail::GumbelDistParams;
-using detail::InvariantDistParams;
-using detail::LaplaceDistParams;
-using detail::LogisticDistParams;
-using detail::LogNormalDistParams;
-using detail::NormalDistParams;
-using detail::NormalIntDistParams;
-using detail::NormalTableDistParams;
-using detail::RayleighDistParams;
-using detail::SamplingParams;
-using detail::ScaledBernoulliDistParams;
-using detail::UniformDistParams;
-using detail::UniformIntDistParams;
-
-// Not strictly needed due to C++ ADL rules
-using detail::custom_next;
-
-/**
- * @brief Helper method to compute Box Muller transform
- *
- * @tparam Type data type
- *
- * @param[inout] val1   first value
- * @param[inout] val2   second value
- * @param[in]    sigma1 standard deviation of output gaussian for first value
- * @param[in]    mu1    mean of output gaussian for first value
- * @param[in]    sigma2 standard deviation of output gaussian for second value
- * @param[in]    mu2    mean of output gaussian for second value
- * @{
- */
-template <typename Type>
-DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2)
-{
-  detail::box_muller_transform(val1, val2, sigma1, mu1, sigma2, mu2);
-}
-
-template <typename Type>
-DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1)
-{
-  detail::box_muller_transform(val1, val2, sigma1, mu1);
-}
-/** @} */
-
-class Rng : public detail::RngImpl {
- public:
-  /**
-   * @brief ctor
-   * @param _s 64b seed used to initialize the RNG
-   * @param _t backend device RNG generator type
-   * @note Refer to the `Rng::seed` method for details about seeding the engine
-   */
-  Rng(uint64_t _s, GeneratorType _t = GenPhilox) : detail::RngImpl(_s, _t) {}
-
-  /**
-   * @brief Generates the 'a' and 'b' parameters for a modulo affine
-   *        transformation equation: `(ax + b) % n`
-   *
-   * @tparam IdxT integer type
-   *
-   * @param[in]  n the modulo range
-   * @param[out] a slope parameter
-   * @param[out] b intercept parameter
-   */
-  template <typename IdxT>
-  void affine_transform_params(IdxT n, IdxT& a, IdxT& b)
-  {
-    detail::RngImpl::affine_transform_params(n, a, b);
-  }
-
-  /**
-   * @brief Generate uniformly distributed numbers in the given range
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output array
-   * @param len the number of elements in the output
-   * @param start start of the range
-   * @param end end of the range
-   * @param stream stream where to launch the kernel
-   * @{
-   */
-  template <typename OutType, typename LenType = int>
-  void uniform(OutType* ptr, LenType len, OutType start, OutType end, cudaStream_t stream)
-  {
-    detail::RngImpl::uniform(ptr, len, start, end, stream);
-  }
-
-  template <typename OutType, typename LenType = int>
-  void uniformInt(OutType* ptr, LenType len, OutType start, OutType end, cudaStream_t stream)
-  {
-    detail::RngImpl::uniformInt(ptr, len, start, end, stream);
-  }
-  /** @} */
-
-  /**
-   * @brief Generate normal distributed numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output array
-   * @param len the number of elements in the output
-   * @param mu mean of the distribution
-   * @param sigma std-dev of the distribution
-   * @param stream stream where to launch the kernel
-   * @{
-   */
-  template <typename OutType, typename LenType = int>
-  void normal(OutType* ptr, LenType len, OutType mu, OutType sigma, cudaStream_t stream)
-  {
-    detail::RngImpl::normal(ptr, len, mu, sigma, stream);
-  }
-
-  template <typename IntType, typename LenType = int>
-  void normalInt(IntType* ptr, LenType len, IntType mu, IntType sigma, cudaStream_t stream)
-  {
-    detail::RngImpl::normalInt(ptr, len, mu, sigma, stream);
-  }
-  /** @} */
-
-  /**
-   * @brief Generate normal distributed table according to the given set of
-   * means and scalar standard deviations.
-   *
-   * Each row in this table conforms to a normally distributed n-dim vector
-   * whose mean is the input vector and standard deviation is the corresponding
-   * vector or scalar. Correlations among the dimensions itself is assumed to
-   * be absent.
-   *
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output table (dim = n_rows x n_cols)
-   * @param n_rows number of rows in the table
-   * @param n_cols number of columns in the table
-   * @param mu_vec mean vector (dim = n_cols x 1).
-   * @param sigma_vec std-dev vector of each component (dim = n_cols x 1). Pass
-   * a nullptr to use the same scalar 'sigma' across all components
-   * @param sigma scalar sigma to be used if 'sigma_vec' is nullptr
-   * @param stream stream where to launch the kernel
-   */
-  template <typename OutType, typename LenType = int>
-  void normalTable(OutType* ptr,
-                   LenType n_rows,
-                   LenType n_cols,
-                   const OutType* mu_vec,
-                   const OutType* sigma_vec,
-                   OutType sigma,
-                   cudaStream_t stream)
-  {
-    detail::RngImpl::normalTable(ptr, n_rows, n_cols, mu_vec, sigma_vec, sigma, stream);
-  }
-
-  /**
-   * @brief Fill an array with the given value
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output array
-   * @param len the number of elements in the output
-   * @param val value to be filled
-   * @param stream stream where to launch the kernel
-   */
-  template <typename OutType, typename LenType = int>
-  void fill(OutType* ptr, LenType len, OutType val, cudaStream_t stream)
-  {
-    detail::RngImpl::fill(ptr, len, val, stream);
-  }
-
-  /**
-   * @brief Generate bernoulli distributed boolean array
-   *
-   * @tparam Type    data type in which to compute the probabilities
-   * @tparam OutType output data type
-   * @tparam LenType data type used to represent length of the arrays
-   *
-   * @param[out] ptr    the output array
-   * @param[in]  len    the number of elements in the output
-   * @param[in]  prob   coin-toss probability for heads
-   * @param[in]  stream stream where to launch the kernel
-   */
-  template <typename Type, typename OutType = bool, typename LenType = int>
-  void bernoulli(OutType* ptr, LenType len, Type prob, cudaStream_t stream)
-  {
-    detail::RngImpl::bernoulli(ptr, len, prob, stream);
-  }
-
-  /**
-   * @brief Generate bernoulli distributed array and applies scale
-   * @tparam Type data type in which to compute the probabilities
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output array
-   * @param len the number of elements in the output
-   * @param prob coin-toss probability for heads
-   * @param scale scaling factor
-   * @param stream stream where to launch the kernel
-   */
-  template <typename OutType, typename LenType = int>
-  void scaled_bernoulli(OutType* ptr, LenType len, OutType prob, OutType scale, cudaStream_t stream)
-  {
-    detail::RngImpl::scaled_bernoulli(ptr, len, prob, scale, stream);
-  }
-
-  /**
-   * @brief Generate Gumbel distributed random numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr output array
-   * @param len number of elements in the output array
-   * @param mu mean value
-   * @param beta scale value
-   * @param stream stream where to launch the kernel
-   * @note https://en.wikipedia.org/wiki/Gumbel_distribution
-   */
-  template <typename OutType, typename LenType = int>
-  void gumbel(OutType* ptr, LenType len, OutType mu, OutType beta, cudaStream_t stream)
-  {
-    detail::RngImpl::gumbel(ptr, len, mu, beta, stream);
-  }
-
-  /**
-   * @brief Generate lognormal distributed numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output array
-   * @param len the number of elements in the output
-   * @param mu mean of the distribution
-   * @param sigma std-dev of the distribution
-   * @param stream stream where to launch the kernel
-   */
-  template <typename OutType, typename LenType = int>
-  void lognormal(OutType* ptr, LenType len, OutType mu, OutType sigma, cudaStream_t stream)
-  {
-    detail::RngImpl::lognormal(ptr, len, mu, sigma, stream);
-  }
-
-  /**
-   * @brief Generate logistic distributed random numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr output array
-   * @param len number of elements in the output array
-   * @param mu mean value
-   * @param scale scale value
-   * @param stream stream where to launch the kernel
-   */
-  template <typename OutType, typename LenType = int>
-  void logistic(OutType* ptr, LenType len, OutType mu, OutType scale, cudaStream_t stream)
-  {
-    detail::RngImpl::logistic(ptr, len, mu, scale, stream);
-  }
-
-  /**
-   * @brief Generate exponentially distributed random numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr output array
-   * @param len number of elements in the output array
-   * @param lambda the lambda
-   * @param stream stream where to launch the kernel
-   */
-  template <typename OutType, typename LenType = int>
-  void exponential(OutType* ptr, LenType len, OutType lambda, cudaStream_t stream)
-  {
-    detail::RngImpl::exponential(ptr, len, lambda, stream);
-  }
-
-  /**
-   * @brief Generate rayleigh distributed random numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr output array
-   * @param len number of elements in the output array
-   * @param sigma the sigma
-   * @param stream stream where to launch the kernel
-   */
-  template <typename OutType, typename LenType = int>
-  void rayleigh(OutType* ptr, LenType len, OutType sigma, cudaStream_t stream)
-  {
-    detail::RngImpl::rayleigh(ptr, len, sigma, stream);
-  }
-
-  /**
-   * @brief Generate laplace distributed random numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr output array
-   * @param len number of elements in the output array
-   * @param mu the mean
-   * @param scale the scale
-   * @param stream stream where to launch the kernel
-   */
-  template <typename OutType, typename LenType = int>
-  void laplace(OutType* ptr, LenType len, OutType mu, OutType scale, cudaStream_t stream)
-  {
-    detail::RngImpl::laplace(ptr, len, mu, scale, stream);
-  }
-
-  void advance(uint64_t max_streams, uint64_t max_calls_per_subsequence)
-  {
-    detail::RngImpl::advance(max_streams, max_calls_per_subsequence);
-  }
-
-  /**
-   * @brief Sample the input array without replacement, optionally based on the
-   * input weight vector for each element in the array
-   *
-   * Implementation here is based on the `one-pass sampling` algo described here:
-   * https://www.ethz.ch/content/dam/ethz/special-interest/baug/ivt/ivt-dam/vpl/reports/1101-1200/ab1141.pdf
-   *
-   * @note In the sampled array the elements which are picked will always appear
-   * in the increasing order of their weights as computed using the exponential
-   * distribution. So, if you're particular about the order (for eg. array
-   * permutations), then this might not be the right choice!
-   *
-   * @tparam DataT data type
-   * @tparam WeightsT weights type
-   * @tparam IdxT index type
-   * @param handle
-   * @param out output sampled array (of length 'sampledLen')
-   * @param outIdx indices of the sampled array (of length 'sampledLen'). Pass
-   * a nullptr if this is not required.
-   * @param in input array to be sampled (of length 'len')
-   * @param wts weights array (of length 'len'). Pass a nullptr if uniform
-   * sampling is desired
-   * @param sampledLen output sampled array length
-   * @param len input array length
-   * @param stream cuda stream
-   */
-  template <typename DataT, typename WeightsT, typename IdxT = int>
-  void sampleWithoutReplacement(const raft::handle_t& handle,
-                                DataT* out,
-                                IdxT* outIdx,
-                                const DataT* in,
-                                const WeightsT* wts,
-                                IdxT sampledLen,
-                                IdxT len,
-                                cudaStream_t stream)
-  {
-    detail::RngImpl::sampleWithoutReplacement(
-      handle, out, outIdx, in, wts, sampledLen, len, stream);
-  }
-};
-
-};  // end namespace random
-};  // end namespace raft
-
-#endif
\ No newline at end of file
+#include "rng.cuh"
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/convert/coo.hpp b/cpp/include/raft/sparse/convert/coo.hpp
index 009a19a563..697452db09 100644
--- a/cpp/include/raft/sparse/convert/coo.hpp
+++ b/cpp/include/raft/sparse/convert/coo.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/convert/csr.hpp b/cpp/include/raft/sparse/convert/csr.hpp
index 6a9a99d014..cd5d89bf71 100644
--- a/cpp/include/raft/sparse/convert/csr.hpp
+++ b/cpp/include/raft/sparse/convert/csr.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/convert/dense.hpp b/cpp/include/raft/sparse/convert/dense.hpp
index 1bdfa26732..f8338536c8 100644
--- a/cpp/include/raft/sparse/convert/dense.hpp
+++ b/cpp/include/raft/sparse/convert/dense.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/csr.hpp b/cpp/include/raft/sparse/csr.hpp
index ca0e6537e4..49fe980646 100644
--- a/cpp/include/raft/sparse/csr.hpp
+++ b/cpp/include/raft/sparse/csr.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,7 +47,7 @@ using WeakCCState = detail::WeakCCState;
  * @param filter_op an optional filtering function to determine which points
  * should get considered for labeling. It gets global indexes (not batch-wide!)
  */
-template <typename Index_, typename Lambda = auto(Index_)->bool>
+template <typename Index_, typename Lambda>
 void weak_cc_batched(Index_* labels,
                      const Index_* row_ind,
                      const Index_* row_ind_ptr,
@@ -129,7 +129,7 @@ void weak_cc_batched(Index_* labels,
  * @param filter_op an optional filtering function to determine which points
  * should get considered for labeling. It gets global indexes (not batch-wide!)
  */
-template <typename Index_ = int, typename Lambda = auto(Index_)->bool>
+template <typename Index_ = int, typename Lambda>
 void weak_cc(Index_* labels,
              const Index_* row_ind,
              const Index_* row_ind_ptr,
diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp
index cba419e53a..86d4db81d2 100644
--- a/cpp/include/raft/sparse/distance/distance.hpp
+++ b/cpp/include/raft/sparse/distance/distance.hpp
@@ -14,128 +14,10 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
-#ifndef __SPARSE_DIST_H
-#define __SPARSE_DIST_H
-
 #pragma once
 
-#include <raft/sparse/distance/common.h>
-#include <unordered_set>
-
-#include <raft/distance/distance_type.hpp>
-
-#include <raft/sparse/distance/detail/bin_distance.cuh>
-#include <raft/sparse/distance/detail/ip_distance.cuh>
-#include <raft/sparse/distance/detail/l2_distance.cuh>
-#include <raft/sparse/distance/detail/lp_distance.cuh>
-
-namespace raft {
-namespace sparse {
-namespace distance {
-
-static const std::unordered_set<raft::distance::DistanceType> supportedDistance{
-  raft::distance::DistanceType::L2Expanded,
-  raft::distance::DistanceType::L2Unexpanded,
-  raft::distance::DistanceType::L2SqrtExpanded,
-  raft::distance::DistanceType::L2SqrtUnexpanded,
-  raft::distance::DistanceType::InnerProduct,
-  raft::distance::DistanceType::L1,
-  raft::distance::DistanceType::Canberra,
-  raft::distance::DistanceType::Linf,
-  raft::distance::DistanceType::LpUnexpanded,
-  raft::distance::DistanceType::JaccardExpanded,
-  raft::distance::DistanceType::CosineExpanded,
-  raft::distance::DistanceType::HellingerExpanded,
-  raft::distance::DistanceType::DiceExpanded,
-  raft::distance::DistanceType::CorrelationExpanded,
-  raft::distance::DistanceType::RusselRaoExpanded,
-  raft::distance::DistanceType::HammingUnexpanded,
-  raft::distance::DistanceType::JensenShannon,
-  raft::distance::DistanceType::KLDivergence};
-
-/**
- * Compute pairwise distances between A and B, using the provided
- * input configuration and distance function.
- *
- * @tparam value_idx index type
- * @tparam value_t value type
- * @param[out] out dense output array (size A.nrows * B.nrows)
- * @param[in] input_config input argument configuration
- * @param[in] metric distance metric to use
- * @param[in] metric_arg metric argument (used for Minkowski distance)
- */
-template <typename value_idx = int, typename value_t = float>
-void pairwiseDistance(value_t* out,
-                      distances_config_t<value_idx, value_t> input_config,
-                      raft::distance::DistanceType metric,
-                      float metric_arg)
-{
-  switch (metric) {
-    case raft::distance::DistanceType::L2Expanded:
-      detail::l2_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::L2SqrtExpanded:
-      detail::l2_sqrt_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::InnerProduct:
-      detail::ip_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::L2Unexpanded:
-      detail::l2_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::L2SqrtUnexpanded:
-      detail::l2_sqrt_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::L1:
-      detail::l1_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::LpUnexpanded:
-      detail::lp_unexpanded_distances_t<value_idx, value_t>(input_config, metric_arg).compute(out);
-      break;
-    case raft::distance::DistanceType::Linf:
-      detail::linf_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::Canberra:
-      detail::canberra_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::JaccardExpanded:
-      detail::jaccard_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::CosineExpanded:
-      detail::cosine_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::HellingerExpanded:
-      detail::hellinger_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::DiceExpanded:
-      detail::dice_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::CorrelationExpanded:
-      detail::correlation_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::RusselRaoExpanded:
-      detail::russelrao_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::HammingUnexpanded:
-      detail::hamming_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::JensenShannon:
-      detail::jensen_shannon_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-    case raft::distance::DistanceType::KLDivergence:
-      detail::kl_divergence_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
-      break;
-
-    default: THROW("Unsupported distance: %d", metric);
-  }
-}
-
-};  // namespace distance
-};  // namespace sparse
-};  // namespace raft
-
-#endif
\ No newline at end of file
+#include <raft/sparse/distance/distance.cuh>
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
index e7a37b7bf5..80c3c3c521 100644
--- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
+++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
@@ -14,57 +14,10 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
-#ifndef __SINGLE_LINKAGE_H
-#define __SINGLE_LINKAGE_H
-
 #pragma once
 
-#include <raft/sparse/hierarchy/common.h>
-#include <raft/sparse/hierarchy/detail/single_linkage.cuh>
-
-namespace raft {
-namespace hierarchy {
-
-/**
- * Single-linkage clustering, capable of constructing a KNN graph to
- * scale the algorithm beyond the n^2 memory consumption of implementations
- * that use the fully-connected graph of pairwise distances by connecting
- * a knn graph when k is not large enough to connect it.
-
- * @tparam value_idx
- * @tparam value_t
- * @tparam dist_type method to use for constructing connectivities graph
- * @param[in] handle raft handle
- * @param[in] X dense input matrix in row-major layout
- * @param[in] m number of rows in X
- * @param[in] n number of columns in X
- * @param[in] metric distance metrix to use when constructing connectivities graph
- * @param[out] out struct containing output dendrogram and cluster assignments
- * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect
- control
- *            of k. The algorithm will set `k = log(n) + c`
- * @param[in] n_clusters number of clusters to assign data samples
- */
-template <typename value_idx,
-          typename value_t,
-          LinkageDistance dist_type = LinkageDistance::KNN_GRAPH>
-void single_linkage(const raft::handle_t& handle,
-                    const value_t* X,
-                    size_t m,
-                    size_t n,
-                    raft::distance::DistanceType metric,
-                    linkage_output<value_idx, value_t>* out,
-                    int c,
-                    size_t n_clusters)
-{
-  detail::single_linkage<value_idx, value_t, dist_type>(
-    handle, X, m, n, metric, out, c, n_clusters);
-}
-};  // namespace hierarchy
-};  // namespace raft
-
-#endif
\ No newline at end of file
+#include <raft/sparse/hierarchy/single_linkage.cuh>
diff --git a/cpp/include/raft/sparse/linalg/add.hpp b/cpp/include/raft/sparse/linalg/add.hpp
index 33259cb39f..39ab2d6450 100644
--- a/cpp/include/raft/sparse/linalg/add.hpp
+++ b/cpp/include/raft/sparse/linalg/add.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/linalg/degree.hpp b/cpp/include/raft/sparse/linalg/degree.hpp
index 0c6af596ce..7cece7908e 100644
--- a/cpp/include/raft/sparse/linalg/degree.hpp
+++ b/cpp/include/raft/sparse/linalg/degree.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/linalg/norm.hpp b/cpp/include/raft/sparse/linalg/norm.hpp
index 196951bac7..1f054e63ab 100644
--- a/cpp/include/raft/sparse/linalg/norm.hpp
+++ b/cpp/include/raft/sparse/linalg/norm.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/linalg/spectral.hpp b/cpp/include/raft/sparse/linalg/spectral.hpp
index 9daa6e07b0..ff400f1f0f 100644
--- a/cpp/include/raft/sparse/linalg/spectral.hpp
+++ b/cpp/include/raft/sparse/linalg/spectral.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh
index d41540c0b3..a01145376a 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh
@@ -57,12 +57,12 @@ void coo_symmetrize(COO<T>* in,
  * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction
  */
 template <typename value_idx = int64_t, typename value_t = float>
-__global__ static void symmetric_find_size(const value_t* __restrict__ data,
-                                           const value_idx* __restrict__ indices,
-                                           const value_idx n,
-                                           const int k,
-                                           value_idx* __restrict__ row_sizes,
-                                           value_idx* __restrict__ row_sizes2)
+__global__ void symmetric_find_size(const value_t __restrict__* data,
+                                    const value_idx __restrict__* indices,
+                                    const value_idx n,
+                                    const int k,
+                                    value_idx __restrict__* row_sizes,
+                                    value_idx __restrict__* row_sizes2)
 {
   detail::symmetric_find_size(data, indices, n, k, row_sizes, row_sizes2);
 }
@@ -78,10 +78,10 @@ __global__ static void symmetric_find_size(const value_t* __restrict__ data,
  * @param row_sizes2: Input row sum 2 array(n) for faster reduction
  */
 template <typename value_idx>
-__global__ static void reduce_find_size(const value_idx n,
-                                        const int k,
-                                        value_idx* __restrict__ row_sizes,
-                                        const value_idx* __restrict__ row_sizes2)
+__global__ void reduce_find_size(const value_idx n,
+                                 const int k,
+                                 value_idx __restrict__* row_sizes,
+                                 const value_idx __restrict__* row_sizes2)
 {
   detail::reduce_find_size(n, k, row_sizes, row_sizes2);
 }
@@ -103,14 +103,14 @@ __global__ static void reduce_find_size(const value_idx n,
  * @param k: Number of n_neighbors
  */
 template <typename value_idx = int64_t, typename value_t = float>
-__global__ static void symmetric_sum(value_idx* __restrict__ edges,
-                                     const value_t* __restrict__ data,
-                                     const value_idx* __restrict__ indices,
-                                     value_t* __restrict__ VAL,
-                                     value_idx* __restrict__ COL,
-                                     value_idx* __restrict__ ROW,
-                                     const value_idx n,
-                                     const int k)
+__global__ void symmetric_sum(value_idx* __restrict__ edges,
+                              const value_t* __restrict__ data,
+                              const value_idx* __restrict__ indices,
+                              value_t* __restrict__ VAL,
+                              value_idx* __restrict__ COL,
+                              value_idx* __restrict__ ROW,
+                              const value_idx n,
+                              const int k)
 {
   detail::symmetric_sum(edges, data, indices, VAL, COL, ROW, n, k);
 }
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.hpp b/cpp/include/raft/sparse/linalg/symmetrize.hpp
index 4d8520dabf..6e1d3b4fa1 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.hpp
+++ b/cpp/include/raft/sparse/linalg/symmetrize.hpp
@@ -14,160 +14,10 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
-#ifndef __SYMMETRIZE_H
-#define __SYMMETRIZE_H
-
 #pragma once
 
-#include <raft/sparse/coo.hpp>
-#include <raft/sparse/linalg/detail/symmetrize.cuh>
-
-namespace raft {
-namespace sparse {
-namespace linalg {
-
-/**
- * @brief takes a COO matrix which may not be symmetric and symmetrizes
- * it, running a custom reduction function against the each value
- * and its transposed value.
- *
- * @param in: Input COO matrix
- * @param out: Output symmetrized COO matrix
- * @param reduction_op: a custom reduction function
- * @param stream: cuda stream to use
- */
-template <typename T, typename Lambda>
-void coo_symmetrize(COO<T>* in,
-                    COO<T>* out,
-                    Lambda reduction_op,  // two-argument reducer
-                    cudaStream_t stream)
-{
-  detail::coo_symmetrize(in, out, reduction_op, stream);
-}
-
-/**
- * @brief Find how much space needed in each row.
- * We look through all datapoints and increment the count for each row.
- *
- * TODO: This isn't generalized. Remove in place of `symmetrize()`
- * @param data: Input knn distances(n, k)
- * @param indices: Input knn indices(n, k)
- * @param n: Number of rows
- * @param k: Number of n_neighbors
- * @param row_sizes: Input empty row sum 1 array(n)
- * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction
- */
-template <typename value_idx = int64_t, typename value_t = float>
-__global__ static void symmetric_find_size(const value_t* __restrict__ data,
-                                           const value_idx* __restrict__ indices,
-                                           const value_idx n,
-                                           const int k,
-                                           value_idx* __restrict__ row_sizes,
-                                           value_idx* __restrict__ row_sizes2)
-{
-  detail::symmetric_find_size(data, indices, n, k, row_sizes, row_sizes2);
-}
-
-/**
- * @brief Reduce sum(row_sizes) + k
- * Reduction for symmetric_find_size kernel. Allows algo to be faster.
- *
- * TODO: This isn't generalized. Remove in place of `symmetrize()`
- * @param n: Number of rows
- * @param k: Number of n_neighbors
- * @param row_sizes: Input row sum 1 array(n)
- * @param row_sizes2: Input row sum 2 array(n) for faster reduction
- */
-template <typename value_idx>
-__global__ static void reduce_find_size(const value_idx n,
-                                        const int k,
-                                        value_idx* __restrict__ row_sizes,
-                                        const value_idx* __restrict__ row_sizes2)
-{
-  detail::reduce_find_size(n, k, row_sizes, row_sizes2);
-}
-
-/**
- * @brief Perform data + data.T operation.
- * Can only run once row_sizes from the CSR matrix of data + data.T has been
- * determined.
- *
- * TODO: This isn't generalized. Remove in place of `symmetrize()`
- *
- * @param edges: Input row sum array(n) after reduction
- * @param data: Input knn distances(n, k)
- * @param indices: Input knn indices(n, k)
- * @param VAL: Output values for data + data.T
- * @param COL: Output column indices for data + data.T
- * @param ROW: Output row indices for data + data.T
- * @param n: Number of rows
- * @param k: Number of n_neighbors
- */
-template <typename value_idx = int64_t, typename value_t = float>
-__global__ static void symmetric_sum(value_idx* __restrict__ edges,
-                                     const value_t* __restrict__ data,
-                                     const value_idx* __restrict__ indices,
-                                     value_t* __restrict__ VAL,
-                                     value_idx* __restrict__ COL,
-                                     value_idx* __restrict__ ROW,
-                                     const value_idx n,
-                                     const int k)
-{
-  detail::symmetric_sum(edges, data, indices, VAL, COL, ROW, n, k);
-}
-
-/**
- * @brief Perform data + data.T on raw KNN data.
- * The following steps are invoked:
- * (1) Find how much space needed in each row
- * (2) Compute final space needed (n*k + sum(row_sizes)) == 2*n*k
- * (3) Allocate new space
- * (4) Prepare edges for each new row
- * (5) Perform final data + data.T operation
- * (6) Return summed up VAL, COL, ROW
- *
- * TODO: This isn't generalized. Remove in place of `symmetrize()`
- *
- * @param knn_indices: Input knn distances(n, k)
- * @param knn_dists: Input knn indices(n, k)
- * @param n: Number of rows
- * @param k: Number of n_neighbors
- * @param out: Output COO Matrix class
- * @param stream: Input cuda stream
- */
-template <typename value_idx = int64_t, typename value_t = float, int TPB_X = 32, int TPB_Y = 32>
-void from_knn_symmetrize_matrix(const value_idx* __restrict__ knn_indices,
-                                const value_t* __restrict__ knn_dists,
-                                const value_idx n,
-                                const int k,
-                                COO<value_t, value_idx>* out,
-                                cudaStream_t stream)
-{
-  detail::from_knn_symmetrize_matrix(knn_indices, knn_dists, n, k, out, stream);
-}
-
-/**
- * Symmetrizes a COO matrix
- */
-template <typename value_idx, typename value_t>
-void symmetrize(const raft::handle_t& handle,
-                const value_idx* rows,
-                const value_idx* cols,
-                const value_t* vals,
-                size_t m,
-                size_t n,
-                size_t nnz,
-                raft::sparse::COO<value_t, value_idx>& out)
-{
-  detail::symmetrize(handle, rows, cols, vals, m, n, nnz, out);
-}
-
-};  // end NAMESPACE linalg
-};  // end NAMESPACE sparse
-};  // end NAMESPACE raft
-
-#endif
+#include <raft/sparse/linalg/symmetrize.cuh>
diff --git a/cpp/include/raft/sparse/linalg/transpose.hpp b/cpp/include/raft/sparse/linalg/transpose.hpp
index 0aea254803..c709c20473 100644
--- a/cpp/include/raft/sparse/linalg/transpose.hpp
+++ b/cpp/include/raft/sparse/linalg/transpose.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/mst/mst.hpp b/cpp/include/raft/sparse/mst/mst.hpp
index ac4cf21b64..5a66e8c815 100644
--- a/cpp/include/raft/sparse/mst/mst.hpp
+++ b/cpp/include/raft/sparse/mst/mst.hpp
@@ -16,48 +16,9 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
-
-#ifndef __MST_H
-#define __MST_H
-
 #pragma once
 
-#include "mst_solver.cuh"
-
-namespace raft {
-namespace mst {
-
-template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t = weight_t>
-raft::Graph_COO<vertex_t, edge_t, weight_t> mst(const raft::handle_t& handle,
-                                                edge_t const* offsets,
-                                                vertex_t const* indices,
-                                                weight_t const* weights,
-                                                vertex_t const v,
-                                                edge_t const e,
-                                                vertex_t* color,
-                                                cudaStream_t stream,
-                                                bool symmetrize_output = true,
-                                                bool initialize_colors = true,
-                                                int iterations         = 0)
-{
-  MST_solver<vertex_t, edge_t, weight_t, alteration_t> mst_solver(handle,
-                                                                  offsets,
-                                                                  indices,
-                                                                  weights,
-                                                                  v,
-                                                                  e,
-                                                                  color,
-                                                                  stream,
-                                                                  symmetrize_output,
-                                                                  initialize_colors,
-                                                                  iterations);
-  return mst_solver.solve();
-}
-
-}  // namespace mst
-}  // namespace raft
-
-#endif
\ No newline at end of file
+#include "mst.cuh"
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/filter.hpp b/cpp/include/raft/sparse/op/filter.hpp
index b67084f18a..3821d963b0 100644
--- a/cpp/include/raft/sparse/op/filter.hpp
+++ b/cpp/include/raft/sparse/op/filter.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/op/reduce.hpp b/cpp/include/raft/sparse/op/reduce.hpp
index a7e771d157..bb7560fa3d 100644
--- a/cpp/include/raft/sparse/op/reduce.hpp
+++ b/cpp/include/raft/sparse/op/reduce.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/op/row_op.hpp b/cpp/include/raft/sparse/op/row_op.hpp
index b3eafafa66..ac12432e92 100644
--- a/cpp/include/raft/sparse/op/row_op.hpp
+++ b/cpp/include/raft/sparse/op/row_op.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/op/slice.hpp b/cpp/include/raft/sparse/op/slice.hpp
index b4e0622ced..75b7e478e5 100644
--- a/cpp/include/raft/sparse/op/slice.hpp
+++ b/cpp/include/raft/sparse/op/slice.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/op/sort.hpp b/cpp/include/raft/sparse/op/sort.hpp
index 12a4a77ca9..cd363582fb 100644
--- a/cpp/include/raft/sparse/op/sort.hpp
+++ b/cpp/include/raft/sparse/op/sort.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/selection/connect_components.hpp b/cpp/include/raft/sparse/selection/connect_components.hpp
index 83d8fce8ba..25d71367db 100644
--- a/cpp/include/raft/sparse/selection/connect_components.hpp
+++ b/cpp/include/raft/sparse/selection/connect_components.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/selection/knn.hpp b/cpp/include/raft/sparse/selection/knn.hpp
index 4158bd40c2..bd6dd39fdf 100644
--- a/cpp/include/raft/sparse/selection/knn.hpp
+++ b/cpp/include/raft/sparse/selection/knn.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/sparse/selection/knn_graph.hpp b/cpp/include/raft/sparse/selection/knn_graph.hpp
index eb035390ce..be47a6a9ef 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.hpp
+++ b/cpp/include/raft/sparse/selection/knn_graph.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp
index bb11a2b11b..b6d3ca2976 100644
--- a/cpp/include/raft/spatial/knn/ann.hpp
+++ b/cpp/include/raft/spatial/knn/ann.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/spatial/knn/ball_cover.hpp b/cpp/include/raft/spatial/knn/ball_cover.hpp
index 26c2c1fb2e..a7c483493e 100644
--- a/cpp/include/raft/spatial/knn/ball_cover.hpp
+++ b/cpp/include/raft/spatial/knn/ball_cover.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
index b3ba0fc442..7674ac0d46 100644
--- a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
+++ b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index da18e891d4..c7b21f16ad 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/spatial/knn/specializations.hpp b/cpp/include/raft/spatial/knn/specializations.hpp
index 538e1b1380..13721a975f 100644
--- a/cpp/include/raft/spatial/knn/specializations.hpp
+++ b/cpp/include/raft/spatial/knn/specializations.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp
index c6b166bb4f..9cb773cce2 100644
--- a/cpp/include/raft/spectral/cluster_solvers.hpp
+++ b/cpp/include/raft/spectral/cluster_solvers.hpp
@@ -15,7 +15,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp
index d55ddf952a..e6b37f29ec 100644
--- a/cpp/include/raft/spectral/eigen_solvers.hpp
+++ b/cpp/include/raft/spectral/eigen_solvers.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/spectral/modularity_maximization.cuh b/cpp/include/raft/spectral/modularity_maximization.cuh
index c8221e434c..61d85aefaa 100644
--- a/cpp/include/raft/spectral/modularity_maximization.cuh
+++ b/cpp/include/raft/spectral/modularity_maximization.cuh
@@ -31,24 +31,17 @@ namespace spectral {
 
 /** Compute partition for a weighted undirected graph. This
  *  partition attempts to minimize the cost function:
- *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *    Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition)
  *
- *  @param G Weighted graph in CSR format
- *  @param nClusters Number of partitions.
- *  @param nEigVecs Number of eigenvectors to compute.
- *  @param maxIter_lanczos Maximum number of Lanczos iterations.
- *  @param restartIter_lanczos Maximum size of Lanczos system before
- *    implicit restart.
- *  @param tol_lanczos Convergence tolerance for Lanczos method.
- *  @param maxIter_kmeans Maximum number of k-means iterations.
- *  @param tol_kmeans Convergence tolerance for k-means algorithm.
- *  @param clusters (Output, device memory, n entries) Cluster
+ *  @param handle raft handle for managing expensive resources
+ *  @param csr_m Weighted graph in CSR format
+ *  @param eigen_solver Eigensolver implementation
+ *  @param cluster_solver Cluster solver implementation
+ *  @param clusters (Output, device memory, n entries) Partition
  *    assignments.
- *  @param iters_lanczos On exit, number of Lanczos iterations
- *    performed.
- *  @param iters_kmeans On exit, number of k-means iterations
- *    performed.
- *  @return error flag.
+ *  @param eigVals Output eigenvalue array pointer on device
+ *  @param eigVecs Output eigenvector array pointer on device
+ *  @return statistics: number of eigensolver iterations, .
  */
 template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
@@ -70,7 +63,8 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
 
 /// Compute modularity
 /** This function determines the modularity based on a graph and cluster assignments
- *  @param G Weighted graph in CSR format
+ *  @param handle raft handle for managing expensive resources
+ *  @param csr_m Weighted graph in CSR format
  *  @param nClusters Number of clusters.
  *  @param clusters (Input, device memory, n entries) Cluster assignments.
  *  @param modularity On exit, modularity
diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp
index d1c3ea00f7..1bc003e711 100644
--- a/cpp/include/raft/spectral/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/modularity_maximization.hpp
@@ -14,84 +14,10 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
-#ifndef __MODULARITY_MAXIMIZATION_H
-#define __MODULARITY_MAXIMIZATION_H
-
 #pragma once
 
-#include <tuple>
-
-#include <raft/spectral/detail/modularity_maximization.hpp>
-
-namespace raft {
-namespace spectral {
-
-// =========================================================
-// Spectral modularity_maximization
-// =========================================================
-
-/** Compute partition for a weighted undirected graph. This
- *  partition attempts to minimize the cost function:
- *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
- *
- *  @param G Weighted graph in CSR format
- *  @param nClusters Number of partitions.
- *  @param nEigVecs Number of eigenvectors to compute.
- *  @param maxIter_lanczos Maximum number of Lanczos iterations.
- *  @param restartIter_lanczos Maximum size of Lanczos system before
- *    implicit restart.
- *  @param tol_lanczos Convergence tolerance for Lanczos method.
- *  @param maxIter_kmeans Maximum number of k-means iterations.
- *  @param tol_kmeans Convergence tolerance for k-means algorithm.
- *  @param clusters (Output, device memory, n entries) Cluster
- *    assignments.
- *  @param iters_lanczos On exit, number of Lanczos iterations
- *    performed.
- *  @param iters_kmeans On exit, number of k-means iterations
- *    performed.
- *  @return error flag.
- */
-template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
-std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
-  handle_t const& handle,
-  matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
-  EigenSolver const& eigen_solver,
-  ClusterSolver const& cluster_solver,
-  vertex_t* __restrict__ clusters,
-  weight_t* eigVals,
-  weight_t* eigVecs)
-{
-  return raft::spectral::detail::
-    modularity_maximization<vertex_t, weight_t, EigenSolver, ClusterSolver>(
-      handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
-}
-//===================================================
-// Analysis of graph partition
-// =========================================================
-
-/// Compute modularity
-/** This function determines the modularity based on a graph and cluster assignments
- *  @param G Weighted graph in CSR format
- *  @param nClusters Number of clusters.
- *  @param clusters (Input, device memory, n entries) Cluster assignments.
- *  @param modularity On exit, modularity
- */
-template <typename vertex_t, typename weight_t>
-void analyzeModularity(handle_t const& handle,
-                       matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
-                       vertex_t nClusters,
-                       vertex_t const* __restrict__ clusters,
-                       weight_t& modularity)
-{
-  raft::spectral::detail::analyzeModularity<vertex_t, weight_t>(
-    handle, csr_m, nClusters, clusters, modularity);
-}
-
-}  // namespace spectral
-}  // namespace raft
-
-#endif
\ No newline at end of file
+#include <raft/spectral/modularity_maximization.cuh>
\ No newline at end of file
diff --git a/cpp/include/raft/spectral/partition.cuh b/cpp/include/raft/spectral/partition.cuh
index 9ccc21c868..2d21f2223c 100644
--- a/cpp/include/raft/spectral/partition.cuh
+++ b/cpp/include/raft/spectral/partition.cuh
@@ -33,23 +33,16 @@ namespace spectral {
 /// Compute spectral graph partition
 /** Compute partition for a weighted undirected graph. This
  *  partition attempts to minimize the cost function:
- *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *    Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition)
  *
- *  @param G Weighted graph in CSR format
- *  @param nClusters Number of partitions.
- *  @param nEigVecs Number of eigenvectors to compute.
- *  @param maxIter_lanczos Maximum number of Lanczos iterations.
- *  @param restartIter_lanczos Maximum size of Lanczos system before
- *    implicit restart.
- *  @param tol_lanczos Convergence tolerance for Lanczos method.
- *  @param maxIter_kmeans Maximum number of k-means iterations.
- *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+ *  @param handle raft handle for managing expensive resources
+ *  @param csr_m Weighted graph in CSR format
+ *  @param eigen_solver Eigensolver implementation
+ *  @param cluster_solver Cluster solver implementation
  *  @param clusters (Output, device memory, n entries) Partition
  *    assignments.
- *  @param iters_lanczos On exit, number of Lanczos iterations
- *    performed.
- *  @param iters_kmeans On exit, number of k-means iterations
- *    performed.
+ *  @param eigVals Output eigenvalue array pointer on device
+ *  @param eigVecs Output eigenvector array pointer on device
  *  @return statistics: number of eigensolver iterations, .
  */
 template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
@@ -73,16 +66,16 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
 /// Compute cost function for partition
 /** This function determines the edges cut by a partition and a cost
  *  function:
- *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *    Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition)
  *  Graph is assumed to be weighted and undirected.
  *
- *  @param G Weighted graph in CSR format
+ *  @param handle raft handle for managing expensive resources
+ *  @param csr_m Weighted graph in CSR format
  *  @param nClusters Number of partitions.
  *  @param clusters (Input, device memory, n entries) Partition
  *    assignments.
  *  @param edgeCut On exit, weight of edges cut by partition.
  *  @param cost On exit, partition cost function.
- *  @return error flag.
  */
 template <typename vertex_t, typename weight_t>
 void analyzePartition(handle_t const& handle,
diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp
index fde2e6572b..27f204d055 100644
--- a/cpp/include/raft/spectral/partition.hpp
+++ b/cpp/include/raft/spectral/partition.hpp
@@ -14,93 +14,10 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
-#ifndef __PARTITION_H
-#define __PARTITION_H
-
 #pragma once
 
-#include <tuple>
-
-#include <raft/spectral/detail/partition.hpp>
-
-namespace raft {
-namespace spectral {
-
-// =========================================================
-// Spectral partitioner
-// =========================================================
-
-/// Compute spectral graph partition
-/** Compute partition for a weighted undirected graph. This
- *  partition attempts to minimize the cost function:
- *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
- *
- *  @param G Weighted graph in CSR format
- *  @param nClusters Number of partitions.
- *  @param nEigVecs Number of eigenvectors to compute.
- *  @param maxIter_lanczos Maximum number of Lanczos iterations.
- *  @param restartIter_lanczos Maximum size of Lanczos system before
- *    implicit restart.
- *  @param tol_lanczos Convergence tolerance for Lanczos method.
- *  @param maxIter_kmeans Maximum number of k-means iterations.
- *  @param tol_kmeans Convergence tolerance for k-means algorithm.
- *  @param clusters (Output, device memory, n entries) Partition
- *    assignments.
- *  @param iters_lanczos On exit, number of Lanczos iterations
- *    performed.
- *  @param iters_kmeans On exit, number of k-means iterations
- *    performed.
- *  @return statistics: number of eigensolver iterations, .
- */
-template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
-std::tuple<vertex_t, weight_t, vertex_t> partition(
-  handle_t const& handle,
-  matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
-  EigenSolver const& eigen_solver,
-  ClusterSolver const& cluster_solver,
-  vertex_t* __restrict__ clusters,
-  weight_t* eigVals,
-  weight_t* eigVecs)
-{
-  return raft::spectral::detail::partition<vertex_t, weight_t, EigenSolver, ClusterSolver>(
-    handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
-}
-
-// =========================================================
-// Analysis of graph partition
-// =========================================================
-
-/// Compute cost function for partition
-/** This function determines the edges cut by a partition and a cost
- *  function:
- *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
- *  Graph is assumed to be weighted and undirected.
- *
- *  @param G Weighted graph in CSR format
- *  @param nClusters Number of partitions.
- *  @param clusters (Input, device memory, n entries) Partition
- *    assignments.
- *  @param edgeCut On exit, weight of edges cut by partition.
- *  @param cost On exit, partition cost function.
- *  @return error flag.
- */
-template <typename vertex_t, typename weight_t>
-void analyzePartition(handle_t const& handle,
-                      matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
-                      vertex_t nClusters,
-                      const vertex_t* __restrict__ clusters,
-                      weight_t& edgeCut,
-                      weight_t& cost)
-{
-  raft::spectral::detail::analyzePartition<vertex_t, weight_t>(
-    handle, csr_m, nClusters, clusters, edgeCut, cost);
-}
-
-}  // namespace spectral
-}  // namespace raft
-
-#endif
\ No newline at end of file
+#include <raft/spectral/partition.cuh>
diff --git a/cpp/include/raft/stats/accuracy.hpp b/cpp/include/raft/stats/accuracy.hpp
index eefe96b2d1..8cbb0f719e 100644
--- a/cpp/include/raft/stats/accuracy.hpp
+++ b/cpp/include/raft/stats/accuracy.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/adjusted_rand_index.hpp b/cpp/include/raft/stats/adjusted_rand_index.hpp
index cbf6112000..bc836eed86 100644
--- a/cpp/include/raft/stats/adjusted_rand_index.hpp
+++ b/cpp/include/raft/stats/adjusted_rand_index.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/completeness_score.hpp b/cpp/include/raft/stats/completeness_score.hpp
index 01ed0d66b9..0dd97e9782 100644
--- a/cpp/include/raft/stats/completeness_score.hpp
+++ b/cpp/include/raft/stats/completeness_score.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/contingency_matrix.hpp b/cpp/include/raft/stats/contingency_matrix.hpp
index 6fa4a314f9..70800be1e6 100644
--- a/cpp/include/raft/stats/contingency_matrix.hpp
+++ b/cpp/include/raft/stats/contingency_matrix.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/cov.hpp b/cpp/include/raft/stats/cov.hpp
index 27b4ede876..a584dedc95 100644
--- a/cpp/include/raft/stats/cov.hpp
+++ b/cpp/include/raft/stats/cov.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/dispersion.hpp b/cpp/include/raft/stats/dispersion.hpp
index 5958551e87..7fabf07992 100644
--- a/cpp/include/raft/stats/dispersion.hpp
+++ b/cpp/include/raft/stats/dispersion.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/entropy.hpp b/cpp/include/raft/stats/entropy.hpp
index eb1fee2949..37dc2b700c 100644
--- a/cpp/include/raft/stats/entropy.hpp
+++ b/cpp/include/raft/stats/entropy.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/histogram.hpp b/cpp/include/raft/stats/histogram.hpp
index 828719236b..627026c219 100644
--- a/cpp/include/raft/stats/histogram.hpp
+++ b/cpp/include/raft/stats/histogram.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/homogeneity_score.hpp b/cpp/include/raft/stats/homogeneity_score.hpp
index 49baea0c19..4e119f2bc7 100644
--- a/cpp/include/raft/stats/homogeneity_score.hpp
+++ b/cpp/include/raft/stats/homogeneity_score.hpp
@@ -15,7 +15,7 @@
  */
 
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/information_criterion.hpp b/cpp/include/raft/stats/information_criterion.hpp
index f6dd69aa08..3a39e56c41 100644
--- a/cpp/include/raft/stats/information_criterion.hpp
+++ b/cpp/include/raft/stats/information_criterion.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/kl_divergence.hpp b/cpp/include/raft/stats/kl_divergence.hpp
index 9d7c0b1e46..59db77246f 100644
--- a/cpp/include/raft/stats/kl_divergence.hpp
+++ b/cpp/include/raft/stats/kl_divergence.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/mean.hpp b/cpp/include/raft/stats/mean.hpp
index add9e47569..2767b632e6 100644
--- a/cpp/include/raft/stats/mean.hpp
+++ b/cpp/include/raft/stats/mean.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/mean_center.hpp b/cpp/include/raft/stats/mean_center.hpp
index 69ce79338b..e219891cab 100644
--- a/cpp/include/raft/stats/mean_center.hpp
+++ b/cpp/include/raft/stats/mean_center.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/meanvar.hpp b/cpp/include/raft/stats/meanvar.hpp
index a6809170e7..d7ef935fbc 100644
--- a/cpp/include/raft/stats/meanvar.hpp
+++ b/cpp/include/raft/stats/meanvar.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/minmax.hpp b/cpp/include/raft/stats/minmax.hpp
index 669b3c5837..97f06129fa 100644
--- a/cpp/include/raft/stats/minmax.hpp
+++ b/cpp/include/raft/stats/minmax.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/mutual_info_score.hpp b/cpp/include/raft/stats/mutual_info_score.hpp
index c900f9ce5b..a080211c36 100644
--- a/cpp/include/raft/stats/mutual_info_score.hpp
+++ b/cpp/include/raft/stats/mutual_info_score.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/r2_score.hpp b/cpp/include/raft/stats/r2_score.hpp
index 4e126d903b..c88a1822ec 100644
--- a/cpp/include/raft/stats/r2_score.hpp
+++ b/cpp/include/raft/stats/r2_score.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/rand_index.hpp b/cpp/include/raft/stats/rand_index.hpp
index c94e4fa8db..e8c3089371 100644
--- a/cpp/include/raft/stats/rand_index.hpp
+++ b/cpp/include/raft/stats/rand_index.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/regression_metrics.hpp b/cpp/include/raft/stats/regression_metrics.hpp
index b8868bdb33..f65ad524ef 100644
--- a/cpp/include/raft/stats/regression_metrics.hpp
+++ b/cpp/include/raft/stats/regression_metrics.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/silhouette_score.hpp b/cpp/include/raft/stats/silhouette_score.hpp
index 7506d9a733..e6c84855c6 100644
--- a/cpp/include/raft/stats/silhouette_score.hpp
+++ b/cpp/include/raft/stats/silhouette_score.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/specializations.hpp b/cpp/include/raft/stats/specializations.hpp
index 87301deccc..3929b3124c 100644
--- a/cpp/include/raft/stats/specializations.hpp
+++ b/cpp/include/raft/stats/specializations.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/stddev.hpp b/cpp/include/raft/stats/stddev.hpp
index e038fecc02..f496b1fd30 100644
--- a/cpp/include/raft/stats/stddev.hpp
+++ b/cpp/include/raft/stats/stddev.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/sum.hpp b/cpp/include/raft/stats/sum.hpp
index c2b93b79db..e1c8c67777 100644
--- a/cpp/include/raft/stats/sum.hpp
+++ b/cpp/include/raft/stats/sum.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/trustworthiness_score.hpp b/cpp/include/raft/stats/trustworthiness_score.hpp
index 81ca4eb5b7..81edf2ea04 100644
--- a/cpp/include/raft/stats/trustworthiness_score.hpp
+++ b/cpp/include/raft/stats/trustworthiness_score.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/v_measure.hpp b/cpp/include/raft/stats/v_measure.hpp
index 925171c2d2..a137af844d 100644
--- a/cpp/include/raft/stats/v_measure.hpp
+++ b/cpp/include/raft/stats/v_measure.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/cpp/include/raft/stats/weighted_mean.hpp b/cpp/include/raft/stats/weighted_mean.hpp
index 4f53067e65..5b3f4678d8 100644
--- a/cpp/include/raft/stats/weighted_mean.hpp
+++ b/cpp/include/raft/stats/weighted_mean.hpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 /**
- * @warning This file is deprecated and will be removed in release 22.06.
+ * This file is deprecated and will be removed in release 22.06.
  * Please use the cuh version instead.
  */
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 6fd7e3d702..bb9f0c1a84 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -25,6 +25,7 @@
 # absolute, like shown here.
 sys.path.insert(0, os.path.abspath('sphinxext'))
 sys.path.insert(0, os.path.abspath('../../python/raft'))
+sys.path.insert(0, os.path.abspath('../../python/pylibraft'))
 
 from github_link import make_linkcode_resolve # noqa
 
diff --git a/docs/source/cpp_api.rst b/docs/source/cpp_api.rst
index 6d951587d9..db139031a2 100644
--- a/docs/source/cpp_api.rst
+++ b/docs/source/cpp_api.rst
@@ -9,6 +9,11 @@ RAFT C++ API Reference
    :maxdepth: 4
 
    cpp_api/core.rst
+   cpp_api/clustering.rst
+   cpp_api/linalg.rst
+   cpp_api/matrix.rst
+   cpp_api/optimization.rst
+   cpp_api/random.rst
    cpp_api/spatial.rst
-   cpp_api/nn.rst
-   cpp_api/sparse.rst
\ No newline at end of file
+   cpp_api/sparse.rst
+   cpp_api/stats.rst
\ No newline at end of file
diff --git a/docs/source/cpp_api/clustering.rst b/docs/source/cpp_api/clustering.rst
new file mode 100644
index 0000000000..715275b59a
--- /dev/null
+++ b/docs/source/cpp_api/clustering.rst
@@ -0,0 +1,25 @@
+Clustering
+==========
+
+This page provides C++ class references for the publicly-exposed elements of the clustering package.
+
+K-Means
+#######
+
+.. doxygennamespace:: raft::cluster
+    :project: RAFT
+    :members:
+
+Spectral
+########
+
+.. doxygennamespace:: raft::spectral
+    :project: RAFT
+    :members:
+
+Hierarchical
+############
+
+.. doxygennamespace:: raft::hierarchy
+    :project: RAFT
+    :members:
diff --git a/docs/source/cpp_api/core.rst b/docs/source/cpp_api/core.rst
index bae39e3282..ef6270556e 100644
--- a/docs/source/cpp_api/core.rst
+++ b/docs/source/cpp_api/core.rst
@@ -14,8 +14,41 @@ handle_t
 
 
 interruptible
-########
+#############
 
 .. doxygenclass:: raft::interruptible
     :project: RAFT
     :members:
+
+
+mdarray
+#######
+
+.. doxygenclass:: raft::mdarray
+    :project: RAFT
+    :members:
+
+
+span
+####
+
+.. doxygenclass:: raft::span
+    :project: RAFT
+    :members:
+
+
+logger
+######
+
+.. doxygenclass:: raft::logger
+    :project: RAFT
+    :members:
+
+
+Multi-node Multi-GPU
+####################
+
+.. doxygennamespace:: raft::comms
+    :project: RAFT
+    :members:
+
diff --git a/docs/source/cpp_api/distributed.rst b/docs/source/cpp_api/distributed.rst
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/docs/source/cpp_api/linalg.rst b/docs/source/cpp_api/linalg.rst
index e69de29bb2..f9986fd2ce 100644
--- a/docs/source/cpp_api/linalg.rst
+++ b/docs/source/cpp_api/linalg.rst
@@ -0,0 +1,8 @@
+Linear Algebra
+==============
+
+This page provides C++ class references for the publicly-exposed elements of the (dense) linear algebra package.
+
+.. doxygennamespace:: raft::linalg
+    :project: RAFT
+    :members:
diff --git a/docs/source/cpp_api/matrix.rst b/docs/source/cpp_api/matrix.rst
new file mode 100644
index 0000000000..65534aa6ee
--- /dev/null
+++ b/docs/source/cpp_api/matrix.rst
@@ -0,0 +1,8 @@
+Matrix
+======
+
+This page provides C++ class references for the publicly-exposed elements of the matrix package.
+
+.. doxygennamespace:: raft::matrix
+    :project: RAFT
+    :members:
diff --git a/docs/source/cpp_api/nn.rst b/docs/source/cpp_api/nn.rst
deleted file mode 100644
index 79d8dd1ad3..0000000000
--- a/docs/source/cpp_api/nn.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Nearest Neighbors
-=================
-
-This page provides C++ class references for the publicly-exposed elements of the nearest neighbors package.
-
-
-
-nearest neighbors
-#################
-
-.. doxygennamespace:: raft::spatial::knn
-    :project: RAFT
-    :members:
-
diff --git a/docs/source/cpp_api/optimization.rst b/docs/source/cpp_api/optimization.rst
new file mode 100644
index 0000000000..75cec2494e
--- /dev/null
+++ b/docs/source/cpp_api/optimization.rst
@@ -0,0 +1,19 @@
+Optimization
+============
+
+This page provides C++ class references for the publicly-exposed elements of the optimization package.
+
+
+Linear Assignment Problem
+#########################
+
+.. doxygenclass:: raft::lap::LinearAssignmentProblem
+    :project: RAFT
+    :members:
+
+Minimum Spanning Tree
+#####################
+
+.. doxygennamespace:: raft::mst
+    :project: RAFT
+    :members:
diff --git a/docs/source/cpp_api/random.rst b/docs/source/cpp_api/random.rst
new file mode 100644
index 0000000000..8635855484
--- /dev/null
+++ b/docs/source/cpp_api/random.rst
@@ -0,0 +1,12 @@
+Random
+======
+
+This page provides C++ class references for the publicly-exposed elements of the random package.
+
+.. doxygennamespace:: raft::random
+    :project: RAFT
+    :members:
+
+.. doxygenclass:: raft::random::Rng
+    :project: RAFT
+    :members:
diff --git a/docs/source/cpp_api/sparse.rst b/docs/source/cpp_api/sparse.rst
index 91e553426b..c0ea61c6f7 100644
--- a/docs/source/cpp_api/sparse.rst
+++ b/docs/source/cpp_api/sparse.rst
@@ -5,10 +5,41 @@ This page provides C++ class references for the publicly-exposed elements of the
 
 
-raft::sparse
-############
+Conversion
+##########
 
-.. doxygennamespace:: raft::sparse
+.. doxygennamespace:: raft::sparse::convert
     :project: RAFT
     :members:
 
+Distance
+########
+
+.. doxygennamespace:: raft::sparse::distance
+    :project: RAFT
+    :members:
+
+Linear Algebra
+##############
+
+.. doxygennamespace:: raft::sparse::linalg
+    :project: RAFT
+    :members:
+
+Misc Operations
+###############
+
+.. doxygennamespace:: raft::sparse::op
+    :project: RAFT
+    :members:
+
+Selection
+#########
+
+.. doxygennamespace:: raft::sparse::selection
+    :project: RAFT
+    :members:
+
+.. doxygennamespace:: raft::linkage
+    :project: RAFT
+    :members:
diff --git a/docs/source/cpp_api/spatial.rst b/docs/source/cpp_api/spatial.rst
index 410267e528..5065fa5af0 100644
--- a/docs/source/cpp_api/spatial.rst
+++ b/docs/source/cpp_api/spatial.rst
@@ -3,11 +3,16 @@ Spatial
 
 This page provides C++ class references for the publicly-exposed elements of the spatial package.
 
-
-
-distance
+Distance
 ########
 
 .. doxygennamespace:: raft::distance
     :project: RAFT
 
+
+Nearest Neighbors
+#################
+
+.. doxygennamespace:: raft::spatial::knn
+    :project: RAFT
+    :members:
diff --git a/docs/source/cpp_api/stats.rst b/docs/source/cpp_api/stats.rst
index e69de29bb2..8ad8b8a604 100644
--- a/docs/source/cpp_api/stats.rst
+++ b/docs/source/cpp_api/stats.rst
@@ -0,0 +1,8 @@
+Stats
+=====
+
+This page provides C++ class references for the publicly-exposed elements of the stats package.
+
+.. doxygennamespace:: raft::stats
+    :project: RAFT
+    :members:
diff --git a/docs/source/cuda_cpp.rst b/docs/source/cuda_cpp.rst
index 3737875a27..30e8903f29 100644
--- a/docs/source/cuda_cpp.rst
+++ b/docs/source/cuda_cpp.rst
@@ -8,4 +8,4 @@ RAFT is header-only but provides optional shared libraries to speed up compile t
 .. toctree::
    :maxdepth: 4
 
-    cpp_api.rst
+   cpp_api.rst
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 85798a9b47..d047543c13 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,19 +1,15 @@
 Welcome to RAFT's documentation!
 =================================
 
-
-RAFT (RAPIDS Analytics Framework Toolkit) is a Python and CUDA/C++ library containing building-blocks, mathematical primitives, and utilities for accelerating the composition of RAPIDS analytics.
-
-
+RAFT contains fundamental widely-used algorithms and primitives for data science, graph and machine learning.
 
 .. toctree::
    :maxdepth: 2
    :caption: Contents:
 
-   raft_intro.rst
    cpp_api.rst
-   python_api.rst
-
+   pyraft_api.rst
+   pylibraft_api.rst
 
 
 Indices and tables
diff --git a/docs/source/pylibraft_api.rst b/docs/source/pylibraft_api.rst
new file mode 100644
index 0000000000..4df0d9d01c
--- /dev/null
+++ b/docs/source/pylibraft_api.rst
@@ -0,0 +1,13 @@
+~~~~~~~~~~~~~~~~~~~~~~~
+PyLibRAFT API Reference
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. role:: py(code)
+   :language: python
+   :class: highlight
+
+
+Pairwise Distances
+==================
+
+.. autofunction:: pylibraft.distance.pairwise_distance
\ No newline at end of file
diff --git a/docs/source/pyraft_api.rst b/docs/source/pyraft_api.rst
new file mode 100644
index 0000000000..c763c9a0f7
--- /dev/null
+++ b/docs/source/pyraft_api.rst
@@ -0,0 +1,20 @@
+~~~~~~~~~~~~~~~~~~~~
+PyRAFT API Reference
+~~~~~~~~~~~~~~~~~~~~
+
+.. role:: py(code)
+   :language: python
+   :class: highlight
+
+
+RAFT Handle
+-----------
+
+.. autoclass:: raft.common.handle.Handle
+    :members:
+
+Dask-based Multi-Node Multi-GPU Communicator
+--------------------------------------------
+
+.. autoclass:: raft.dask.common.Comms
+    :members:
diff --git a/docs/source/python.rst b/docs/source/python.rst
index 3909403ff0..0ae9f88398 100644
--- a/docs/source/python.rst
+++ b/docs/source/python.rst
@@ -1,11 +1,10 @@
-Python API
-==========
-
-
+RAFT Python APIs
+================
 
 .. toctree::
    :maxdepth: 2
    :caption: Contents:
 
-   python_api.rst
+   pyraft_api.rst
+   pylibraft_api.rst
 
diff --git a/docs/source/python_api.rst b/docs/source/python_api.rst
deleted file mode 100644
index fb8be78c7a..0000000000
--- a/docs/source/python_api.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-~~~~~~~~~~~~~~~~~~~
-RAFT API Reference
-~~~~~~~~~~~~~~~~~~~
-
-.. role:: py(code)
-   :language: python
-   :class: highlight
-
-
-Multi-Node Multi-GPU Infrastructure
-===================================
-
-Dask-based Communicator
------------------------
-
-.. autoclass:: raft.dask.common.Comms
-    :members:
diff --git a/python/pylibraft/pylibraft/distance/distance_type.pxd b/python/pylibraft/pylibraft/distance/distance_type.pxd
index 2c01e42e53..ab865670bb 100644
--- a/python/pylibraft/pylibraft/distance/distance_type.pxd
+++ b/python/pylibraft/pylibraft/distance/distance_type.pxd
@@ -13,6 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
 
 cdef extern from "raft/distance/distance_type.hpp" namespace "raft::distance":
 
diff --git a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
index 713a1d57d4..e667015ac8 100644
--- a/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
+++ b/python/pylibraft/pylibraft/distance/pairwise_distance.pyx
@@ -13,6 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
 import numpy as np
 
 from libc.stdint cimport uintptr_t
@@ -74,14 +79,39 @@ def distance(X, Y, dists, metric="euclidean"):
     """
     Compute pairwise distances between X and Y
 
+    Valid values for metric:
+        ["euclidean", "l2", "l1", "cityblock", "inner_product",
+         "chebyshev", "canberra", "lp", "hellinger", "jensenshannon",
+         "kl_divergence", "russellrao"]
+
     Parameters
     ----------
 
     X : CUDA array interface compliant matrix shape (m, k)
     Y : CUDA array interface compliant matrix shape (n, k)
     dists : Writable CUDA array interface matrix shape (m, n)
-    metric : string denoting the metric type
-    """
+    metric : string denoting the metric type (default="euclidean")
+
+    Examples
+    --------
+
+    .. code-block:: python
+
+        import cupy as cp
+
+        from pylibraft.distance import pairwise_distance
+
+        n_samples = 5000
+        n_features = 50
+
+        in1 = cp.random.random_sample((n_samples, n_features),
+                                      dtype=cp.float32)
+        in2 = cp.random.random_sample((n_samples, n_features),
+                                      dtype=cp.float32)
+        output = cp.empty((n_samples, n_samples), dtype=cp.float32)
+
+        pairwise_distance(in1, in2, output, metric="euclidean")
+   """
 
     # TODO: Validate inputs, shapes, etc...
     x_cai = X.__cuda_array_interface__

From 08b965d071781897aecde0fe3fe28e69b59fc1a2 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyas.ramasubramani@gmail.com>
Date: Fri, 25 Mar 2022 13:06:43 -0700
Subject: [PATCH 165/171] Update cuco version. (#592)

This PR updates the commit hash for cuCollections to include the changes in NVIDIA/cuCollections#138. cudf depends on those changes in 22.04, and some of our CI builds of cudf are finding the version of cuco installed by raft and then failing, so I'm making this change to 22.04 even though we're in code freeze. Happy to work with ops an an alternate solution if there are concerns about the update, though.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/592
---
 cpp/cmake/thirdparty/get_cuco.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/cmake/thirdparty/get_cuco.cmake b/cpp/cmake/thirdparty/get_cuco.cmake
index 3f2434d9be..e00e63dff0 100644
--- a/cpp/cmake/thirdparty/get_cuco.cmake
+++ b/cpp/cmake/thirdparty/get_cuco.cmake
@@ -23,7 +23,7 @@ function(find_and_configure_cuco VERSION)
           INSTALL_EXPORT_SET  raft-exports
           CPM_ARGS
             GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
-            GIT_TAG        0ca860b824f5dc22cf8a41f09912e62e11f07d82
+            GIT_TAG        6ec8b6dcdeceea07ab4456d32461a05c18864411
             OPTIONS        "BUILD_TESTS OFF"
                            "BUILD_BENCHMARKS OFF"
                            "BUILD_EXAMPLES OFF"

From ac6fde465d5e67c74c84670416d3d2e540d52f30 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Mon, 28 Mar 2022 10:50:42 -0400
Subject: [PATCH 166/171] Adding build script for docs (#589)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/raft/pull/589
---
 ci/docs/build.sh | 54 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 ci/docs/build.sh

diff --git a/ci/docs/build.sh b/ci/docs/build.sh
new file mode 100644
index 0000000000..62710d5dee
--- /dev/null
+++ b/ci/docs/build.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#################################
+# RAFT docs build script for CI #
+#################################
+
+if [ -z "$PROJECT_WORKSPACE" ]; then
+    echo ">>>> ERROR: Could not detect PROJECT_WORKSPACE in environment"
+    echo ">>>> WARNING: This script contains git commands meant for automated building, do not run locally"
+    exit 1
+fi
+
+export DOCS_WORKSPACE="$WORKSPACE/docs"
+export PATH=/conda/bin:/usr/local/cuda/bin:$PATH
+export HOME="$WORKSPACE"
+export PROJECT_WORKSPACE=/rapids/raft
+export PROJECTS=(raft)
+
+gpuci_logger "Check environment"
+env
+
+gpuci_logger "Check GPU usage"
+nvidia-smi
+
+
+gpuci_logger "Activate conda env"
+. /opt/conda/etc/profile.d/conda.sh
+conda activate rapids
+
+gpuci_logger "Check versions"
+python --version
+$CC --version
+$CXX --version
+
+gpuci_logger "Show conda info"
+conda info
+conda config --show-sources
+conda list --show-channel-urls
+
+# Build Doxygen docs
+gpuci_logger "Build Doxygen and Sphinx docs"
+"$PROJECT_WORKSPACE/build.sh" docs -v
+
+#Commit to Website
+cd "$DOCS_WORKSPACE"
+
+for PROJECT in ${PROJECTS[@]}; do
+    if [ ! -d "api/$PROJECT/$BRANCH_VERSION" ]; then
+        mkdir -p "api/$PROJECT/$BRANCH_VERSION"
+    fi
+    rm -rf "$DOCS_WORKSPACE/api/$PROJECT/$BRANCH_VERSION/"*
+done
+
+mv "$PROJECT_WORKSPACE/docs/build/html/"* "$DOCS_WORKSPACE/api/raft/$BRANCH_VERSION"
\ No newline at end of file

From 6aa9ac839174925200e2db60a683c4ed31525ac4 Mon Sep 17 00:00:00 2001
From: Micka <9810050+lowener@users.noreply.github.com>
Date: Tue, 29 Mar 2022 01:26:44 +0200
Subject: [PATCH 167/171] Fix make_device_vector_view (#595)

Small typo fix.

Authors:
  - Micka (https://github.com/lowener)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Jiaming Yuan (https://github.com/trivialfis)

URL: https://github.com/rapidsai/raft/pull/595
---
 cpp/include/raft/mdarray.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/mdarray.hpp b/cpp/include/raft/mdarray.hpp
index f92a0e5e59..7c3a1a02fa 100644
--- a/cpp/include/raft/mdarray.hpp
+++ b/cpp/include/raft/mdarray.hpp
@@ -482,7 +482,7 @@ template <typename ElementType>
 auto make_host_vector_view(ElementType* ptr, size_t n)
 {
   detail::vector_extent extents{n};
-  return host_matrix_view<ElementType>{ptr, extents};
+  return host_vector_view<ElementType>{ptr, extents};
 }
 
 /**
@@ -496,7 +496,7 @@ template <typename ElementType>
 auto make_device_vector_view(ElementType* ptr, size_t n)
 {
   detail::vector_extent extents{n};
-  return device_matrix_view<ElementType>{ptr, extents};
+  return device_vector_view<ElementType>{ptr, extents};
 }
 
 /**

From bde25844232435351ca5d2550d54d5fefb83b0e3 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Wed, 30 Mar 2022 15:52:50 +0200
Subject: [PATCH 168/171] KNN select-top-k variants (#551)

Integrate two new implementations for knn's `select_k` function.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/551
---
 cpp/bench/CMakeLists.txt                      |   1 +
 cpp/bench/spatial/selection.cu                | 119 +++
 cpp/include/raft/cudart_utils.h               |  16 +
 .../spatial/knn/detail/selection_faiss.cuh    |  78 +-
 .../spatial/knn/detail/topk/bitonic_sort.cuh  | 236 +++++
 .../spatial/knn/detail/topk/radix_topk.cuh    | 608 ++++++++++++
 .../spatial/knn/detail/topk/warpsort_topk.cuh | 881 ++++++++++++++++++
 cpp/include/raft/spatial/knn/knn.cuh          | 172 ++--
 cpp/include/raft/spatial/knn/knn.hpp          | 151 +--
 cpp/test/spatial/selection.cu                 | 418 +++++++--
 cpp/test/test_utils.h                         |  30 +
 11 files changed, 2373 insertions(+), 337 deletions(-)
 create mode 100644 cpp/bench/spatial/selection.cu
 create mode 100644 cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh
 create mode 100644 cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
 create mode 100644 cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh

diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
index 9f0a6096d9..5214047571 100644
--- a/cpp/bench/CMakeLists.txt
+++ b/cpp/bench/CMakeLists.txt
@@ -19,6 +19,7 @@ set(RAFT_CPP_BENCH_TARGET "bench_raft")
 # (please keep the filenames in alphabetical order)
 add_executable(${RAFT_CPP_BENCH_TARGET}
   bench/linalg/reduce.cu
+  bench/spatial/selection.cu
   bench/main.cpp
 )
 
diff --git a/cpp/bench/spatial/selection.cu b/cpp/bench/spatial/selection.cu
new file mode 100644
index 0000000000..09d02940a5
--- /dev/null
+++ b/cpp/bench/spatial/selection.cu
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/spatial/knn/knn.cuh>
+
+#include <raft/random/rng.hpp>
+#include <raft/sparse/detail/utils.h>
+
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+namespace raft::bench::spatial {
+
+struct params {
+  int n_inputs;
+  int input_len;
+  int k;
+  int select_min;
+};
+
+template <typename KeyT, typename IdxT, raft::spatial::knn::SelectKAlgo Algo>
+struct selection : public fixture {
+  explicit selection(const params& p)
+    : params_(p),
+      in_dists_(p.n_inputs * p.input_len, stream),
+      in_ids_(p.n_inputs * p.input_len, stream),
+      out_dists_(p.n_inputs * p.k, stream),
+      out_ids_(p.n_inputs * p.k, stream)
+  {
+    raft::sparse::iota_fill(in_ids_.data(), IdxT(p.n_inputs), IdxT(p.input_len), stream);
+    raft::random::Rng(42).uniform(
+      in_dists_.data(), in_dists_.size(), KeyT(-1.0), KeyT(1.0), stream);
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    using_pool_memory_res res;
+    try {
+      std::ostringstream label_stream;
+      label_stream << params_.n_inputs << "#" << params_.input_len << "#" << params_.k;
+      state.SetLabel(label_stream.str());
+      loop_on_state(state, [this]() {
+        raft::spatial::knn::select_k<IdxT, KeyT>(in_dists_.data(),
+                                                 in_ids_.data(),
+                                                 params_.n_inputs,
+                                                 params_.input_len,
+                                                 out_dists_.data(),
+                                                 out_ids_.data(),
+                                                 params_.select_min,
+                                                 params_.k,
+                                                 stream,
+                                                 Algo);
+      });
+    } catch (raft::exception& e) {
+      state.SkipWithError(e.what());
+    }
+  }
+
+ private:
+  const params params_;
+  rmm::device_uvector<KeyT> in_dists_, out_dists_;
+  rmm::device_uvector<IdxT> in_ids_, out_ids_;
+};
+
+const std::vector<params> kInputs{
+  {20000, 500, 1, true},   {20000, 500, 2, true},    {20000, 500, 4, true},
+  {20000, 500, 8, true},   {20000, 500, 16, true},   {20000, 500, 32, true},
+  {20000, 500, 64, true},  {20000, 500, 128, true},  {20000, 500, 256, true},
+
+  {1000, 10000, 1, true},  {1000, 10000, 2, true},   {1000, 10000, 4, true},
+  {1000, 10000, 8, true},  {1000, 10000, 16, true},  {1000, 10000, 32, true},
+  {1000, 10000, 64, true}, {1000, 10000, 128, true}, {1000, 10000, 256, true},
+
+  {100, 100000, 1, true},  {100, 100000, 2, true},   {100, 100000, 4, true},
+  {100, 100000, 8, true},  {100, 100000, 16, true},  {100, 100000, 32, true},
+  {100, 100000, 64, true}, {100, 100000, 128, true}, {100, 100000, 256, true},
+
+  {10, 1000000, 1, true},  {10, 1000000, 2, true},   {10, 1000000, 4, true},
+  {10, 1000000, 8, true},  {10, 1000000, 16, true},  {10, 1000000, 32, true},
+  {10, 1000000, 64, true}, {10, 1000000, 128, true}, {10, 1000000, 256, true},
+};
+
+#define SELECTION_REGISTER(KeyT, IdxT, Algo)                                      \
+  namespace BENCHMARK_PRIVATE_NAME(selection)                                     \
+  {                                                                               \
+    using SelectK = selection<KeyT, IdxT, raft::spatial::knn::SelectKAlgo::Algo>; \
+    RAFT_BENCH_REGISTER(SelectK, #KeyT "/" #IdxT "/" #Algo, kInputs);             \
+  }
+
+SELECTION_REGISTER(float, int, FAISS);
+SELECTION_REGISTER(float, int, RADIX_8_BITS);
+SELECTION_REGISTER(float, int, RADIX_11_BITS);
+SELECTION_REGISTER(float, int, WARP_SORT);
+
+SELECTION_REGISTER(double, int, FAISS);
+SELECTION_REGISTER(double, int, RADIX_8_BITS);
+SELECTION_REGISTER(double, int, RADIX_11_BITS);
+SELECTION_REGISTER(double, int, WARP_SORT);
+
+SELECTION_REGISTER(double, size_t, FAISS);
+SELECTION_REGISTER(double, size_t, RADIX_8_BITS);
+SELECTION_REGISTER(double, size_t, RADIX_11_BITS);
+SELECTION_REGISTER(double, size_t, WARP_SORT);
+
+}  // namespace raft::bench::spatial
diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index 4ba1e18768..05fce6c0c4 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -404,6 +404,22 @@ IntType gcd(IntType a, IntType b)
   return a;
 }
 
+template <typename T>
+constexpr T lower_bound()
+{
+  if constexpr (std::numeric_limits<T>::has_infinity && std::numeric_limits<T>::is_signed) {
+    return -std::numeric_limits<T>::infinity();
+  }
+  return std::numeric_limits<T>::lowest();
+}
+
+template <typename T>
+constexpr T upper_bound()
+{
+  if constexpr (std::numeric_limits<T>::has_infinity) { return std::numeric_limits<T>::infinity(); }
+  return std::numeric_limits<T>::max();
+}
+
 }  // namespace raft
 
 #endif
diff --git a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
index 03a4eabaac..2d2fabd9d6 100644
--- a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,24 +31,30 @@ namespace spatial {
 namespace knn {
 namespace detail {
 
-template <typename K, typename IndexType, bool select_min, int warp_q, int thread_q, int tpb>
-__global__ void select_k_kernel(K* inK,
-                                IndexType* inV,
+template <typename key_t, typename payload_t>
+constexpr int kFaissMaxK()
+{
+  return (sizeof(key_t) + sizeof(payload_t) > 8) ? 512 : 1024;
+}
+
+template <typename key_t, typename payload_t, bool select_min, int warp_q, int thread_q, int tpb>
+__global__ void select_k_kernel(key_t* inK,
+                                payload_t* inV,
                                 size_t n_rows,
                                 size_t n_cols,
-                                K* outK,
-                                IndexType* outV,
-                                K initK,
-                                IndexType initV,
+                                key_t* outK,
+                                payload_t* outV,
+                                key_t initK,
+                                payload_t initV,
                                 int k)
 {
   constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
-  __shared__ K smemK[kNumWarps * warp_q];
-  __shared__ IndexType smemV[kNumWarps * warp_q];
+  __shared__ key_t smemK[kNumWarps * warp_q];
+  __shared__ payload_t smemV[kNumWarps * warp_q];
 
   faiss::gpu::
-    BlockSelect<K, IndexType, select_min, faiss::gpu::Comparator<K>, warp_q, thread_q, tpb>
+    BlockSelect<key_t, payload_t, select_min, faiss::gpu::Comparator<key_t>, warp_q, thread_q, tpb>
       heap(initK, initV, smemK, smemV, k);
 
   // Grid is exactly sized to rows available
@@ -56,8 +62,8 @@ __global__ void select_k_kernel(K* inK,
   int i   = threadIdx.x;
 
   int idx             = row * n_cols;
-  K* inKStart         = inK + idx + i;
-  IndexType* inVStart = inV + idx + i;
+  key_t* inKStart     = inK + idx + i;
+  payload_t* inVStart = inV + idx + i;
 
   // Whole warps must participate in the selection
   int limit = faiss::gpu::utils::roundDown(n_cols, faiss::gpu::kWarpSize);
@@ -84,13 +90,13 @@ __global__ void select_k_kernel(K* inK,
   }
 }
 
-template <typename value_idx = int, typename value_t = float, int warp_q, int thread_q>
-inline void select_k_impl(value_t* inK,
-                          value_idx* inV,
+template <typename payload_t = int, typename key_t = float, int warp_q, int thread_q>
+inline void select_k_impl(key_t* inK,
+                          payload_t* inV,
                           size_t n_rows,
                           size_t n_cols,
-                          value_t* outK,
-                          value_idx* outV,
+                          key_t* outK,
+                          payload_t* outV,
                           bool select_min,
                           int k,
                           cudaStream_t stream)
@@ -100,14 +106,13 @@ inline void select_k_impl(value_t* inK,
   constexpr int n_threads = (warp_q <= 1024) ? 128 : 64;
   auto block              = dim3(n_threads);
 
-  auto kInit =
-    select_min ? faiss::gpu::Limits<value_t>::getMax() : faiss::gpu::Limits<value_t>::getMin();
+  auto kInit = select_min ? upper_bound<key_t>() : lower_bound<key_t>();
   auto vInit = -1;
   if (select_min) {
-    select_k_kernel<value_t, value_idx, false, warp_q, thread_q, n_threads>
+    select_k_kernel<key_t, payload_t, false, warp_q, thread_q, n_threads>
       <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k);
   } else {
-    select_k_kernel<value_t, value_idx, true, warp_q, thread_q, n_threads>
+    select_k_kernel<key_t, payload_t, true, warp_q, thread_q, n_threads>
       <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k);
   }
   RAFT_CUDA_TRY(cudaGetLastError());
@@ -127,38 +132,41 @@ inline void select_k_impl(value_t* inK,
  * @param[in] k number of neighbors per partition (also number of merged neighbors)
  * @param[in] stream CUDA stream to use
  */
-template <typename value_idx = int, typename value_t = float>
-inline void select_k(value_t* inK,
-                     value_idx* inV,
+template <typename payload_t = int, typename key_t = float>
+inline void select_k(key_t* inK,
+                     payload_t* inV,
                      size_t n_rows,
                      size_t n_cols,
-                     value_t* outK,
-                     value_idx* outV,
+                     key_t* outK,
+                     payload_t* outV,
                      bool select_min,
                      int k,
                      cudaStream_t stream)
 {
+  constexpr int max_k = kFaissMaxK<payload_t, key_t>();
   if (k == 1)
-    select_k_impl<value_idx, value_t, 1, 1>(
+    select_k_impl<payload_t, key_t, 1, 1>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 32)
-    select_k_impl<value_idx, value_t, 32, 2>(
+    select_k_impl<payload_t, key_t, 32, 2>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 64)
-    select_k_impl<value_idx, value_t, 64, 3>(
+    select_k_impl<payload_t, key_t, 64, 3>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 128)
-    select_k_impl<value_idx, value_t, 128, 3>(
+    select_k_impl<payload_t, key_t, 128, 3>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 256)
-    select_k_impl<value_idx, value_t, 256, 4>(
+    select_k_impl<payload_t, key_t, 256, 4>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 512)
-    select_k_impl<value_idx, value_t, 512, 8>(
+    select_k_impl<payload_t, key_t, 512, 8>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
-  else if (k <= 1024)
-    select_k_impl<value_idx, value_t, 1024, 8>(
+  else if (k <= 1024 && k <= max_k)
+    select_k_impl<payload_t, key_t, max_k, 8>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+  else
+    ASSERT(k <= max_k, "Current max k is %d (requested %d)", max_k, k);
 }
 
 };  // namespace detail
diff --git a/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh b/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh
new file mode 100644
index 0000000000..44ffe6bc50
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+
+namespace raft::spatial::knn::detail::topk {
+
+namespace helpers {
+
+template <typename T>
+__device__ __forceinline__ void swap(T& x, T& y)
+{
+  T t = x;
+  x   = y;
+  y   = t;
+}
+
+template <typename T>
+__device__ __forceinline__ void conditional_assign(bool cond, T& ptr, T x)
+{
+  if (cond) { ptr = x; }
+}
+
+}  // namespace helpers
+
+/**
+ * Warp-wide bitonic merge and sort.
+ * The data is strided among `warp_width` threads,
+ * e.g. calling `bitonic<4>(ascending=true).sort(arr)` takes a unique 4-element array as input of
+ * each thread in a warp and sorts them, such that for a fixed i, arr[i] are sorted within the
+ * threads in a warp, and for any i < j, arr[j] in any thread is not smaller than arr[i] in any
+ * other thread.
+ * When `warp_width < WarpSize`, the data is sorted within all subwarps of the warp independently.
+ *
+ * As an example, assuming `Size = 4`, `warp_width = 16`, and `WarpSize = 32`, sorting a permutation
+ * of numbers 0-63 in each subwarp yield the following result:
+ * `
+ *  arr_i \ laneId()
+ *       0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15    16  17  18 ...
+ *      subwarp_1                                                         subwarp_2
+ *   0   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15     0   1   2 ...
+ *   1  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31    16  17  18 ...
+ *   2  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47    32  33  34 ...
+ *   3  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63    48  49  50 ...
+ * `
+ *
+ * @tparam Size
+ *   number of elements processed in each thread;
+ *   i.e. the total data size is `Size * warp_width`.
+ *   Must be power-of-two.
+ *
+ */
+template <int Size = 1>
+class bitonic {
+  static_assert(isPo2(Size));
+
+ public:
+  /**
+   * Initialize bitonic sort config.
+   *
+   * @param ascending
+   *   the resulting order (true: ascending, false: descending).
+   * @param warp_width
+   *   the number of threads participating in the warp-level primitives;
+   *   the total size of the sorted data is `Size * warp_width`.
+   *   Must be power-of-two, not larger than the WarpSize.
+   */
+  __device__ __forceinline__ explicit bitonic(bool ascending, int warp_width = WarpSize)
+    : ascending_(ascending), warp_width_(warp_width)
+  {
+  }
+
+  bitonic(bitonic const&) = delete;
+  bitonic(bitonic&&)      = delete;
+  auto operator=(bitonic const&) -> bitonic& = delete;
+  auto operator=(bitonic&&) -> bitonic& = delete;
+
+  /**
+   * You can think of this function in two ways:
+   *
+   *   1) Sort any bitonic sequence.
+   *   2) Merge two halfs of the input data assuming they're already sorted, and their order is
+   *      opposite (i.e. either ascending, descending or vice-versa).
+   *
+   * The input pointers are unique per-thread.
+   * See the class description for the description of the data layout.
+   *
+   * @param keys
+   *   is a device pointer to a contiguous array of keys, unique per thread; must be at least `Size`
+   *   elements long.
+   * @param payloads
+   *   are zero or more associated arrays of the same size as keys, which are sorted together with
+   *   the keys; must be at least `Size` elements long.
+   */
+  template <typename KeyT, typename... PayloadTs>
+  __device__ __forceinline__ void merge(KeyT* __restrict__ keys,
+                                        PayloadTs* __restrict__... payloads) const
+  {
+    return bitonic<Size>::merge_(ascending_, warp_width_, keys, payloads...);
+  }
+
+  /**
+   * Sort the data.
+   * The input pointers are unique per-thread.
+   * See the class description for the description of the data layout.
+   *
+   * @param keys
+   *   is a device pointer to a contiguous array of keys, unique per thread; must be at least `Size`
+   *   elements long.
+   * @param payloads
+   *   are zero or more associated arrays of the same size as keys, which are sorted together with
+   *   the keys; must be at least `Size` elements long.
+   */
+  template <typename KeyT, typename... PayloadTs>
+  __device__ __forceinline__ void sort(KeyT* __restrict__ keys,
+                                       PayloadTs* __restrict__... payloads) const
+  {
+    return bitonic<Size>::sort_(ascending_, warp_width_, keys, payloads...);
+  }
+
+  /**
+   * @brief `merge` variant for the case of one element per thread
+   *        (pass input by a reference instead of a pointer).
+   *
+   * @param key
+   * @param payload
+   */
+  template <typename KeyT, typename... PayloadTs, int S = Size>
+  __device__ __forceinline__ auto merge(KeyT& __restrict__ key,
+                                        PayloadTs& __restrict__... payload) const
+    -> std::enable_if_t<S == 1, void>  // SFINAE to enable this for Size == 1 only
+  {
+    static_assert(S == Size);
+    return merge(&key, &payload...);
+  }
+
+  /**
+   * @brief `sort` variant for the case of one element per thread
+   *        (pass input by a reference instead of a pointer).
+   *
+   * @param key
+   * @param payload
+   */
+  template <typename KeyT, typename... PayloadTs, int S = Size>
+  __device__ __forceinline__ auto sort(KeyT& __restrict__ key,
+                                       PayloadTs& __restrict__... payload) const
+    -> std::enable_if_t<S == 1, void>  // SFINAE to enable this for Size == 1 only
+  {
+    static_assert(S == Size);
+    return sort(&key, &payload...);
+  }
+
+ private:
+  const int warp_width_;
+  const bool ascending_;
+
+  template <int AnotherSize>
+  friend class bitonic;
+
+  template <typename KeyT, typename... PayloadTs>
+  static __device__ __forceinline__ void merge_(bool ascending,
+                                                int warp_width,
+                                                KeyT* __restrict__ keys,
+                                                PayloadTs* __restrict__... payloads)
+  {
+#pragma unroll
+    for (int size = Size; size > 1; size >>= 1) {
+      const int stride = size >> 1;
+#pragma unroll
+      for (int offset = 0; offset < Size; offset += size) {
+#pragma unroll
+        for (int i = offset + stride - 1; i >= offset; i--) {
+          const int other_i = i + stride;
+          KeyT& key         = keys[i];
+          KeyT& other       = keys[other_i];
+          if (ascending ? key > other : key < other) {
+            helpers::swap(key, other);
+            (helpers::swap(payloads[i], payloads[other_i]), ...);
+          }
+        }
+      }
+    }
+    const int lane = laneId();
+#pragma unroll
+    for (int i = 0; i < Size; i++) {
+      KeyT& key = keys[i];
+      for (int stride = (warp_width >> 1); stride > 0; stride >>= 1) {
+        const bool is_second = lane & stride;
+        const KeyT other     = shfl_xor(key, stride, warp_width);
+        const bool do_assign = (ascending != is_second) ? key > other : key < other;
+
+        helpers::conditional_assign(do_assign, key, other);
+        // NB: don't put shfl_xor in a conditional; it must be called by all threads in a warp.
+        (helpers::conditional_assign(
+           do_assign, payloads[i], shfl_xor(payloads[i], stride, warp_width)),
+         ...);
+      }
+    }
+  }
+
+  template <typename KeyT, typename... PayloadTs>
+  static __device__ __forceinline__ void sort_(bool ascending,
+                                               int warp_width,
+                                               KeyT* __restrict__ keys,
+                                               PayloadTs* __restrict__... payloads)
+  {
+    if constexpr (Size == 1) {
+      const int lane = laneId();
+      for (int width = 2; width < warp_width; width <<= 1) {
+        bitonic<1>::merge_(lane & width, width, keys, payloads...);
+      }
+    } else {
+      constexpr int kSize2 = Size / 2;
+      bitonic<kSize2>::sort_(false, warp_width, keys, payloads...);
+      bitonic<kSize2>::sort_(true, warp_width, keys + kSize2, (payloads + kSize2)...);
+    }
+    bitonic<Size>::merge_(ascending, warp_width, keys, payloads...);
+  }
+};
+
+}  // namespace raft::spatial::knn::detail::topk
diff --git a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
new file mode 100644
index 0000000000..21e6ea026c
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
@@ -0,0 +1,608 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/radix_rank_sort_operations.cuh>
+
+#include <raft/cudart_utils.h>
+#include <raft/device_atomics.cuh>
+#include <raft/vectorized.cuh>
+
+namespace raft::spatial::knn::detail::topk {
+
+constexpr int ITEM_PER_THREAD      = 32;
+constexpr int VECTORIZED_READ_SIZE = 16;
+
+template <int BitsPerPass>
+__host__ __device__ constexpr int calc_num_buckets()
+{
+  return 1 << BitsPerPass;
+}
+
+template <typename T, int BitsPerPass>
+__host__ __device__ constexpr int calc_num_passes()
+{
+  return ceildiv<int>(sizeof(T) * 8, BitsPerPass);
+}
+
+/**
+ * Bit 0 is the least significant (rightmost);
+ * this implementation processes input from the most to the least significant bit.
+ * This way, we can skip some passes in the end at the cost of having an unsorted output.
+ *
+ * NB: Use pass=-1 for calc_mask().
+ */
+template <typename T, int BitsPerPass>
+__device__ constexpr int calc_start_bit(int pass)
+{
+  int start_bit = static_cast<int>(sizeof(T) * 8) - (pass + 1) * BitsPerPass;
+  if (start_bit < 0) { start_bit = 0; }
+  return start_bit;
+}
+
+template <typename T, int BitsPerPass>
+__device__ constexpr unsigned calc_mask(int pass)
+{
+  static_assert(BitsPerPass <= 31);
+  int num_bits = calc_start_bit<T, BitsPerPass>(pass - 1) - calc_start_bit<T, BitsPerPass>(pass);
+  return (1 << num_bits) - 1;
+}
+
+/**
+ * Use cub to twiddle bits - so that we can correctly compare bits of floating-point values as well
+ * as of integers.
+ */
+template <typename T>
+__device__ typename cub::Traits<T>::UnsignedBits twiddle_in(T key, bool greater)
+{
+  auto bits = reinterpret_cast<typename cub::Traits<T>::UnsignedBits&>(key);
+  bits      = cub::Traits<T>::TwiddleIn(bits);
+  if (greater) { bits = ~bits; }
+  return bits;
+}
+
+template <typename T, int BitsPerPass>
+__device__ int calc_bucket(T x, int start_bit, unsigned mask, bool greater)
+{
+  static_assert(BitsPerPass <= sizeof(int) * 8 - 1);  // so return type can be int
+  return (twiddle_in(x, greater) >> start_bit) & mask;
+}
+
+/**
+ * Map a Func over the input data, using vectorized load instructions if possible.
+ *
+ * NB: in future, we should move this to cpp/include/raft/linalg/detail/unary_op.cuh, which
+ *     currently does not support the second lambda argument (index of an element)
+ *
+ * @tparam T element type
+ * @tparam IdxT indexing type
+ * @tparam Func void (T x, IdxT idx)
+ *
+ * @param in the input data
+ * @param len the number of elements to read
+ * @param f the lambda taking two arguments (T x, IdxT idx)
+ */
+template <typename T, typename IdxT, typename Func>
+__device__ void vectorized_process(const T* in, IdxT len, Func f)
+{
+  const IdxT stride = blockDim.x * gridDim.x;
+  const int tid     = blockIdx.x * blockDim.x + threadIdx.x;
+  if constexpr (sizeof(T) >= VECTORIZED_READ_SIZE || VECTORIZED_READ_SIZE % sizeof(T) != 0) {
+    for (IdxT i = tid; i < len; i += stride) {
+      f(in[i], i);
+    }
+  } else {
+    using wide_t      = TxN_t<T, VECTORIZED_READ_SIZE / sizeof(T)>;
+    using align_bytes = Pow2<(size_t)VECTORIZED_READ_SIZE>;
+    using align_elems = Pow2<wide_t::Ratio>;
+    wide_t wide;
+
+    // how many elements to skip in order to do aligned vectorized load
+    const IdxT skip_cnt_left = std::min<IdxT>((IdxT)(align_bytes::roundUp(in) - in), len);
+
+    // The main loop: process all aligned data
+    for (IdxT i = tid * wide_t::Ratio + skip_cnt_left; i + wide_t::Ratio <= len;
+         i += stride * wide_t::Ratio) {
+      wide.load(in, i);
+#pragma unroll
+      for (int j = 0; j < wide_t::Ratio; ++j) {
+        f(wide.val.data[j], i + j);
+      }
+    }
+
+    static_assert(WarpSize >= wide_t::Ratio);
+    // Processes the skipped elements on the left
+    if (tid < skip_cnt_left) { f(in[tid], tid); }
+    // Processes the skipped elements on the right
+    const IdxT skip_cnt_right = align_elems::mod(len - skip_cnt_left);
+    const IdxT remain_i       = len - skip_cnt_right + tid;
+    if (remain_i < len) { f(in[remain_i], remain_i); }
+  }
+}
+
+template <typename T, typename IdxT>
+struct Counter {
+  IdxT k;
+  IdxT len;
+  IdxT previous_len;
+  int bucket;
+
+  IdxT filter_cnt;
+  unsigned int finished_block_cnt;
+  IdxT out_cnt;
+  IdxT out_back_cnt;
+};
+
+/**
+ * Fused filtering of the current phase and building histogram for the next phase
+ * (see steps 4-1 in `radix_kernel` description).
+ */
+template <typename T, typename IdxT, int BitsPerPass>
+__device__ void filter_and_histogram(const T* in_buf,
+                                     const IdxT* in_idx_buf,
+                                     T* out_buf,
+                                     IdxT* out_idx_buf,
+                                     T* out,
+                                     IdxT* out_idx,
+                                     IdxT len,
+                                     Counter<T, IdxT>* counter,
+                                     IdxT* histogram,
+                                     bool greater,
+                                     int pass,
+                                     int k)
+{
+  constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
+  __shared__ IdxT histogram_smem[num_buckets];
+  for (IdxT i = threadIdx.x; i < num_buckets; i += blockDim.x) {
+    histogram_smem[i] = 0;
+  }
+  __syncthreads();
+
+  const int start_bit = calc_start_bit<T, BitsPerPass>(pass);
+  const unsigned mask = calc_mask<T, BitsPerPass>(pass);
+
+  if (pass == 0) {
+    // Passed to vectorized_process, this function executes in all blocks in parallel,
+    // i.e. the work is split along the input (both, in batches and chunks of a single row).
+    // Later, the histograms are merged using atomicAdd.
+    auto f = [greater, start_bit, mask](T value, IdxT) {
+      int bucket = calc_bucket<T, BitsPerPass>(value, start_bit, mask, greater);
+      atomicAdd(histogram_smem + bucket, IdxT(1));
+    };
+    vectorized_process(in_buf, len, f);
+  } else {
+    const IdxT previous_len      = counter->previous_len;
+    const int want_bucket        = counter->bucket;
+    IdxT& filter_cnt             = counter->filter_cnt;
+    IdxT& out_cnt                = counter->out_cnt;
+    const IdxT counter_len       = counter->len;
+    const int previous_start_bit = calc_start_bit<T, BitsPerPass>(pass - 1);
+    const unsigned previous_mask = calc_mask<T, BitsPerPass>(pass - 1);
+
+    // See the remark above on the distributed execution of `f` using vectorized_process.
+    auto f = [in_idx_buf,
+              out_buf,
+              out_idx_buf,
+              out,
+              out_idx,
+              greater,
+              k,
+              start_bit,
+              mask,
+              previous_start_bit,
+              previous_mask,
+              want_bucket,
+              &filter_cnt,
+              &out_cnt,
+              counter_len](T value, IdxT i) {
+      int prev_bucket =
+        calc_bucket<T, BitsPerPass>(value, previous_start_bit, previous_mask, greater);
+      if (prev_bucket == want_bucket) {
+        IdxT pos     = atomicAdd(&filter_cnt, IdxT(1));
+        out_buf[pos] = value;
+        if (out_idx_buf) { out_idx_buf[pos] = in_idx_buf ? in_idx_buf[i] : i; }
+        int bucket = calc_bucket<T, BitsPerPass>(value, start_bit, mask, greater);
+        atomicAdd(histogram_smem + bucket, IdxT(1));
+
+        if (counter_len == 1) {
+          out[k - 1]     = value;
+          out_idx[k - 1] = in_idx_buf ? in_idx_buf[i] : i;
+        }
+      } else if (prev_bucket < want_bucket) {
+        IdxT pos     = atomicAdd(&out_cnt, IdxT(1));
+        out[pos]     = value;
+        out_idx[pos] = in_idx_buf ? in_idx_buf[i] : i;
+      }
+    };
+
+    vectorized_process(in_buf, previous_len, f);
+  }
+  __syncthreads();
+
+  // merge histograms produced by individual blocks
+  for (int i = threadIdx.x; i < num_buckets; i += blockDim.x) {
+    if (histogram_smem[i] != 0) { atomicAdd(histogram + i, histogram_smem[i]); }
+  }
+}
+
+/**
+ * Replace a part of the histogram with its own prefix sum, starting from the `start` and adding
+ * `current` to each entry of the result.
+ * (step 2 in `radix_kernel` description)
+ */
+template <typename IdxT, int BitsPerPass, int BlockSize>
+__device__ void scan(volatile IdxT* histogram,
+                     const int start,
+                     const int num_buckets,
+                     const IdxT current)
+{
+  typedef cub::BlockScan<IdxT, BlockSize> BlockScan;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  IdxT thread_data = 0;
+  int index        = start + threadIdx.x;
+  if (index < num_buckets) { thread_data = histogram[index]; }
+
+  BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+  __syncthreads();
+  if (index < num_buckets) { histogram[index] = thread_data + current; }
+  __syncthreads();  // This sync is necessary, as the content of histogram needs
+                    // to be read after
+}
+
+/**
+ * Calculate in which bucket the k-th value will fall
+ *  (steps 2-3 in `radix_kernel` description)
+ */
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+__device__ void choose_bucket(Counter<T, IdxT>* counter, IdxT* histogram, const IdxT k)
+{
+  constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
+  int index                 = threadIdx.x;
+  IdxT last_prefix_sum      = 0;
+  int num_pass              = 1;
+  if constexpr (num_buckets >= BlockSize) {
+    static_assert(num_buckets % BlockSize == 0);
+    num_pass = num_buckets / BlockSize;
+  }
+
+  for (int i = 0; i < num_pass && (last_prefix_sum < k); i++) {
+    // Turn the i-th chunk of the histogram into its prefix sum.
+    scan<IdxT, BitsPerPass, BlockSize>(histogram, i * BlockSize, num_buckets, last_prefix_sum);
+    if (index < num_buckets) {
+      // Number of values in the previous `index-1` buckets (see the `scan` op above)
+      IdxT prev = (index == 0) ? 0 : histogram[index - 1];
+      // Number of values in `index` buckets
+      IdxT cur = histogram[index];
+
+      // one and only one thread will satisfy this condition, so only write once
+      if (prev < k && cur >= k) {
+        counter->k            = k - prev;  // how many values still are there to find
+        counter->previous_len = counter->len;
+        counter->len          = cur - prev;  // number of values in `index` bucket
+        counter->bucket       = index;
+      }
+    }
+    index += BlockSize;
+    // this will break the loop when the counter is set (cur >= k), because last_prefix_sum >= cur
+    last_prefix_sum = histogram[(i + 1) * BlockSize - 1];
+  }
+}
+
+/**
+ *
+ * It is expected to call this kernel multiple times (passes), in each pass we process a radix,
+ * going from the most significant towards the least significant bits (MSD).
+ *
+ * Conceptually, each pass consists of 4 steps:
+ *
+ * 1. Calculate histogram
+ *      First, transform bits into a digit, the value of which is in the range
+ *      [0, 2^{BITS_PER_PASS}-1]. Then count the frequency of each digit value and the result is a
+ *      histogram. That is, histogram[i] contains the count of inputs having value i.
+ *
+ * 2. Scan the histogram
+ *      Inclusive prefix sum is computed for the histogram. After this step, histogram[i] contains
+ *      the count of inputs having value <= i.
+ *
+ * 3. Find the bucket j of the histogram that the k-th value falls into
+ *
+ * 4. Filtering
+ *      Input elements whose digit value <j are the top-k elements. We put them into the result
+ *      array out. The number of such elements is histogram[j-1]. Since the k-th value must be in
+ *      the bucket j, we write all elements in bucket j into a intermediate buffer out_buf. For the
+ *      next pass, these elements are used as input, and we would like to find the
+ *      (k - histogram[j-1])-th value among them. That is, the k in the next pass is set to
+ *      (k - histogram[j-1]).
+ *
+ * In the implementation, the filtering step is delayed to the next pass so the filtering and
+ * histogram computation are fused. In this way, inputs are read once rather than twice.
+ */
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+__global__ void __launch_bounds__(BlockSize) radix_kernel(const T* in_buf,
+                                                          const IdxT* in_idx_buf,
+                                                          T* out_buf,
+                                                          IdxT* out_idx_buf,
+                                                          T* out,
+                                                          IdxT* out_idx,
+                                                          Counter<T, IdxT>* counters,
+                                                          IdxT* histograms,
+                                                          const IdxT len,
+                                                          const int k,
+                                                          const bool greater,
+                                                          const int pass)
+{
+  __shared__ bool isLastBlockDone;
+
+  constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
+  constexpr int num_passes  = calc_num_passes<T, BitsPerPass>();
+  const int batch_id        = blockIdx.y;
+  in_buf += batch_id * len;
+  out_buf += batch_id * len;
+  out += batch_id * k;
+  out_idx += batch_id * k;
+  if (in_idx_buf) { in_idx_buf += batch_id * len; }
+  if (out_idx_buf) { out_idx_buf += batch_id * len; }
+
+  auto counter   = counters + batch_id;
+  auto histogram = histograms + batch_id * num_buckets;
+
+  filter_and_histogram<T, IdxT, BitsPerPass>(in_buf,
+                                             in_idx_buf,
+                                             out_buf,
+                                             out_idx_buf,
+                                             out,
+                                             out_idx,
+                                             len,
+                                             counter,
+                                             histogram,
+                                             greater,
+                                             pass,
+                                             k);
+  __threadfence();
+
+  if (threadIdx.x == 0) {
+    unsigned int finished = atomicInc(&counter->finished_block_cnt, gridDim.x - 1);
+    isLastBlockDone       = (finished == (gridDim.x - 1));
+  }
+
+  // Synchronize to make sure that each thread reads the correct value of
+  // isLastBlockDone.
+  __syncthreads();
+  if (isLastBlockDone) {
+    if (counter->len == 1 && threadIdx.x == 0) {
+      counter->previous_len = 0;
+      counter->len          = 0;
+    }
+    // init counter, other members of counter is initialized with 0 by
+    // cudaMemset()
+    if (pass == 0 && threadIdx.x == 0) {
+      counter->k            = k;
+      counter->len          = len;
+      counter->out_back_cnt = 0;
+    }
+    __syncthreads();
+
+    IdxT ori_k = counter->k;
+
+    if (counter->len > 0) {
+      choose_bucket<T, IdxT, BitsPerPass, BlockSize>(counter, histogram, ori_k);
+    }
+
+    __syncthreads();
+    if (pass == num_passes - 1) {
+      const IdxT previous_len = counter->previous_len;
+      const int want_bucket   = counter->bucket;
+      int start_bit           = calc_start_bit<T, BitsPerPass>(pass);
+      unsigned mask           = calc_mask<T, BitsPerPass>(pass);
+
+      // radix topk
+      IdxT& out_cnt = counter->out_cnt;
+      for (IdxT i = threadIdx.x; i < previous_len; i += blockDim.x) {
+        const T value = out_buf[i];
+        int bucket    = calc_bucket<T, BitsPerPass>(value, start_bit, mask, greater);
+        if (bucket < want_bucket) {
+          IdxT pos     = atomicAdd(&out_cnt, IdxT(1));
+          out[pos]     = value;
+          out_idx[pos] = out_idx_buf[i];
+        } else if (bucket == want_bucket) {
+          IdxT needed_num_of_kth = counter->k;
+          IdxT back_pos          = atomicAdd(&(counter->out_back_cnt), IdxT(1));
+          if (back_pos < needed_num_of_kth) {
+            IdxT pos     = k - 1 - back_pos;
+            out[pos]     = value;
+            out_idx[pos] = out_idx_buf[i];
+          }
+        }
+      }
+      __syncthreads();
+    } else {
+      // reset for next pass
+      for (int i = threadIdx.x; i < num_buckets; i += blockDim.x) {
+        histogram[i] = 0;
+      }
+      if (threadIdx.x == 0) { counter->filter_cnt = 0; }
+    }
+  }
+}
+
+/**
+ * Calculate the minimal batch size, such that GPU is still fully occupied.
+ */
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+inline uint16_t get_optimal_batch_size(size_t req_batch_size, size_t blocks_per_row)
+{
+  int dev_id, sm_count, occupancy, max_grid_dim_y;
+  RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id));
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&max_grid_dim_y, cudaDevAttrMaxGridDimY, dev_id));
+  RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &occupancy, radix_kernel<T, IdxT, BitsPerPass, BlockSize>, BlockSize, 0));
+
+  // fully occupy GPU
+  size_t opt_batch_size = ceildiv<size_t>(sm_count * occupancy, blocks_per_row);
+  // round it up to the closest pow-of-two for better data alignment
+  opt_batch_size = isPo2(opt_batch_size) ? opt_batch_size : (1 << (log2(opt_batch_size) + 1));
+  // Take a max possible pow-of-two grid_dim_y
+  max_grid_dim_y = isPo2(max_grid_dim_y) ? max_grid_dim_y : (1 << log2(max_grid_dim_y));
+  // If the optimal batch size is very small compared to the requested batch size, we know
+  // the extra required memory is not significant and we can increase the batch size for
+  // better occupancy when the grid size is not multiple of the SM count.
+  // Also don't split the batch size when there is not much work overall.
+  const size_t safe_enlarge_factor = 9;
+  const size_t min_grid_size       = 1024;
+  while ((opt_batch_size << safe_enlarge_factor) < req_batch_size ||
+         blocks_per_row * opt_batch_size < min_grid_size) {
+    opt_batch_size <<= 1;
+  }
+
+  // Do not exceed the max grid size.
+  opt_batch_size = std::min<size_t>(opt_batch_size, size_t(max_grid_dim_y));
+
+  // Don't do more work than needed
+  return uint16_t(std::min<size_t>(opt_batch_size, req_batch_size));
+}
+
+/**
+ * Select k smallest or largest key/values from each row in the input data.
+ *
+ * If you think of the input data `in_keys` as a row-major matrix with len columns and
+ * batch_size rows, then this function selects k smallest/largest values in each row and fills
+ * in the row-major matrix `out` of size (batch_size, k).
+ *
+ * Note, the output is NOT sorted within the groups of `k` selected elements.
+ *
+ * @tparam T
+ *   the type of the keys (what is being compared).
+ * @tparam IdxT
+ *   the index type (what is being selected together with the keys).
+ * @tparam BitsPerPass
+ *   The size of the radix;
+ *   it affects the number of passes and number of buckets.
+ * @tparam BlockSize
+ *   Number of threads in a kernel thread block.
+ *
+ * @param[in] in
+ *   contiguous device array of inputs of size (len * batch_size);
+ *   these are compared and selected.
+ * @param[in] in_idx
+ *   contiguous device array of inputs of size (len * batch_size);
+ *   typically, these are indices of the corresponding in_keys.
+ * @param[in] batch_size
+ *   number of input rows, i.e. the batch size.
+ * @param[in] len
+ *   length of a single input array (row); also sometimes referred as n_cols.
+ *   Invariant: len >= k.
+ * @param[in] k
+ *   the number of outputs to select in each input row.
+ * @param[out] out
+ *   contiguous device array of outputs of size (k * batch_size);
+ *   the k smallest/largest values from each row of the `in_keys`.
+ * @param[out] out_idx
+ *   contiguous device array of outputs of size (k * batch_size);
+ *   the payload selected together with `out`.
+ * @param[in] select_min
+ *   whether to select k smallest (true) or largest (false) keys.
+ * @param[in] stream
+ */
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+void radix_topk(const T* in,
+                const IdxT* in_idx,
+                size_t batch_size,
+                size_t len,
+                int k,
+                T* out,
+                IdxT* out_idx,
+                bool select_min,
+                rmm::cuda_stream_view stream)
+{
+  // TODO: is it possible to relax this restriction?
+  static_assert(calc_num_passes<T, BitsPerPass>() > 1);
+  constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
+
+  size_t blocks_per_row = ceildiv<size_t>(len, BlockSize * ITEM_PER_THREAD);
+  uint16_t max_chunk_size =
+    get_optimal_batch_size<T, IdxT, BitsPerPass, BlockSize>(batch_size, blocks_per_row);
+
+  rmm::device_uvector<Counter<T, IdxT>> counters(max_chunk_size, stream);
+  rmm::device_uvector<IdxT> histograms(num_buckets * max_chunk_size, stream);
+  rmm::device_uvector<T> buf1(len * max_chunk_size, stream);
+  rmm::device_uvector<IdxT> idx_buf1(len * max_chunk_size, stream);
+  rmm::device_uvector<T> buf2(len * max_chunk_size, stream);
+  rmm::device_uvector<IdxT> idx_buf2(len * max_chunk_size, stream);
+
+  for (size_t offset = 0; offset < batch_size; offset += max_chunk_size) {
+    auto chunk_size = uint16_t(std::min<size_t>(max_chunk_size, batch_size - offset));
+
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(counters.data(), 0, counters.size() * sizeof(Counter<T, IdxT>), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(histograms.data(), 0, histograms.size() * sizeof(IdxT), stream));
+
+    const T* in_buf        = nullptr;
+    const IdxT* in_idx_buf = nullptr;
+    T* out_buf             = nullptr;
+    IdxT* out_idx_buf      = nullptr;
+
+    dim3 blocks(blocks_per_row, chunk_size);
+
+    constexpr int num_passes = calc_num_passes<T, BitsPerPass>();
+
+    for (int pass = 0; pass < num_passes; ++pass) {
+      if (pass == 0) {
+        in_buf      = in + offset * len;
+        in_idx_buf  = nullptr;
+        out_buf     = nullptr;
+        out_idx_buf = nullptr;
+      } else if (pass == 1) {
+        in_buf      = in + offset * len;
+        in_idx_buf  = in_idx ? in_idx + offset * len : nullptr;
+        out_buf     = buf1.data();
+        out_idx_buf = idx_buf1.data();
+      } else if (pass % 2 == 0) {
+        in_buf      = buf1.data();
+        in_idx_buf  = idx_buf1.data();
+        out_buf     = buf2.data();
+        out_idx_buf = idx_buf2.data();
+      } else {
+        in_buf      = buf2.data();
+        in_idx_buf  = idx_buf2.data();
+        out_buf     = buf1.data();
+        out_idx_buf = idx_buf1.data();
+      }
+
+      radix_kernel<T, IdxT, BitsPerPass, BlockSize>
+        <<<blocks, BlockSize, 0, stream>>>(in_buf,
+                                           in_idx_buf,
+                                           out_buf,
+                                           out_idx_buf,
+                                           out + offset * k,
+                                           out_idx + offset * k,
+                                           counters.data(),
+                                           histograms.data(),
+                                           len,
+                                           k,
+                                           !select_min,
+                                           pass);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+    }
+  }
+}
+
+}  // namespace raft::spatial::knn::detail::topk
diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
new file mode 100644
index 0000000000..f5ea8ba879
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
@@ -0,0 +1,881 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "bitonic_sort.cuh"
+
+#include <raft/cuda_utils.cuh>
+#include <raft/pow2_utils.cuh>
+
+#include <algorithm>
+#include <functional>
+#include <type_traits>
+
+/*
+  Three APIs of different scopes are provided:
+    1. host function: warp_sort_topk()
+    2. block-wide API: class block_sort
+    3. warp-wide API: class warp_sort_filtered and class warp_sort_immediate
+
+
+  1. warp_sort_topk()
+    (see the docstring)
+
+  2. class block_sort
+    It can be regarded as a fixed size priority queue for a thread block,
+    although the API is not typical.
+    class warp_sort_filtered and warp_sort_immediate can be used to instantiate block_sort.
+
+    It uses dynamic shared memory as intermediate buffer.
+    So the required shared memory size should be calculated using
+    calc_smem_size_for_block_wide() and passed as the 3rd kernel launch parameter.
+
+    Two overload functions can be used to add items to the queue.
+    One is load(const T* in, IdxT start, IdxT end) and it adds a range of items,
+    namely [start, end) of in. The idx is inferred from start.
+    This function should be called only once to add all items, and should not be
+    used together with the add().
+    The second one is add(T val, IdxT idx), and it adds only one item pair.
+    Note that the range [start, end) is for the whole block of threads, that is,
+    each thread in the same block should get the same start/end.
+    In contrast, the parameters of the second form are for only one thread,
+    so each thread must get different val/idx.
+
+    After adding is finished, function done() should be called. And finally,
+    store() is used to get the top-k result.
+
+    Example:
+      __global__ void kernel() {
+        block_sort<warp_sort_immediate, ...> queue(...);
+
+        // way 1, [0, len) is same for the whole block
+        queue.load(in, 0, len);
+        // way 2, each thread gets its own val/idx pair
+        for (IdxT i = threadIdx.x; i < len, i += blockDim.x) {
+          queue.add(in[i], idx[i]);
+        }
+
+        queue.done();
+        queue.store(out, out_idx);
+     }
+
+     int smem_size = calc_smem_size_for_block_wide<T>(...);
+     kernel<<<grid_dim, block_dim, smem_size>>>();
+
+
+  3. class warp_sort_filtered and class warp_sort_immediate
+    These two classes can be regarded as fixed size priority queue for a warp.
+    Usage is similar to class block_sort.
+    Two types of add() functions are provided, and also note that [start, end) is
+    for a whole warp, while val/idx is for a thread.
+    No shared memory is needed.
+
+    The host function (warp_sort_topk) uses a heuristic to choose between these two classes for
+    sorting, warp_sort_immediate being chosen when the number of inputs per warp is somewhat small
+    (see the usage of LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing).
+
+    Example:
+      __global__ void kernel() {
+        warp_sort_immediate<...> queue(...);
+        int warp_id = threadIdx.x / WarpSize;
+        int lane_id = threadIdx.x % WarpSize;
+
+        // way 1, [0, len) is same for the whole warp
+        queue.load(in, 0, len);
+        // way 2, each thread gets its own val/idx pair
+        for (IdxT i = lane_id; i < len, i += WarpSize) {
+          queue.add(in[i], idx[i]);
+        }
+
+        queue.done();
+        // each warp outputs to a different offset
+        queue.store(out+ warp_id * k, out_idx+ warp_id * k);
+      }
+ */
+
+namespace raft::spatial::knn::detail::topk {
+
+static constexpr int kMaxCapacity = 256;
+
+namespace {
+
+/** Whether 'left` should indeed be on the left w.r.t. `right`. */
+template <bool Ascending, typename T>
+__device__ __forceinline__ auto is_ordered(T left, T right) -> bool
+{
+  if constexpr (Ascending) { return left < right; }
+  if constexpr (!Ascending) { return left > right; }
+}
+
+constexpr auto calc_capacity(int k) -> int
+{
+  int capacity = isPo2(k) ? k : (1 << (log2(k) + 1));
+  if (capacity < WarpSize) { capacity = WarpSize; }  // TODO: remove this to allow small sizes.
+  return capacity;
+}
+
+}  // namespace
+
+/**
+ * A fixed-size warp-level priority queue.
+ * By feeding the data through this queue, you get the `k <= Capacity`
+ * smallest/greatest values in the data.
+ *
+ * @tparam Capacity
+ *   maximum number of elements in the queue.
+ * @tparam Ascending
+ *   which comparison to use: `true` means `<`, collect the smallest elements,
+ *   `false` means `>`, collect the greatest elements.
+ * @tparam T
+ *   the type of keys (what is being compared)
+ * @tparam IdxT
+ *   the type of payload (normally, indices of elements), i.e.
+ *   the content sorted alongside the keys.
+ */
+template <int Capacity, bool Ascending, typename T, typename IdxT>
+class warp_sort {
+  static_assert(isPo2(Capacity));
+
+ public:
+  /**
+   * Construct the warp_sort empty queue.
+   *
+   * @param k
+   *   number of elements to select.
+   * @param dummy
+   *   the `empty` value for the choosen binary operation,
+   *   i.e. `Ascending ? upper_bound<T>() : lower_bound<T>()`.
+   *
+   */
+  __device__ warp_sort(IdxT k, T dummy) : k_(k), dummy_(dummy)
+  {
+#pragma unroll
+    for (int i = 0; i < kMaxArrLen; i++) {
+      val_arr_[i] = dummy_;
+    }
+  }
+
+  /**
+   * Load k values from the pointers at the given position, and merge them in the storage.
+   */
+  __device__ void load_sorted(const T* in, const IdxT* in_idx)
+  {
+    IdxT idx = kWarpWidth - 1 - Pow2<kWarpWidth>::mod(laneId());
+#pragma unroll
+    for (int i = kMaxArrLen - 1; i >= 0; --i, idx += kWarpWidth) {
+      if (idx < k_) {
+        T t = in[idx];
+        if (is_ordered<Ascending>(t, val_arr_[i])) {
+          val_arr_[i] = t;
+          idx_arr_[i] = in_idx[idx];
+        }
+      }
+    }
+    topk::bitonic<kMaxArrLen>(Ascending, kWarpWidth).merge(val_arr_, idx_arr_);
+  }
+
+  /** Save the content by the pointer location. */
+  __device__ void store(T* out, IdxT* out_idx) const
+  {
+    IdxT idx = Pow2<kWarpWidth>::mod(laneId());
+#pragma unroll kMaxArrLen
+    for (int i = 0; i < kMaxArrLen && idx < k_; i++, idx += kWarpWidth) {
+      out[idx]     = val_arr_[i];
+      out_idx[idx] = idx_arr_[i];
+    }
+  }
+
+ protected:
+  static constexpr int kWarpWidth = std::min<int>(Capacity, WarpSize);
+  static constexpr int kMaxArrLen = Capacity / kWarpWidth;
+
+  const IdxT k_;
+  const T dummy_;
+  T val_arr_[kMaxArrLen];
+  IdxT idx_arr_[kMaxArrLen];
+
+  /**
+   * Merge another array (sorted in the opposite direction) in the queue.
+   * Thanks to the other array being sorted in the opposite direction,
+   * it's enough to call bitonic.merge once to maintain the valid state
+   * of the queue.
+   *
+   * @tparam PerThreadSizeIn
+   *   the size of the other array per-thread (compared to `kMaxArrLen`).
+   *
+   * @param keys_in
+   *   the values to be merged in. Pointers are unique per-thread. The values
+   *   must already be sorted in the opposite direction.
+   *   The layout of `keys_in` must be the same as the layout of `val_arr_`.
+   * @param ids_in
+   *   the associated indices of the elements in the same format as `keys_in`.
+   */
+  template <int PerThreadSizeIn>
+  __device__ __forceinline__ void merge_in(const T* __restrict__ keys_in,
+                                           const IdxT* __restrict__ ids_in)
+  {
+#pragma unroll
+    for (int i = std::min(kMaxArrLen, PerThreadSizeIn); i > 0; i--) {
+      T& key  = val_arr_[kMaxArrLen - i];
+      T other = keys_in[PerThreadSizeIn - i];
+      if (is_ordered<Ascending>(other, key)) {
+        key                      = other;
+        idx_arr_[kMaxArrLen - i] = ids_in[PerThreadSizeIn - i];
+      }
+    }
+    topk::bitonic<kMaxArrLen>(Ascending).merge(val_arr_, idx_arr_);
+  }
+};
+
+/**
+ * This version of warp_sort compares each input element against the current
+ * estimate of k-th value before adding it to the intermediate sorting buffer.
+ * This makes the algorithm do less sorting steps for long input sequences
+ * at the cost of extra checks on each step.
+ *
+ * This implementation is preferred for large len values.
+ */
+template <int Capacity, bool Ascending, typename T, typename IdxT>
+class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
+  static_assert(Capacity >= WarpSize);
+
+ public:
+  __device__ warp_sort_filtered(int k, T dummy)
+    : warp_sort<Capacity, Ascending, T, IdxT>(k, dummy), buf_len_(0), k_th_(dummy)
+  {
+#pragma unroll
+    for (int i = 0; i < kMaxBufLen; i++) {
+      val_buf_[i] = dummy_;
+    }
+  }
+
+  __device__ void load(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
+  {
+    const IdxT end_for_fullwarp = Pow2<WarpSize>::roundUp(end - start) + start;
+    for (IdxT i = start + laneId(); i < end_for_fullwarp; i += WarpSize) {
+      T val    = (i < end) ? in[i] : dummy_;
+      IdxT idx = (i < end) ? in_idx[i] : std::numeric_limits<IdxT>::max();
+      add(val, idx);
+    }
+  }
+
+  __device__ void add(T val, IdxT idx)
+  {
+    // comparing for k_th should reduce the total amount of updates:
+    // `false` means the input value is surely not in the top-k values.
+    if (is_ordered<Ascending>(val, k_th_)) {
+      // NB: the loop is used here to ensure the constant indexing,
+      //     to not force the buffers spill into the local memory.
+#pragma unroll
+      for (int i = 0; i < kMaxBufLen; i++) {
+        if (i == buf_len_) {
+          val_buf_[i] = val;
+          idx_buf_[i] = idx;
+        }
+      }
+      ++buf_len_;
+    }
+    if (any(buf_len_ == kMaxBufLen)) { merge_buf_(); }
+  }
+
+  __device__ void done()
+  {
+    if (any(buf_len_ != 0)) { merge_buf_(); }
+  }
+
+ private:
+  __device__ void set_k_th_()
+  {
+    // NB on using srcLane: it's ok if it is outside the warp size / width;
+    //                      the modulo op will be done inside the __shfl_sync.
+    k_th_ = shfl(val_arr_[kMaxArrLen - 1], k_ - 1);
+  }
+
+  __device__ void merge_buf_()
+  {
+    topk::bitonic<kMaxBufLen>(!Ascending).sort(val_buf_, idx_buf_);
+    this->merge_in<kMaxBufLen>(val_buf_, idx_buf_);
+    buf_len_ = 0;
+    set_k_th_();  // contains warp sync
+#pragma unroll
+    for (int i = 0; i < kMaxBufLen; i++) {
+      val_buf_[i] = dummy_;
+    }
+  }
+
+  using warp_sort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
+  using warp_sort<Capacity, Ascending, T, IdxT>::val_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::idx_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::k_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::dummy_;
+
+  static constexpr int kMaxBufLen = (Capacity <= 64) ? 2 : 4;
+
+  T val_buf_[kMaxBufLen];
+  IdxT idx_buf_[kMaxBufLen];
+  int buf_len_;
+
+  T k_th_;
+};
+
+/**
+ * This version of warp_sort adds every input element into the intermediate sorting
+ * buffer, and thus does the sorting step every `Capacity` input elements.
+ *
+ * This implementation is preferred for very small len values.
+ */
+template <int Capacity, bool Ascending, typename T, typename IdxT>
+class warp_sort_immediate : public warp_sort<Capacity, Ascending, T, IdxT> {
+  static_assert(Capacity >= WarpSize);
+
+ public:
+  __device__ warp_sort_immediate(int k, T dummy)
+    : warp_sort<Capacity, Ascending, T, IdxT>(k, dummy), buf_len_(0)
+  {
+#pragma unroll
+    for (int i = 0; i < kMaxArrLen; i++) {
+      val_buf_[i] = dummy_;
+    }
+  }
+
+  __device__ void load(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
+  {
+    add_first_(in, in_idx, start, end);
+    start += Capacity;
+    while (start < end) {
+      add_extra_(in, in_idx, start, end);
+      this->merge_in<kMaxArrLen>(val_buf_, idx_buf_);
+      start += Capacity;
+    }
+  }
+
+  __device__ void add(T val, IdxT idx)
+  {
+    // NB: the loop is used here to ensure the constant indexing,
+    //     to not force the buffers spill into the local memory.
+#pragma unroll
+    for (int i = 0; i < kMaxArrLen; ++i) {
+      if (i == buf_len_) {
+        val_buf_[i] = val;
+        idx_buf_[i] = idx;
+      }
+    }
+
+    ++buf_len_;
+    if (buf_len_ == kMaxArrLen) {
+      topk::bitonic<kMaxArrLen>(!Ascending).sort(val_buf_, idx_buf_);
+      this->merge_in<kMaxArrLen>(val_buf_, idx_buf_);
+#pragma unroll
+      for (int i = 0; i < kMaxArrLen; i++) {
+        val_buf_[i] = dummy_;
+      }
+      buf_len_ = 0;
+    }
+  }
+
+  __device__ void done()
+  {
+    if (buf_len_ != 0) {
+      topk::bitonic<kMaxArrLen>(!Ascending).sort(val_buf_, idx_buf_);
+      this->merge_in<kMaxArrLen>(val_buf_, idx_buf_);
+    }
+  }
+
+ private:
+  /** Fill in the primary val_arr_/idx_arr_ */
+  __device__ void add_first_(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
+  {
+    IdxT idx = start + laneId();
+    for (int i = 0; i < kMaxArrLen; ++i, idx += WarpSize) {
+      if (idx < end) {
+        val_arr_[i] = in[idx];
+        idx_arr_[i] = in_idx[idx];
+      }
+    }
+    topk::bitonic<kMaxArrLen>(Ascending).sort(val_arr_, idx_arr_);
+  }
+
+  /** Fill in the secondary val_buf_/idx_buf_ */
+  __device__ void add_extra_(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
+  {
+    IdxT idx = start + laneId();
+    for (int i = 0; i < kMaxArrLen; ++i, idx += WarpSize) {
+      val_buf_[i] = (idx < end) ? in[idx] : dummy_;
+      idx_buf_[i] = (idx < end) ? in_idx[idx] : std::numeric_limits<IdxT>::max();
+    }
+    topk::bitonic<kMaxArrLen>(!Ascending).sort(val_buf_, idx_buf_);
+  }
+
+  using warp_sort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
+  using warp_sort<Capacity, Ascending, T, IdxT>::val_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::idx_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::k_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::dummy_;
+
+  T val_buf_[kMaxArrLen];
+  IdxT idx_buf_[kMaxArrLen];
+  int buf_len_;
+};
+
+/**
+ * This one is used for the second pass only:
+ *   if the first pass happens in multiple blocks, the output consists of a series
+ *   of sorted arrays, length `k` each.
+ *   Under this assumption, we can use load_sorted to just do the merging, rather than
+ *   the full sort.
+ */
+template <int Capacity, bool Ascending, typename T, typename IdxT>
+class warp_merge : public warp_sort<Capacity, Ascending, T, IdxT> {
+ public:
+  __device__ warp_merge(int k, T dummy) : warp_sort<Capacity, Ascending, T, IdxT>(k, dummy) {}
+
+  // NB: the input is already sorted, because it's the second pass.
+  __device__ void load(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
+  {
+    for (; start < end; start += k_) {
+      load_sorted(in + start, in_idx + start);
+    }
+  }
+
+  __device__ void done() {}
+
+ private:
+  using warp_sort<Capacity, Ascending, T, IdxT>::kWarpWidth;
+  using warp_sort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
+  using warp_sort<Capacity, Ascending, T, IdxT>::val_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::idx_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::k_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::dummy_;
+};
+
+template <typename T, typename IdxT>
+int calc_smem_size_for_block_wide(int num_of_warp, IdxT k)
+{
+  return Pow2<256>::roundUp(num_of_warp / 2 * sizeof(T) * k) + num_of_warp / 2 * sizeof(IdxT) * k;
+}
+
+template <template <int, bool, typename, typename> class WarpSortWarpWide,
+          int Capacity,
+          bool Ascending,
+          typename T,
+          typename IdxT>
+class block_sort {
+ public:
+  __device__ block_sort(int k, T dummy, void* smem_buf) : queue_(k, dummy), k_(k), dummy_(dummy)
+  {
+    val_smem_             = static_cast<T*>(smem_buf);
+    const int num_of_warp = blockDim.x / WarpSize;
+    idx_smem_             = reinterpret_cast<IdxT*>(reinterpret_cast<char*>(smem_buf) +
+                                        Pow2<256>::roundUp(num_of_warp / 2 * sizeof(T) * k_));
+  }
+
+  __device__ void load(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
+  {
+    int num_of_warp   = blockDim.x / WarpSize;
+    const int warp_id = threadIdx.x / WarpSize;
+    IdxT len_per_warp = ceildiv<IdxT>(end - start, num_of_warp);
+    len_per_warp      = alignTo<IdxT>(len_per_warp, k_);
+
+    IdxT warp_start = start + warp_id * len_per_warp;
+    IdxT warp_end   = warp_start + len_per_warp;
+    if (warp_end > end) { warp_end = end; }
+    queue_.load(in, in_idx, warp_start, warp_end);
+  }
+
+  __device__ void add(T val, IdxT idx) { queue_.add(val, idx); }
+
+  /**
+   * At the point of calling this function, the warp-level queues consumed all input
+   * independently. The remaining work to be done is to merge them together.
+   *
+   * Here we tree-merge the results using the shared memory and block sync.
+   */
+  __device__ void done()
+  {
+    queue_.done();
+
+    int num_of_warp   = blockDim.x / WarpSize;
+    const int warp_id = threadIdx.x / WarpSize;
+
+    while (num_of_warp > 1) {
+      int half_num_of_warp = (num_of_warp + 1) / 2;
+      if (warp_id < num_of_warp && warp_id >= half_num_of_warp) {
+        int dst_warp_id = warp_id - half_num_of_warp;
+        queue_.store(val_smem_ + dst_warp_id * k_, idx_smem_ + dst_warp_id * k_);
+      }
+      __syncthreads();
+
+      if (warp_id < num_of_warp / 2) {
+        queue_.load_sorted(val_smem_ + warp_id * k_, idx_smem_ + warp_id * k_);
+      }
+      __syncthreads();
+
+      num_of_warp = half_num_of_warp;
+    }
+  }
+
+  /** Save the content by the pointer location. */
+  __device__ void store(T* out, IdxT* out_idx) const
+  {
+    if (threadIdx.x < kWarpWidth) { queue_.store(out, out_idx); }
+  }
+
+ private:
+  static constexpr int kWarpWidth = std::min<int>(Capacity, WarpSize);
+
+  WarpSortWarpWide<Capacity, Ascending, T, IdxT> queue_;
+  int k_;
+  T dummy_;
+  T* val_smem_;
+  IdxT* idx_smem_;
+};
+
+/**
+ * Uses the `WarpSortClass` to sort chunks of data within one block with no interblock
+ * communication. It can be arranged so, that multiple blocks process one row of input; in this
+ * case, they output multiple results of length k each. Then, a second pass is needed to merge
+ * those into one final output.
+ */
+template <template <int, bool, typename, typename> class WarpSortClass,
+          int Capacity,
+          bool Ascending,
+          typename T,
+          typename IdxT>
+__global__ void block_kernel(
+  const T* in, const IdxT* in_idx, IdxT len, int k, T* out, IdxT* out_idx, T dummy)
+{
+  extern __shared__ __align__(sizeof(T) * 256) uint8_t smem_buf_bytes[];
+  block_sort<WarpSortClass, Capacity, Ascending, T, IdxT> queue(
+    k, dummy, reinterpret_cast<T*>(smem_buf_bytes));
+  in += blockIdx.y * len;
+  in_idx += blockIdx.y * len;
+
+  const IdxT len_per_block = ceildiv<IdxT>(len, gridDim.x);
+  queue.load(
+    in, in_idx, blockIdx.x * len_per_block, std::min<IdxT>(len, (blockIdx.x + 1) * len_per_block));
+
+  queue.done();
+  const int block_id = blockIdx.x + gridDim.x * blockIdx.y;
+  queue.store(out + block_id * k, out_idx + block_id * k);
+}
+
+template <template <int, bool, typename, typename> class WarpSortClass,
+          typename T,
+          typename IdxT,
+          int Capacity = kMaxCapacity>
+struct launch_setup {
+  /**
+   * @brief Calculate the best block size and minimum grid size for the given `k`.
+   *
+   * @param[in] k
+   *   The select-top-k parameter
+   * @param[out] block_size
+   *   Returned block size
+   * @param[out] min_grid_size
+   *   Returned minimum grid size needed to achieve the best potential occupancy
+   */
+  static void calc_optimal_params(int k, int* block_size, int* min_grid_size)
+  {
+    const int capacity = calc_capacity(k);
+    if constexpr (Capacity > WarpSize) {  // TODO: replace with `Capacity > 1` to allow small sizes.
+      if (capacity < Capacity) {
+        return launch_setup<WarpSortClass, T, IdxT, Capacity / 2>::calc_optimal_params(
+          capacity, block_size, min_grid_size);
+      }
+    }
+    ASSERT(capacity <= Capacity, "Requested k is too big (%d)", k);
+    auto calc_smem = [k](int block_size) {
+      int num_of_warp = block_size / WarpSize;
+      return calc_smem_size_for_block_wide<T>(num_of_warp, k);
+    };
+    RAFT_CUDA_TRY(cudaOccupancyMaxPotentialBlockSizeVariableSMem(
+      min_grid_size, block_size, block_kernel<WarpSortClass, Capacity, true, T, IdxT>, calc_smem));
+  }
+
+  static void kernel(int k,
+                     bool select_min,
+                     IdxT batch_size,
+                     IdxT len,
+                     int num_blocks,
+                     int block_dim,
+                     int smem_size,
+                     const T* in_key,
+                     const IdxT* in_idx,
+                     T* out_key,
+                     IdxT* out_idx,
+                     cudaStream_t stream)
+  {
+    const int capacity = calc_capacity(k);
+    if constexpr (Capacity > WarpSize) {  // TODO: replace with `Capacity > 1` to allow small sizes.
+      if (capacity < Capacity) {
+        return launch_setup<WarpSortClass, T, IdxT, Capacity / 2>::kernel(k,
+                                                                          select_min,
+                                                                          batch_size,
+                                                                          len,
+                                                                          num_blocks,
+                                                                          block_dim,
+                                                                          smem_size,
+                                                                          in_key,
+                                                                          in_idx,
+                                                                          out_key,
+                                                                          out_idx,
+                                                                          stream);
+      }
+    }
+    ASSERT(capacity <= Capacity, "Requested k is too big (%d)", k);
+    T dummy = select_min ? upper_bound<T>() : lower_bound<T>();
+    // This is less than cuda's max block dim along Y axis (65535), but it's a
+    // power-of-two, which ensures the alignment of batches in memory.
+    constexpr IdxT kMaxGridDimY = 32768;
+    for (IdxT offset = 0; offset < batch_size; offset += kMaxGridDimY) {
+      IdxT batch_chunk = std::min<IdxT>(kMaxGridDimY, batch_size - offset);
+      dim3 gs(num_blocks, batch_chunk, 1);
+      if (select_min) {
+        block_kernel<WarpSortClass, Capacity, true>
+          <<<gs, block_dim, smem_size, stream>>>(in_key + offset * len,
+                                                 in_idx + offset * len,
+                                                 len,
+                                                 k,
+                                                 out_key + offset * num_blocks * k,
+                                                 out_idx + offset * num_blocks * k,
+                                                 dummy);
+      } else {
+        block_kernel<WarpSortClass, Capacity, false>
+          <<<gs, block_dim, smem_size, stream>>>(in_key + offset * len,
+                                                 in_idx + offset * len,
+                                                 len,
+                                                 k,
+                                                 out_key + offset * num_blocks * k,
+                                                 out_idx + offset * num_blocks * k,
+                                                 dummy);
+      }
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+    }
+  }
+};
+
+template <template <int, bool, typename, typename> class WarpSortClass>
+struct LaunchThreshold {
+};
+
+template <>
+struct LaunchThreshold<warp_sort_filtered> {
+  static constexpr int len_factor_for_multi_block  = 2;
+  static constexpr int len_factor_for_single_block = 32;
+};
+
+template <>
+struct LaunchThreshold<warp_sort_immediate> {
+  static constexpr int len_factor_for_choosing     = 4;
+  static constexpr int len_factor_for_multi_block  = 2;
+  static constexpr int len_factor_for_single_block = 4;
+};
+
+template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename IdxT>
+void calc_launch_parameter(int batch_size, IdxT len, int k, int* p_num_of_block, int* p_num_of_warp)
+{
+  const int capacity = calc_capacity(k);
+  int block_size     = 0;
+  int min_grid_size  = 0;
+  launch_setup<WarpSortClass, T, IdxT>::calc_optimal_params(k, &block_size, &min_grid_size);
+
+  int num_of_warp;
+  int num_of_block;
+  if (batch_size < min_grid_size) {  // may use multiple blocks
+    num_of_warp       = block_size / WarpSize;
+    num_of_block      = min_grid_size / batch_size;
+    int len_per_block = (len - 1) / num_of_block + 1;
+    int len_per_warp  = (len_per_block - 1) / num_of_warp + 1;
+
+    len_per_warp  = Pow2<WarpSize>::roundUp(len_per_warp);
+    len_per_block = len_per_warp * num_of_warp;
+    num_of_block  = (len - 1) / len_per_block + 1;
+
+    constexpr int len_factor = LaunchThreshold<WarpSortClass>::len_factor_for_multi_block;
+    if (len_per_warp < capacity * len_factor) {
+      len_per_warp  = capacity * len_factor;
+      len_per_block = num_of_warp * len_per_warp;
+      if ((IdxT)len_per_block > len) { len_per_block = len; }
+      num_of_block = (len - 1) / len_per_block + 1;
+      num_of_warp  = (len_per_block - 1) / len_per_warp + 1;
+    }
+  } else {  // use only single block
+    num_of_block = 1;
+
+    // block size could be decreased if batch size is large
+    float scale = batch_size / min_grid_size;
+    if (scale > 1) {
+      // make sure scale > 1 so block_size only decreases not increases
+      if (0.8 * scale > 1) { scale = 0.8 * scale; }
+      block_size /= scale;
+      if (block_size < 1) { block_size = 1; }
+      block_size = Pow2<WarpSize>::roundUp(block_size);
+    }
+
+    num_of_warp      = block_size / WarpSize;
+    int len_per_warp = (len - 1) / num_of_warp + 1;
+    len_per_warp     = Pow2<WarpSize>::roundUp(len_per_warp);
+    num_of_warp      = (len - 1) / len_per_warp + 1;
+
+    constexpr int len_factor = LaunchThreshold<WarpSortClass>::len_factor_for_single_block;
+    if (len_per_warp < capacity * len_factor) {
+      len_per_warp = capacity * len_factor;
+      num_of_warp  = (len - 1) / len_per_warp + 1;
+    }
+  }
+
+  *p_num_of_block = num_of_block;
+  *p_num_of_warp  = num_of_warp;
+}
+
+template <typename T, typename IdxT>
+void calc_launch_parameter_for_merge(IdxT len, int k, int* num_of_block, int* num_of_warp)
+{
+  *num_of_block = 1;
+
+  int block_size    = 0;
+  int min_grid_size = 0;
+  launch_setup<warp_merge, T, IdxT>::calc_optimal_params(k, &block_size, &min_grid_size);
+
+  *num_of_warp      = block_size / WarpSize;
+  IdxT len_per_warp = (len - 1) / (*num_of_warp) + 1;
+  len_per_warp      = ((len_per_warp - 1) / k + 1) * k;
+  *num_of_warp      = (len - 1) / len_per_warp + 1;
+}
+
+template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename IdxT>
+void warp_sort_topk_(int num_of_block,
+                     int num_of_warp,
+                     const T* in,
+                     const IdxT* in_idx,
+                     size_t batch_size,
+                     size_t len,
+                     int k,
+                     T* out,
+                     IdxT* out_idx,
+                     bool select_min,
+                     cudaStream_t stream = 0)
+{
+  rmm::device_uvector<T> tmp_val(num_of_block * k * batch_size, stream);
+  rmm::device_uvector<IdxT> tmp_idx(num_of_block * k * batch_size, stream);
+
+  int capacity = calc_capacity(k);
+
+  T* result_val    = (num_of_block == 1) ? out : tmp_val.data();
+  IdxT* result_idx = (num_of_block == 1) ? out_idx : tmp_idx.data();
+  int block_dim    = num_of_warp * WarpSize;
+  int smem_size    = calc_smem_size_for_block_wide<T>(num_of_warp, (IdxT)k);
+  launch_setup<WarpSortClass, T, IdxT>::kernel((IdxT)k,
+                                               select_min,
+                                               (IdxT)batch_size,
+                                               (IdxT)len,
+                                               num_of_block,
+                                               block_dim,
+                                               smem_size,
+                                               in,
+                                               in_idx,
+                                               result_val,
+                                               result_idx,
+                                               stream);
+
+  if (num_of_block > 1) {
+    // Merge the results across blocks using warp_merge
+    len = k * num_of_block;
+    calc_launch_parameter_for_merge<T>(len, k, &num_of_block, &num_of_warp);
+    block_dim = num_of_warp * WarpSize;
+    smem_size = calc_smem_size_for_block_wide<T>(num_of_warp, (IdxT)k);
+    launch_setup<warp_merge, T, IdxT>::kernel((IdxT)k,
+                                              select_min,
+                                              (IdxT)batch_size,
+                                              (IdxT)len,
+                                              num_of_block,
+                                              block_dim,
+                                              smem_size,
+                                              tmp_val.data(),
+                                              tmp_idx.data(),
+                                              out,
+                                              out_idx,
+                                              stream);
+  }
+}
+
+/**
+ * Select k smallest or largest key/values from each row in the input data.
+ *
+ * If you think of the input data `in_keys` as a row-major matrix with len columns and
+ * batch_size rows, then this function selects k smallest/largest values in each row and fills
+ * in the row-major matrix `out` of size (batch_size, k).
+ *
+ * @tparam T
+ *   the type of the keys (what is being compared).
+ * @tparam IdxT
+ *   the index type (what is being selected together with the keys).
+ *
+ * @param[in] in
+ *   contiguous device array of inputs of size (len * batch_size);
+ *   these are compared and selected.
+ * @param[in] in_idx
+ *   contiguous device array of inputs of size (len * batch_size);
+ *   typically, these are indices of the corresponding in_keys.
+ * @param[in] batch_size
+ *   number of input rows, i.e. the batch size.
+ * @param[in] len
+ *   length of a single input array (row); also sometimes referred as n_cols.
+ *   Invariant: len >= k.
+ * @param[in] k
+ *   the number of outputs to select in each input row.
+ * @param[out] out
+ *   contiguous device array of outputs of size (k * batch_size);
+ *   the k smallest/largest values from each row of the `in_keys`.
+ * @param[out] out_idx
+ *   contiguous device array of outputs of size (k * batch_size);
+ *   the payload selected together with `out`.
+ * @param[in] select_min
+ *   whether to select k smallest (true) or largest (false) keys.
+ * @param[in] stream
+ */
+template <typename T, typename IdxT>
+void warp_sort_topk(const T* in,
+                    const IdxT* in_idx,
+                    size_t batch_size,
+                    size_t len,
+                    int k,
+                    T* out,
+                    IdxT* out_idx,
+                    bool select_min,
+                    rmm::cuda_stream_view stream = 0)
+{
+  ASSERT(k <= kMaxCapacity, "Current max k is %d (requested %d)", kMaxCapacity, k);
+
+  int capacity     = calc_capacity(k);
+  int num_of_block = 0;
+  int num_of_warp  = 0;
+  calc_launch_parameter<warp_sort_immediate, T>(
+    batch_size, len, (IdxT)k, &num_of_block, &num_of_warp);
+  int len_per_warp = len / (num_of_block * num_of_warp);
+
+  if (len_per_warp <= capacity * LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing) {
+    warp_sort_topk_<warp_sort_immediate, T, IdxT>(
+      num_of_block, num_of_warp, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+  } else {
+    calc_launch_parameter<warp_sort_filtered, T>(batch_size, len, k, &num_of_block, &num_of_warp);
+    warp_sort_topk_<warp_sort_filtered, T, IdxT>(
+      num_of_block, num_of_warp, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+  }
+}
+
+}  // namespace raft::spatial::knn::detail::topk
diff --git a/cpp/include/raft/spatial/knn/knn.cuh b/cpp/include/raft/spatial/knn/knn.cuh
index 189b537361..8765a7c30a 100644
--- a/cpp/include/raft/spatial/knn/knn.cuh
+++ b/cpp/include/raft/spatial/knn/knn.cuh
@@ -14,17 +14,17 @@
  * limitations under the License.
  */
 
-#ifndef __KNN_H
-#define __KNN_H
-
 #pragma once
 
 #include "detail/knn_brute_force_faiss.cuh"
 #include "detail/selection_faiss.cuh"
 
-namespace raft {
-namespace spatial {
-namespace knn {
+#include "detail/topk/radix_topk.cuh"
+#include "detail/topk/warpsort_topk.cuh"
+
+#include <raft/common/nvtx.hpp>
+
+namespace raft::spatial::knn {
 
 /**
  * Performs a k-select across row partitioned index/distance
@@ -38,65 +38,131 @@ namespace knn {
  *
  * etc...
  *
- * @tparam value_idx
+ * @tparam idx_t
  * @tparam value_t
- * @param inK
- * @param inV
- * @param outK
- * @param outV
+ * @param in_keys
+ * @param in_values
+ * @param out_keys
+ * @param out_values
  * @param n_samples
  * @param n_parts
  * @param k
  * @param stream
  * @param translations
  */
-template <typename value_idx = int64_t, typename value_t = float>
-inline void knn_merge_parts(value_t* inK,
-                            value_idx* inV,
-                            value_t* outK,
-                            value_idx* outV,
+template <typename idx_t = int64_t, typename value_t = float>
+inline void knn_merge_parts(value_t* in_keys,
+                            idx_t* in_values,
+                            value_t* out_keys,
+                            idx_t* out_values,
                             size_t n_samples,
                             int n_parts,
                             int k,
                             cudaStream_t stream,
-                            value_idx* translations)
+                            idx_t* translations)
 {
-  detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
+  detail::knn_merge_parts(
+    in_keys, in_values, out_keys, out_values, n_samples, n_parts, k, stream, translations);
 }
 
+/** Choose an implementation for the select-top-k, */
+enum class SelectKAlgo {
+  /** Adapted from the faiss project. Result: sorted (not stable). */
+  FAISS,
+  /** Incomplete series of radix sort passes, comparing 8 bits per pass. Result: unsorted. */
+  RADIX_8_BITS,
+  /** Incomplete series of radix sort passes, comparing 11 bits per pass. Result: unsorted. */
+  RADIX_11_BITS,
+  /** Filtering with a bitonic-sort-based priority queue. Result: sorted (not stable). */
+  WARP_SORT
+};
+
 /**
- * Performs a k-select across column-partitioned index/distance
- * matrices formatted like the following:
- * row1: k0, k1, k2, k0, k1, k2
- * row2: k0, k1, k2, k0, k1, k2
- * row3: k0, k1, k2, k0, k1, k2
+ * Select k smallest or largest key/values from each row in the input data.
  *
- * etc...
+ * If you think of the input data `in_keys` as a row-major matrix with input_len columns and
+ * n_inputs rows, then this function selects k smallest/largest values in each row and fills
+ * in the row-major matrix `out_keys` of size (n_inputs, k).
  *
- * @tparam value_idx
+ * Note, depending on the selected algorithm, the values within rows of `out_keys` are not
+ * necessarily sorted. See the `SelectKAlgo` enumeration for more details.
+ *
+ * @tparam idx_t
+ *   the payload type (what is being selected together with the keys).
  * @tparam value_t
- * @param inK
- * @param inV
- * @param n_rows
- * @param n_cols
- * @param outK
- * @param outV
- * @param select_min
- * @param k
- * @param stream
+ *   the type of the keys (what is being compared).
+ *
+ * @param[in] in_keys
+ *   contiguous device array of inputs of size (input_len * n_inputs);
+ *   these are compared and selected.
+ * @param[in] in_values
+ *   contiguous device array of inputs of size (input_len * n_inputs);
+ *   typically, these are indices of the corresponding in_keys.
+ * @param[in] n_inputs
+ *   number of input rows, i.e. the batch size.
+ * @param[in] input_len
+ *   length of a single input array (row); also sometimes referred as n_cols.
+ *   Invariant: input_len >= k.
+ * @param[out] out_keys
+ *   contiguous device array of outputs of size (k * n_inputs);
+ *   the k smallest/largest values from each row of the `in_keys`.
+ * @param[out] out_values
+ *   contiguous device array of outputs of size (k * n_inputs);
+ *   the payload selected together with `out_keys`.
+ * @param[in] select_min
+ *   whether to select k smallest (true) or largest (false) keys.
+ * @param[in] k
+ *   the number of outputs to select in each input row.
+ * @param[in] stream
+ * @param[in] algo
+ *   the implementation of the algorithm
  */
-template <typename value_idx = int, typename value_t = float>
-inline void select_k(value_t* inK,
-                     value_idx* inV,
-                     size_t n_rows,
-                     size_t n_cols,
-                     value_t* outK,
-                     value_idx* outV,
+template <typename idx_t = int, typename value_t = float>
+inline void select_k(value_t* in_keys,
+                     idx_t* in_values,
+                     size_t n_inputs,
+                     size_t input_len,
+                     value_t* out_keys,
+                     idx_t* out_values,
                      bool select_min,
                      int k,
-                     cudaStream_t stream)
+                     cudaStream_t stream,
+                     SelectKAlgo algo = SelectKAlgo::FAISS)
 {
-  detail::select_k(inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope("select-%s-%d (%zu, %zu) algo-%d",
+                                                            select_min ? "min" : "max",
+                                                            k,
+                                                            n_inputs,
+                                                            input_len,
+                                                            int(algo));
+  ASSERT(size_t(input_len) >= size_t(k),
+         "Size of the input (input_len = %zu) must be not smaller than the selection (k = %zu).",
+         size_t(input_len),
+         size_t(k));
+
+  switch (algo) {
+    case SelectKAlgo::FAISS:
+      detail::select_k(
+        in_keys, in_values, n_inputs, input_len, out_keys, out_values, select_min, k, stream);
+      break;
+
+    case SelectKAlgo::RADIX_8_BITS:
+      detail::topk::radix_topk<value_t, idx_t, 8, 512>(
+        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
+      break;
+
+    case SelectKAlgo::RADIX_11_BITS:
+      detail::topk::radix_topk<value_t, idx_t, 11, 512>(
+        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
+      break;
+
+    case SelectKAlgo::WARP_SORT:
+      detail::topk::warp_sort_topk<value_t, idx_t>(
+        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
+      break;
+
+    default: ASSERT(false, "Unknown algorithm (id = %d)", int(algo));
+  }
 }
 
 /**
@@ -122,21 +188,21 @@ inline void select_k(value_t* inK,
  * @param[in] translations starting offsets for partitions. should be the same size
  *            as input vector.
  */
-template <typename value_idx = std::int64_t, typename value_t = float, typename value_int = int>
+template <typename idx_t = std::int64_t, typename value_t = float, typename value_int = int>
 void brute_force_knn(raft::handle_t const& handle,
                      std::vector<value_t*>& input,
                      std::vector<value_int>& sizes,
                      value_int D,
                      value_t* search_items,
                      value_int n,
-                     value_idx* res_I,
+                     idx_t* res_I,
                      value_t* res_D,
                      value_int k,
-                     bool rowMajorIndex                   = true,
-                     bool rowMajorQuery                   = true,
-                     std::vector<value_idx>* translations = nullptr,
-                     distance::DistanceType metric        = distance::DistanceType::L2Unexpanded,
-                     float metric_arg                     = 2.0f)
+                     bool rowMajorIndex               = true,
+                     bool rowMajorQuery               = true,
+                     std::vector<idx_t>* translations = nullptr,
+                     distance::DistanceType metric    = distance::DistanceType::L2Unexpanded,
+                     float metric_arg                 = 2.0f)
 {
   ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size");
 
@@ -155,8 +221,4 @@ void brute_force_knn(raft::handle_t const& handle,
                                metric,
                                metric_arg);
 }
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
-
-#endif
\ No newline at end of file
+}  // namespace raft::spatial::knn
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index c7b21f16ad..08a6378f54 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -13,154 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-/**
- * This file is deprecated and will be removed in release 22.06.
- * Please use the cuh version instead.
- */
-
-#ifndef __KNN_H
-#define __KNN_H
 
 #pragma once
 
-#include "detail/knn_brute_force_faiss.cuh"
-#include "detail/selection_faiss.cuh"
-
-namespace raft {
-namespace spatial {
-namespace knn {
-
-/**
- * Performs a k-select across row partitioned index/distance
- * matrices formatted like the following:
- * row1: k0, k1, k2
- * row2: k0, k1, k2
- * row3: k0, k1, k2
- * row1: k0, k1, k2
- * row2: k0, k1, k2
- * row3: k0, k1, k2
- *
- * etc...
- *
- * @tparam value_idx
- * @tparam value_t
- * @param inK
- * @param inV
- * @param outK
- * @param outV
- * @param n_samples
- * @param n_parts
- * @param k
- * @param stream
- * @param translations
- */
-template <typename value_idx = int64_t, typename value_t = float>
-inline void knn_merge_parts(value_t* inK,
-                            value_idx* inV,
-                            value_t* outK,
-                            value_idx* outV,
-                            size_t n_samples,
-                            int n_parts,
-                            int k,
-                            cudaStream_t stream,
-                            value_idx* translations)
-{
-  detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
-}
-
-/**
- * Performs a k-select across column-partitioned index/distance
- * matrices formatted like the following:
- * row1: k0, k1, k2, k0, k1, k2
- * row2: k0, k1, k2, k0, k1, k2
- * row3: k0, k1, k2, k0, k1, k2
- *
- * etc...
- *
- * @tparam value_idx
- * @tparam value_t
- * @param inK
- * @param inV
- * @param n_rows
- * @param n_cols
- * @param outK
- * @param outV
- * @param select_min
- * @param k
- * @param stream
- */
-template <typename value_idx = int, typename value_t = float>
-inline void select_k(value_t* inK,
-                     value_idx* inV,
-                     size_t n_rows,
-                     size_t n_cols,
-                     value_t* outK,
-                     value_idx* outV,
-                     bool select_min,
-                     int k,
-                     cudaStream_t stream)
-{
-  detail::select_k(inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
-}
-
-/**
- * @brief Flat C++ API function to perform a brute force knn on
- * a series of input arrays and combine the results into a single
- * output array for indexes and distances.
- *
- * @param[in] handle the cuml handle to use
- * @param[in] input vector of pointers to the input arrays
- * @param[in] sizes vector of sizes of input arrays
- * @param[in] D the dimensionality of the arrays
- * @param[in] search_items array of items to search of dimensionality D
- * @param[in] n number of rows in search_items
- * @param[out] res_I the resulting index array of size n * k
- * @param[out] res_D the resulting distance array of size n * k
- * @param[in] k the number of nearest neighbors to return
- * @param[in] rowMajorIndex are the index arrays in row-major order?
- * @param[in] rowMajorQuery are the query arrays in row-major order?
- * @param[in] metric distance metric to use. Euclidean (L2) is used by
- * 			   default
- * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances. This
- * 					 is ignored if the metric_type is not Minkowski.
- * @param[in] translations starting offsets for partitions. should be the same size
- *            as input vector.
- */
-template <typename value_idx = std::int64_t, typename value_t = float, typename value_int = int>
-void brute_force_knn(raft::handle_t const& handle,
-                     std::vector<value_t*>& input,
-                     std::vector<value_int>& sizes,
-                     value_int D,
-                     value_t* search_items,
-                     value_int n,
-                     value_idx* res_I,
-                     value_t* res_D,
-                     value_int k,
-                     bool rowMajorIndex                   = true,
-                     bool rowMajorQuery                   = true,
-                     std::vector<value_idx>* translations = nullptr,
-                     distance::DistanceType metric        = distance::DistanceType::L2Unexpanded,
-                     float metric_arg                     = 2.0f)
-{
-  ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size");
-
-  detail::brute_force_knn_impl(handle,
-                               input,
-                               sizes,
-                               D,
-                               search_items,
-                               n,
-                               res_I,
-                               res_D,
-                               k,
-                               rowMajorIndex,
-                               rowMajorQuery,
-                               translations,
-                               metric,
-                               metric_arg);
-}
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
+#pragma message(__FILE__                                               \
+                " is deprecated and will be removed in release 22.06." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "knn.cuh"
diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu
index 25ec2e50ab..86adb10915 100644
--- a/cpp/test/spatial/selection.cu
+++ b/cpp/test/spatial/selection.cu
@@ -14,8 +14,11 @@
  * limitations under the License.
  */
 
+#include <algorithm>
 #include <gtest/gtest.h>
+#include <numeric>
 #include <raft/cudart_utils.h>
+#include <raft/random/rng.hpp>
 
 #include "../test_utils.h"
 
@@ -25,139 +28,354 @@
 #include <raft/spatial/knn/specializations.cuh>
 #endif
 
-namespace raft {
-namespace spatial {
-namespace selection {
+namespace raft::spatial::selection {
 
 using namespace raft;
 using namespace raft::sparse;
 
-template <typename value_idx, typename value_t>
-struct SparseSelectionInputs {
-  value_idx n_rows;
-  value_idx n_cols;
-
-  std::vector<value_t> dists_h;
-
-  std::vector<value_t> out_dists_ref_h;
-  std::vector<value_idx> out_indices_ref_h;
-
+struct SelectTestSpec {
+  int n_inputs;
+  int input_len;
   int k;
-
-  bool select_min;
+  int select_min;
 };
 
-template <typename value_idx, typename value_t>
-::std::ostream& operator<<(::std::ostream& os,
-                           const SparseSelectionInputs<value_idx, value_t>& dims)
+std::ostream& operator<<(std::ostream& os, const SelectTestSpec& ss)
 {
+  os << "spec{size: " << ss.input_len << "*" << ss.n_inputs << ", k: " << ss.k;
+  os << (ss.select_min ? "; min}" : "; max}");
   return os;
 }
 
-template <typename value_idx, typename value_t>
-class SparseSelectionTest
-  : public ::testing::TestWithParam<SparseSelectionInputs<value_idx, value_t>> {
+template <typename IdxT>
+auto gen_simple_ids(int n_inputs, int input_len) -> std::vector<IdxT>
+{
+  std::vector<IdxT> out(n_inputs * input_len);
+  auto s = rmm::cuda_stream_default;
+  rmm::device_uvector<IdxT> out_d(out.size(), s);
+  iota_fill(out_d.data(), IdxT(n_inputs), IdxT(input_len), s);
+  update_host(out.data(), out_d.data(), out.size(), s);
+  s.synchronize();
+  return out;
+}
+
+template <typename KeyT, typename IdxT>
+struct SelectInOutSimple {
  public:
-  SparseSelectionTest()
-    : params(::testing::TestWithParam<SparseSelectionInputs<value_idx, value_t>>::GetParam()),
-      stream(handle.get_stream()),
-      dists(0, stream),
-      inds(0, stream),
-      out_indices_ref(0, stream),
-      out_dists_ref(0, stream),
-      out_dists(0, stream),
-      out_indices(0, stream)
+  bool not_supported = false;
+
+  SelectInOutSimple(const SelectTestSpec& spec,
+                    const std::vector<KeyT>& in_dists,
+                    const std::vector<KeyT>& out_dists,
+                    const std::vector<IdxT>& out_ids)
+    : in_dists_(in_dists),
+      in_ids_(gen_simple_ids<IdxT>(spec.n_inputs, spec.input_len)),
+      out_dists_(out_dists),
+      out_ids_(out_ids)
   {
   }
 
- protected:
-  void make_data()
+  auto get_in_dists() -> std::vector<KeyT>& { return in_dists_; }
+  auto get_in_ids() -> std::vector<IdxT>& { return in_ids_; }
+  auto get_out_dists() -> std::vector<KeyT>& { return out_dists_; }
+  auto get_out_ids() -> std::vector<IdxT>& { return out_ids_; }
+
+ private:
+  std::vector<KeyT> in_dists_;
+  std::vector<IdxT> in_ids_;
+  std::vector<KeyT> out_dists_;
+  std::vector<IdxT> out_ids_;
+};
+
+template <typename KeyT, typename IdxT>
+struct SelectInOutComputed {
+ public:
+  bool not_supported = false;
+
+  SelectInOutComputed(const SelectTestSpec& spec,
+                      knn::SelectKAlgo algo,
+                      const std::vector<KeyT>& in_dists,
+                      const std::optional<std::vector<IdxT>>& in_ids = std::nullopt)
+    : in_dists_(in_dists),
+      in_ids_(in_ids.value_or(gen_simple_ids<IdxT>(spec.n_inputs, spec.input_len))),
+      out_dists_(spec.n_inputs * spec.k),
+      out_ids_(spec.n_inputs * spec.k)
   {
-    std::vector<value_t> dists_h = params.dists_h;
+    // check if the size is supported by the algorithm
+    switch (algo) {
+      case knn::SelectKAlgo::WARP_SORT:
+        if (spec.k > raft::spatial::knn::detail::topk::kMaxCapacity) {
+          not_supported = true;
+          return;
+        }
+        break;
+      case knn::SelectKAlgo::FAISS:
+        if (spec.k > raft::spatial::knn::detail::kFaissMaxK<IdxT, KeyT>()) {
+          not_supported = true;
+          return;
+        }
+        break;
+      default: break;
+    }
+
+    auto stream = rmm::cuda_stream_default;
 
-    dists.resize(n_rows * n_cols, stream);
-    inds.resize(n_rows * n_cols, stream);
-    out_dists.resize(n_rows * k, stream);
-    out_indices.resize(n_rows * k, stream);
+    rmm::device_uvector<KeyT> in_dists_d(in_dists_.size(), stream);
+    rmm::device_uvector<IdxT> in_ids_d(in_ids_.size(), stream);
+    rmm::device_uvector<KeyT> out_dists_d(out_dists_.size(), stream);
+    rmm::device_uvector<IdxT> out_ids_d(out_ids_.size(), stream);
 
-    update_device(dists.data(), dists_h.data(), dists_h.size(), stream);
-    iota_fill(inds.data(), n_rows, n_cols, stream);
+    update_device(in_dists_d.data(), in_dists_.data(), in_dists_.size(), stream);
+    update_device(in_ids_d.data(), in_ids_.data(), in_ids_.size(), stream);
 
-    std::vector<value_t> out_dists_ref_h     = params.out_dists_ref_h;
-    std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
-    out_indices_ref.resize(out_indices_ref_h.size(), stream);
-    out_dists_ref.resize(out_dists_ref_h.size(), stream);
+    raft::spatial::knn::select_k<IdxT, KeyT>(in_dists_d.data(),
+                                             in_ids_d.data(),
+                                             spec.n_inputs,
+                                             spec.input_len,
+                                             out_dists_d.data(),
+                                             out_ids_d.data(),
+                                             spec.select_min,
+                                             spec.k,
+                                             stream,
+                                             algo);
 
-    update_device(
-      out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream);
-    update_device(out_dists_ref.data(), out_dists_ref_h.data(), out_dists_ref_h.size(), stream);
+    update_host(out_dists_.data(), out_dists_d.data(), out_dists_.size(), stream);
+    update_host(out_ids_.data(), out_ids_d.data(), out_ids_.size(), stream);
+
+    interruptible::synchronize(stream);
+
+    auto p = topk_sort_permutation(out_dists_, out_ids_, spec.k, spec.select_min);
+    apply_permutation(out_dists_, p);
+    apply_permutation(out_ids_, p);
   }
 
-  void SetUp() override
+  auto get_in_dists() -> std::vector<KeyT>& { return in_dists_; }
+  auto get_in_ids() -> std::vector<IdxT>& { return in_ids_; }
+  auto get_out_dists() -> std::vector<KeyT>& { return out_dists_; }
+  auto get_out_ids() -> std::vector<IdxT>& { return out_ids_; }
+
+ private:
+  std::vector<KeyT> in_dists_;
+  std::vector<IdxT> in_ids_;
+  std::vector<KeyT> out_dists_;
+  std::vector<IdxT> out_ids_;
+
+  auto topk_sort_permutation(const std::vector<KeyT>& vec,
+                             const std::vector<IdxT>& inds,
+                             int k,
+                             bool select_min) -> std::vector<IdxT>
   {
-    n_rows = params.n_rows;
-    n_cols = params.n_cols;
-    k      = params.k;
-
-    make_data();
-
-    raft::spatial::knn::select_k(dists.data(),
-                                 inds.data(),
-                                 n_rows,
-                                 n_cols,
-                                 out_dists.data(),
-                                 out_indices.data(),
-                                 params.select_min,
-                                 k,
-                                 stream);
-
-    handle.sync_stream(stream);
+    std::vector<IdxT> p(vec.size());
+    std::iota(p.begin(), p.end(), 0);
+    if (select_min) {
+      std::sort(p.begin(), p.end(), [&vec, &inds, k](IdxT i, IdxT j) {
+        const IdxT ik = i / k;
+        const IdxT jk = j / k;
+        if (ik == jk) {
+          if (vec[i] == vec[j]) { return inds[i] < inds[j]; }
+          return vec[i] < vec[j];
+        }
+        return ik < jk;
+      });
+    } else {
+      std::sort(p.begin(), p.end(), [&vec, &inds, k](IdxT i, IdxT j) {
+        const IdxT ik = i / k;
+        const IdxT jk = j / k;
+        if (ik == jk) {
+          if (vec[i] == vec[j]) { return inds[i] < inds[j]; }
+          return vec[i] > vec[j];
+        }
+        return ik < jk;
+      });
+    }
+    return p;
   }
 
-  void compare()
+  template <typename T>
+  void apply_permutation(std::vector<T>& vec, const std::vector<IdxT>& p)
   {
-    ASSERT_TRUE(
-      devArrMatch(out_dists_ref.data(), out_dists.data(), n_rows * k, Compare<value_t>()));
-    ASSERT_TRUE(
-      devArrMatch(out_indices_ref.data(), out_indices.data(), n_rows * k, Compare<value_idx>()));
+    for (auto i = IdxT(vec.size()) - 1; i > 0; i--) {
+      auto j = p[i];
+      while (j > i)
+        j = p[j];
+      std::swap(vec[j], vec[i]);
+    }
   }
+};
+
+template <typename InOut>
+using Params = std::tuple<SelectTestSpec, knn::SelectKAlgo, InOut>;
 
+template <typename KeyT, typename IdxT, template <typename, typename> typename ParamsReader>
+class SelectionTest : public testing::TestWithParam<typename ParamsReader<KeyT, IdxT>::ParamsIn> {
  protected:
-  raft::handle_t handle;
-  cudaStream_t stream;
+  const SelectTestSpec spec;
+  const knn::SelectKAlgo algo;
+
+  typename ParamsReader<KeyT, IdxT>::InOut ref;
+  SelectInOutComputed<KeyT, IdxT> res;
+
+ public:
+  explicit SelectionTest(Params<typename ParamsReader<KeyT, IdxT>::InOut> ps)
+    : spec(std::get<0>(ps)),
+      algo(std::get<1>(ps)),
+      ref(std::get<2>(ps)),
+      res(spec, algo, ref.get_in_dists(), ref.get_in_ids())
+  {
+  }
+
+  explicit SelectionTest(typename ParamsReader<KeyT, IdxT>::ParamsIn ps)
+    : SelectionTest(ParamsReader<KeyT, IdxT>::read(ps))
+  {
+  }
 
-  int n_rows, n_cols, k;
+  SelectionTest()
+    : SelectionTest(testing::TestWithParam<typename ParamsReader<KeyT, IdxT>::ParamsIn>::GetParam())
+  {
+  }
 
-  // input data
-  rmm::device_uvector<value_t> dists;
-  rmm::device_uvector<value_idx> inds;
+  void run()
+  {
+    if (ref.not_supported || res.not_supported) { GTEST_SKIP(); }
+    ASSERT_TRUE(hostVecMatch(ref.get_out_dists(), res.get_out_dists(), Compare<KeyT>()));
+    ASSERT_TRUE(hostVecMatch(ref.get_out_ids(), res.get_out_ids(), Compare<IdxT>()));
+  }
+};
 
-  // output data
-  rmm::device_uvector<value_idx> out_indices;
-  rmm::device_uvector<value_t> out_dists;
+auto selection_algos = testing::Values(knn::SelectKAlgo::FAISS,
+                                       knn::SelectKAlgo::RADIX_8_BITS,
+                                       knn::SelectKAlgo::RADIX_11_BITS,
+                                       knn::SelectKAlgo::WARP_SORT);
 
-  rmm::device_uvector<value_idx> out_indices_ref;
-  rmm::device_uvector<value_t> out_dists_ref;
+template <typename KeyT, typename IdxT>
+struct params_simple {
+  using InOut = SelectInOutSimple<KeyT, IdxT>;
+  using Inputs =
+    std::tuple<SelectTestSpec, std::vector<KeyT>, std::vector<KeyT>, std::vector<IdxT>>;
+  using ParamsIn = std::tuple<Inputs, knn::SelectKAlgo>;
 
-  SparseSelectionInputs<value_idx, value_t> params;
+  static auto read(ParamsIn ps) -> Params<InOut>
+  {
+    auto ins  = std::get<0>(ps);
+    auto algo = std::get<1>(ps);
+    return std::make_tuple(
+      std::get<0>(ins),
+      algo,
+      SelectInOutSimple<KeyT, IdxT>(
+        std::get<0>(ins), std::get<1>(ins), std::get<2>(ins), std::get<3>(ins)));
+  }
 };
 
-const std::vector<SparseSelectionInputs<int, float>> inputs_i32_f = {
-  {5,
-   5,
-   {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
-    1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
-   {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0,
-    4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0},
-   {4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 3, 0, 1, 4, 2, 4, 2, 1, 3, 0, 0, 2, 1, 4, 3},
-   5,
-   true}};
-typedef SparseSelectionTest<int, float> SparseSelectionTestF;
-TEST_P(SparseSelectionTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(SparseSelectionTest,
-                        SparseSelectionTestF,
-                        ::testing::ValuesIn(inputs_i32_f));
-
-};  // end namespace selection
-};  // end namespace spatial
-};  // end namespace raft
+auto inputs_simple_f = testing::Values(
+  params_simple<float, int>::Inputs(
+    {5, 5, 5, true},
+    {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
+     1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+    {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0,
+     4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0},
+    {4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 3, 0, 1, 4, 2, 4, 2, 1, 3, 0, 0, 2, 1, 4, 3}),
+  params_simple<float, int>::Inputs(
+    {5, 5, 3, true},
+    {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
+     1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+    {1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0},
+    {4, 3, 2, 0, 1, 2, 3, 0, 1, 4, 2, 1, 0, 2, 1}),
+  params_simple<float, int>::Inputs(
+    {5, 7, 3, true},
+    {5.0, 4.0, 3.0, 2.0, 1.3, 7.5, 19.0, 9.0, 2.0, 3.0, 3.0, 5.0, 6.0, 4.0, 2.0, 3.0, 5.0, 1.0,
+     4.0, 1.0, 1.0, 5.0, 7.0, 2.5, 4.0,  7.0, 8.0, 8.0, 1.0, 3.0, 2.0, 5.0, 4.0, 1.1, 1.2},
+    {1.3, 2.0, 3.0, 2.0, 3.0, 3.0, 1.0, 1.0, 1.0, 2.5, 4.0, 5.0, 1.0, 1.1, 1.2},
+    {4, 3, 2, 1, 2, 3, 3, 5, 6, 2, 3, 0, 0, 5, 6}),
+  params_simple<float, int>::Inputs(
+    {1, 7, 3, true}, {2.0, 3.0, 5.0, 1.0, 4.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {3, 5, 6}),
+  params_simple<float, int>::Inputs(
+    {1, 7, 3, false}, {2.0, 3.0, 5.0, 1.0, 4.0, 1.0, 1.0}, {5.0, 4.0, 3.0}, {2, 4, 1}),
+  params_simple<float, int>::Inputs(
+    {1, 7, 3, false}, {2.0, 3.0, 5.0, 9.0, 4.0, 9.0, 9.0}, {9.0, 9.0, 9.0}, {3, 5, 6}),
+  params_simple<float, int>::Inputs(
+    {1, 130, 15, false},
+    {19, 1, 0, 1, 0, 1,  0,  1,  0,  1,  0,  1,  0,  1,  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+     0,  1, 0, 1, 0, 1,  0,  1,  0,  1,  0,  1,  0,  1,  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+     0,  1, 0, 1, 0, 1,  0,  1,  1,  2,  1,  2,  1,  2,  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
+     1,  2, 1, 2, 1, 2,  1,  2,  1,  2,  1,  2,  1,  2,  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 4,
+     5,  6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 4, 4, 2, 3, 2, 3, 2, 3, 2, 3, 2, 20},
+    {20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6},
+    {129, 0, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105}));
+
+typedef SelectionTest<float, int, params_simple> SimpleFloatInt;
+TEST_P(SimpleFloatInt, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(SelectionTest,
+                        SimpleFloatInt,
+                        testing::Combine(inputs_simple_f, selection_algos));
+
+template <knn::SelectKAlgo RefAlgo>
+struct with_ref {
+  template <typename KeyT, typename IdxT>
+  struct params_random {
+    using InOut    = SelectInOutComputed<KeyT, IdxT>;
+    using ParamsIn = std::tuple<SelectTestSpec, knn::SelectKAlgo>;
+
+    static auto read(ParamsIn ps) -> Params<InOut>
+    {
+      auto spec = std::get<0>(ps);
+      auto algo = std::get<1>(ps);
+      std::vector<KeyT> dists(spec.input_len * spec.n_inputs);
+
+      auto s = rmm::cuda_stream_default;
+      rmm::device_uvector<KeyT> dists_d(spec.input_len * spec.n_inputs, s);
+      raft::random::Rng(42).normal(dists_d.data(), dists_d.size(), KeyT(10.0), KeyT(100.0), s);
+      update_host(dists.data(), dists_d.data(), dists_d.size(), s);
+      s.synchronize();
+
+      return std::make_tuple(spec, algo, SelectInOutComputed<KeyT, IdxT>(spec, RefAlgo, dists));
+    }
+  };
+};
+
+auto inputs_random = testing::Values(SelectTestSpec{1, 130, 15, false},
+                                     SelectTestSpec{1, 128, 15, false},
+                                     SelectTestSpec{20, 700, 1, true},
+                                     SelectTestSpec{20, 700, 2, true},
+                                     SelectTestSpec{20, 700, 3, true},
+                                     SelectTestSpec{20, 700, 4, true},
+                                     SelectTestSpec{20, 700, 5, true},
+                                     SelectTestSpec{20, 700, 6, true},
+                                     SelectTestSpec{20, 700, 7, true},
+                                     SelectTestSpec{20, 700, 8, true},
+                                     SelectTestSpec{20, 700, 9, true},
+                                     SelectTestSpec{20, 700, 10, true},
+                                     SelectTestSpec{20, 700, 11, true},
+                                     SelectTestSpec{20, 700, 12, true},
+                                     SelectTestSpec{20, 700, 16, true},
+                                     SelectTestSpec{100, 1700, 17, true},
+                                     SelectTestSpec{100, 1700, 31, true},
+                                     SelectTestSpec{100, 1700, 32, false},
+                                     SelectTestSpec{100, 1700, 33, false},
+                                     SelectTestSpec{100, 1700, 63, false},
+                                     SelectTestSpec{100, 1700, 64, false},
+                                     SelectTestSpec{100, 1700, 65, false},
+                                     SelectTestSpec{100, 1700, 255, true},
+                                     SelectTestSpec{100, 1700, 256, true},
+                                     SelectTestSpec{100, 1700, 511, false},
+                                     SelectTestSpec{100, 1700, 512, true},
+                                     SelectTestSpec{100, 1700, 1023, false},
+                                     SelectTestSpec{100, 1700, 1024, true},
+                                     SelectTestSpec{100, 1700, 1700, true},
+                                     SelectTestSpec{10000, 100, 100, false},
+                                     SelectTestSpec{10000, 200, 100, false});
+
+typedef SelectionTest<float, int, with_ref<knn::SelectKAlgo::RADIX_8_BITS>::params_random>
+  ReferencedRandomFloatInt;
+TEST_P(ReferencedRandomFloatInt, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(SelectionTest,
+                        ReferencedRandomFloatInt,
+                        testing::Combine(inputs_random, selection_algos));
+
+typedef SelectionTest<double, int, with_ref<knn::SelectKAlgo::RADIX_8_BITS>::params_random>
+  ReferencedRandomDoubleInt;
+TEST_P(ReferencedRandomDoubleInt, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(SelectionTest,
+                        ReferencedRandomDoubleInt,
+                        testing::Combine(inputs_random, selection_algos));
+
+}  // namespace raft::spatial::selection
diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h
index 5349ac23d9..196b0cd0a8 100644
--- a/cpp/test/test_utils.h
+++ b/cpp/test/test_utils.h
@@ -209,6 +209,36 @@ testing::AssertionResult devArrMatchHost(
   return testing::AssertionSuccess();
 }
 
+/**
+ * @brief Helper function to compare host vectors using a custom comparison
+ * @tparam T the element type
+ * @tparam L the comparator lambda or object function
+ * @param expected_h host vector of expected value(s)
+ * @param actual_h host vector actual values
+ * @param eq_compare the comparator
+ * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
+ */
+template <typename T, typename L>
+testing::AssertionResult hostVecMatch(const std::vector<T>& expected_h,
+                                      const std::vector<T>& actual_h,
+                                      L eq_compare)
+{
+  auto n = actual_h.size();
+  if (n != expected_h.size())
+    return testing::AssertionFailure()
+           << "vector sizez mismatch: "
+           << "actual=" << n << " != expected=" << expected_h.size() << "; ";
+  for (size_t i = 0; i < n; ++i) {
+    auto exp = expected_h[i];
+    auto act = actual_h[i];
+    if (!eq_compare(exp, act)) {
+      return testing::AssertionFailure()
+             << "actual=" << act << " != expected=" << exp << " @" << i << "; ";
+    }
+  }
+  return testing::AssertionSuccess();
+}
+
 /*
  * @brief Helper function to compare diagonal values of a 2D matrix
  * @tparam T the data type of the arrays

From bdefaaa986c941c6e0184351d88d935cb2d5fdfe Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Wed, 30 Mar 2022 17:23:12 -0500
Subject: [PATCH 169/171] Pin cmake in conda recipe to <3.23 (#600)

CMake 3.23 has a bug that breaks our conda-build based builds in CI, this avoids that issue.

Authors:
  - Dante Gama Dessavre (https://github.com/dantegd)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/raft/pull/600
---
 conda/recipes/libraft_distance/meta.yaml | 2 +-
 conda/recipes/libraft_headers/meta.yaml  | 2 +-
 conda/recipes/libraft_nn/meta.yaml       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/conda/recipes/libraft_distance/meta.yaml b/conda/recipes/libraft_distance/meta.yaml
index 4474629df4..a2eaab8854 100644
--- a/conda/recipes/libraft_distance/meta.yaml
+++ b/conda/recipes/libraft_distance/meta.yaml
@@ -36,7 +36,7 @@ build:
 
 requirements:
   build:
-    - cmake>=3.20.1
+    - cmake>=3.20.1,<3.23
   host:
     - libraft-headers {{ version }}
     - nccl>=2.9.9
diff --git a/conda/recipes/libraft_headers/meta.yaml b/conda/recipes/libraft_headers/meta.yaml
index aec6fa4351..aaabfce18c 100644
--- a/conda/recipes/libraft_headers/meta.yaml
+++ b/conda/recipes/libraft_headers/meta.yaml
@@ -36,7 +36,7 @@ build:
 
 requirements:
   build:
-    - cmake>=3.20.1
+    - cmake>=3.20.1,<3.23
   host:
     - nccl>=2.9.9
     - cudatoolkit {{ cuda_version }}.*
diff --git a/conda/recipes/libraft_nn/meta.yaml b/conda/recipes/libraft_nn/meta.yaml
index 9d6732d56b..0c31a67d66 100644
--- a/conda/recipes/libraft_nn/meta.yaml
+++ b/conda/recipes/libraft_nn/meta.yaml
@@ -36,7 +36,7 @@ build:
 
 requirements:
   build:
-    - cmake>=3.20.1
+    - cmake>=3.20.1,<3.23
   host:
     - libraft-headers {{ version }}
     - cudatoolkit {{ cuda_version }}.*

From c50948353d0aa479065af706b062b990fa71b5fe Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Fri, 1 Apr 2022 15:03:38 -0400
Subject: [PATCH 170/171] Only run doc builds for x86_64 (#607)

We don't have arm64 packages for `rapids-doc-env`, so only run the doc builds for x86_64

Authors:
   - Ray Douglass (https://github.com/raydouglass)

Approvers:
   - AJ Schmidt (https://github.com/ajschmidt8)
---
 ci/gpu/build.sh | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 69d21a5bb4..b227d34e41 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -63,8 +63,12 @@ gpuci_mamba_retry install -y -c conda-forge -c rapidsai -c rapidsai-nightly -c n
       "dask-cuda=${MINOR_VERSION}" \
       "ucx-py=${UCX_PY_VERSION}" \
       "rapids-build-env=${MINOR_VERSION}.*" \
-      "rapids-notebook-env=${MINOR_VERSION}.*" \
-      "rapids-doc-env=${MINOR_VERSION}.*"
+      "rapids-notebook-env=${MINOR_VERSION}.*"
+
+if [ "$(arch)" = "x86_64" ]; then
+    gpuci_mamba_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \
+        "rapids-doc-env=${MINOR_VERSION}.*"
+fi
 
 # Install the master version of dask, distributed, and dask-ml
 gpuci_logger "Install the master version of dask and distributed"
@@ -134,5 +138,7 @@ gpuci_logger "Python pytest for pylibraft"
 cd "$WORKSPACE/python/pylibraft"
 python -m pytest --cache-clear --junitxml="$WORKSPACE/junit-pylibraft.xml" -v -s
 
-gpuci_logger "Building docs"
-"$WORKSPACE/build.sh" docs -v
+if [ "$(arch)" = "x86_64" ]; then
+  gpuci_logger "Building docs"
+  "$WORKSPACE/build.sh" docs -v
+fi

From f5d2627faec8c35eba458fa003b4fce16474fc1d Mon Sep 17 00:00:00 2001
From: Raymond Douglass <ray@raydouglass.com>
Date: Wed, 6 Apr 2022 10:37:59 -0400
Subject: [PATCH 171/171] update changelog

---
 CHANGELOG.md | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 106 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1733db2100..048febdca8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,110 @@
-# raft 22.04.00 (Date TBD)
+# raft 22.04.00 (6 Apr 2022)
 
-Please see https://github.com/rapidsai/raft/releases/tag/v22.04.00a for the latest changes to this development branch.
+## 🚨 Breaking Changes
+
+- Moving some of the remaining linalg prims from cuml ([#502](https://github.com/rapidsai/raft/pull/502)) [@cjnolet](https://github.com/cjnolet)
+- Fix badly merged cublas wrappers ([#492](https://github.com/rapidsai/raft/pull/492)) [@achirkin](https://github.com/achirkin)
+- Hiding implementation details for lap, clustering, spectral, and label ([#477](https://github.com/rapidsai/raft/pull/477)) [@cjnolet](https://github.com/cjnolet)
+- Adding destructor for std comms and using nccl allreduce for barrier in mpi comms ([#473](https://github.com/rapidsai/raft/pull/473)) [@cjnolet](https://github.com/cjnolet)
+- Cleaning up cusparse_wrappers ([#441](https://github.com/rapidsai/raft/pull/441)) [@cjnolet](https://github.com/cjnolet)
+- Improvents to RNG ([#434](https://github.com/rapidsai/raft/pull/434)) [@vinaydes](https://github.com/vinaydes)
+- Remove RAFT memory management ([#400](https://github.com/rapidsai/raft/pull/400)) [@viclafargue](https://github.com/viclafargue)
+- LinAlg impl in detail ([#383](https://github.com/rapidsai/raft/pull/383)) [@divyegala](https://github.com/divyegala)
+
+## 🐛 Bug Fixes
+
+- Pin cmake in conda recipe to &lt;3.23 ([#600](https://github.com/rapidsai/raft/pull/600)) [@dantegd](https://github.com/dantegd)
+- Fix make_device_vector_view ([#595](https://github.com/rapidsai/raft/pull/595)) [@lowener](https://github.com/lowener)
+- Update cuco version. ([#592](https://github.com/rapidsai/raft/pull/592)) [@vyasr](https://github.com/vyasr)
+- Fixing raft headers dir ([#574](https://github.com/rapidsai/raft/pull/574)) [@cjnolet](https://github.com/cjnolet)
+- Update update-version.sh ([#560](https://github.com/rapidsai/raft/pull/560)) [@raydouglass](https://github.com/raydouglass)
+- find_package(raft) can now be called multiple times safely ([#532](https://github.com/rapidsai/raft/pull/532)) [@robertmaynard](https://github.com/robertmaynard)
+- Allocate sufficient memory for Hungarian if number of batches &gt; 1 ([#531](https://github.com/rapidsai/raft/pull/531)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Adding lap.hpp back (with deprecation) ([#529](https://github.com/rapidsai/raft/pull/529)) [@cjnolet](https://github.com/cjnolet)
+- raft-config is idempotent no matter RAFT_COMPILE_LIBRARIES value ([#516](https://github.com/rapidsai/raft/pull/516)) [@robertmaynard](https://github.com/robertmaynard)
+- Call initialize() in mpi_comms_t constructor. ([#506](https://github.com/rapidsai/raft/pull/506)) [@seunghwak](https://github.com/seunghwak)
+- Improve row-major meanvar kernel via minimizing atomicCAS locks ([#489](https://github.com/rapidsai/raft/pull/489)) [@achirkin](https://github.com/achirkin)
+- Adding destructor for std comms and using nccl allreduce for barrier in mpi comms ([#473](https://github.com/rapidsai/raft/pull/473)) [@cjnolet](https://github.com/cjnolet)
+
+## 📖 Documentation
+
+- Updating docs for 22.04 ([#566](https://github.com/rapidsai/raft/pull/566)) [@cjnolet](https://github.com/cjnolet)
+
+## 🚀 New Features
+
+- Add benchmarks ([#549](https://github.com/rapidsai/raft/pull/549)) [@achirkin](https://github.com/achirkin)
+- Unify weighted mean code ([#514](https://github.com/rapidsai/raft/pull/514)) [@lowener](https://github.com/lowener)
+- single-pass raft::stats::meanvar ([#472](https://github.com/rapidsai/raft/pull/472)) [@achirkin](https://github.com/achirkin)
+- Move `random` package of cuML to RAFT ([#449](https://github.com/rapidsai/raft/pull/449)) [@divyegala](https://github.com/divyegala)
+- mdspan integration. ([#437](https://github.com/rapidsai/raft/pull/437)) [@trivialfis](https://github.com/trivialfis)
+- Interruptible execution ([#433](https://github.com/rapidsai/raft/pull/433)) [@achirkin](https://github.com/achirkin)
+- make raft sources compilable with clang ([#424](https://github.com/rapidsai/raft/pull/424)) [@MatthiasKohl](https://github.com/MatthiasKohl)
+- Span implementation. ([#399](https://github.com/rapidsai/raft/pull/399)) [@trivialfis](https://github.com/trivialfis)
+
+## 🛠️ Improvements
+
+- Adding build script for docs ([#589](https://github.com/rapidsai/raft/pull/589)) [@cjnolet](https://github.com/cjnolet)
+- Temporarily disable new `ops-bot` functionality ([#586](https://github.com/rapidsai/raft/pull/586)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Fix commands to get conda output files ([#584](https://github.com/rapidsai/raft/pull/584)) [@Ethyling](https://github.com/Ethyling)
+- Link to `cuco` and add faiss `EXCLUDE_FROM_ALL` option ([#583](https://github.com/rapidsai/raft/pull/583)) [@trxcllnt](https://github.com/trxcllnt)
+- exposing faiss::faiss ([#582](https://github.com/rapidsai/raft/pull/582)) [@cjnolet](https://github.com/cjnolet)
+- Pin `dask` and `distributed` version ([#581](https://github.com/rapidsai/raft/pull/581)) [@galipremsagar](https://github.com/galipremsagar)
+- removing exclude_from_all from cuco ([#580](https://github.com/rapidsai/raft/pull/580)) [@cjnolet](https://github.com/cjnolet)
+- Adding INSTALL_EXPORT_SET for cuco, rmm, thrust ([#579](https://github.com/rapidsai/raft/pull/579)) [@cjnolet](https://github.com/cjnolet)
+- Thrust package name case ([#576](https://github.com/rapidsai/raft/pull/576)) [@trxcllnt](https://github.com/trxcllnt)
+- Add missing thrust includes to transpose.cuh ([#575](https://github.com/rapidsai/raft/pull/575)) [@zbjornson](https://github.com/zbjornson)
+- Use unanchored clang-format version check ([#573](https://github.com/rapidsai/raft/pull/573)) [@zbjornson](https://github.com/zbjornson)
+- Fixing accidental removal of thrust target from cmakelists ([#571](https://github.com/rapidsai/raft/pull/571)) [@cjnolet](https://github.com/cjnolet)
+- Don&#39;t add gtest to build export set or generate a gtest-config.cmake ([#565](https://github.com/rapidsai/raft/pull/565)) [@trxcllnt](https://github.com/trxcllnt)
+- Set `main` label by default ([#559](https://github.com/rapidsai/raft/pull/559)) [@galipremsagar](https://github.com/galipremsagar)
+- Add local conda channel while looking for conda outputs ([#558](https://github.com/rapidsai/raft/pull/558)) [@Ethyling](https://github.com/Ethyling)
+- Updated dask and distributed to &gt;=2022.02.1 ([#557](https://github.com/rapidsai/raft/pull/557)) [@rlratzel](https://github.com/rlratzel)
+- Upload packages using testing label for nightlies ([#556](https://github.com/rapidsai/raft/pull/556)) [@Ethyling](https://github.com/Ethyling)
+- Add `.github/ops-bot.yaml` config file ([#554](https://github.com/rapidsai/raft/pull/554)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Disabling benchmarks building by default. ([#553](https://github.com/rapidsai/raft/pull/553)) [@cjnolet](https://github.com/cjnolet)
+- KNN select-top-k variants ([#551](https://github.com/rapidsai/raft/pull/551)) [@achirkin](https://github.com/achirkin)
+- Adding logger ([#550](https://github.com/rapidsai/raft/pull/550)) [@cjnolet](https://github.com/cjnolet)
+- clang-tidy support: improved clang run scripts with latest changes (see cugraph-ops) ([#548](https://github.com/rapidsai/raft/pull/548)) [@MatthiasKohl](https://github.com/MatthiasKohl)
+- Pylibraft for pairwise distances ([#540](https://github.com/rapidsai/raft/pull/540)) [@cjnolet](https://github.com/cjnolet)
+- mdspan PoC for distance make_blobs ([#538](https://github.com/rapidsai/raft/pull/538)) [@cjnolet](https://github.com/cjnolet)
+- Include thrust/sort.h in ball_cover.cuh ([#526](https://github.com/rapidsai/raft/pull/526)) [@akifcorduk](https://github.com/akifcorduk)
+- Increase parallelism in allgatherv ([#525](https://github.com/rapidsai/raft/pull/525)) [@seunghwak](https://github.com/seunghwak)
+- Moving device functions to cuh files and deprecating hpp ([#524](https://github.com/rapidsai/raft/pull/524)) [@cjnolet](https://github.com/cjnolet)
+- Use `dynamic_extent` from `stdex`. ([#523](https://github.com/rapidsai/raft/pull/523)) [@trivialfis](https://github.com/trivialfis)
+- Updating some of the ci check scripts ([#522](https://github.com/rapidsai/raft/pull/522)) [@cjnolet](https://github.com/cjnolet)
+- Use shfl_xor in warpReduce for broadcast ([#521](https://github.com/rapidsai/raft/pull/521)) [@akifcorduk](https://github.com/akifcorduk)
+- Fixing Python conda package and installation ([#520](https://github.com/rapidsai/raft/pull/520)) [@cjnolet](https://github.com/cjnolet)
+- Adding instructions to install from conda and build using CPM ([#519](https://github.com/rapidsai/raft/pull/519)) [@cjnolet](https://github.com/cjnolet)
+- Implement span storage optimization. ([#515](https://github.com/rapidsai/raft/pull/515)) [@trivialfis](https://github.com/trivialfis)
+- RNG test fixes and improvements ([#513](https://github.com/rapidsai/raft/pull/513)) [@vinaydes](https://github.com/vinaydes)
+- Moving scores and metrics over to raft::stats ([#512](https://github.com/rapidsai/raft/pull/512)) [@cjnolet](https://github.com/cjnolet)
+- Random ball cover in 3d ([#510](https://github.com/rapidsai/raft/pull/510)) [@cjnolet](https://github.com/cjnolet)
+- Initializing memory in RBC ([#509](https://github.com/rapidsai/raft/pull/509)) [@cjnolet](https://github.com/cjnolet)
+- Adjusting conda packaging to remove duplicate dependencies ([#508](https://github.com/rapidsai/raft/pull/508)) [@cjnolet](https://github.com/cjnolet)
+- Moving remaining stats prims from cuml ([#507](https://github.com/rapidsai/raft/pull/507)) [@cjnolet](https://github.com/cjnolet)
+- Correcting the namespace ([#505](https://github.com/rapidsai/raft/pull/505)) [@vinaydes](https://github.com/vinaydes)
+- Passing stream through commsplit ([#503](https://github.com/rapidsai/raft/pull/503)) [@cjnolet](https://github.com/cjnolet)
+- Moving some of the remaining linalg prims from cuml ([#502](https://github.com/rapidsai/raft/pull/502)) [@cjnolet](https://github.com/cjnolet)
+- Fixing spectral APIs ([#496](https://github.com/rapidsai/raft/pull/496)) [@cjnolet](https://github.com/cjnolet)
+- Fix badly merged cublas wrappers ([#492](https://github.com/rapidsai/raft/pull/492)) [@achirkin](https://github.com/achirkin)
+- Fix integer overflow in distances ([#490](https://github.com/rapidsai/raft/pull/490)) [@RAMitchell](https://github.com/RAMitchell)
+- Reusing shared libs in gpu ci builds ([#487](https://github.com/rapidsai/raft/pull/487)) [@cjnolet](https://github.com/cjnolet)
+- Adding fatbin to shared libs and fixing conda paths in cpu build ([#485](https://github.com/rapidsai/raft/pull/485)) [@cjnolet](https://github.com/cjnolet)
+- Add CMake `install` rule for tests ([#483](https://github.com/rapidsai/raft/pull/483)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Adding cpu ci for conda build ([#482](https://github.com/rapidsai/raft/pull/482)) [@cjnolet](https://github.com/cjnolet)
+- iUpdating codeowners to use new raft codeowners ([#480](https://github.com/rapidsai/raft/pull/480)) [@cjnolet](https://github.com/cjnolet)
+- Hiding implementation details for lap, clustering, spectral, and label ([#477](https://github.com/rapidsai/raft/pull/477)) [@cjnolet](https://github.com/cjnolet)
+- Define PTDS via `-D` to fix cache misses in sccache ([#476](https://github.com/rapidsai/raft/pull/476)) [@trxcllnt](https://github.com/trxcllnt)
+- Unpin dask and distributed ([#474](https://github.com/rapidsai/raft/pull/474)) [@galipremsagar](https://github.com/galipremsagar)
+- Replace `ccache` with `sccache` ([#471](https://github.com/rapidsai/raft/pull/471)) [@ajschmidt8](https://github.com/ajschmidt8)
+- More README updates ([#467](https://github.com/rapidsai/raft/pull/467)) [@cjnolet](https://github.com/cjnolet)
+- CUBLAS wrappers with switchable host/device pointer mode ([#453](https://github.com/rapidsai/raft/pull/453)) [@achirkin](https://github.com/achirkin)
+- Cleaning up cusparse_wrappers ([#441](https://github.com/rapidsai/raft/pull/441)) [@cjnolet](https://github.com/cjnolet)
+- Adding conda packaging for libraft and pyraft ([#439](https://github.com/rapidsai/raft/pull/439)) [@cjnolet](https://github.com/cjnolet)
+- Improvents to RNG ([#434](https://github.com/rapidsai/raft/pull/434)) [@vinaydes](https://github.com/vinaydes)
+- Hiding implementation details for comms ([#409](https://github.com/rapidsai/raft/pull/409)) [@cjnolet](https://github.com/cjnolet)
+- Remove RAFT memory management ([#400](https://github.com/rapidsai/raft/pull/400)) [@viclafargue](https://github.com/viclafargue)
+- LinAlg impl in detail ([#383](https://github.com/rapidsai/raft/pull/383)) [@divyegala](https://github.com/divyegala)
 
 # raft 22.02.00 (2 Feb 2022)