diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
index 5b2df3bcb3..941f7cc633 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -21,7 +21,7 @@
 #include <raft/linalg/cusolver_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
-#include <raft/matrix/matrix.cuh>
+#include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh
index cc912d7d86..a50448acbe 100644
--- a/cpp/include/raft/linalg/qr.cuh
+++ b/cpp/include/raft/linalg/qr.cuh
@@ -18,7 +18,7 @@
 
 #include <raft/linalg/cublas_wrappers.h>
 #include <raft/linalg/cusolver_wrappers.h>
-#include <raft/matrix/matrix.cuh>
+#include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh
index 8b40a80903..2315920689 100644
--- a/cpp/include/raft/linalg/svd.cuh
+++ b/cpp/include/raft/linalg/svd.cuh
@@ -21,8 +21,8 @@
 #include <raft/linalg/cusolver_wrappers.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
-#include <raft/matrix/math.cuh>
-#include <raft/matrix/matrix.cuh>
+#include <raft/matrix/math.hpp>
+#include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include "eig.cuh"
diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh
new file mode 100644
index 0000000000..f79cb397b7
--- /dev/null
+++ b/cpp/include/raft/matrix/detail/math.cuh
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/cub.cuh>
+#include <raft/cuda_utils.cuh>
+
+namespace raft {
+namespace matrix {
+namespace detail {
+
+// Computes the argmax(d_in) column-wise in a DxN matrix
+template <typename T, int TPB>
+__global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) {
+  typedef cub::BlockReduce<cub::KeyValuePair<int, T>, TPB> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  // compute maxIndex=argMax  index for column
+  using KVP = cub::KeyValuePair<int, T>;
+  int rowStart = blockIdx.x * D;
+  KVP thread_data(-1, -raft::myInf<T>());
+
+  for (int i = threadIdx.x; i < D; i += TPB) {
+    int idx = rowStart + i;
+    thread_data = cub::ArgMax()(thread_data, KVP(i, d_in[idx]));
+  }
+
+  auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax());
+
+  if (threadIdx.x == 0) {
+    argmax[blockIdx.x] = maxKV.key;
+  }
+}
+
+template <typename math_t>
+void argmax(const math_t *in, int n_rows, int n_cols, math_t *out,
+            cudaStream_t stream) {
+  int D = n_rows;
+  int N = n_cols;
+  if (D <= 32) {
+    argmaxKernel<math_t, 32><<<N, 32, 0, stream>>>(in, D, N, out);
+  } else if (D <= 64) {
+    argmaxKernel<math_t, 64><<<N, 64, 0, stream>>>(in, D, N, out);
+  } else if (D <= 128) {
+    argmaxKernel<math_t, 128><<<N, 128, 0, stream>>>(in, D, N, out);
+  } else {
+    argmaxKernel<math_t, 256><<<N, 256, 0, stream>>>(in, D, N, out);
+  }
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+// Utility kernel needed for signFlip.
+// Computes the argmax(abs(d_in)) column-wise in a DxN matrix followed by
+// flipping the sign if the |max| value for each column is negative.
+template <typename T, int TPB>
+__global__ void signFlipKernel(T *d_in, int D, int N) {
+  typedef cub::BlockReduce<cub::KeyValuePair<int, T>, TPB> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  // compute maxIndex=argMax (with abs()) index for column
+  using KVP = cub::KeyValuePair<int, T>;
+  int rowStart = blockIdx.x * D;
+  KVP thread_data(0, 0);
+  for (int i = threadIdx.x; i < D; i += TPB) {
+    int idx = rowStart + i;
+    thread_data = cub::ArgMax()(thread_data, KVP(idx, abs(d_in[idx])));
+  }
+  auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax());
+
+  // flip column sign if d_in[maxIndex] < 0
+  __shared__ bool need_sign_flip;
+  if (threadIdx.x == 0) {
+    need_sign_flip = d_in[maxKV.key] < T(0);
+  }
+  __syncthreads();
+
+  if (need_sign_flip) {
+    for (int i = threadIdx.x; i < D; i += TPB) {
+      int idx = rowStart + i;
+      d_in[idx] = -d_in[idx];
+    }
+  }
+}
+
+template <typename math_t>
+void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) {
+  int D = n_rows;
+  int N = n_cols;
+  auto data = inout;
+  if (D <= 32) {
+    signFlipKernel<math_t, 32><<<N, 32, 0, stream>>>(data, D, N);
+  } else if (D <= 64) {
+    signFlipKernel<math_t, 64><<<N, 64, 0, stream>>>(data, D, N);
+  } else if (D <= 128) {
+    signFlipKernel<math_t, 128><<<N, 128, 0, stream>>>(data, D, N);
+  } else {
+    signFlipKernel<math_t, 256><<<N, 256, 0, stream>>>(data, D, N);
+  }
+  CUDA_CHECK(cudaPeekAtLastError());
+}
+
+}  // end namespace detail
+}  // end namespace matrix
+}  // end namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
new file mode 100644
index 0000000000..8293d01bdb
--- /dev/null
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cache/cache_util.cuh>
+#include <raft/cuda_utils.cuh>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/for_each.h>
+
+namespace raft {
+namespace matrix {
+namespace detail {
+
+template <typename m_t, typename idx_array_t = int, typename idx_t = size_t>
+void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out,
+              const idx_array_t *indices, idx_t n_rows_indices,
+              cudaStream_t stream, bool rowMajor = false) {
+  if (rowMajor) {
+    const idx_t TPB = 256;
+    cache::
+      get_vecs<<<raft::ceildiv(n_rows_indices * n_cols, TPB), TPB, 0, stream>>>(
+        in, n_cols, indices, n_rows_indices, out);
+    CUDA_CHECK(cudaPeekAtLastError());
+    return;
+  }
+
+  idx_t size = n_rows_indices * n_cols;
+  auto counting = thrust::make_counting_iterator<idx_t>(0);
+
+  thrust::for_each(rmm::exec_policy(stream), counting, counting + size,
+                   [=] __device__(idx_t idx) {
+                     idx_t row = idx % n_rows_indices;
+                     idx_t col = idx / n_rows_indices;
+
+                     out[col * n_rows_indices + row] =
+                       in[col * n_rows + indices[row]];
+                   });
+}
+
+/**
+ * @brief Kernel for copying a slice of a big matrix to a small matrix with a
+ * size matches that slice
+ * @param src_d: input matrix
+ * @param m: number of rows of input matrix
+ * @param n: number of columns of input matrix
+ * @param dst_d: output matrix
+ * @param x1, y1: coordinate of the top-left point of the wanted area (0-based)
+ * @param x2, y2: coordinate of the bottom-right point of the wanted area
+ * (1-based)
+ */
+template <typename m_t, typename idx_t = int>
+__global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1,
+                      idx_t y1, idx_t x2, idx_t y2) {
+  idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  idx_t dm = x2 - x1, dn = y2 - y1;
+  if (idx < dm * dn) {
+    idx_t i = idx % dm, j = idx / dm;
+    idx_t is = i + x1, js = j + y1;
+    dst_d[idx] = src_d[is + js * m];
+  }
+}
+
+template <typename m_t, typename idx_t = int>
+void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1,
+                 idx_t y1, idx_t x2, idx_t y2, cudaStream_t stream) {
+  // Slicing
+  dim3 block(64);
+  dim3 grid(((x2 - x1) * (y2 - y1) + block.x - 1) / block.x);
+  slice<<<grid, block, 0, stream>>>(in, n_rows, n_cols, out, x1, y1, x2, y2);
+}
+
+/**
+ * @brief Kernel for copying the upper triangular part of a matrix to another
+ * @param src: input matrix with a size of mxn
+ * @param dst: output matrix with a size of kxk
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param k: min(n_rows, n_cols)
+ */
+template <typename m_t, typename idx_t = int>
+__global__ void getUpperTriangular(m_t *src, m_t *dst, idx_t n_rows,
+                                   idx_t n_cols, idx_t k) {
+  idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  idx_t m = n_rows, n = n_cols;
+  if (idx < m * n) {
+    idx_t i = idx % m, j = idx / m;
+    if (i < k && j < k && j >= i) {
+      dst[i + j * k] = src[idx];
+    }
+  }
+}
+
+template <typename m_t, typename idx_t = int>
+void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols,
+                         cudaStream_t stream) {
+  idx_t m = n_rows, n = n_cols;
+  idx_t k = min(m, n);
+  dim3 block(64);
+  dim3 grid((m * n + block.x - 1) / block.x);
+  getUpperTriangular<<<grid, block, 0, stream>>>(src, dst, m, n, k);
+}
+
+/**
+ * @brief Copy a vector to the diagonal of a matrix
+ * @param vec: vector of length k = min(n_rows, n_cols)
+ * @param matrix: matrix of size n_rows x n_cols
+ * @param m: number of rows of the matrix
+ * @param n: number of columns of the matrix
+ * @param k: dimensionality
+ */
+template <typename m_t, typename idx_t = int>
+__global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, idx_t m,
+                                           idx_t n, idx_t k) {
+  idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+
+  if (idx < k) {
+    matrix[idx + idx * m] = vec[idx];
+  }
+}
+
+template <typename m_t, typename idx_t = int>
+void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols,
+                              cudaStream_t stream) {
+  idx_t k = min(n_rows, n_cols);
+  dim3 block(64);
+  dim3 grid((k + block.x - 1) / block.x);
+  copyVectorToMatrixDiagonal<<<grid, block, 0, stream>>>(vec, matrix, n_rows,
+                                                         n_cols, k);
+}
+
+/**
+ * @brief Calculate the inverse of the diagonal of a square matrix
+ * element-wise and in place
+ * @param in: square input matrix with size len x len
+ * @param len: size of one side of the matrix
+ */
+template <typename m_t, typename idx_t = int>
+__global__ void matrixDiagonalInverse(m_t *in, idx_t len) {
+  idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  if (idx < len) {
+    in[idx + idx * len] = 1.0 / in[idx + idx * len];
+  }
+}
+
+template <typename m_t, typename idx_t = int>
+void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) {
+  dim3 block(64);
+  dim3 grid((len + block.x - 1) / block.x);
+  matrixDiagonalInverse<m_t><<<grid, block, 0, stream>>>(in, len);
+}
+
+}  // end namespace detail
+}  // end namespace matrix
+}  // end namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.hpp
similarity index 84%
rename from cpp/include/raft/matrix/math.cuh
rename to cpp/include/raft/matrix/math.hpp
index 41ca85dce0..e67440019f 100644
--- a/cpp/include/raft/matrix/math.cuh
+++ b/cpp/include/raft/matrix/math.hpp
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include "detail/math.cuh"
+
 #include <raft/handle.hpp>
 #include <raft/linalg/binary_op.cuh>
 #include <raft/linalg/map_then_reduce.cuh>
@@ -304,29 +306,6 @@ void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len,
 
 /** @} */
 
-// Computes the argmax(d_in) column-wise in a DxN matrix
-template <typename T, int TPB>
-__global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) {
-  typedef cub::BlockReduce<cub::KeyValuePair<int, T>, TPB> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  // compute maxIndex=argMax  index for column
-  using KVP = cub::KeyValuePair<int, T>;
-  int rowStart = blockIdx.x * D;
-  KVP thread_data(-1, -raft::myInf<T>());
-
-  for (int i = threadIdx.x; i < D; i += TPB) {
-    int idx = rowStart + i;
-    thread_data = cub::ArgMax()(thread_data, KVP(i, d_in[idx]));
-  }
-
-  auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax());
-
-  if (threadIdx.x == 0) {
-    argmax[blockIdx.x] = maxKV.key;
-  }
-}
-
 /**
  * @brief Argmax: find the row idx with maximum value for each column
  * @param in: input matrix
@@ -338,51 +317,7 @@ __global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) {
 template <typename math_t>
 void argmax(const math_t *in, int n_rows, int n_cols, math_t *out,
             cudaStream_t stream) {
-  int D = n_rows;
-  int N = n_cols;
-  if (D <= 32) {
-    argmaxKernel<math_t, 32><<<N, 32, 0, stream>>>(in, D, N, out);
-  } else if (D <= 64) {
-    argmaxKernel<math_t, 64><<<N, 64, 0, stream>>>(in, D, N, out);
-  } else if (D <= 128) {
-    argmaxKernel<math_t, 128><<<N, 128, 0, stream>>>(in, D, N, out);
-  } else {
-    argmaxKernel<math_t, 256><<<N, 256, 0, stream>>>(in, D, N, out);
-  }
-  CUDA_CHECK(cudaPeekAtLastError());
-}
-
-// Utility kernel needed for signFlip.
-// Computes the argmax(abs(d_in)) column-wise in a DxN matrix followed by
-// flipping the sign if the |max| value for each column is negative.
-template <typename T, int TPB>
-__global__ void signFlipKernel(T *d_in, int D, int N) {
-  typedef cub::BlockReduce<cub::KeyValuePair<int, T>, TPB> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  // compute maxIndex=argMax (with abs()) index for column
-  using KVP = cub::KeyValuePair<int, T>;
-  int rowStart = blockIdx.x * D;
-  KVP thread_data(0, 0);
-  for (int i = threadIdx.x; i < D; i += TPB) {
-    int idx = rowStart + i;
-    thread_data = cub::ArgMax()(thread_data, KVP(idx, abs(d_in[idx])));
-  }
-  auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax());
-
-  // flip column sign if d_in[maxIndex] < 0
-  __shared__ bool need_sign_flip;
-  if (threadIdx.x == 0) {
-    need_sign_flip = d_in[maxKV.key] < T(0);
-  }
-  __syncthreads();
-
-  if (need_sign_flip) {
-    for (int i = threadIdx.x; i < D; i += TPB) {
-      int idx = rowStart + i;
-      d_in[idx] = -d_in[idx];
-    }
-  }
+  detail::argmax(in, n_rows, n_cols, out, stream);
 }
 
 /**
@@ -395,19 +330,7 @@ __global__ void signFlipKernel(T *d_in, int D, int N) {
  */
 template <typename math_t>
 void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) {
-  int D = n_rows;
-  int N = n_cols;
-  auto data = inout;
-  if (D <= 32) {
-    signFlipKernel<math_t, 32><<<N, 32, 0, stream>>>(data, D, N);
-  } else if (D <= 64) {
-    signFlipKernel<math_t, 64><<<N, 64, 0, stream>>>(data, D, N);
-  } else if (D <= 128) {
-    signFlipKernel<math_t, 128><<<N, 128, 0, stream>>>(data, D, N);
-  } else {
-    signFlipKernel<math_t, 256><<<N, 256, 0, stream>>>(data, D, N);
-  }
-  CUDA_CHECK(cudaPeekAtLastError());
+  detail::signFlip(inout, n_rows, n_cols, stream);
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.hpp
similarity index 68%
rename from cpp/include/raft/matrix/matrix.cuh
rename to cpp/include/raft/matrix/matrix.hpp
index 688b92da09..8dd9fbf487 100644
--- a/cpp/include/raft/matrix/matrix.cuh
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -16,17 +16,15 @@
 
 #pragma once
 
+#include "detail/matrix.cuh"
+
 #include <cuda_runtime.h>
 #include <cusolverDn.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/cublas_wrappers.h>
-#include <thrust/execution_policy.h>
 #include <algorithm>
 #include <cstddef>
-#include <raft/cache/cache_util.cuh>
-#include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
-#include <rmm/exec_policy.hpp>
 
 namespace raft {
 namespace matrix {
@@ -52,26 +50,8 @@ template <typename m_t, typename idx_array_t = int, typename idx_t = size_t>
 void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out,
               const idx_array_t *indices, idx_t n_rows_indices,
               cudaStream_t stream, bool rowMajor = false) {
-  if (rowMajor) {
-    const idx_t TPB = 256;
-    cache::
-      get_vecs<<<raft::ceildiv(n_rows_indices * n_cols, TPB), TPB, 0, stream>>>(
-        in, n_cols, indices, n_rows_indices, out);
-    CUDA_CHECK(cudaPeekAtLastError());
-    return;
-  }
-
-  idx_t size = n_rows_indices * n_cols;
-  auto counting = thrust::make_counting_iterator<idx_t>(0);
-
-  thrust::for_each(rmm::exec_policy(stream), counting, counting + size,
-                   [=] __device__(idx_t idx) {
-                     idx_t row = idx % n_rows_indices;
-                     idx_t col = idx / n_rows_indices;
-
-                     out[col * n_rows_indices + row] =
-                       in[col * n_rows + indices[row]];
-                   });
+  detail::copyRows(in, n_rows, n_cols, out, indices, n_rows_indices, stream,
+                   rowMajor);
 }
 
 /**
@@ -185,10 +165,10 @@ void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) {
  */
 template <typename m_t, typename idx_t = int>
 void print(const m_t *in, idx_t n_rows, idx_t n_cols, char h_separator = ' ',
-           char v_separator = '\n') {
+           char v_separator = '\n',
+           cudaStream_t stream = rmm::cuda_stream_default) {
   std::vector<m_t> h_matrix = std::vector<m_t>(n_cols * n_rows);
-  CUDA_CHECK(cudaMemcpy(h_matrix.data(), in, n_cols * n_rows * sizeof(m_t),
-                        cudaMemcpyDeviceToHost));
+  raft::update_host(h_matrix.data(), in, n_cols * n_rows, stream);
 
   for (idx_t i = 0; i < n_rows; i++) {
     for (idx_t j = 0; j < n_cols; j++) {
@@ -214,29 +194,6 @@ void printHost(const m_t *in, idx_t n_rows, idx_t n_cols) {
   }
 }
 
-/**
- * @brief Kernel for copying a slice of a big matrix to a small matrix with a
- * size matches that slice
- * @param src_d: input matrix
- * @param m: number of rows of input matrix
- * @param n: number of columns of input matrix
- * @param dst_d: output matrix
- * @param x1, y1: coordinate of the top-left point of the wanted area (0-based)
- * @param x2, y2: coordinate of the bottom-right point of the wanted area
- * (1-based)
- */
-template <typename m_t, typename idx_t = int>
-__global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1,
-                      idx_t y1, idx_t x2, idx_t y2) {
-  idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-  idx_t dm = x2 - x1, dn = y2 - y1;
-  if (idx < dm * dn) {
-    idx_t i = idx % dm, j = idx / dm;
-    idx_t is = i + x1, js = j + y1;
-    dst_d[idx] = src_d[is + js * m];
-  }
-}
-
 /**
  * @brief Slice a matrix (in-place)
  * @param in: input matrix
@@ -253,31 +210,7 @@ __global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1,
 template <typename m_t, typename idx_t = int>
 void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1,
                  idx_t y1, idx_t x2, idx_t y2, cudaStream_t stream) {
-  // Slicing
-  dim3 block(64);
-  dim3 grid(((x2 - x1) * (y2 - y1) + block.x - 1) / block.x);
-  slice<<<grid, block, 0, stream>>>(in, n_rows, n_cols, out, x1, y1, x2, y2);
-}
-
-/**
- * @brief Kernel for copying the upper triangular part of a matrix to another
- * @param src: input matrix with a size of mxn
- * @param dst: output matrix with a size of kxk
- * @param n_rows: number of rows of input matrix
- * @param n_cols: number of columns of input matrix
- * @param k: min(n_rows, n_cols)
- */
-template <typename m_t, typename idx_t = int>
-__global__ void getUpperTriangular(m_t *src, m_t *dst, idx_t n_rows,
-                                   idx_t n_cols, idx_t k) {
-  idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-  idx_t m = n_rows, n = n_cols;
-  if (idx < m * n) {
-    idx_t i = idx % m, j = idx / m;
-    if (i < k && j < k && j >= i) {
-      dst[i + j * k] = src[idx];
-    }
-  }
+  detail::sliceMatrix(in, n_rows, n_cols, out, x1, y1, x2, y2, stream);
 }
 
 /**
@@ -291,29 +224,7 @@ __global__ void getUpperTriangular(m_t *src, m_t *dst, idx_t n_rows,
 template <typename m_t, typename idx_t = int>
 void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols,
                          cudaStream_t stream) {
-  idx_t m = n_rows, n = n_cols;
-  idx_t k = min(m, n);
-  dim3 block(64);
-  dim3 grid((m * n + block.x - 1) / block.x);
-  getUpperTriangular<<<grid, block, 0, stream>>>(src, dst, m, n, k);
-}
-
-/**
- * @brief Copy a vector to the diagonal of a matrix
- * @param vec: vector of length k = min(n_rows, n_cols)
- * @param matrix: matrix of size n_rows x n_cols
- * @param m: number of rows of the matrix
- * @param n: number of columns of the matrix
- * @param k: dimensionality
- */
-template <typename m_t, typename idx_t = int>
-__global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, idx_t m,
-                                           idx_t n, idx_t k) {
-  idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-
-  if (idx < k) {
-    matrix[idx + idx * m] = vec[idx];
-  }
+  detail::copyUpperTriangular(src, dst, n_rows, n_cols, stream);
 }
 
 /**
@@ -327,25 +238,7 @@ __global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, idx_t m,
 template <typename m_t, typename idx_t = int>
 void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols,
                               cudaStream_t stream) {
-  idx_t k = min(n_rows, n_cols);
-  dim3 block(64);
-  dim3 grid((k + block.x - 1) / block.x);
-  copyVectorToMatrixDiagonal<<<grid, block, 0, stream>>>(vec, matrix, n_rows,
-                                                         n_cols, k);
-}
-
-/**
- * @brief Calculate the inverse of the diagonal of a square matrix
- * element-wise and in place
- * @param in: square input matrix with size len x len
- * @param len: size of one side of the matrix
- */
-template <typename m_t, typename idx_t = int>
-__global__ void matrixDiagonalInverse(m_t *in, idx_t len) {
-  idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-  if (idx < len) {
-    in[idx + idx * len] = 1.0 / in[idx + idx * len];
-  }
+  detail::initializeDiagonalMatrix(vec, matrix, n_rows, n_cols, stream);
 }
 
 /**
@@ -356,9 +249,7 @@ __global__ void matrixDiagonalInverse(m_t *in, idx_t len) {
  */
 template <typename m_t, typename idx_t = int>
 void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) {
-  dim3 block(64);
-  dim3 grid((len + block.x - 1) / block.x);
-  matrixDiagonalInverse<m_t><<<grid, block, 0, stream>>>(in, len);
+  detail::getDiagonalInverseMatrix(in, len, stream);
 }
 
 /**
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/detail/rng_impl.cuh
similarity index 62%
rename from cpp/include/raft/random/rng.cuh
rename to cpp/include/raft/random/detail/rng_impl.cuh
index 3d2e44e49b..654c46bbf9 100644
--- a/cpp/include/raft/random/rng.cuh
+++ b/cpp/include/raft/random/detail/rng_impl.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <curand_kernel.h>
 #include <raft/cudart_utils.h>
 #include <stdint.h>
 #include <cstdio>
@@ -27,10 +28,10 @@
 #include <random>
 #include <rmm/device_uvector.hpp>
 #include <type_traits>
-#include "rng_impl.cuh"
 
 namespace raft {
 namespace random {
+namespace detail {
 
 /** all different generator types used */
 enum GeneratorType {
@@ -42,12 +43,48 @@ enum GeneratorType {
   GenKiss99
 };
 
+template <typename Type>
+DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1,
+                             Type sigma2, Type mu2) {
+  constexpr Type twoPi = Type(2.0) * Type(3.141592654);
+  constexpr Type minus2 = -Type(2.0);
+  Type R = raft::mySqrt(minus2 * raft::myLog(val1));
+  Type theta = twoPi * val2;
+  Type s, c;
+  raft::mySinCos(theta, s, c);
+  val1 = R * c * sigma1 + mu1;
+  val2 = R * s * sigma2 + mu2;
+}
+template <typename Type>
+DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) {
+  box_muller_transform<Type>(val1, val2, sigma1, mu1, sigma1, mu1);
+}
+
+/**
+ * @brief generator-agnostic way of generating random numbers
+ * @tparam GenType the generator object that expose 'next' method
+ */
+template <typename GenType>
+struct Generator {
+  DI Generator(uint64_t seed, uint64_t subsequence, uint64_t offset)
+    : gen(seed, subsequence, offset) {}
+
+  template <typename Type>
+  DI void next(Type &ret) {
+    gen.next(ret);
+  }
+
+ private:
+  /** the actual generator */
+  GenType gen;
+};
+
 template <typename OutType, typename MathType, typename GenType,
           typename LenType, typename Lambda>
 __global__ void randKernel(uint64_t seed, uint64_t offset, OutType *ptr,
                            LenType len, Lambda randOp) {
   LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  detail::Generator<GenType> gen(seed, (uint64_t)tid, offset);
+  Generator<GenType> gen(seed, (uint64_t)tid, offset);
   const LenType stride = gridDim.x * blockDim.x;
   for (LenType idx = tid; idx < len; idx += stride) {
     MathType val;
@@ -62,7 +99,7 @@ template <typename OutType, typename MathType, typename GenType,
 __global__ void rand2Kernel(uint64_t seed, uint64_t offset, OutType *ptr,
                             LenType len, Lambda2 rand2Op) {
   LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  detail::Generator<GenType> gen(seed, (uint64_t)tid, offset);
+  Generator<GenType> gen(seed, (uint64_t)tid, offset);
   const LenType stride = gridDim.x * blockDim.x;
   for (LenType idx = tid; idx < len; idx += stride) {
     MathType val1, val2;
@@ -84,47 +121,252 @@ __global__ void constFillKernel(Type *ptr, int len, Type val) {
   }
 }
 
-/**
- * @brief Helper method to compute Box Muller transform
- *
- * @tparam Type data type
- *
- * @param[inout] val1   first value
- * @param[inout] val2   second value
- * @param[in]    sigma1 standard deviation of output gaussian for first value
- * @param[in]    mu1    mean of output gaussian for first value
- * @param[in]    sigma2 standard deviation of output gaussian for second value
- * @param[in]    mu2    mean of output gaussian for second value
- * @{
- */
-template <typename Type>
-DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1,
-                             Type sigma2, Type mu2) {
-  constexpr Type twoPi = Type(2.0) * Type(3.141592654);
-  constexpr Type minus2 = -Type(2.0);
-  Type R = raft::mySqrt(minus2 * raft::myLog(val1));
-  Type theta = twoPi * val2;
-  Type s, c;
-  raft::mySinCos(theta, s, c);
-  val1 = R * c * sigma1 + mu1;
-  val2 = R * s * sigma2 + mu2;
-}
-template <typename Type>
-DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) {
-  box_muller_transform<Type>(val1, val2, sigma1, mu1, sigma1, mu1);
-}
-/** @} */
+/** Philox-based random number generator */
+// Courtesy: Jakub Szuppe
+struct PhiloxGenerator {
+  /**
+   * @brief ctor. Initializes the state for RNG
+   * @param seed random seed (can be same across all threads)
+   * @param subsequence as found in curand docs
+   * @param offset as found in curand docs
+   */
+  DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
+    curand_init(seed, subsequence, offset, &state);
+  }
 
-/** The main random number generator class, fully on GPUs */
-class Rng {
- public:
   /**
-   * @brief ctor
-   * @param _s 64b seed used to initialize the RNG
-   * @param _t backend device RNG generator type
-   * @note Refer to the `Rng::seed` method for details about seeding the engine
+   * @defgroup NextRand Generate the next random number
+   * @{
+   */
+  DI void next(float &ret) { ret = curand_uniform(&(this->state)); }
+  DI void next(double &ret) { ret = curand_uniform_double(&(this->state)); }
+  DI void next(uint32_t &ret) { ret = curand(&(this->state)); }
+  DI void next(uint64_t &ret) {
+    uint32_t a, b;
+    next(a);
+    next(b);
+    ret = (uint64_t)a | ((uint64_t)b << 32);
+  }
+  DI void next(int32_t &ret) {
+    uint32_t val;
+    next(val);
+    ret = int32_t(val & 0x7fffffff);
+  }
+  DI void next(int64_t &ret) {
+    uint64_t val;
+    next(val);
+    ret = int64_t(val & 0x7fffffffffffffff);
+  }
+  /** @} */
+
+ private:
+  /** the state for RNG */
+  curandStatePhilox4_32_10_t state;
+};
+
+/** LFSR taps-filter for generating random numbers. */
+// Courtesy: Vinay Deshpande
+struct TapsGenerator {
+  /**
+   * @brief ctor. Initializes the state for RNG
+   * @param seed the seed (can be same across all threads)
+   * @param subsequence unused
+   * @param offset unused
+   */
+  DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
+    uint64_t delta = (blockIdx.x * blockDim.x) + threadIdx.x;
+    uint64_t stride = blockDim.x * gridDim.x;
+    delta += ((blockIdx.y * blockDim.y) + threadIdx.y) * stride;
+    stride *= blockDim.y * gridDim.y;
+    delta += ((blockIdx.z * blockDim.z) + threadIdx.z) * stride;
+    state = seed + delta + 1;
+  }
+
+  /**
+   * @defgroup NextRand Generate the next random number
+   * @{
+   */
+  template <typename Type>
+  DI void next(Type &ret) {
+    constexpr double ULL_LARGE = 1.8446744073709551614e19;
+    uint64_t val;
+    next(val);
+    ret = static_cast<Type>(val);
+    ret /= static_cast<Type>(ULL_LARGE);
+  }
+  DI void next(uint64_t &ret) {
+    constexpr uint64_t TAPS = 0x8000100040002000ULL;
+    constexpr int ROUNDS = 128;
+    for (int i = 0; i < ROUNDS; i++)
+      state = (state >> 1) ^ (-(state & 1ULL) & TAPS);
+    ret = state;
+  }
+  DI void next(uint32_t &ret) {
+    uint64_t val;
+    next(val);
+    ret = (uint32_t)val;
+  }
+  DI void next(int32_t &ret) {
+    uint32_t val;
+    next(val);
+    ret = int32_t(val & 0x7fffffff);
+  }
+  DI void next(int64_t &ret) {
+    uint64_t val;
+    next(val);
+    ret = int64_t(val & 0x7fffffffffffffff);
+  }
+  /** @} */
+
+ private:
+  /** the state for RNG */
+  uint64_t state;
+};
+
+/** Kiss99-based random number generator */
+
+struct Kiss99Generator {
+  /**
+   * @brief ctor. Initializes the state for RNG
+   * @param seed the seed (can be same across all threads)
+   * @param subsequence unused
+   * @param offset unused
    */
-  Rng(uint64_t _s, GeneratorType _t = GenPhilox)
+  DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
+    initKiss99(seed);
+  }
+
+  /**
+   * @defgroup NextRand Generate the next random number
+   * @{
+   */
+  template <typename Type>
+  DI void next(Type &ret) {
+    constexpr double U_LARGE = 4.294967295e9;
+    uint32_t val;
+    next(val);
+    ret = static_cast<Type>(val);
+    ret /= static_cast<Type>(U_LARGE);
+  }
+  DI void next(uint32_t &ret) {
+    uint32_t MWC;
+    z = 36969 * (z & 65535) + (z >> 16);
+    w = 18000 * (w & 65535) + (w >> 16);
+    MWC = ((z << 16) + w);
+    jsr ^= (jsr << 17);
+    jsr ^= (jsr >> 13);
+    jsr ^= (jsr << 5);
+    jcong = 69069 * jcong + 1234567;
+    MWC = ((MWC ^ jcong) + jsr);
+    ret = MWC;
+  }
+  DI void next(uint64_t &ret) {
+    uint32_t a, b;
+    next(a);
+    next(b);
+    ret = (uint64_t)a | ((uint64_t)b << 32);
+  }
+  DI void next(int32_t &ret) {
+    uint32_t val;
+    next(val);
+    ret = int32_t(val & 0x7fffffff);
+  }
+  DI void next(int64_t &ret) {
+    uint64_t val;
+    next(val);
+    ret = int64_t(val & 0x7fffffffffffffff);
+  }
+  /** @} */
+
+ private:
+  /** one of the kiss99 states */
+  uint32_t z;
+  /** one of the kiss99 states */
+  uint32_t w;
+  /** one of the kiss99 states */
+  uint32_t jsr;
+  /** one of the kiss99 states */
+  uint32_t jcong;
+
+  // This function multiplies 128-bit hash by 128-bit FNV prime and returns lower
+  // 128 bits. It uses 32-bit wide multiply only.
+  DI void mulByFnv1a128Prime(uint32_t *h) {
+    typedef union {
+      uint32_t u32[2];
+      uint64_t u64[1];
+    } words64;
+
+    // 128-bit FNV prime = p3 * 2^96 + p2 * 2^64 + p1 * 2^32 + p0
+    // Here p0 = 315, p2 = 16777216, p1 = p3 = 0
+    const uint32_t p0 = uint32_t(315), p2 = uint32_t(16777216);
+    // Partial products
+    words64 h0p0, h1p0, h2p0, h0p2, h3p0, h1p2;
+
+    h0p0.u64[0] = uint64_t(h[0]) * p0;
+    h1p0.u64[0] = uint64_t(h[1]) * p0;
+    h2p0.u64[0] = uint64_t(h[2]) * p0;
+    h0p2.u64[0] = uint64_t(h[0]) * p2;
+    h3p0.u64[0] = uint64_t(h[3]) * p0;
+    h1p2.u64[0] = uint64_t(h[1]) * p2;
+
+    // h_n[0] = LO(h[0]*p[0]);
+    // h_n[1] = HI(h[0]*p[0]) + LO(h[1]*p[0]);
+    // h_n[2] = HI(h[1]*p[0]) + LO(h[2]*p[0]) + LO(h[0]*p[2]);
+    // h_n[3] = HI(h[2]*p[0]) + HI(h[0]*p[2]) + LO(h[3]*p[0]) + LO(h[1]*p[2]);
+    uint32_t carry = 0;
+    h[0] = h0p0.u32[0];
+
+    h[1] = h0p0.u32[1] + h1p0.u32[0];
+    carry = h[1] < h0p0.u32[1] ? 1 : 0;
+
+    h[2] = h1p0.u32[1] + carry;
+    carry = h[2] < h1p0.u32[1] ? 1 : 0;
+    h[2] += h2p0.u32[0];
+    carry = h[2] < h2p0.u32[0] ? carry + 1 : carry;
+    h[2] += h0p2.u32[0];
+    carry = h[2] < h0p2.u32[0] ? carry + 1 : carry;
+
+    h[3] = h2p0.u32[1] + h0p2.u32[1] + h3p0.u32[0] + h1p2.u32[0] + carry;
+    return;
+  }
+
+  DI void fnv1a128(uint32_t *hash, uint32_t txt) {
+    hash[0] ^= (txt >> 0) & 0xFF;
+    mulByFnv1a128Prime(hash);
+    hash[0] ^= (txt >> 8) & 0xFF;
+    mulByFnv1a128Prime(hash);
+    hash[0] ^= (txt >> 16) & 0xFF;
+    mulByFnv1a128Prime(hash);
+    hash[0] ^= (txt >> 24) & 0xFF;
+    mulByFnv1a128Prime(hash);
+  }
+
+  DI void initKiss99(uint64_t seed) {
+    // Initialize hash to 128-bit FNV1a basis
+    uint32_t hash[4] = {1653982605UL, 1656234357UL, 129696066UL, 1818371886UL};
+
+    // Digest threadIdx, blockIdx and seed
+    fnv1a128(hash, threadIdx.x);
+    fnv1a128(hash, threadIdx.y);
+    fnv1a128(hash, threadIdx.z);
+    fnv1a128(hash, blockIdx.x);
+    fnv1a128(hash, blockIdx.y);
+    fnv1a128(hash, blockIdx.z);
+    fnv1a128(hash, uint32_t(seed));
+    fnv1a128(hash, uint32_t(seed >> 32));
+
+    // Initialize KISS99 state with hash
+    z = hash[0];
+    w = hash[1];
+    jsr = hash[2];
+    jcong = hash[3];
+  }
+};
+
+/** The main random number generator class, fully on GPUs */
+class RngImpl {
+ public:
+  RngImpl(uint64_t _s, GeneratorType _t = GenPhilox)
     : type(_t),
       offset(0),
       // simple heuristic to make sure all SMs will be occupied properly
@@ -134,28 +376,11 @@ class Rng {
     seed(_s);
   }
 
-  /**
-   * @brief Seed (and thus re-initialize) the underlying RNG engine
-   * @param _s 64b seed used to initialize the RNG
-   * @note If you need non-reproducibility, pass a seed that's, for example, a
-   *       function of timestamp. Another example is to use the c++11's
-   *       `std::random_device` for setting seed.
-   */
   void seed(uint64_t _s) {
     gen.seed(_s);
     offset = 0;
   }
 
-  /**
-   * @brief Generates the 'a' and 'b' parameters for a modulo affine
-   *        transformation equation: `(ax + b) % n`
-   *
-   * @tparam IdxT integer type
-   *
-   * @param[in]  n the modulo range
-   * @param[out] a slope parameter
-   * @param[out] b intercept parameter
-   */
   template <typename IdxT>
   void affine_transform_params(IdxT n, IdxT &a, IdxT &b) {
     // always keep 'a' to be coprime to 'n'
@@ -168,17 +393,6 @@ class Rng {
     b = gen() % n;
   }
 
-  /**
-   * @brief Generate uniformly distributed numbers in the given range
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output array
-   * @param len the number of elements in the output
-   * @param start start of the range
-   * @param end end of the range
-   * @param stream stream where to launch the kernel
-   * @{
-   */
   template <typename Type, typename LenType = int>
   void uniform(Type *ptr, LenType len, Type start, Type end,
                cudaStream_t stream) {
@@ -203,19 +417,7 @@ class Rng {
       },
       stream);
   }
-  /** @} */
 
-  /**
-   * @brief Generate normal distributed numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output array
-   * @param len the number of elements in the output
-   * @param mu mean of the distribution
-   * @param sigma std-dev of the distribution
-   * @param stream stream where to launch the kernel
-   * @{
-   */
   template <typename Type, typename LenType = int>
   void normal(Type *ptr, LenType len, Type mu, Type sigma,
               cudaStream_t stream) {
@@ -240,28 +442,7 @@ class Rng {
       },
       NumThreads, nBlocks, type, stream);
   }
-  /** @} */
 
-  /**
-   * @brief Generate normal distributed table according to the given set of
-   * means and scalar standard deviations.
-   *
-   * Each row in this table conforms to a normally distributed n-dim vector
-   * whose mean is the input vector and standard deviation is the corresponding
-   * vector or scalar. Correlations among the dimensions itself is assumed to
-   * be absent.
-   *
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output table (dim = n_rows x n_cols)
-   * @param n_rows number of rows in the table
-   * @param n_cols number of columns in the table
-   * @param mu mean vector (dim = n_cols x 1).
-   * @param sigma_vec std-dev vector of each component (dim = n_cols x 1). Pass
-   * a nullptr to use the same scalar 'sigma' across all components
-   * @param sigma scalar sigma to be used if 'sigma_vec' is nullptr
-   * @param stream stream where to launch the kernel
-   */
   template <typename Type, typename LenType = int>
   void normalTable(Type *ptr, LenType n_rows, LenType n_cols, const Type *mu,
                    const Type *sigma_vec, Type sigma, cudaStream_t stream) {
@@ -280,33 +461,13 @@ class Rng {
       NumThreads, nBlocks, type, stream);
   }
 
-  /**
-   * @brief Fill an array with the given value
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output array
-   * @param len the number of elements in the output
-   * @param val value to be filled
-   * @param stream stream where to launch the kernel
-   */
   template <typename Type, typename LenType = int>
   void fill(Type *ptr, LenType len, Type val, cudaStream_t stream) {
-    constFillKernel<Type><<<nBlocks, NumThreads, 0, stream>>>(ptr, len, val);
+    detail::constFillKernel<Type>
+      <<<nBlocks, NumThreads, 0, stream>>>(ptr, len, val);
     CUDA_CHECK(cudaPeekAtLastError());
   }
 
-  /**
-   * @brief Generate bernoulli distributed boolean array
-   *
-   * @tparam Type    data type in which to compute the probabilities
-   * @tparam OutType output data type
-   * @tparam LenType data type used to represent length of the arrays
-   *
-   * @param[out] ptr    the output array
-   * @param[in]  len    the number of elements in the output
-   * @param[in]  prob   coin-toss probability for heads
-   * @param[in]  stream stream where to launch the kernel
-   */
   template <typename Type, typename OutType = bool, typename LenType = int>
   void bernoulli(OutType *ptr, LenType len, Type prob, cudaStream_t stream) {
     custom_distribution<OutType, Type>(
@@ -314,16 +475,6 @@ class Rng {
       stream);
   }
 
-  /**
-   * @brief Generate bernoulli distributed array and applies scale
-   * @tparam Type data type in which to compute the probabilities
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output array
-   * @param len the number of elements in the output
-   * @param prob coin-toss probability for heads
-   * @param scale scaling factor
-   * @param stream stream where to launch the kernel
-   */
   template <typename Type, typename LenType = int>
   void scaled_bernoulli(Type *ptr, LenType len, Type prob, Type scale,
                         cudaStream_t stream) {
@@ -337,17 +488,6 @@ class Rng {
       stream);
   }
 
-  /**
-   * @brief Generate gumbel distributed random numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr output array
-   * @param len number of elements in the output array
-   * @param mu mean value
-   * @param beta scale value
-   * @param stream stream where to launch the kernel
-   * @note https://en.wikipedia.org/wiki/Gumbel_distribution
-   */
   template <typename Type, typename LenType = int>
   void gumbel(Type *ptr, LenType len, Type mu, Type beta, cudaStream_t stream) {
     custom_distribution(
@@ -358,16 +498,6 @@ class Rng {
       stream);
   }
 
-  /**
-   * @brief Generate lognormal distributed numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr the output array
-   * @param len the number of elements in the output
-   * @param mu mean of the distribution
-   * @param sigma std-dev of the distribution
-   * @param stream stream where to launch the kernel
-   */
   template <typename Type, typename LenType = int>
   void lognormal(Type *ptr, LenType len, Type mu, Type sigma,
                  cudaStream_t stream) {
@@ -381,16 +511,6 @@ class Rng {
       NumThreads, nBlocks, type, stream);
   }
 
-  /**
-   * @brief Generate logistic distributed random numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr output array
-   * @param len number of elements in the output array
-   * @param mu mean value
-   * @param scale scale value
-   * @param stream stream where to launch the kernel
-   */
   template <typename Type, typename LenType = int>
   void logistic(Type *ptr, LenType len, Type mu, Type scale,
                 cudaStream_t stream) {
@@ -403,15 +523,6 @@ class Rng {
       stream);
   }
 
-  /**
-   * @brief Generate exponentially distributed random numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr output array
-   * @param len number of elements in the output array
-   * @param lambda the lambda
-   * @param stream stream where to launch the kernel
-   */
   template <typename Type, typename LenType = int>
   void exponential(Type *ptr, LenType len, Type lambda, cudaStream_t stream) {
     custom_distribution(
@@ -423,15 +534,6 @@ class Rng {
       stream);
   }
 
-  /**
-   * @brief Generate rayleigh distributed random numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr output array
-   * @param len number of elements in the output array
-   * @param sigma the sigma
-   * @param stream stream where to launch the kernel
-   */
   template <typename Type, typename LenType = int>
   void rayleigh(Type *ptr, LenType len, Type sigma, cudaStream_t stream) {
     custom_distribution(
@@ -444,16 +546,6 @@ class Rng {
       stream);
   }
 
-  /**
-   * @brief Generate laplace distributed random numbers
-   * @tparam Type data type of output random number
-   * @tparam LenType data type used to represent length of the arrays
-   * @param ptr output array
-   * @param len number of elements in the output array
-   * @param mu the mean
-   * @param scale the scale
-   * @param stream stream where to launch the kernel
-   */
   template <typename Type, typename LenType = int>
   void laplace(Type *ptr, LenType len, Type mu, Type scale,
                cudaStream_t stream) {
@@ -474,31 +566,6 @@ class Rng {
       stream);
   }
 
-  /**
-   * @brief Sample the input array without replacement, optionally based on the
-   * input weight vector for each element in the array
-   *
-   * Implementation here is based on the `one-pass sampling` algo described here:
-   * https://www.ethz.ch/content/dam/ethz/special-interest/baug/ivt/ivt-dam/vpl/reports/1101-1200/ab1141.pdf
-   *
-   * @note In the sampled array the elements which are picked will always appear
-   * in the increasing order of their weights as computed using the exponential
-   * distribution. So, if you're particular about the order (for eg. array
-   * permutations), then this might not be the right choice!
-   *
-   * @tparam DataT data type
-   * @tparam WeightsT weights type
-   * @tparam IdxT index type
-   * @param out output sampled array (of length 'sampledLen')
-   * @param outIdx indices of the sampled array (of length 'sampledLen'). Pass
-   * a nullptr if this is not required.
-   * @param in input array to be sampled (of length 'len')
-   * @param wts weights array (of length 'len'). Pass a nullptr if uniform
-   * sampling is desired
-   * @param sampledLen output sampled array length
-   * @param len input array length
-   * @param stream cuda stream
-   */
   template <typename DataT, typename WeightsT, typename IdxT = int>
   void sampleWithoutReplacement(const raft::handle_t &handle, DataT *out,
                                 IdxT *outIdx, const DataT *in,
@@ -535,24 +602,9 @@ class Rng {
       CUDA_CHECK(cudaMemcpyAsync(outIdx, outIdxPtr, sizeof(IdxT) * sampledLen,
                                  cudaMemcpyDeviceToDevice, stream));
     }
-    scatter<DataT, IdxT>(out, in, outIdxPtr, sampledLen, stream);
+    raft::scatter<DataT, IdxT>(out, in, outIdxPtr, sampledLen, stream);
   }
 
-  /**
-   * @brief Core method to generate a pdf based on the cdf that is defined in
-   *        the input device lambda
-   *
-   * @tparam OutType  output type
-   * @tparam MathType type on which arithmetic is done
-   * @tparam LenTyp   index type
-   * @tparam Lambda   device lambda (or operator)
-   *
-   * @param[out] ptr    output buffer [on device] [len = len]
-   * @param[in]  len    number of elements to be generated
-   * @param[in]  randOp the device lambda or operator
-   * @param[in]  stream cuda stream
-   * @{
-   */
   template <typename OutType, typename MathType = OutType,
             typename LenType = int, typename Lambda>
   void custom_distribution(OutType *ptr, LenType len, Lambda randOp,
@@ -573,10 +625,10 @@ class Rng {
   /** generator type */
   GeneratorType type;
   /**
-   * offset is also used to initialize curand state.
-   * Limits period of Philox RNG from (4 * 2^128) to (Blocks * Threads * 2^64),
-   * but is still a large period.
-   */
+  * offset is also used to initialize curand state.
+  * Limits period of Philox RNG from (4 * 2^128) to (Blocks * Threads * 2^64),
+  * but is still a large period.
+  */
   uint64_t offset;
   /** number of blocks to launch */
   int nBlocks;
@@ -617,15 +669,18 @@ class Rng {
                                                            nThreads, nBlocks);
     switch (type) {
       case GenPhilox:
-        randKernel<OutType, MathType, detail::PhiloxGenerator, LenType, Lambda>
+        detail::randKernel<OutType, MathType, detail::PhiloxGenerator, LenType,
+                           Lambda>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
         break;
       case GenTaps:
-        randKernel<OutType, MathType, detail::TapsGenerator, LenType, Lambda>
+        detail::randKernel<OutType, MathType, detail::TapsGenerator, LenType,
+                           Lambda>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
         break;
       case GenKiss99:
-        randKernel<OutType, MathType, detail::Kiss99Generator, LenType, Lambda>
+        detail::randKernel<OutType, MathType, detail::Kiss99Generator, LenType,
+                           Lambda>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
         break;
       default:
@@ -646,17 +701,18 @@ class Rng {
                                                           nThreads, nBlocks);
     switch (type) {
       case GenPhilox:
-        rand2Kernel<OutType, MathType, detail::PhiloxGenerator, LenType,
-                    Lambda2>
+        detail::rand2Kernel<OutType, MathType, detail::PhiloxGenerator, LenType,
+                            Lambda2>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
         break;
       case GenTaps:
-        rand2Kernel<OutType, MathType, detail::TapsGenerator, LenType, Lambda2>
+        detail::rand2Kernel<OutType, MathType, detail::TapsGenerator, LenType,
+                            Lambda2>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
         break;
       case GenKiss99:
-        rand2Kernel<OutType, MathType, detail::Kiss99Generator, LenType,
-                    Lambda2>
+        detail::rand2Kernel<OutType, MathType, detail::Kiss99Generator, LenType,
+                            Lambda2>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
         break;
       default:
@@ -667,5 +723,6 @@ class Rng {
   }
 };
 
+};  // end namespace detail
 };  // end namespace random
 };  // end namespace raft
diff --git a/cpp/include/raft/random/rng.hpp b/cpp/include/raft/random/rng.hpp
new file mode 100644
index 0000000000..b6b0911ab0
--- /dev/null
+++ b/cpp/include/raft/random/rng.hpp
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/rng_impl.cuh"
+
+namespace raft {
+namespace random {
+
+/** all different generator types used */
+using detail::GeneratorType;
+/** curand-based philox generator */
+using detail::GenPhilox;
+/** GenTaps : LFSR taps generator */
+using detail::GenTaps;
+/** GenKiss99 : kiss99 generator (currently the fastest) */
+using detail::GenKiss99;
+
+/** Philox-based random number generator */
+using detail::PhiloxGenerator;
+/** LFSR taps-filter for generating random numbers. */
+using detail::TapsGenerator;
+/** Kiss99-based random number generator */
+using detail::Kiss99Generator;
+
+/**
+ * @brief Helper method to compute Box Muller transform
+ *
+ * @tparam Type data type
+ *
+ * @param[inout] val1   first value
+ * @param[inout] val2   second value
+ * @param[in]    sigma1 standard deviation of output gaussian for first value
+ * @param[in]    mu1    mean of output gaussian for first value
+ * @param[in]    sigma2 standard deviation of output gaussian for second value
+ * @param[in]    mu2    mean of output gaussian for second value
+ * @{
+ */
+template <typename Type>
+DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1,
+                             Type sigma2, Type mu2) {
+  detail::box_muller_transform(val1, val2, sigma1, mu1, sigma1, mu2);
+}
+template <typename Type>
+DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) {
+  detail::box_muller_transform(val1, val2, sigma1, mu1);
+}
+/** @} */
+
+/** The main random number generator class, fully on GPUs */
+class Rng : public detail::RngImpl {
+ public:
+  /**
+   * @brief ctor
+   * @param _s 64b seed used to initialize the RNG
+   * @param _t backend device RNG generator type
+   * @note Refer to the `Rng::seed` method for details about seeding the engine
+   */
+  Rng(uint64_t _s, GeneratorType _t = GenPhilox) : detail::RngImpl{_s, _t} {}
+
+  /**
+   * @brief Seed (and thus re-initialize) the underlying RNG engine
+   * @param _s 64b seed used to initialize the RNG
+   * @note If you need non-reproducibility, pass a seed that's, for example, a
+   *       function of timestamp. Another example is to use the c++11's
+   *       `std::random_device` for setting seed.
+   */
+  void seed(uint64_t _s) { detail::RngImpl::seed(_s); }
+
+  /**
+   * @brief Generates the 'a' and 'b' parameters for a modulo affine
+   *        transformation equation: `(ax + b) % n`
+   *
+   * @tparam IdxT integer type
+   *
+   * @param[in]  n the modulo range
+   * @param[out] a slope parameter
+   * @param[out] b intercept parameter
+   */
+  template <typename IdxT>
+  void affine_transform_params(IdxT n, IdxT &a, IdxT &b) {
+    detail::RngImpl::affine_transform_params(n, a, b);
+  }
+
+  /**
+   * @brief Generate uniformly distributed numbers in the given range
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param start start of the range
+   * @param end end of the range
+   * @param stream stream where to launch the kernel
+   * @{
+   */
+  template <typename Type, typename LenType = int>
+  void uniform(Type *ptr, LenType len, Type start, Type end,
+               cudaStream_t stream) {
+    detail::RngImpl::uniform(ptr, len, start, end, stream);
+  }
+  template <typename IntType, typename LenType = int>
+  void uniformInt(IntType *ptr, LenType len, IntType start, IntType end,
+                  cudaStream_t stream) {
+    detail::RngImpl::uniformInt(ptr, len, start, end, stream);
+  }
+  /** @} */
+
+  /**
+   * @brief Generate normal distributed numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param mu mean of the distribution
+   * @param sigma std-dev of the distribution
+   * @param stream stream where to launch the kernel
+   * @{
+   */
+  template <typename Type, typename LenType = int>
+  void normal(Type *ptr, LenType len, Type mu, Type sigma,
+              cudaStream_t stream) {
+    detail::RngImpl::normal(ptr, len, mu, sigma, stream);
+  }
+  template <typename IntType, typename LenType = int>
+  void normalInt(IntType *ptr, LenType len, IntType mu, IntType sigma,
+                 cudaStream_t stream) {
+    detail::RngImpl::normalInt(ptr, len, mu, sigma, stream);
+  }
+  /** @} */
+
+  /**
+   * @brief Generate normal distributed table according to the given set of
+   * means and scalar standard deviations.
+   *
+   * Each row in this table conforms to a normally distributed n-dim vector
+   * whose mean is the input vector and standard deviation is the corresponding
+   * vector or scalar. Correlations among the dimensions itself is assumed to
+   * be absent.
+   *
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output table (dim = n_rows x n_cols)
+   * @param n_rows number of rows in the table
+   * @param n_cols number of columns in the table
+   * @param mu mean vector (dim = n_cols x 1).
+   * @param sigma_vec std-dev vector of each component (dim = n_cols x 1). Pass
+   * a nullptr to use the same scalar 'sigma' across all components
+   * @param sigma scalar sigma to be used if 'sigma_vec' is nullptr
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void normalTable(Type *ptr, LenType n_rows, LenType n_cols, const Type *mu,
+                   const Type *sigma_vec, Type sigma, cudaStream_t stream) {
+    detail::RngImpl::normalTable(ptr, n_rows, n_cols, mu, sigma_vec, sigma,
+                                 stream);
+  }
+
+  /**
+   * @brief Fill an array with the given value
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param val value to be filled
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void fill(Type *ptr, LenType len, Type val, cudaStream_t stream) {
+    detail::RngImpl::fill(ptr, len, val, stream);
+  }
+
+  /**
+   * @brief Generate bernoulli distributed boolean array
+   *
+   * @tparam Type    data type in which to compute the probabilities
+   * @tparam OutType output data type
+   * @tparam LenType data type used to represent length of the arrays
+   *
+   * @param[out] ptr    the output array
+   * @param[in]  len    the number of elements in the output
+   * @param[in]  prob   coin-toss probability for heads
+   * @param[in]  stream stream where to launch the kernel
+   */
+  template <typename Type, typename OutType = bool, typename LenType = int>
+  void bernoulli(OutType *ptr, LenType len, Type prob, cudaStream_t stream) {
+    detail::RngImpl::bernoulli(ptr, len, prob, stream);
+  }
+
+  /**
+   * @brief Generate bernoulli distributed array and applies scale
+   * @tparam Type data type in which to compute the probabilities
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param prob coin-toss probability for heads
+   * @param scale scaling factor
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void scaled_bernoulli(Type *ptr, LenType len, Type prob, Type scale,
+                        cudaStream_t stream) {
+    detail::RngImpl::scaled_bernoulli(ptr, len, prob, scale, stream);
+  }
+
+  /**
+   * @brief Generate gumbel distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param mu mean value
+   * @param beta scale value
+   * @param stream stream where to launch the kernel
+   * @note https://en.wikipedia.org/wiki/Gumbel_distribution
+   */
+  template <typename Type, typename LenType = int>
+  void gumbel(Type *ptr, LenType len, Type mu, Type beta, cudaStream_t stream) {
+    detail::RngImpl::gumbel(ptr, len, mu, beta, stream);
+  }
+
+  /**
+   * @brief Generate lognormal distributed numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param mu mean of the distribution
+   * @param sigma std-dev of the distribution
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void lognormal(Type *ptr, LenType len, Type mu, Type sigma,
+                 cudaStream_t stream) {
+    detail::RngImpl::lognormal(ptr, len, mu, sigma, stream);
+  }
+
+  /**
+   * @brief Generate logistic distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param mu mean value
+   * @param scale scale value
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void logistic(Type *ptr, LenType len, Type mu, Type scale,
+                cudaStream_t stream) {
+    detail::RngImpl::logistic(ptr, len, mu, scale, stream);
+  }
+
+  /**
+   * @brief Generate exponentially distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param lambda the lambda
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void exponential(Type *ptr, LenType len, Type lambda, cudaStream_t stream) {
+    detail::RngImpl::exponential(ptr, len, lambda, stream);
+  }
+
+  /**
+   * @brief Generate rayleigh distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param sigma the sigma
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void rayleigh(Type *ptr, LenType len, Type sigma, cudaStream_t stream) {
+    detail::RngImpl::rayleigh(ptr, len, sigma, stream);
+  }
+
+  /**
+   * @brief Generate laplace distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param mu the mean
+   * @param scale the scale
+   * @param stream stream where to launch the kernel
+   */
+  template <typename Type, typename LenType = int>
+  void laplace(Type *ptr, LenType len, Type mu, Type scale,
+               cudaStream_t stream) {
+    detail::RngImpl::laplace(ptr, len, mu, scale, stream);
+  }
+
+  /**
+   * @brief Sample the input array without replacement, optionally based on the
+   * input weight vector for each element in the array
+   *
+   * Implementation here is based on the `one-pass sampling` algo described here:
+   * https://www.ethz.ch/content/dam/ethz/special-interest/baug/ivt/ivt-dam/vpl/reports/1101-1200/ab1141.pdf
+   *
+   * @note In the sampled array the elements which are picked will always appear
+   * in the increasing order of their weights as computed using the exponential
+   * distribution. So, if you're particular about the order (for eg. array
+   * permutations), then this might not be the right choice!
+   *
+   * @tparam DataT data type
+   * @tparam WeightsT weights type
+   * @tparam IdxT index type
+   * @param out output sampled array (of length 'sampledLen')
+   * @param outIdx indices of the sampled array (of length 'sampledLen'). Pass
+   * a nullptr if this is not required.
+   * @param in input array to be sampled (of length 'len')
+   * @param wts weights array (of length 'len'). Pass a nullptr if uniform
+   * sampling is desired
+   * @param sampledLen output sampled array length
+   * @param len input array length
+   * @param stream cuda stream
+   */
+  template <typename DataT, typename WeightsT, typename IdxT = int>
+  void sampleWithoutReplacement(const raft::handle_t &handle, DataT *out,
+                                IdxT *outIdx, const DataT *in,
+                                const WeightsT *wts, IdxT sampledLen, IdxT len,
+                                cudaStream_t stream) {
+    detail::RngImpl::sampleWithoutReplacement(handle, out, outIdx, in, wts,
+                                              sampledLen, len, stream);
+  }
+
+  /**
+   * @brief Core method to generate a pdf based on the cdf that is defined in
+   *        the input device lambda
+   *
+   * @tparam OutType  output type
+   * @tparam MathType type on which arithmetic is done
+   * @tparam LenTyp   index type
+   * @tparam Lambda   device lambda (or operator)
+   *
+   * @param[out] ptr    output buffer [on device] [len = len]
+   * @param[in]  len    number of elements to be generated
+   * @param[in]  randOp the device lambda or operator
+   * @param[in]  stream cuda stream
+   * @{
+   */
+  template <typename OutType, typename MathType = OutType,
+            typename LenType = int, typename Lambda>
+  void custom_distribution(OutType *ptr, LenType len, Lambda randOp,
+                           cudaStream_t stream) {
+    detail::RngImpl::custom_distribution(ptr, len, randOp, stream);
+  }
+  template <typename OutType, typename MathType = OutType,
+            typename LenType = int, typename Lambda>
+  void custom_distribution2(OutType *ptr, LenType len, Lambda randOp,
+                            cudaStream_t stream) {
+    detail::RngImpl::custom_distribution2(ptr, len, randOp, stream);
+  }
+  /** @} */
+};
+
+};  // end namespace random
+};  // end namespace raft
diff --git a/cpp/include/raft/random/rng_impl.cuh b/cpp/include/raft/random/rng_impl.cuh
deleted file mode 100644
index d44c6f018b..0000000000
--- a/cpp/include/raft/random/rng_impl.cuh
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <curand_kernel.h>
-#include <stdint.h>
-#include <raft/cuda_utils.cuh>
-
-namespace raft {
-namespace random {
-namespace detail {
-
-/** Philox-based random number generator */
-// Courtesy: Jakub Szuppe
-struct PhiloxGenerator {
-  /**
-   * @brief ctor. Initializes the state for RNG
-   * @param seed random seed (can be same across all threads)
-   * @param subsequence as found in curand docs
-   * @param offset as found in curand docs
-   */
-  DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
-    curand_init(seed, subsequence, offset, &state);
-  }
-
-  /**
-   * @defgroup NextRand Generate the next random number
-   * @{
-   */
-  DI void next(float& ret) { ret = curand_uniform(&(this->state)); }
-  DI void next(double& ret) { ret = curand_uniform_double(&(this->state)); }
-  DI void next(uint32_t& ret) { ret = curand(&(this->state)); }
-  DI void next(uint64_t& ret) {
-    uint32_t a, b;
-    next(a);
-    next(b);
-    ret = (uint64_t)a | ((uint64_t)b << 32);
-  }
-  DI void next(int32_t& ret) {
-    uint32_t val;
-    next(val);
-    ret = int32_t(val & 0x7fffffff);
-  }
-  DI void next(int64_t& ret) {
-    uint64_t val;
-    next(val);
-    ret = int64_t(val & 0x7fffffffffffffff);
-  }
-  /** @} */
-
- private:
-  /** the state for RNG */
-  curandStatePhilox4_32_10_t state;
-};
-
-/** LFSR taps-filter for generating random numbers. */
-// Courtesy: Vinay Deshpande
-struct TapsGenerator {
-  /**
-   * @brief ctor. Initializes the state for RNG
-   * @param seed the seed (can be same across all threads)
-   * @param subsequence unused
-   * @param offset unused
-   */
-  DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
-    uint64_t delta = (blockIdx.x * blockDim.x) + threadIdx.x;
-    uint64_t stride = blockDim.x * gridDim.x;
-    delta += ((blockIdx.y * blockDim.y) + threadIdx.y) * stride;
-    stride *= blockDim.y * gridDim.y;
-    delta += ((blockIdx.z * blockDim.z) + threadIdx.z) * stride;
-    state = seed + delta + 1;
-  }
-
-  /**
-   * @defgroup NextRand Generate the next random number
-   * @{
-   */
-  template <typename Type>
-  DI void next(Type& ret) {
-    constexpr double ULL_LARGE = 1.8446744073709551614e19;
-    uint64_t val;
-    next(val);
-    ret = static_cast<Type>(val);
-    ret /= static_cast<Type>(ULL_LARGE);
-  }
-  DI void next(uint64_t& ret) {
-    constexpr uint64_t TAPS = 0x8000100040002000ULL;
-    constexpr int ROUNDS = 128;
-    for (int i = 0; i < ROUNDS; i++)
-      state = (state >> 1) ^ (-(state & 1ULL) & TAPS);
-    ret = state;
-  }
-  DI void next(uint32_t& ret) {
-    uint64_t val;
-    next(val);
-    ret = (uint32_t)val;
-  }
-  DI void next(int32_t& ret) {
-    uint32_t val;
-    next(val);
-    ret = int32_t(val & 0x7fffffff);
-  }
-  DI void next(int64_t& ret) {
-    uint64_t val;
-    next(val);
-    ret = int64_t(val & 0x7fffffffffffffff);
-  }
-  /** @} */
-
- private:
-  /** the state for RNG */
-  uint64_t state;
-};
-
-/** Kiss99-based random number generator */
-
-struct Kiss99Generator {
-  /**
-   * @brief ctor. Initializes the state for RNG
-   * @param seed the seed (can be same across all threads)
-   * @param subsequence unused
-   * @param offset unused
-   */
-  DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
-    initKiss99(seed);
-  }
-
-  /**
-   * @defgroup NextRand Generate the next random number
-   * @{
-   */
-  template <typename Type>
-  DI void next(Type& ret) {
-    constexpr double U_LARGE = 4.294967295e9;
-    uint32_t val;
-    next(val);
-    ret = static_cast<Type>(val);
-    ret /= static_cast<Type>(U_LARGE);
-  }
-  DI void next(uint32_t& ret) {
-    uint32_t MWC;
-    z = 36969 * (z & 65535) + (z >> 16);
-    w = 18000 * (w & 65535) + (w >> 16);
-    MWC = ((z << 16) + w);
-    jsr ^= (jsr << 17);
-    jsr ^= (jsr >> 13);
-    jsr ^= (jsr << 5);
-    jcong = 69069 * jcong + 1234567;
-    MWC = ((MWC ^ jcong) + jsr);
-    ret = MWC;
-  }
-  DI void next(uint64_t& ret) {
-    uint32_t a, b;
-    next(a);
-    next(b);
-    ret = (uint64_t)a | ((uint64_t)b << 32);
-  }
-  DI void next(int32_t& ret) {
-    uint32_t val;
-    next(val);
-    ret = int32_t(val & 0x7fffffff);
-  }
-  DI void next(int64_t& ret) {
-    uint64_t val;
-    next(val);
-    ret = int64_t(val & 0x7fffffffffffffff);
-  }
-  /** @} */
-
- private:
-  /** one of the kiss99 states */
-  uint32_t z;
-  /** one of the kiss99 states */
-  uint32_t w;
-  /** one of the kiss99 states */
-  uint32_t jsr;
-  /** one of the kiss99 states */
-  uint32_t jcong;
-
-  // This function multiplies 128-bit hash by 128-bit FNV prime and returns lower
-  // 128 bits. It uses 32-bit wide multiply only.
-  DI void mulByFnv1a128Prime(uint32_t* h) {
-    typedef union {
-      uint32_t u32[2];
-      uint64_t u64[1];
-    } words64;
-
-    // 128-bit FNV prime = p3 * 2^96 + p2 * 2^64 + p1 * 2^32 + p0
-    // Here p0 = 315, p2 = 16777216, p1 = p3 = 0
-    const uint32_t p0 = uint32_t(315), p2 = uint32_t(16777216);
-    // Partial products
-    words64 h0p0, h1p0, h2p0, h0p2, h3p0, h1p2;
-
-    h0p0.u64[0] = uint64_t(h[0]) * p0;
-    h1p0.u64[0] = uint64_t(h[1]) * p0;
-    h2p0.u64[0] = uint64_t(h[2]) * p0;
-    h0p2.u64[0] = uint64_t(h[0]) * p2;
-    h3p0.u64[0] = uint64_t(h[3]) * p0;
-    h1p2.u64[0] = uint64_t(h[1]) * p2;
-
-    // h_n[0] = LO(h[0]*p[0]);
-    // h_n[1] = HI(h[0]*p[0]) + LO(h[1]*p[0]);
-    // h_n[2] = HI(h[1]*p[0]) + LO(h[2]*p[0]) + LO(h[0]*p[2]);
-    // h_n[3] = HI(h[2]*p[0]) + HI(h[0]*p[2]) + LO(h[3]*p[0]) + LO(h[1]*p[2]);
-    uint32_t carry = 0;
-    h[0] = h0p0.u32[0];
-
-    h[1] = h0p0.u32[1] + h1p0.u32[0];
-    carry = h[1] < h0p0.u32[1] ? 1 : 0;
-
-    h[2] = h1p0.u32[1] + carry;
-    carry = h[2] < h1p0.u32[1] ? 1 : 0;
-    h[2] += h2p0.u32[0];
-    carry = h[2] < h2p0.u32[0] ? carry + 1 : carry;
-    h[2] += h0p2.u32[0];
-    carry = h[2] < h0p2.u32[0] ? carry + 1 : carry;
-
-    h[3] = h2p0.u32[1] + h0p2.u32[1] + h3p0.u32[0] + h1p2.u32[0] + carry;
-    return;
-  }
-
-  DI void fnv1a128(uint32_t* hash, uint32_t txt) {
-    hash[0] ^= (txt >> 0) & 0xFF;
-    mulByFnv1a128Prime(hash);
-    hash[0] ^= (txt >> 8) & 0xFF;
-    mulByFnv1a128Prime(hash);
-    hash[0] ^= (txt >> 16) & 0xFF;
-    mulByFnv1a128Prime(hash);
-    hash[0] ^= (txt >> 24) & 0xFF;
-    mulByFnv1a128Prime(hash);
-  }
-
-  DI void initKiss99(uint64_t seed) {
-    // Initialize hash to 128-bit FNV1a basis
-    uint32_t hash[4] = {1653982605UL, 1656234357UL, 129696066UL, 1818371886UL};
-
-    // Digest threadIdx, blockIdx and seed
-    fnv1a128(hash, threadIdx.x);
-    fnv1a128(hash, threadIdx.y);
-    fnv1a128(hash, threadIdx.z);
-    fnv1a128(hash, blockIdx.x);
-    fnv1a128(hash, blockIdx.y);
-    fnv1a128(hash, blockIdx.z);
-    fnv1a128(hash, uint32_t(seed));
-    fnv1a128(hash, uint32_t(seed >> 32));
-
-    // Initialize KISS99 state with hash
-    z = hash[0];
-    w = hash[1];
-    jsr = hash[2];
-    jcong = hash[3];
-  }
-};
-
-/**
- * @brief generator-agnostic way of generating random numbers
- * @tparam GenType the generator object that expose 'next' method
- */
-template <typename GenType>
-struct Generator {
-  DI Generator(uint64_t seed, uint64_t subsequence, uint64_t offset)
-    : gen(seed, subsequence, offset) {}
-
-  template <typename Type>
-  DI void next(Type& ret) {
-    gen.next(ret);
-  }
-
- private:
-  /** the actual generator */
-  GenType gen;
-};
-
-};  // end namespace detail
-};  // end namespace random
-};  // end namespace raft
diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh
index fc1a7c0d8d..b796b63dc8 100644
--- a/cpp/include/raft/sparse/selection/knn.cuh
+++ b/cpp/include/raft/sparse/selection/knn.cuh
@@ -22,7 +22,7 @@
 #include <raft/linalg/distance_type.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/matrix/matrix.cuh>
+#include <raft/matrix/matrix.hpp>
 #include <raft/mr/device/buffer.hpp>
 
 #include <raft/sparse/op/slice.h>
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
index 909e28708e..7354fa3497 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
@@ -31,8 +31,8 @@
 
 #include <raft/cuda_utils.cuh>
 
-#include <raft/matrix/matrix.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/matrix/matrix.hpp>
+#include <raft/random/rng.hpp>
 #include <raft/sparse/convert/csr.cuh>
 
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index 876e91e877..b66ea025a2 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -19,8 +19,8 @@
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/norm.cuh>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/stats/mean.cuh>
-#include <raft/stats/mean_center.cuh>
+#include <raft/stats/mean.hpp>
+#include <raft/stats/mean_center.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh
similarity index 81%
rename from cpp/include/raft/stats/mean.cuh
rename to cpp/include/raft/stats/detail/mean.cuh
index 8691cabc85..1b338a035a 100644
--- a/cpp/include/raft/stats/mean.cuh
+++ b/cpp/include/raft/stats/detail/mean.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,14 @@
 
 #pragma once
 
-#include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
-#include <raft/handle.hpp>
 #include <raft/linalg/eltwise.cuh>
 
+#include <cub/cub.cuh>
+
 namespace raft {
 namespace stats {
+namespace detail {
 
 ///@todo: ColsPerBlk has been tested only for 32!
 template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
@@ -62,23 +63,6 @@ __global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D,
   }
 }
 
-/**
- * @brief Compute mean of the input matrix
- *
- * Mean operation is assumed to be performed on a given column.
- *
- * @tparam Type: the data type
- * @tparam IdxType Integer type used to for addressing
- * @param mu: the output mean vector
- * @param data: the input matrix
- * @param D: number of columns of data
- * @param N: number of rows of data
- * @param sample: whether to evaluate sample mean or not. In other words,
- * whether
- *  to normalize the output using N-1 or N, for true or false, respectively
- * @param rowMajor: whether the input data is row or col major
- * @param stream: cuda stream
- */
 template <typename Type, typename IdxType = int>
 void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample,
           bool rowMajor, cudaStream_t stream) {
@@ -102,5 +86,6 @@ void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample,
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
-};  // namespace stats
-};  // namespace raft
+}  // namespace detail
+}  // namespace stats
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/stats/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh
similarity index 87%
rename from cpp/include/raft/stats/stddev.cuh
rename to cpp/include/raft/stats/detail/stddev.cuh
index f12c633829..e8917a60b3 100644
--- a/cpp/include/raft/stats/stddev.cuh
+++ b/cpp/include/raft/stats/detail/stddev.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,14 @@
 
 #pragma once
 
-#include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
-#include <raft/handle.hpp>
 #include <raft/linalg/binary_op.cuh>
 
+#include <cub/cub.cuh>
+
 namespace raft {
 namespace stats {
+namespace detail {
 
 ///@todo: ColPerBlk has been tested only for 32!
 template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
@@ -131,23 +132,23 @@ void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N,
 }
 
 /**
- * @brief Compute variance of the input matrix
- *
- * Variance operation is assumed to be performed on a given column.
- *
- * @tparam Type the data type
- * @tparam IdxType Integer type used to for addressing
- * @param var the output stddev vector
- * @param data the input matrix
- * @param mu the mean vector
- * @param D number of columns of data
- * @param N number of rows of data
- * @param sample whether to evaluate sample stddev or not. In other words,
- * whether
- *  to normalize the output using N-1 or N, for true or false, respectively
- * @param rowMajor whether the input data is row or col major
- * @param stream cuda stream where to launch work
- */
+  * @brief Compute variance of the input matrix
+  *
+  * Variance operation is assumed to be performed on a given column.
+  *
+  * @tparam Type the data type
+  * @tparam IdxType Integer type used to for addressing
+  * @param var the output stddev vector
+  * @param data the input matrix
+  * @param mu the mean vector
+  * @param D number of columns of data
+  * @param N number of rows of data
+  * @param sample whether to evaluate sample stddev or not. In other words,
+  * whether
+  *  to normalize the output using N-1 or N, for true or false, respectively
+  * @param rowMajor whether the input data is row or col major
+  * @param stream cuda stream where to launch work
+  */
 template <typename Type, typename IdxType = int>
 void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N,
           bool sample, bool rowMajor, cudaStream_t stream) {
@@ -172,5 +173,6 @@ void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N,
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
-};  // namespace stats
-};  // namespace raft
+}  // namespace detail
+}  // namespace stats
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/stats/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh
similarity index 83%
rename from cpp/include/raft/stats/sum.cuh
rename to cpp/include/raft/stats/detail/sum.cuh
index 5f8416c7e2..37a3313ed1 100644
--- a/cpp/include/raft/stats/sum.cuh
+++ b/cpp/include/raft/stats/detail/sum.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,14 @@
 
 #pragma once
 
-#include <raft/cudart_utils.h>
-#include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/eltwise.cuh>
 
+#include <cub/cub.cuh>
+
 namespace raft {
 namespace stats {
+namespace detail {
 
 ///@todo: ColsPerBlk has been tested only for 32!
 template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
@@ -62,20 +63,6 @@ __global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D,
   }
 }
 
-/**
- * @brief Compute sum of the input matrix
- *
- * Sum operation is assumed to be performed on a given column.
- *
- * @tparam Type the data type
- * @tparam IdxType Integer type used to for addressing
- * @param output the output mean vector
- * @param input the input matrix
- * @param D number of columns of data
- * @param N number of rows of data
- * @param rowMajor whether the input data is row or col major
- * @param stream cuda stream where to launch work
- */
 template <typename Type, typename IdxType = int>
 void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor,
          cudaStream_t stream) {
@@ -96,5 +83,6 @@ void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor,
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
-};  // end namespace stats
-};  // end namespace raft
+}  // namespace detail
+}  // namespace stats
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/stats/mean.hpp b/cpp/include/raft/stats/mean.hpp
new file mode 100644
index 0000000000..6e4cf39850
--- /dev/null
+++ b/cpp/include/raft/stats/mean.hpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/mean.cuh"
+
+#include <raft/handle.hpp>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute mean of the input matrix
+ *
+ * Mean operation is assumed to be performed on a given column.
+ *
+ * @tparam Type: the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param mu: the output mean vector
+ * @param data: the input matrix
+ * @param D: number of columns of data
+ * @param N: number of rows of data
+ * @param sample: whether to evaluate sample mean or not. In other words,
+ * whether
+ *  to normalize the output using N-1 or N, for true or false, respectively
+ * @param rowMajor: whether the input data is row or col major
+ * @param stream: cuda stream
+ */
+template <typename Type, typename IdxType = int>
+void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample,
+          bool rowMajor, cudaStream_t stream) {
+  detail::mean(mu, data, D, N, sample, rowMajor, stream);
+}
+
+};  // namespace stats
+};  // namespace raft
diff --git a/cpp/include/raft/stats/mean_center.cuh b/cpp/include/raft/stats/mean_center.hpp
similarity index 100%
rename from cpp/include/raft/stats/mean_center.cuh
rename to cpp/include/raft/stats/mean_center.hpp
diff --git a/cpp/include/raft/stats/stddev.hpp b/cpp/include/raft/stats/stddev.hpp
new file mode 100644
index 0000000000..17c5ae457d
--- /dev/null
+++ b/cpp/include/raft/stats/stddev.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/stddev.cuh"
+
+#include <raft/handle.hpp>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute stddev of the input matrix
+ *
+ * Stddev operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param std the output stddev vector
+ * @param data the input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param sample whether to evaluate sample stddev or not. In other words,
+ * whether
+ *  to normalize the output using N-1 or N, for true or false, respectively
+ * @param rowMajor whether the input data is row or col major
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int>
+void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N,
+            bool sample, bool rowMajor, cudaStream_t stream) {
+  detail::stddev(std, data, mu, D, N, sample, rowMajor, stream);
+}
+
+/**
+ * @brief Compute variance of the input matrix
+ *
+ * Variance operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param var the output stddev vector
+ * @param data the input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param sample whether to evaluate sample stddev or not. In other words,
+ * whether
+ *  to normalize the output using N-1 or N, for true or false, respectively
+ * @param rowMajor whether the input data is row or col major
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int>
+void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N,
+          bool sample, bool rowMajor, cudaStream_t stream) {
+  detail::vars(var, data, mu, D, N, sample, rowMajor, stream);
+}
+
+};  // namespace stats
+};  // namespace raft
diff --git a/cpp/include/raft/stats/sum.hpp b/cpp/include/raft/stats/sum.hpp
new file mode 100644
index 0000000000..4f67acdf36
--- /dev/null
+++ b/cpp/include/raft/stats/sum.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/sum.cuh"
+
+#include <raft/cudart_utils.h>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute sum of the input matrix
+ *
+ * Sum operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param output the output mean vector
+ * @param input the input matrix
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param rowMajor whether the input data is row or col major
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int>
+void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor,
+         cudaStream_t stream) {
+  detail::sum(output, input, D, N, rowMajor, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index 82c2bfdfec..efa1e2cd41 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -18,7 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/distance.hpp>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
 
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index f6b18198d2..f31fbc9165 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -18,7 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/distance.hpp>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 8c06abf370..33782baf8d 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -20,7 +20,7 @@
 #include <raft/distance/detail/fused_l2_nn.cuh>
 #include <raft/distance/fused_l2_nn.hpp>
 #include <raft/linalg/norm.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu
index eea9df046b..48ad83dfd2 100644
--- a/cpp/test/linalg/add.cu
+++ b/cpp/test/linalg/add.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/add.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 #include "add.cuh"
 
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu
index b60f04cd34..c8121bfbe4 100644
--- a/cpp/test/linalg/binary_op.cu
+++ b/cpp/test/linalg/binary_op.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/binary_op.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
 #include "binary_op.cuh"
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index f17a0f0f5d..fdfc3052b7 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -18,7 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/coalesced_reduction.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 #include "reduce.cuh"
 
diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu
index 430c35f41b..d90955147c 100644
--- a/cpp/test/linalg/divide.cu
+++ b/cpp/test/linalg/divide.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/divide.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 #include "unary_op.cuh"
 
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
index 87d6c4faa6..2ac9118506 100644
--- a/cpp/test/linalg/eig.cu
+++ b/cpp/test/linalg/eig.cu
@@ -18,7 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/eig.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index 3c75654733..9eb1c10313 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -20,7 +20,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/eig.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu
index 65bd7e4427..c3b26f5423 100644
--- a/cpp/test/linalg/eltwise.cu
+++ b/cpp/test/linalg/eltwise.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/eltwise.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu
index cecfc5eb8e..699d40d55e 100644
--- a/cpp/test/linalg/gemm_layout.cu
+++ b/cpp/test/linalg/gemm_layout.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/gemm.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/linalg/gemv.cu b/cpp/test/linalg/gemv.cu
index 4a474bc461..92e59ae49b 100644
--- a/cpp/test/linalg/gemv.cu
+++ b/cpp/test/linalg/gemv.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/linalg/gemv.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu
index 271ae13b2e..f04c225aa9 100644
--- a/cpp/test/linalg/map.cu
+++ b/cpp/test/linalg/map.cu
@@ -18,7 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/eltwise.cuh>
 #include <raft/linalg/map.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index e77809def7..9d59e49e60 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -18,7 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <limits>
 #include <raft/linalg/map_then_reduce.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index 28296ad7bd..aad1d1e137 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 #include "matrix_vector_op.cuh"
 
diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu
index d0772e538d..f78ae64f05 100644
--- a/cpp/test/linalg/multiply.cu
+++ b/cpp/test/linalg/multiply.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/multiply.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 #include "unary_op.cuh"
 
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
index 94b703d15e..659956534e 100644
--- a/cpp/test/linalg/norm.cu
+++ b/cpp/test/linalg/norm.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/norm.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
index cf7585dc23..9822ca2c60 100644
--- a/cpp/test/linalg/reduce.cu
+++ b/cpp/test/linalg/reduce.cu
@@ -18,7 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/reduce.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 #include "reduce.cuh"
 
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu
index 56632a59cc..4f761d39f6 100644
--- a/cpp/test/linalg/strided_reduction.cu
+++ b/cpp/test/linalg/strided_reduction.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/strided_reduction.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 #include "reduce.cuh"
 
diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu
index df3686ee32..0a82da61c9 100644
--- a/cpp/test/linalg/subtract.cu
+++ b/cpp/test/linalg/subtract.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/subtract.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu
index cbd6df9c8f..8ebbf19683 100644
--- a/cpp/test/linalg/svd.cu
+++ b/cpp/test/linalg/svd.cu
@@ -18,8 +18,8 @@
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/svd.cuh>
-#include <raft/matrix/matrix.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/matrix/matrix.hpp>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index b3f7f5b126..1d8ef08673 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -18,7 +18,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/linalg/transpose.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu
index c3d10d70e7..0fcf465150 100644
--- a/cpp/test/linalg/unary_op.cu
+++ b/cpp/test/linalg/unary_op.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 #include "unary_op.cuh"
 
diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu
index 84aa310076..7c7f29815b 100644
--- a/cpp/test/matrix/math.cu
+++ b/cpp/test/matrix/math.cu
@@ -16,8 +16,8 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/matrix/math.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/matrix/math.hpp>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu
index 9dde1cca9a..e247abad1e 100644
--- a/cpp/test/matrix/matrix.cu
+++ b/cpp/test/matrix/matrix.cu
@@ -16,8 +16,8 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/matrix/matrix.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/matrix/matrix.hpp>
+#include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
 
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
index f0e0f6cb93..810d6cb871 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/test/random/rng.cu
@@ -18,14 +18,16 @@
 #include <raft/cudart_utils.h>
 #include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
-#include <raft/random/rng.cuh>
-#include <raft/stats/mean.cuh>
-#include <raft/stats/stddev.cuh>
+#include <raft/random/rng.hpp>
+#include <raft/stats/mean.hpp>
+#include <raft/stats/stddev.hpp>
 #include "../test_utils.h"
 
 namespace raft {
 namespace random {
 
+using namespace raft::random::detail;
+
 enum RandomType {
   RNG_Normal,
   RNG_LogNormal,
diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu
index e51700fbb7..cef2d47276 100644
--- a/cpp/test/random/rng_int.cu
+++ b/cpp/test/random/rng_int.cu
@@ -18,12 +18,14 @@
 #include <raft/cudart_utils.h>
 #include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 namespace raft {
 namespace random {
 
+using namespace raft::random::detail;
+
 enum RandomType { RNG_Uniform };
 
 template <typename T, int TPB>
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu
index ecb4164616..1d33f08c62 100644
--- a/cpp/test/random/sample_without_replacement.cu
+++ b/cpp/test/random/sample_without_replacement.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include <set>
 #include <vector>
 #include "../test_utils.h"
@@ -25,6 +25,8 @@
 namespace raft {
 namespace random {
 
+using namespace raft::random::detail;
+
 // Terminology:
 // SWoR - Sample Without Replacement
 
diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu
index b9d4d18e98..a5f08489f1 100644
--- a/cpp/test/sparse/add.cu
+++ b/cpp/test/sparse/add.cu
@@ -20,7 +20,7 @@
 #include <raft/sparse/linalg/add.cuh>
 
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 #include <iostream>
diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu
index 8bdd5b88c7..d30114bbcb 100644
--- a/cpp/test/sparse/convert_coo.cu
+++ b/cpp/test/sparse/convert_coo.cu
@@ -20,7 +20,7 @@
 #include <raft/sparse/csr.cuh>
 
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 
 #include "../test_utils.h"
 
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index 2f1ed99332..cd665934c2 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 #include <raft/sparse/convert/csr.cuh>
diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu
index 2201702b03..fbadadb29d 100644
--- a/cpp/test/sparse/degree.cu
+++ b/cpp/test/sparse/degree.cu
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 #include <raft/sparse/linalg/degree.cuh>
diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu
index 4634e5fc0e..58ad9cf803 100644
--- a/cpp/test/sparse/filter.cu
+++ b/cpp/test/sparse/filter.cu
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 #include <raft/sparse/op/sort.h>
diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu
index 712f95018c..c2a1c4b93c 100644
--- a/cpp/test/sparse/knn_graph.cu
+++ b/cpp/test/sparse/knn_graph.cu
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
index 91b9bc6c54..4900b3ff2b 100644
--- a/cpp/test/sparse/norm.cu
+++ b/cpp/test/sparse/norm.cu
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include <raft/sparse/csr.cuh>
 #include <raft/sparse/linalg/norm.cuh>
 #include "../test_utils.h"
diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu
index 8011d73a6e..d527e7323e 100644
--- a/cpp/test/sparse/row_op.cu
+++ b/cpp/test/sparse/row_op.cu
@@ -20,7 +20,7 @@
 #include <raft/sparse/op/row_op.cuh>
 
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 #include <iostream>
diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu
index 92833630dd..7d43780cfd 100644
--- a/cpp/test/sparse/sort.cu
+++ b/cpp/test/sparse/sort.cu
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include "../test_utils.h"
 
 #include <raft/sparse/op/sort.h>
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu
index d50211f971..77d9d3d822 100644
--- a/cpp/test/sparse/symmetrize.cu
+++ b/cpp/test/sparse/symmetrize.cu
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
+#include <raft/random/rng.hpp>
 #include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/coo.cuh>
 #include <raft/sparse/linalg/symmetrize.cuh>
diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu
index 9884202cc0..cf866a5663 100644
--- a/cpp/test/stats/mean.cu
+++ b/cpp/test/stats/mean.cu
@@ -19,8 +19,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/random/rng.cuh>
-#include <raft/stats/mean.cuh>
+#include <raft/random/rng.hpp>
+#include <raft/stats/mean.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu
index 9845663df9..dcc4b4e551 100644
--- a/cpp/test/stats/mean_center.cu
+++ b/cpp/test/stats/mean_center.cu
@@ -16,9 +16,9 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.cuh>
-#include <raft/stats/mean.cuh>
-#include <raft/stats/mean_center.cuh>
+#include <raft/random/rng.hpp>
+#include <raft/stats/mean.hpp>
+#include <raft/stats/mean_center.hpp>
 #include "../linalg/matrix_vector_op.cuh"
 #include "../test_utils.h"
 
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index 8c42b70c07..53f392aaf3 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -16,10 +16,10 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/matrix/math.cuh>
-#include <raft/random/rng.cuh>
-#include <raft/stats/mean.cuh>
-#include <raft/stats/stddev.cuh>
+#include <raft/matrix/math.hpp>
+#include <raft/random/rng.hpp>
+#include <raft/stats/mean.hpp>
+#include <raft/stats/stddev.hpp>
 #include "../test_utils.h"
 
 namespace raft {
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index f5b341cb0e..ac4d642c8e 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -17,8 +17,8 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/eltwise.cuh>
-#include <raft/random/rng.cuh>
-#include <raft/stats/sum.cuh>
+#include <raft/random/rng.hpp>
+#include <raft/stats/sum.hpp>
 #include "../test_utils.h"
 
 namespace raft {