From 08003c24c8ffa7ac2d62b8c7bb301d57be74b36e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Thu, 24 Feb 2022 21:29:37 -0500
Subject: [PATCH] Moving device functions to cuh files and deprecating hpp
 (#524)

For consistency, we had originally scraped through the primitive functions and used the `hpp` extension across the public API. However, it was brought to my attention more recently that this is confusing when considering the larger scope of the project- which also contains many host-only APIs that don't require a cuda-enabled compiler.

However, as we're gaining more consumers, we need to start being more careful about making breaking changes to the public APIs and their header files. For this reason, I'm opting to copy the existing `hpp` files into `cuh` files, deprecating the hpp files, and using `#define` w/ conditionals to make sure the contents from only one file get defined even if both are included (for example, when a user includes `filea.hpp` but raft internally includes `filea.cuh`. This should allow us to set a version where we can make an announcement to remove the offending `hpp` files and give ample notice before the breaking change is made.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Mark Sadang (https://github.com/msadang)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/524
---
 build.sh                                      |   2 +-
 ci/release/update-version.sh                  |   2 +-
 cpp/cmake/thirdparty/get_faiss.cmake          |   2 +-
 cpp/include/raft/cluster/detail/kmeans.cuh    |   2 +-
 .../raft/cluster/{kmeans.hpp => kmeans.cuh}   |   2 +-
 cpp/include/raft/comms/comms.hpp              |   2 +-
 cpp/include/raft/comms/comms_test.hpp         |   2 +-
 cpp/include/raft/comms/detail/ucp_helper.hpp  |   2 +-
 cpp/include/raft/comms/helper.hpp             |   2 +-
 cpp/include/raft/comms/mpi_comms.hpp          |   2 +-
 cpp/include/raft/comms/std_comms.hpp          |   2 +-
 .../raft/distance/detail/correlation.cuh      |   4 +-
 cpp/include/raft/distance/detail/cosine.cuh   |   4 +-
 cpp/include/raft/distance/detail/distance.cuh |   2 +-
 .../raft/distance/detail/euclidean.cuh        |   4 +-
 .../raft/distance/detail/fused_l2_nn.cuh      |   4 +-
 .../raft/distance/detail/hellinger.cuh        |   4 +-
 .../detail/pairwise_distance_base.cuh         |   4 +-
 cpp/include/raft/distance/distance.cuh        | 325 ++++++++++++
 cpp/include/raft/distance/distance.hpp        |  11 +-
 cpp/include/raft/distance/fused_l2_nn.cuh     | 118 +++++
 cpp/include/raft/distance/fused_l2_nn.hpp     |  11 +-
 cpp/include/raft/distance/specializations.cuh |  24 +
 cpp/include/raft/distance/specializations.hpp |  13 +-
 .../detail/{canberra.hpp => canberra.cuh}     |   2 +-
 .../detail/{chebyshev.hpp => chebyshev.cuh}   |   2 +-
 .../{correlation.hpp => correlation.cuh}      |   2 +-
 .../detail/{cosine.hpp => cosine.cuh}         |   2 +-
 ..._unexpanded.hpp => hamming_unexpanded.cuh} |   2 +-
 ...er_expanded.hpp => hellinger_expanded.cuh} |   2 +-
 ...{jensen_shannon.hpp => jensen_shannon.cuh} |   2 +-
 .../{kl_divergence.hpp => kl_divergence.cuh}  |   2 +-
 .../specializations/detail/{l1.hpp => l1.cuh} |   2 +-
 .../{l2_expanded.hpp => l2_expanded.cuh}      |   2 +-
 ...sqrt_expanded.hpp => l2_sqrt_expanded.cuh} |   2 +-
 ..._unexpanded.hpp => l2_sqrt_unexpanded.cuh} |   2 +-
 .../{l2_unexpanded.hpp => l2_unexpanded.cuh}  |   2 +-
 .../{lp_unexpanded.hpp => lp_unexpanded.cuh}  |   2 +-
 .../{distance.hpp => distance.cuh}            |  30 +-
 cpp/include/raft/label/classlabels.cuh        | 121 +++++
 cpp/include/raft/label/classlabels.hpp        |   7 +-
 cpp/include/raft/label/detail/classlabels.cuh |   4 +-
 .../raft/label/detail/merge_labels.cuh        |   4 +-
 cpp/include/raft/label/merge_labels.cuh       |  71 +++
 cpp/include/raft/label/merge_labels.hpp       |   9 +-
 cpp/include/raft/lap/detail/d_structs.h       |   2 +-
 cpp/include/raft/lap/detail/lap_functions.cuh |   2 +-
 cpp/include/raft/lap/detail/lap_kernels.cuh   |   2 +-
 cpp/include/raft/lap/{lap.hpp => lap.cuh}     |   2 +-
 cpp/include/raft/linalg/add.cuh               |  90 ++++
 cpp/include/raft/linalg/add.hpp               |   9 +
 cpp/include/raft/linalg/axpy.cuh              |  55 ++
 cpp/include/raft/linalg/axpy.hpp              |  11 +-
 cpp/include/raft/linalg/binary_op.cuh         |  58 +++
 cpp/include/raft/linalg/binary_op.hpp         |   9 +
 .../raft/linalg/cholesky_r1_update.cuh        | 138 ++++++
 .../raft/linalg/cholesky_r1_update.hpp        |  11 +-
 .../raft/linalg/coalesced_reduction.cuh       |  76 +++
 .../raft/linalg/coalesced_reduction.hpp       |   9 +
 cpp/include/raft/linalg/contractions.cuh      | 211 ++++++++
 cpp/include/raft/linalg/contractions.hpp      |   9 +
 cpp/include/raft/linalg/cublas_macros.h       | 116 +++++
 cpp/include/raft/linalg/cusolver_macros.h     | 112 +++++
 cpp/include/raft/linalg/detail/add.cuh        |   4 +-
 .../raft/linalg/detail/{axpy.hpp => axpy.cuh} |   0
 ...y_r1_update.hpp => cholesky_r1_update.cuh} |   3 +-
 .../raft/linalg/detail/cublas_wrappers.hpp    |   1 +
 .../linalg/detail/{divide.hpp => divide.cuh}  |   2 +-
 .../raft/linalg/detail/{eig.hpp => eig.cuh}   |   3 +-
 .../detail/{eltwise.hpp => eltwise.cuh}       |   4 +-
 .../detail/{lanczos.hpp => lanczos.cuh}       |   0
 .../linalg/detail/{lstsq.hpp => lstsq.cuh}    |  21 +-
 .../raft/linalg/detail/matrix_vector_op.cuh   |   2 +-
 ...uared_error.hpp => mean_squared_error.cuh} |   2 +-
 .../detail/{multiply.hpp => multiply.cuh}     |   2 +-
 .../raft/linalg/detail/{norm.hpp => norm.cuh} |   2 +-
 cpp/include/raft/linalg/detail/qr.cuh         |   2 +-
 .../linalg/detail/{reduce.hpp => reduce.cuh}  |   4 +-
 cpp/include/raft/linalg/detail/rsvd.cuh       |  16 +-
 .../raft/linalg/detail/strided_reduction.cuh  |   2 +-
 cpp/include/raft/linalg/detail/subtract.cuh   |   4 +-
 .../raft/linalg/detail/{svd.hpp => svd.cuh}   |  10 +-
 .../detail/{transpose.hpp => transpose.cuh}   |   0
 cpp/include/raft/linalg/divide.cuh            |  49 ++
 cpp/include/raft/linalg/divide.hpp            |  11 +-
 cpp/include/raft/linalg/eig.cuh               | 120 +++++
 cpp/include/raft/linalg/eig.hpp               |  11 +-
 cpp/include/raft/linalg/eltwise.cuh           | 106 ++++
 cpp/include/raft/linalg/eltwise.hpp           |  11 +-
 cpp/include/raft/linalg/gemm.cuh              | 179 +++++++
 cpp/include/raft/linalg/gemm.hpp              |   9 +
 cpp/include/raft/linalg/gemv.cuh              | 211 ++++++++
 cpp/include/raft/linalg/gemv.hpp              |   9 +
 cpp/include/raft/linalg/init.cuh              |  60 +++
 cpp/include/raft/linalg/init.hpp              |   9 +
 cpp/include/raft/linalg/lanczos.cuh           | 162 ++++++
 cpp/include/raft/linalg/lanczos.hpp           |  11 +-
 cpp/include/raft/linalg/lstsq.cuh             | 121 +++++
 cpp/include/raft/linalg/lstsq.hpp             |  11 +-
 cpp/include/raft/linalg/map.cuh               |  54 ++
 cpp/include/raft/linalg/map.hpp               |   9 +
 cpp/include/raft/linalg/map_then_reduce.cuh   |  91 ++++
 cpp/include/raft/linalg/map_then_reduce.hpp   |   9 +
 cpp/include/raft/linalg/matrix_vector_op.cuh  | 105 ++++
 cpp/include/raft/linalg/matrix_vector_op.hpp  |   9 +
 .../raft/linalg/mean_squared_error.cuh        |  47 ++
 .../raft/linalg/mean_squared_error.hpp        |  11 +-
 cpp/include/raft/linalg/multiply.cuh          |  47 ++
 cpp/include/raft/linalg/multiply.hpp          |  11 +-
 cpp/include/raft/linalg/norm.cuh              |  94 ++++
 cpp/include/raft/linalg/norm.hpp              |  11 +-
 cpp/include/raft/linalg/power.cuh             |   8 +-
 cpp/include/raft/linalg/power.hpp             |  74 +++
 cpp/include/raft/linalg/qr.cuh                |  78 +++
 cpp/include/raft/linalg/qr.hpp                |   9 +
 cpp/include/raft/linalg/reduce.cuh            |  81 +++
 cpp/include/raft/linalg/reduce.hpp            |  11 +-
 .../raft/linalg/reduce_cols_by_key.cuh        |   4 +
 .../raft/linalg/reduce_cols_by_key.hpp        |  62 +++
 .../raft/linalg/reduce_rows_by_key.cuh        |   6 +-
 .../raft/linalg/reduce_rows_by_key.hpp        | 119 +++++
 cpp/include/raft/linalg/rsvd.cuh              |   4 +
 cpp/include/raft/linalg/rsvd.hpp              | 148 ++++++
 cpp/include/raft/linalg/sqrt.cuh              |   6 +-
 cpp/include/raft/linalg/sqrt.hpp              |  53 ++
 cpp/include/raft/linalg/strided_reduction.cuh |  77 +++
 cpp/include/raft/linalg/strided_reduction.hpp |   9 +
 cpp/include/raft/linalg/subtract.cuh          |  90 ++++
 cpp/include/raft/linalg/subtract.hpp          |   9 +
 cpp/include/raft/linalg/svd.cuh               | 188 +++++++
 cpp/include/raft/linalg/svd.hpp               |  11 +-
 cpp/include/raft/linalg/ternary_op.cuh        |   7 +-
 cpp/include/raft/linalg/ternary_op.hpp        |  59 +++
 cpp/include/raft/linalg/transpose.cuh         |  61 +++
 cpp/include/raft/linalg/transpose.hpp         |  11 +-
 cpp/include/raft/linalg/unary_op.cuh          |  77 +++
 cpp/include/raft/linalg/unary_op.hpp          |   9 +
 cpp/include/raft/matrix/col_wise_sort.cuh     |  56 +++
 cpp/include/raft/matrix/col_wise_sort.hpp     |   9 +
 cpp/include/raft/matrix/detail/math.cuh       |  10 +-
 cpp/include/raft/matrix/detail/matrix.cuh     |   2 +-
 cpp/include/raft/matrix/math.cuh              | 468 ++++++++++++++++++
 cpp/include/raft/matrix/math.hpp              |  11 +-
 cpp/include/raft/matrix/matrix.cuh            | 278 +++++++++++
 cpp/include/raft/matrix/matrix.hpp            |   9 +
 cpp/include/raft/mr/buffer_base.hpp           |   2 +-
 cpp/include/raft/mr/device/buffer.hpp         |   2 +-
 cpp/include/raft/mr/host/buffer.hpp           |   2 +-
 cpp/include/raft/random/detail/make_blobs.cuh |   4 +-
 .../raft/random/detail/make_regression.cuh    |  14 +-
 .../random/detail/multi_variable_gaussian.cuh |   4 +-
 cpp/include/raft/random/make_blobs.cuh        |  96 ++++
 cpp/include/raft/random/make_blobs.hpp        |  12 +-
 cpp/include/raft/random/make_regression.cuh   | 105 ++++
 cpp/include/raft/random/make_regression.hpp   |  12 +-
 .../raft/random/multi_variable_gaussian.cuh   |  64 +++
 .../raft/random/multi_variable_gaussian.hpp   |  11 +-
 cpp/include/raft/random/permute.cuh           |  63 +++
 cpp/include/raft/random/permute.hpp           |  11 +-
 cpp/include/raft/random/rng.cuh               | 380 ++++++++++++++
 cpp/include/raft/random/rng.hpp               |   9 +
 cpp/include/raft/sparse/convert/coo.cuh       |  46 ++
 cpp/include/raft/sparse/convert/coo.hpp       |  13 +-
 cpp/include/raft/sparse/convert/csr.cuh       | 142 ++++++
 cpp/include/raft/sparse/convert/csr.hpp       |  13 +-
 cpp/include/raft/sparse/convert/dense.cuh     |  67 +++
 cpp/include/raft/sparse/convert/dense.hpp     |  13 +-
 .../raft/sparse/convert/detail/coo.cuh        |   2 +-
 .../raft/sparse/convert/detail/csr.cuh        |   6 +-
 .../raft/sparse/convert/detail/dense.cuh      |   2 +-
 cpp/include/raft/sparse/detail/csr.cuh        |   2 +-
 .../raft/sparse/detail/cusparse_macros.h      |   2 +-
 .../raft/sparse/detail/cusparse_wrappers.h    |   2 +-
 .../sparse/distance/detail/bin_distance.cuh   |   2 +-
 .../raft/sparse/distance/detail/coo_spmv.cuh  |   2 +-
 .../sparse/distance/detail/ip_distance.cuh    |   6 +-
 .../sparse/distance/detail/l2_distance.cuh    |   6 +-
 .../sparse/distance/detail/lp_distance.cuh    |   4 +-
 .../raft/sparse/distance/detail/utils.cuh     |   2 +-
 cpp/include/raft/sparse/distance/distance.cuh | 137 +++++
 cpp/include/raft/sparse/distance/distance.hpp |  11 +-
 .../sparse/hierarchy/detail/agglomerative.cuh |   2 +-
 .../hierarchy/detail/connectivities.cuh       |   8 +-
 .../raft/sparse/hierarchy/detail/mst.cuh      |   6 +-
 ...{single_linkage.hpp => single_linkage.cuh} |   2 +-
 .../raft/sparse/hierarchy/single_linkage.cuh  |  65 +++
 .../raft/sparse/hierarchy/single_linkage.hpp  |  13 +-
 cpp/include/raft/sparse/linalg/add.cuh        |  99 ++++
 cpp/include/raft/sparse/linalg/add.hpp        |  11 +-
 cpp/include/raft/sparse/linalg/degree.cuh     | 123 +++++
 cpp/include/raft/sparse/linalg/degree.hpp     |  11 +-
 cpp/include/raft/sparse/linalg/detail/add.cuh |   2 +-
 .../raft/sparse/linalg/detail/norm.cuh        |   2 +-
 .../raft/sparse/linalg/detail/spectral.cuh    |   8 +-
 .../raft/sparse/linalg/detail/symmetrize.cuh  |   8 +-
 .../raft/sparse/linalg/detail/transpose.h     |   2 +-
 cpp/include/raft/sparse/linalg/norm.cuh       |  73 +++
 cpp/include/raft/sparse/linalg/norm.hpp       |  13 +-
 cpp/include/raft/sparse/linalg/spectral.cuh   |  43 ++
 cpp/include/raft/sparse/linalg/spectral.hpp   |  11 +-
 cpp/include/raft/sparse/linalg/symmetrize.cuh | 168 +++++++
 cpp/include/raft/sparse/linalg/symmetrize.hpp |  11 +-
 cpp/include/raft/sparse/linalg/transpose.cuh  |  74 +++
 cpp/include/raft/sparse/linalg/transpose.hpp  |  11 +-
 cpp/include/raft/sparse/mst/mst.cuh           |   6 +-
 cpp/include/raft/sparse/mst/mst.hpp           |  63 +++
 cpp/include/raft/sparse/op/detail/filter.cuh  |   4 +-
 cpp/include/raft/sparse/op/detail/reduce.cuh  |   6 +-
 cpp/include/raft/sparse/op/detail/row_op.cuh  |   2 +-
 .../sparse/op/detail/{slice.h => slice.cuh}   |   4 +-
 cpp/include/raft/sparse/op/detail/sort.h      |   2 +-
 cpp/include/raft/sparse/op/filter.cuh         |  94 ++++
 cpp/include/raft/sparse/op/filter.hpp         |  11 +-
 cpp/include/raft/sparse/op/reduce.cuh         |  87 ++++
 cpp/include/raft/sparse/op/reduce.hpp         |  11 +-
 cpp/include/raft/sparse/op/row_op.cuh         |  48 ++
 cpp/include/raft/sparse/op/row_op.hpp         |  11 +-
 cpp/include/raft/sparse/op/slice.cuh          |  81 +++
 cpp/include/raft/sparse/op/slice.hpp          |  13 +-
 cpp/include/raft/sparse/op/sort.cuh           |  78 +++
 cpp/include/raft/sparse/op/sort.hpp           |  11 +-
 .../sparse/selection/connect_components.cuh   |  82 +++
 .../sparse/selection/connect_components.hpp   |  11 +-
 .../selection/detail/connect_components.cuh   |  14 +-
 .../raft/sparse/selection/detail/knn.cuh      |  12 +-
 .../sparse/selection/detail/knn_graph.cuh     |   6 +-
 cpp/include/raft/sparse/selection/knn.cuh     | 102 ++++
 cpp/include/raft/sparse/selection/knn.hpp     |  11 +-
 .../raft/sparse/selection/knn_graph.cuh       |  63 +++
 .../raft/sparse/selection/knn_graph.hpp       |  11 +-
 cpp/include/raft/spatial/knn/ann.cuh          |  87 ++++
 cpp/include/raft/spatial/knn/ann.hpp          |  11 +-
 cpp/include/raft/spatial/knn/ann_common.h     |   2 +-
 cpp/include/raft/spatial/knn/ball_cover.cuh   | 192 +++++++
 cpp/include/raft/spatial/knn/ball_cover.hpp   |   9 +
 .../raft/spatial/knn/ball_cover_common.h      |   2 +-
 .../knn/detail/ann_quantized_faiss.cuh        |   4 +-
 .../raft/spatial/knn/detail/ball_cover.cuh    |   6 +-
 .../raft/spatial/knn/detail/common_faiss.h    |   2 +-
 .../knn/detail/epsilon_neighborhood.cuh       |   2 +-
 .../raft/spatial/knn/detail/fused_l2_knn.cuh  |   4 +-
 .../spatial/knn/detail/haversine_distance.cuh |   2 +-
 .../knn/detail/knn_brute_force_faiss.cuh      |   2 +-
 .../raft/spatial/knn/detail/processing.hpp    |  12 +-
 .../raft/spatial/knn/epsilon_neighborhood.cuh |  64 +++
 .../raft/spatial/knn/epsilon_neighborhood.hpp |   9 +
 cpp/include/raft/spatial/knn/knn.cuh          | 162 ++++++
 cpp/include/raft/spatial/knn/knn.hpp          |  11 +-
 .../raft/spatial/knn/specializations.cuh      |  26 +
 .../raft/spatial/knn/specializations.hpp      |  17 +-
 .../{ball_cover.hpp => ball_cover.cuh}        |   4 +-
 .../{fused_l2_knn.hpp => fused_l2_knn.cuh}    |   2 +-
 .../knn/specializations/{knn.hpp => knn.cuh}  |   4 +-
 cpp/include/raft/spectral/cluster_solvers.cuh |  84 ++++
 cpp/include/raft/spectral/cluster_solvers.hpp |  15 +-
 cpp/include/raft/spectral/detail/lapack.hpp   |   2 +-
 ...atrix_wrappers.cuh => matrix_wrappers.hpp} |   2 +-
 .../detail/modularity_maximization.hpp        |   6 +-
 .../raft/spectral/detail/partition.hpp        |   6 +-
 .../raft/spectral/detail/spectral_util.cuh    |   2 +-
 cpp/include/raft/spectral/eigen_solvers.cuh   | 107 ++++
 cpp/include/raft/spectral/eigen_solvers.hpp   |  14 +-
 cpp/include/raft/spectral/matrix_wrappers.hpp |   4 +-
 .../raft/spectral/modularity_maximization.cuh |  92 ++++
 .../raft/spectral/modularity_maximization.hpp |  11 +-
 cpp/include/raft/spectral/partition.cuh       | 102 ++++
 cpp/include/raft/spectral/partition.hpp       |  12 +-
 cpp/include/raft/stats/accuracy.cuh           |  45 ++
 cpp/include/raft/stats/accuracy.hpp           |   9 +
 .../raft/stats/adjusted_rand_index.cuh        |  54 ++
 .../raft/stats/adjusted_rand_index.hpp        |  10 +
 cpp/include/raft/stats/completeness_score.cuh |  52 ++
 cpp/include/raft/stats/completeness_score.hpp |  11 +-
 cpp/include/raft/stats/contingency_matrix.cuh | 106 ++++
 cpp/include/raft/stats/contingency_matrix.hpp |   9 +
 cpp/include/raft/stats/cov.cuh                |  63 +++
 cpp/include/raft/stats/cov.hpp                |   9 +
 .../raft/stats/detail/adjusted_rand_index.cuh |   6 +-
 .../detail/batched/information_criterion.cuh  |   2 +-
 .../raft/stats/detail/completeness_score.cuh  |   6 +-
 cpp/include/raft/stats/detail/cov.cuh         |   4 +-
 cpp/include/raft/stats/detail/dispersion.cuh  |   2 +-
 cpp/include/raft/stats/detail/entropy.cuh     |   4 +-
 .../raft/stats/detail/homogeneity_score.cuh   |   4 +-
 .../raft/stats/detail/kl_divergence.cuh       |   2 +-
 cpp/include/raft/stats/detail/mean.cuh        |   4 +-
 cpp/include/raft/stats/detail/mean_center.cuh |   2 +-
 cpp/include/raft/stats/detail/meanvar.cuh     |   2 +-
 .../raft/stats/detail/mutual_info_score.cuh   |   4 +-
 cpp/include/raft/stats/detail/scores.cuh      |  10 +-
 .../raft/stats/detail/silhouette_score.cuh    |  12 +-
 cpp/include/raft/stats/detail/stddev.cuh      |   4 +-
 cpp/include/raft/stats/detail/sum.cuh         |   4 +-
 .../stats/detail/trustworthiness_score.cuh    |   6 +-
 cpp/include/raft/stats/detail/v_measure.cuh   |   4 +-
 .../raft/stats/detail/weighted_mean.cuh       |   4 +-
 cpp/include/raft/stats/dispersion.cuh         |  61 +++
 cpp/include/raft/stats/dispersion.hpp         |   9 +
 cpp/include/raft/stats/entropy.cuh            |  50 ++
 cpp/include/raft/stats/entropy.hpp            |   9 +
 cpp/include/raft/stats/histogram.cuh          |  67 +++
 cpp/include/raft/stats/histogram.hpp          |   9 +
 cpp/include/raft/stats/homogeneity_score.cuh  |  53 ++
 cpp/include/raft/stats/homogeneity_score.hpp  |  10 +
 .../raft/stats/information_criterion.cuh      |  68 +++
 .../raft/stats/information_criterion.hpp      |  10 +
 cpp/include/raft/stats/kl_divergence.cuh      |  47 ++
 cpp/include/raft/stats/kl_divergence.hpp      |  10 +
 cpp/include/raft/stats/mean.cuh               |  56 +++
 cpp/include/raft/stats/mean.hpp               |  11 +-
 cpp/include/raft/stats/mean_center.cuh        |  84 ++++
 cpp/include/raft/stats/mean_center.hpp        |  11 +-
 cpp/include/raft/stats/meanvar.cuh            |  60 +++
 cpp/include/raft/stats/meanvar.hpp            |   9 +
 cpp/include/raft/stats/minmax.cuh             |  73 +++
 cpp/include/raft/stats/minmax.hpp             |   9 +
 cpp/include/raft/stats/mutual_info_score.cuh  |  52 ++
 cpp/include/raft/stats/mutual_info_score.hpp  |  10 +
 cpp/include/raft/stats/r2_score.cuh           |  51 ++
 cpp/include/raft/stats/r2_score.hpp           |   9 +
 cpp/include/raft/stats/rand_index.cuh         |  43 ++
 cpp/include/raft/stats/rand_index.hpp         |   9 +
 cpp/include/raft/stats/regression_metrics.cuh |  55 ++
 cpp/include/raft/stats/regression_metrics.hpp |   9 +
 cpp/include/raft/stats/silhouette_score.cuh   |  79 +++
 cpp/include/raft/stats/silhouette_score.hpp   |   9 +
 cpp/include/raft/stats/specializations.cuh    |  24 +
 cpp/include/raft/stats/specializations.hpp    |  13 +-
 cpp/include/raft/stats/stddev.cuh             |  93 ++++
 cpp/include/raft/stats/stddev.hpp             |  11 +-
 cpp/include/raft/stats/sum.cuh                |  52 ++
 cpp/include/raft/stats/sum.hpp                |  11 +-
 .../raft/stats/trustworthiness_score.cuh      |  54 ++
 .../raft/stats/trustworthiness_score.hpp      |   9 +
 cpp/include/raft/stats/v_measure.cuh          |  53 ++
 cpp/include/raft/stats/v_measure.hpp          |  12 +-
 cpp/include/raft/stats/weighted_mean.cuh      |  65 +++
 cpp/include/raft/stats/weighted_mean.hpp      |   9 +
 ...jensen_shannon_double_double_double_int.cu |   2 +-
 .../jensen_shannon_float_float_float_int.cu   |   2 +-
 ...jensen_shannon_float_float_float_uint32.cu |   2 +-
 .../kl_divergence_double_double_double_int.cu |   2 +-
 .../kl_divergence_float_float_float_int.cu    |   2 +-
 .../kl_divergence_float_float_float_uint32.cu |   2 +-
 .../detail/l1_double_double_double_int.cu     |   2 +-
 .../detail/l1_float_float_float_int.cu        |   2 +-
 .../detail/l1_float_float_float_uint32.cu     |   2 +-
 .../l2_expanded_double_double_double_int.cu   |   2 +-
 .../l2_expanded_float_float_float_int.cu      |   2 +-
 .../l2_expanded_float_float_float_uint32.cu   |   2 +-
 ..._sqrt_expanded_double_double_double_int.cu |   2 +-
 .../l2_sqrt_expanded_float_float_float_int.cu |   2 +-
 ..._sqrt_expanded_float_float_float_uint32.cu |   2 +-
 ...qrt_unexpanded_double_double_double_int.cu |   2 +-
 ...2_sqrt_unexpanded_float_float_float_int.cu |   2 +-
 ...qrt_unexpanded_float_float_float_uint32.cu |   2 +-
 .../l2_unexpanded_double_double_double_int.cu |   2 +-
 .../l2_unexpanded_float_float_float_int.cu    |   2 +-
 .../l2_unexpanded_float_float_float_uint32.cu |   2 +-
 .../lp_unexpanded_double_double_double_int.cu |   2 +-
 .../lp_unexpanded_float_float_float_int.cu    |   2 +-
 .../lp_unexpanded_float_float_float_uint32.cu |   2 +-
 cpp/src/nn/specializations/ball_cover.cu      |  10 +-
 cpp/src/nn/specializations/knn.cu             |   4 +-
 cpp/test/CMakeLists.txt                       |   2 +-
 cpp/test/cluster_solvers.cu                   |   6 +-
 cpp/test/distance/dist_adj.cu                 |   6 +-
 cpp/test/distance/distance_base.cuh           |   6 +-
 cpp/test/distance/fused_l2_nn.cu              |   8 +-
 cpp/test/eigen_solvers.cu                     |   6 +-
 cpp/test/handle.cpp                           |   2 +-
 cpp/test/label/label.cu                       |   4 +-
 cpp/test/label/merge_labels.cu                |   4 +-
 cpp/test/lap/lap.cu                           |   4 +-
 cpp/test/linalg/add.cu                        |   6 +-
 cpp/test/linalg/add.cuh                       |   4 +-
 cpp/test/linalg/binary_op.cu                  |   6 +-
 cpp/test/linalg/binary_op.cuh                 |   4 +-
 cpp/test/linalg/cholesky_r1.cu                |   4 +-
 cpp/test/linalg/coalesced_reduction.cu        |   6 +-
 cpp/test/linalg/divide.cu                     |   6 +-
 cpp/test/linalg/eig.cu                        |   6 +-
 cpp/test/linalg/eig_sel.cu                    |   2 +-
 cpp/test/linalg/eltwise.cu                    |   6 +-
 cpp/test/linalg/gemm_layout.cu                |   6 +-
 cpp/test/linalg/gemv.cu                       |   6 +-
 cpp/test/linalg/map.cu                        |   8 +-
 cpp/test/linalg/map_then_reduce.cu            |   6 +-
 cpp/test/linalg/matrix_vector_op.cu           |   4 +-
 cpp/test/linalg/matrix_vector_op.cuh          |   4 +-
 cpp/test/linalg/multiply.cu                   |   6 +-
 cpp/test/linalg/norm.cu                       |   6 +-
 cpp/test/linalg/power.cu                      |   2 +-
 cpp/test/linalg/reduce.cu                     |   6 +-
 cpp/test/linalg/reduce.cuh                    |   4 +-
 cpp/test/linalg/reduce_cols_by_key.cu         |   2 +-
 cpp/test/linalg/reduce_rows_by_key.cu         |   2 +-
 cpp/test/linalg/rsvd.cu                       |   2 +-
 cpp/test/linalg/sqrt.cu                       |   2 +-
 cpp/test/linalg/strided_reduction.cu          |   6 +-
 cpp/test/linalg/subtract.cu                   |   6 +-
 cpp/test/linalg/svd.cu                        |   8 +-
 cpp/test/linalg/ternary_op.cu                 |   2 +-
 cpp/test/linalg/transpose.cu                  |   6 +-
 cpp/test/linalg/unary_op.cu                   |   6 +-
 cpp/test/linalg/unary_op.cuh                  |   4 +-
 cpp/test/matrix/columnSort.cu                 |   2 +-
 cpp/test/matrix/linewise_op.cu                |   8 +-
 cpp/test/matrix/math.cu                       |   6 +-
 cpp/test/matrix/matrix.cu                     |   6 +-
 cpp/test/mr/device/buffer.cpp                 |   2 +-
 cpp/test/mr/host/buffer.cpp                   |   2 +-
 cpp/test/random/make_blobs.cu                 |   2 +-
 cpp/test/random/make_regression.cu            |   6 +-
 cpp/test/random/multi_variable_gaussian.cu    |   2 +-
 cpp/test/random/permute.cu                    |   4 +-
 cpp/test/random/rng.cu                        |   6 +-
 cpp/test/random/rng_int.cu                    |   2 +-
 cpp/test/random/sample_without_replacement.cu |   2 +-
 cpp/test/sparse/add.cu                        |   6 +-
 cpp/test/sparse/connect_components.cu         |  12 +-
 cpp/test/sparse/convert_coo.cu                |   6 +-
 cpp/test/sparse/convert_csr.cu                |   6 +-
 cpp/test/sparse/csr_row_slice.cu              |   4 +-
 cpp/test/sparse/csr_to_dense.cu               |   4 +-
 cpp/test/sparse/csr_transpose.cu              |   4 +-
 cpp/test/sparse/degree.cu                     |   6 +-
 cpp/test/sparse/dist_coo_spmv.cu              |   6 +-
 cpp/test/sparse/distance.cu                   |   4 +-
 cpp/test/sparse/filter.cu                     |   8 +-
 cpp/test/sparse/knn.cu                        |   4 +-
 cpp/test/sparse/knn_graph.cu                  |   8 +-
 cpp/test/sparse/linkage.cu                    |   6 +-
 cpp/test/sparse/norm.cu                       |   6 +-
 cpp/test/sparse/reduce.cu                     |   4 +-
 cpp/test/sparse/row_op.cu                     |   6 +-
 cpp/test/sparse/sort.cu                       |   6 +-
 cpp/test/sparse/symmetrize.cu                 |   8 +-
 cpp/test/spatial/ball_cover.cu                |   2 +-
 cpp/test/spatial/epsilon_neighborhood.cu      |   4 +-
 cpp/test/spatial/faiss_mr.cu                  |   4 +-
 cpp/test/spatial/fused_l2_knn.cu              |   6 +-
 cpp/test/spatial/haversine.cu                 |   2 +-
 cpp/test/spatial/knn.cu                       |   6 +-
 cpp/test/spatial/selection.cu                 |   6 +-
 cpp/test/spatial/spatial_data.h               |  58 ++-
 cpp/test/spectral_matrix.cu                   |   2 +-
 cpp/test/stats/adjusted_rand_index.cu         |   4 +-
 cpp/test/stats/completeness_score.cu          |   6 +-
 cpp/test/stats/contingencyMatrix.cu           |   2 +-
 cpp/test/stats/cov.cu                         |   6 +-
 cpp/test/stats/dispersion.cu                  |   4 +-
 cpp/test/stats/entropy.cu                     |   2 +-
 cpp/test/stats/histogram.cu                   |   4 +-
 cpp/test/stats/homogeneity_score.cu           |   4 +-
 cpp/test/stats/information_criterion.cu       |   2 +-
 cpp/test/stats/kl_divergence.cu               |   2 +-
 cpp/test/stats/mean.cu                        |   6 +-
 cpp/test/stats/mean_center.cu                 |   8 +-
 cpp/test/stats/meanvar.cu                     |   6 +-
 cpp/test/stats/minmax.cu                      |   4 +-
 cpp/test/stats/mutual_info_score.cu           |   4 +-
 cpp/test/stats/rand_index.cu                  |   2 +-
 cpp/test/stats/silhouette_score.cu            |   4 +-
 cpp/test/stats/stddev.cu                      |  10 +-
 cpp/test/stats/sum.cu                         |   8 +-
 cpp/test/stats/trustworthiness.cu             |   6 +-
 cpp/test/stats/v_measure.cu                   |   4 +-
 cpp/test/stats/weighted_mean.cu               |   4 +-
 python/raft/dask/common/comms_utils.pyx       |   2 +-
 python/raft/dask/common/nccl.pyx              |   2 +-
 471 files changed, 11388 insertions(+), 646 deletions(-)
 rename cpp/include/raft/cluster/{kmeans.hpp => kmeans.cuh} (98%)
 create mode 100644 cpp/include/raft/distance/distance.cuh
 create mode 100644 cpp/include/raft/distance/fused_l2_nn.cuh
 create mode 100644 cpp/include/raft/distance/specializations.cuh
 rename cpp/include/raft/distance/specializations/detail/{canberra.hpp => canberra.cuh} (97%)
 rename cpp/include/raft/distance/specializations/detail/{chebyshev.hpp => chebyshev.cuh} (97%)
 rename cpp/include/raft/distance/specializations/detail/{correlation.hpp => correlation.cuh} (97%)
 rename cpp/include/raft/distance/specializations/detail/{cosine.hpp => cosine.cuh} (97%)
 rename cpp/include/raft/distance/specializations/detail/{hamming_unexpanded.hpp => hamming_unexpanded.cuh} (97%)
 rename cpp/include/raft/distance/specializations/detail/{hellinger_expanded.hpp => hellinger_expanded.cuh} (97%)
 rename cpp/include/raft/distance/specializations/detail/{jensen_shannon.hpp => jensen_shannon.cuh} (98%)
 rename cpp/include/raft/distance/specializations/detail/{kl_divergence.hpp => kl_divergence.cuh} (97%)
 rename cpp/include/raft/distance/specializations/detail/{l1.hpp => l1.cuh} (97%)
 rename cpp/include/raft/distance/specializations/detail/{l2_expanded.hpp => l2_expanded.cuh} (98%)
 rename cpp/include/raft/distance/specializations/detail/{l2_sqrt_expanded.hpp => l2_sqrt_expanded.cuh} (97%)
 rename cpp/include/raft/distance/specializations/detail/{l2_sqrt_unexpanded.hpp => l2_sqrt_unexpanded.cuh} (97%)
 rename cpp/include/raft/distance/specializations/detail/{l2_unexpanded.hpp => l2_unexpanded.cuh} (97%)
 rename cpp/include/raft/distance/specializations/detail/{lp_unexpanded.hpp => lp_unexpanded.cuh} (97%)
 rename cpp/include/raft/distance/specializations/{distance.hpp => distance.cuh} (54%)
 create mode 100644 cpp/include/raft/label/classlabels.cuh
 create mode 100644 cpp/include/raft/label/merge_labels.cuh
 rename cpp/include/raft/lap/{lap.hpp => lap.cuh} (99%)
 create mode 100644 cpp/include/raft/linalg/add.cuh
 create mode 100644 cpp/include/raft/linalg/axpy.cuh
 create mode 100644 cpp/include/raft/linalg/binary_op.cuh
 create mode 100644 cpp/include/raft/linalg/cholesky_r1_update.cuh
 create mode 100644 cpp/include/raft/linalg/coalesced_reduction.cuh
 create mode 100644 cpp/include/raft/linalg/contractions.cuh
 create mode 100644 cpp/include/raft/linalg/cublas_macros.h
 create mode 100644 cpp/include/raft/linalg/cusolver_macros.h
 rename cpp/include/raft/linalg/detail/{axpy.hpp => axpy.cuh} (100%)
 rename cpp/include/raft/linalg/detail/{cholesky_r1_update.hpp => cholesky_r1_update.cuh} (98%)
 rename cpp/include/raft/linalg/detail/{divide.hpp => divide.cuh} (96%)
 rename cpp/include/raft/linalg/detail/{eig.hpp => eig.cuh} (99%)
 rename cpp/include/raft/linalg/detail/{eltwise.hpp => eltwise.cuh} (97%)
 rename cpp/include/raft/linalg/detail/{lanczos.hpp => lanczos.cuh} (100%)
 rename cpp/include/raft/linalg/detail/{lstsq.hpp => lstsq.cuh} (98%)
 rename cpp/include/raft/linalg/detail/{mean_squared_error.hpp => mean_squared_error.cuh} (96%)
 rename cpp/include/raft/linalg/detail/{multiply.hpp => multiply.cuh} (96%)
 rename cpp/include/raft/linalg/detail/{norm.hpp => norm.cuh} (99%)
 rename cpp/include/raft/linalg/detail/{reduce.hpp => reduce.cuh} (95%)
 rename cpp/include/raft/linalg/detail/{svd.hpp => svd.cuh} (98%)
 rename cpp/include/raft/linalg/detail/{transpose.hpp => transpose.cuh} (100%)
 create mode 100644 cpp/include/raft/linalg/divide.cuh
 create mode 100644 cpp/include/raft/linalg/eig.cuh
 create mode 100644 cpp/include/raft/linalg/eltwise.cuh
 create mode 100644 cpp/include/raft/linalg/gemm.cuh
 create mode 100644 cpp/include/raft/linalg/gemv.cuh
 create mode 100644 cpp/include/raft/linalg/init.cuh
 create mode 100644 cpp/include/raft/linalg/lanczos.cuh
 create mode 100644 cpp/include/raft/linalg/lstsq.cuh
 create mode 100644 cpp/include/raft/linalg/map.cuh
 create mode 100644 cpp/include/raft/linalg/map_then_reduce.cuh
 create mode 100644 cpp/include/raft/linalg/matrix_vector_op.cuh
 create mode 100644 cpp/include/raft/linalg/mean_squared_error.cuh
 create mode 100644 cpp/include/raft/linalg/multiply.cuh
 create mode 100644 cpp/include/raft/linalg/norm.cuh
 create mode 100644 cpp/include/raft/linalg/power.hpp
 create mode 100644 cpp/include/raft/linalg/qr.cuh
 create mode 100644 cpp/include/raft/linalg/reduce.cuh
 create mode 100644 cpp/include/raft/linalg/reduce_cols_by_key.hpp
 create mode 100644 cpp/include/raft/linalg/reduce_rows_by_key.hpp
 create mode 100644 cpp/include/raft/linalg/rsvd.hpp
 create mode 100644 cpp/include/raft/linalg/sqrt.hpp
 create mode 100644 cpp/include/raft/linalg/strided_reduction.cuh
 create mode 100644 cpp/include/raft/linalg/subtract.cuh
 create mode 100644 cpp/include/raft/linalg/svd.cuh
 create mode 100644 cpp/include/raft/linalg/ternary_op.hpp
 create mode 100644 cpp/include/raft/linalg/transpose.cuh
 create mode 100644 cpp/include/raft/linalg/unary_op.cuh
 create mode 100644 cpp/include/raft/matrix/col_wise_sort.cuh
 create mode 100644 cpp/include/raft/matrix/math.cuh
 create mode 100644 cpp/include/raft/matrix/matrix.cuh
 create mode 100644 cpp/include/raft/random/make_blobs.cuh
 create mode 100644 cpp/include/raft/random/make_regression.cuh
 create mode 100644 cpp/include/raft/random/multi_variable_gaussian.cuh
 create mode 100644 cpp/include/raft/random/permute.cuh
 create mode 100644 cpp/include/raft/random/rng.cuh
 create mode 100644 cpp/include/raft/sparse/convert/coo.cuh
 create mode 100644 cpp/include/raft/sparse/convert/csr.cuh
 create mode 100644 cpp/include/raft/sparse/convert/dense.cuh
 create mode 100644 cpp/include/raft/sparse/distance/distance.cuh
 rename cpp/include/raft/sparse/hierarchy/detail/{single_linkage.hpp => single_linkage.cuh} (99%)
 create mode 100644 cpp/include/raft/sparse/hierarchy/single_linkage.cuh
 create mode 100644 cpp/include/raft/sparse/linalg/add.cuh
 create mode 100644 cpp/include/raft/sparse/linalg/degree.cuh
 create mode 100644 cpp/include/raft/sparse/linalg/norm.cuh
 create mode 100644 cpp/include/raft/sparse/linalg/spectral.cuh
 create mode 100644 cpp/include/raft/sparse/linalg/symmetrize.cuh
 create mode 100644 cpp/include/raft/sparse/linalg/transpose.cuh
 create mode 100644 cpp/include/raft/sparse/mst/mst.hpp
 rename cpp/include/raft/sparse/op/detail/{slice.h => slice.cuh} (97%)
 create mode 100644 cpp/include/raft/sparse/op/filter.cuh
 create mode 100644 cpp/include/raft/sparse/op/reduce.cuh
 create mode 100644 cpp/include/raft/sparse/op/row_op.cuh
 create mode 100644 cpp/include/raft/sparse/op/slice.cuh
 create mode 100644 cpp/include/raft/sparse/op/sort.cuh
 create mode 100644 cpp/include/raft/sparse/selection/connect_components.cuh
 create mode 100644 cpp/include/raft/sparse/selection/knn.cuh
 create mode 100644 cpp/include/raft/sparse/selection/knn_graph.cuh
 create mode 100644 cpp/include/raft/spatial/knn/ann.cuh
 create mode 100644 cpp/include/raft/spatial/knn/ball_cover.cuh
 create mode 100644 cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh
 create mode 100644 cpp/include/raft/spatial/knn/knn.cuh
 create mode 100644 cpp/include/raft/spatial/knn/specializations.cuh
 rename cpp/include/raft/spatial/knn/specializations/{ball_cover.hpp => ball_cover.cuh} (95%)
 rename cpp/include/raft/spatial/knn/specializations/{fused_l2_knn.hpp => fused_l2_knn.cuh} (98%)
 rename cpp/include/raft/spatial/knn/specializations/{knn.hpp => knn.cuh} (97%)
 create mode 100644 cpp/include/raft/spectral/cluster_solvers.cuh
 rename cpp/include/raft/spectral/detail/{matrix_wrappers.cuh => matrix_wrappers.hpp} (99%)
 create mode 100644 cpp/include/raft/spectral/eigen_solvers.cuh
 create mode 100644 cpp/include/raft/spectral/modularity_maximization.cuh
 create mode 100644 cpp/include/raft/spectral/partition.cuh
 create mode 100644 cpp/include/raft/stats/accuracy.cuh
 create mode 100644 cpp/include/raft/stats/adjusted_rand_index.cuh
 create mode 100644 cpp/include/raft/stats/completeness_score.cuh
 create mode 100644 cpp/include/raft/stats/contingency_matrix.cuh
 create mode 100644 cpp/include/raft/stats/cov.cuh
 create mode 100644 cpp/include/raft/stats/dispersion.cuh
 create mode 100644 cpp/include/raft/stats/entropy.cuh
 create mode 100644 cpp/include/raft/stats/histogram.cuh
 create mode 100644 cpp/include/raft/stats/homogeneity_score.cuh
 create mode 100644 cpp/include/raft/stats/information_criterion.cuh
 create mode 100644 cpp/include/raft/stats/kl_divergence.cuh
 create mode 100644 cpp/include/raft/stats/mean.cuh
 create mode 100644 cpp/include/raft/stats/mean_center.cuh
 create mode 100644 cpp/include/raft/stats/meanvar.cuh
 create mode 100644 cpp/include/raft/stats/minmax.cuh
 create mode 100644 cpp/include/raft/stats/mutual_info_score.cuh
 create mode 100644 cpp/include/raft/stats/r2_score.cuh
 create mode 100644 cpp/include/raft/stats/rand_index.cuh
 create mode 100644 cpp/include/raft/stats/regression_metrics.cuh
 create mode 100644 cpp/include/raft/stats/silhouette_score.cuh
 create mode 100644 cpp/include/raft/stats/specializations.cuh
 create mode 100644 cpp/include/raft/stats/stddev.cuh
 create mode 100644 cpp/include/raft/stats/sum.cuh
 create mode 100644 cpp/include/raft/stats/trustworthiness_score.cuh
 create mode 100644 cpp/include/raft/stats/v_measure.cuh
 create mode 100644 cpp/include/raft/stats/weighted_mean.cuh

diff --git a/build.sh b/build.sh
index 9a3295321f..9d3a796c65 100755
--- a/build.sh
+++ b/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 # cuml build script
 
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index a832f67aaf..83521e5d11 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 ########################
 # RAFT Version Updater #
 ########################
diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake
index 8c29d2b321..51ed34754b 100644
--- a/cpp/cmake/thirdparty/get_faiss.cmake
+++ b/cpp/cmake/thirdparty/get_faiss.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/cluster/detail/kmeans.cuh b/cpp/include/raft/cluster/detail/kmeans.cuh
index 51e4037c60..f3777405c0 100644
--- a/cpp/include/raft/cluster/detail/kmeans.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/cluster/kmeans.hpp b/cpp/include/raft/cluster/kmeans.cuh
similarity index 98%
rename from cpp/include/raft/cluster/kmeans.hpp
rename to cpp/include/raft/cluster/kmeans.cuh
index ab0fbb04c7..28d4ae0719 100644
--- a/cpp/include/raft/cluster/kmeans.hpp
+++ b/cpp/include/raft/cluster/kmeans.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index 14c33c6cf2..05678a7e49 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/comms/comms_test.hpp b/cpp/include/raft/comms/comms_test.hpp
index 1acb72bc85..f01060cb40 100644
--- a/cpp/include/raft/comms/comms_test.hpp
+++ b/cpp/include/raft/comms/comms_test.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/comms/detail/ucp_helper.hpp b/cpp/include/raft/comms/detail/ucp_helper.hpp
index 6ba66fb6f3..ef93ae90c5 100644
--- a/cpp/include/raft/comms/detail/ucp_helper.hpp
+++ b/cpp/include/raft/comms/detail/ucp_helper.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp
index d83e8f4d4f..b1aae86556 100644
--- a/cpp/include/raft/comms/helper.hpp
+++ b/cpp/include/raft/comms/helper.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp
index 3fab04c441..ca5275cd06 100644
--- a/cpp/include/raft/comms/mpi_comms.hpp
+++ b/cpp/include/raft/comms/mpi_comms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index 6fa0f7e37b..7604606ba1 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/detail/correlation.cuh b/cpp/include/raft/distance/detail/correlation.cuh
index 21d04f3f8d..c88d5afeab 100644
--- a/cpp/include/raft/distance/detail/correlation.cuh
+++ b/cpp/include/raft/distance/detail/correlation.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/reduce.hpp>
+#include <raft/linalg/reduce.cuh>
 
 namespace raft {
 namespace distance {
diff --git a/cpp/include/raft/distance/detail/cosine.cuh b/cpp/include/raft/distance/detail/cosine.cuh
index bead5f1f71..b7eed3e2a8 100644
--- a/cpp/include/raft/distance/detail/cosine.cuh
+++ b/cpp/include/raft/distance/detail/cosine.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/norm.hpp>
+#include <raft/linalg/norm.cuh>
 
 namespace raft {
 namespace distance {
diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index 45850de115..4782afe46e 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/detail/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh
index 4786f584c4..d83e81b6a9 100644
--- a/cpp/include/raft/distance/detail/euclidean.cuh
+++ b/cpp/include/raft/distance/detail/euclidean.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/norm.hpp>
+#include <raft/linalg/norm.cuh>
 
 namespace raft {
 namespace distance {
diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
index 80eb6021ef..81d02c410c 100644
--- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 #include <limits>
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/contractions.hpp>
+#include <raft/linalg/contractions.cuh>
 #include <stdint.h>
 
 namespace raft {
diff --git a/cpp/include/raft/distance/detail/hellinger.cuh b/cpp/include/raft/distance/detail/hellinger.cuh
index 3cb0469803..31854fd1d6 100644
--- a/cpp/include/raft/distance/detail/hellinger.cuh
+++ b/cpp/include/raft/distance/detail/hellinger.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 
 namespace raft {
 namespace distance {
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index 996cc544a6..9d203c0c4f 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -16,8 +16,8 @@
 #pragma once
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/contractions.hpp>
-#include <raft/linalg/norm.hpp>
+#include <raft/linalg/contractions.cuh>
+#include <raft/linalg/norm.cuh>
 #include <raft/vectorized.cuh>
 
 #include <cstddef>
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
new file mode 100644
index 0000000000..71c9e8d32b
--- /dev/null
+++ b/cpp/include/raft/distance/distance.cuh
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __DISTANCE_H
+#define __DISTANCE_H
+
+#pragma once
+
+#include <raft/distance/detail/distance.cuh>
+#include <raft/distance/distance_type.hpp>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace distance {
+
+/**
+ * @brief Evaluate pairwise distances with the user epilogue lamba allowed
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam FinalLambda user-defined epilogue lamba
+ * @tparam Index_ Index type
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param workspace temporary workspace needed for computations
+ * @param worksize number of bytes of the workspace
+ * @param fin_op the final gemm epilogue lambda
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
+ *
+ * @note fin_op: This is a device lambda which is supposed to operate upon the
+ * input which is AccType and returns the output in OutType. It's signature is
+ * as follows:  <pre>OutType fin_op(AccType in, int g_idx);</pre>. If one needs
+ * any other parameters, feel free to pass them via closure.
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void distance(const InType* x,
+              const InType* y,
+              OutType* dist,
+              Index_ m,
+              Index_ n,
+              Index_ k,
+              void* workspace,
+              size_t worksize,
+              FinalLambda fin_op,
+              cudaStream_t stream,
+              bool isRowMajor   = true,
+              InType metric_arg = 2.0f)
+{
+  detail::distance<distanceType, InType, AccType, OutType, FinalLambda, Index_>(
+    x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg);
+}
+
+/**
+ * @brief Evaluate pairwise distances for the simple use case
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam Index_ Index type
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param workspace temporary workspace needed for computations
+ * @param worksize number of bytes of the workspace
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
+ *
+ * @note if workspace is passed as nullptr, this will return in
+ *  worksize, the number of bytes of workspace required
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int>
+void distance(const InType* x,
+              const InType* y,
+              OutType* dist,
+              Index_ m,
+              Index_ n,
+              Index_ k,
+              void* workspace,
+              size_t worksize,
+              cudaStream_t stream,
+              bool isRowMajor   = true,
+              InType metric_arg = 2.0f)
+{
+  detail::distance<distanceType, InType, AccType, OutType, Index_>(
+    x, y, dist, m, n, k, workspace, worksize, stream, isRowMajor, metric_arg);
+}
+
+/**
+ * @brief Return the exact workspace size to compute the distance
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam Index_ Index type
+ * @param x first set of points
+ * @param y second set of points
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ *
+ * @note If the specified distanceType doesn't need the workspace at all, it
+ * returns 0.
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int>
+size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k)
+{
+  return detail::getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(x, y, m, n, k);
+}
+
+/**
+ * @brief Evaluate pairwise distances for the simple use case
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam Index_ Index type
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
+ *
+ * @note if workspace is passed as nullptr, this will return in
+ *  worksize, the number of bytes of workspace required
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int>
+void distance(const InType* x,
+              const InType* y,
+              OutType* dist,
+              Index_ m,
+              Index_ n,
+              Index_ k,
+              cudaStream_t stream,
+              bool isRowMajor   = true,
+              InType metric_arg = 2.0f)
+{
+  rmm::device_uvector<char> workspace(0, stream);
+  auto worksize = getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(x, y, m, n, k);
+  workspace.resize(worksize, stream);
+  detail::distance<distanceType, InType, AccType, OutType, Index_>(
+    x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, metric_arg);
+}
+
+/**
+ * @defgroup pairwise_distance pairwise distance prims
+ * @{
+ * @brief Convenience wrapper around 'distance' prim to convert runtime metric
+ * into compile time for the purpose of dispatch
+ * @tparam Type input/accumulation/output data-type
+ * @tparam Index_ indexing type
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param workspace temporary workspace buffer which can get resized as per the
+ * needed workspace size
+ * @param metric distance metric
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ */
+template <typename Type, typename Index_ = int>
+void pairwise_distance(const raft::handle_t& handle,
+                       const Type* x,
+                       const Type* y,
+                       Type* dist,
+                       Index_ m,
+                       Index_ n,
+                       Index_ k,
+                       rmm::device_uvector<char>& workspace,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor = true,
+                       Type metric_arg = 2.0f)
+{
+  switch (metric) {
+    case raft::distance::DistanceType::L2Expanded:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2Expanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::L2SqrtExpanded:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2SqrtExpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::CosineExpanded:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::CosineExpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::L1:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L1>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::L2Unexpanded:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2Unexpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::L2SqrtUnexpanded:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2SqrtUnexpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::Linf:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::Linf>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::HellingerExpanded:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::HellingerExpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::LpUnexpanded:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::LpUnexpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor, metric_arg);
+      break;
+    case raft::distance::DistanceType::Canberra:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::Canberra>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::HammingUnexpanded:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::HammingUnexpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::JensenShannon:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::JensenShannon>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::RusselRaoExpanded:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::RusselRaoExpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::KLDivergence:
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::KLDivergence>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    case raft::distance::DistanceType::CorrelationExpanded:
+      detail::
+        pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::CorrelationExpanded>(
+          x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      break;
+    default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
+  };
+}
+/** @} */
+
+/**
+ * @defgroup pairwise_distance pairwise distance prims
+ * @{
+ * @brief Convenience wrapper around 'distance' prim to convert runtime metric
+ * into compile time for the purpose of dispatch
+ * @tparam Type input/accumulation/output data-type
+ * @tparam Index_ indexing type
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param metric distance metric
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ */
+template <typename Type, typename Index_ = int>
+void pairwise_distance(const raft::handle_t& handle,
+                       const Type* x,
+                       const Type* y,
+                       Type* dist,
+                       Index_ m,
+                       Index_ n,
+                       Index_ k,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor = true,
+                       Type metric_arg = 2.0f)
+{
+  rmm::device_uvector<char> workspace(0, handle.get_stream());
+  pairwise_distance<Type, Index_>(
+    handle, x, y, dist, m, n, k, workspace, metric, isRowMajor, metric_arg);
+}
+
+};  // namespace distance
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/distance/distance.hpp b/cpp/include/raft/distance/distance.hpp
index 935cf6677a..f9fbde50e4 100644
--- a/cpp/include/raft/distance/distance.hpp
+++ b/cpp/include/raft/distance/distance.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __DISTANCE_H
+#define __DISTANCE_H
 
 #pragma once
 
@@ -319,3 +326,5 @@ void pairwise_distance(const raft::handle_t& handle,
 
 };  // namespace distance
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh
new file mode 100644
index 0000000000..ac8895c9ce
--- /dev/null
+++ b/cpp/include/raft/distance/fused_l2_nn.cuh
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FUSED_L2_NN_H
+#define __FUSED_L2_NN_H
+
+#pragma once
+
+#include <cub/cub.cuh>
+#include <limits>
+#include <raft/cuda_utils.cuh>
+#include <raft/distance/detail/fused_l2_nn.cuh>
+#include <raft/handle.hpp>
+#include <stdint.h>
+
+namespace raft {
+namespace distance {
+
+template <typename LabelT, typename DataT>
+using KVPMinReduce = detail::KVPMinReduceImpl<LabelT, DataT>;
+
+template <typename LabelT, typename DataT>
+using MinAndDistanceReduceOp = detail::MinAndDistanceReduceOpImpl<LabelT, DataT>;
+
+template <typename LabelT, typename DataT>
+using MinReduceOp = detail::MinReduceOpImpl<LabelT, DataT>;
+
+/**
+ * Initialize array using init value from reduction op
+ */
+template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
+void initialize(const raft::handle_t& handle, OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
+{
+  detail::initialize<DataT, OutT, IdxT, ReduceOpT>(min, m, maxVal, redOp, handle.get_stream());
+}
+
+/**
+ * @brief Fused L2 distance and 1-nearest-neighbor computation in a single call.
+ *
+ * The benefits of such a call are 2-fold: 1) eliminate the need for an
+ * intermediate buffer to store the output of gemm 2) reduce the memory read
+ * traffic on this intermediate buffer, otherwise needed during the reduction
+ * phase for 1-NN.
+ *
+ * @tparam DataT     data type
+ * @tparam OutT      output type to either store 1-NN indices and their minimum
+ *                   distances or store only the min distances. Accordingly, one
+ *                   has to pass an appropriate `ReduceOpT`
+ * @tparam IdxT      indexing arithmetic type
+ * @tparam ReduceOpT A struct to perform the final needed reduction operation
+ *                   and also to initialize the output array elements with the
+ *                   appropriate initial value needed for reduction.
+ *
+ * @param[out] min           will contain the reduced output (Length = `m`)
+ *                           (on device)
+ * @param[in]  x             first matrix. Row major. Dim = `m x k`.
+ *                           (on device).
+ * @param[in]  y             second matrix. Row major. Dim = `n x k`.
+ *                           (on device).
+ * @param[in]  xn            L2 squared norm of `x`. Length = `m`. (on device).
+ * @param[in]  yn            L2 squared norm of `y`. Length = `n`. (on device)
+ * @param[in]  m             gemm m
+ * @param[in]  n             gemm n
+ * @param[in]  k             gemm k
+ * @param[in]  workspace     temp workspace. Size = sizeof(int)*m. (on device)
+ * @param[in]  redOp         reduction operator in the epilogue
+ * @param[in] pairRedOp reduction operation on key value pairs
+ * @param[in]  sqrt          Whether the output `minDist` should contain L2-sqrt
+ * @param[in]  initOutBuffer whether to initialize the output buffer before the
+ *                           main kernel launch
+ * @param[in]  stream        cuda stream
+ */
+template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT, typename KVPReduceOpT>
+void fusedL2NN(OutT* min,
+               const DataT* x,
+               const DataT* y,
+               const DataT* xn,
+               const DataT* yn,
+               IdxT m,
+               IdxT n,
+               IdxT k,
+               void* workspace,
+               ReduceOpT redOp,
+               KVPReduceOpT pairRedOp,
+               bool sqrt,
+               bool initOutBuffer,
+               cudaStream_t stream)
+{
+  size_t bytes = sizeof(DataT) * k;
+  if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) {
+    detail::fusedL2NNImpl<DataT, OutT, IdxT, 16 / sizeof(DataT), ReduceOpT>(
+      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+  } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) {
+    detail::fusedL2NNImpl<DataT, OutT, IdxT, 8 / sizeof(DataT), ReduceOpT>(
+      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+  } else {
+    detail::fusedL2NNImpl<DataT, OutT, IdxT, 1, ReduceOpT>(
+      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+  }
+}
+
+}  // namespace distance
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/distance/fused_l2_nn.hpp b/cpp/include/raft/distance/fused_l2_nn.hpp
index b293f0c237..1cb3ee39eb 100644
--- a/cpp/include/raft/distance/fused_l2_nn.hpp
+++ b/cpp/include/raft/distance/fused_l2_nn.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __FUSED_L2_NN_H
+#define __FUSED_L2_NN_H
 
 #pragma once
 
@@ -111,3 +118,5 @@ void fusedL2NN(OutT* min,
 
 }  // namespace distance
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/distance/specializations.cuh b/cpp/include/raft/distance/specializations.cuh
new file mode 100644
index 0000000000..5944534be7
--- /dev/null
+++ b/cpp/include/raft/distance/specializations.cuh
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DISTANCE_SPECIALIZATIONS_H
+#define __DISTANCE_SPECIALIZATIONS_H
+
+#pragma once
+
+#include <raft/distance/specializations/distance.cuh>
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/distance/specializations.hpp b/cpp/include/raft/distance/specializations.hpp
index e70943e731..db426c30d2 100644
--- a/cpp/include/raft/distance/specializations.hpp
+++ b/cpp/include/raft/distance/specializations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,7 +13,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __DISTANCE_SPECIALIZATIONS_H
+#define __DISTANCE_SPECIALIZATIONS_H
 
 #pragma once
 
-#include <raft/distance/specializations/distance.hpp>
\ No newline at end of file
+#include <raft/distance/specializations/distance.cuh>
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/distance/specializations/detail/canberra.hpp b/cpp/include/raft/distance/specializations/detail/canberra.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/canberra.hpp
rename to cpp/include/raft/distance/specializations/detail/canberra.cuh
index 2e71685532..22bdf41fd1 100644
--- a/cpp/include/raft/distance/specializations/detail/canberra.hpp
+++ b/cpp/include/raft/distance/specializations/detail/canberra.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/chebyshev.hpp b/cpp/include/raft/distance/specializations/detail/chebyshev.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/chebyshev.hpp
rename to cpp/include/raft/distance/specializations/detail/chebyshev.cuh
index dc03e047be..7502409082 100644
--- a/cpp/include/raft/distance/specializations/detail/chebyshev.hpp
+++ b/cpp/include/raft/distance/specializations/detail/chebyshev.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/correlation.hpp b/cpp/include/raft/distance/specializations/detail/correlation.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/correlation.hpp
rename to cpp/include/raft/distance/specializations/detail/correlation.cuh
index 2e7683ab10..a2cddea179 100644
--- a/cpp/include/raft/distance/specializations/detail/correlation.hpp
+++ b/cpp/include/raft/distance/specializations/detail/correlation.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/cosine.hpp b/cpp/include/raft/distance/specializations/detail/cosine.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/cosine.hpp
rename to cpp/include/raft/distance/specializations/detail/cosine.cuh
index b47d294645..c98703e135 100644
--- a/cpp/include/raft/distance/specializations/detail/cosine.hpp
+++ b/cpp/include/raft/distance/specializations/detail/cosine.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/hamming_unexpanded.hpp b/cpp/include/raft/distance/specializations/detail/hamming_unexpanded.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/hamming_unexpanded.hpp
rename to cpp/include/raft/distance/specializations/detail/hamming_unexpanded.cuh
index 29a4ca03d9..9cf7b9b343 100644
--- a/cpp/include/raft/distance/specializations/detail/hamming_unexpanded.hpp
+++ b/cpp/include/raft/distance/specializations/detail/hamming_unexpanded.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/hellinger_expanded.hpp b/cpp/include/raft/distance/specializations/detail/hellinger_expanded.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/hellinger_expanded.hpp
rename to cpp/include/raft/distance/specializations/detail/hellinger_expanded.cuh
index 264003ec0e..28ecaa1b65 100644
--- a/cpp/include/raft/distance/specializations/detail/hellinger_expanded.hpp
+++ b/cpp/include/raft/distance/specializations/detail/hellinger_expanded.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/jensen_shannon.hpp b/cpp/include/raft/distance/specializations/detail/jensen_shannon.cuh
similarity index 98%
rename from cpp/include/raft/distance/specializations/detail/jensen_shannon.hpp
rename to cpp/include/raft/distance/specializations/detail/jensen_shannon.cuh
index 3135a4c579..ac0190562b 100644
--- a/cpp/include/raft/distance/specializations/detail/jensen_shannon.hpp
+++ b/cpp/include/raft/distance/specializations/detail/jensen_shannon.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/kl_divergence.hpp b/cpp/include/raft/distance/specializations/detail/kl_divergence.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/kl_divergence.hpp
rename to cpp/include/raft/distance/specializations/detail/kl_divergence.cuh
index 207fca6bc2..b338cebdc2 100644
--- a/cpp/include/raft/distance/specializations/detail/kl_divergence.hpp
+++ b/cpp/include/raft/distance/specializations/detail/kl_divergence.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/l1.hpp b/cpp/include/raft/distance/specializations/detail/l1.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/l1.hpp
rename to cpp/include/raft/distance/specializations/detail/l1.cuh
index e8eddfe1e4..65979ce414 100644
--- a/cpp/include/raft/distance/specializations/detail/l1.hpp
+++ b/cpp/include/raft/distance/specializations/detail/l1.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/l2_expanded.hpp b/cpp/include/raft/distance/specializations/detail/l2_expanded.cuh
similarity index 98%
rename from cpp/include/raft/distance/specializations/detail/l2_expanded.hpp
rename to cpp/include/raft/distance/specializations/detail/l2_expanded.cuh
index db37b8db9f..1dac34ad7a 100644
--- a/cpp/include/raft/distance/specializations/detail/l2_expanded.hpp
+++ b/cpp/include/raft/distance/specializations/detail/l2_expanded.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/l2_sqrt_expanded.hpp b/cpp/include/raft/distance/specializations/detail/l2_sqrt_expanded.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/l2_sqrt_expanded.hpp
rename to cpp/include/raft/distance/specializations/detail/l2_sqrt_expanded.cuh
index ac23c9c357..8b752d8235 100644
--- a/cpp/include/raft/distance/specializations/detail/l2_sqrt_expanded.hpp
+++ b/cpp/include/raft/distance/specializations/detail/l2_sqrt_expanded.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/l2_sqrt_unexpanded.hpp b/cpp/include/raft/distance/specializations/detail/l2_sqrt_unexpanded.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/l2_sqrt_unexpanded.hpp
rename to cpp/include/raft/distance/specializations/detail/l2_sqrt_unexpanded.cuh
index 1e38575fbf..8632fda769 100644
--- a/cpp/include/raft/distance/specializations/detail/l2_sqrt_unexpanded.hpp
+++ b/cpp/include/raft/distance/specializations/detail/l2_sqrt_unexpanded.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/l2_unexpanded.hpp b/cpp/include/raft/distance/specializations/detail/l2_unexpanded.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/l2_unexpanded.hpp
rename to cpp/include/raft/distance/specializations/detail/l2_unexpanded.cuh
index 035c9ef693..3962cfd1ae 100644
--- a/cpp/include/raft/distance/specializations/detail/l2_unexpanded.hpp
+++ b/cpp/include/raft/distance/specializations/detail/l2_unexpanded.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/detail/lp_unexpanded.hpp b/cpp/include/raft/distance/specializations/detail/lp_unexpanded.cuh
similarity index 97%
rename from cpp/include/raft/distance/specializations/detail/lp_unexpanded.hpp
rename to cpp/include/raft/distance/specializations/detail/lp_unexpanded.cuh
index 83eda5f07b..1f7e504ba8 100644
--- a/cpp/include/raft/distance/specializations/detail/lp_unexpanded.hpp
+++ b/cpp/include/raft/distance/specializations/detail/lp_unexpanded.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/specializations/distance.hpp b/cpp/include/raft/distance/specializations/distance.cuh
similarity index 54%
rename from cpp/include/raft/distance/specializations/distance.hpp
rename to cpp/include/raft/distance/specializations/distance.cuh
index a57d6f49a5..7553f87e39 100644
--- a/cpp/include/raft/distance/specializations/distance.hpp
+++ b/cpp/include/raft/distance/specializations/distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,17 +16,17 @@
 
 #pragma once
 
-#include <raft/distance/specializations/detail/canberra.hpp>
-#include <raft/distance/specializations/detail/chebyshev.hpp>
-#include <raft/distance/specializations/detail/correlation.hpp>
-#include <raft/distance/specializations/detail/cosine.hpp>
-#include <raft/distance/specializations/detail/hamming_unexpanded.hpp>
-#include <raft/distance/specializations/detail/hellinger_expanded.hpp>
-#include <raft/distance/specializations/detail/jensen_shannon.hpp>
-#include <raft/distance/specializations/detail/kl_divergence.hpp>
-#include <raft/distance/specializations/detail/l1.hpp>
-#include <raft/distance/specializations/detail/l2_expanded.hpp>
-#include <raft/distance/specializations/detail/l2_sqrt_expanded.hpp>
-#include <raft/distance/specializations/detail/l2_sqrt_unexpanded.hpp>
-#include <raft/distance/specializations/detail/l2_unexpanded.hpp>
-#include <raft/distance/specializations/detail/lp_unexpanded.hpp>
+#include <raft/distance/specializations/detail/canberra.cuh>
+#include <raft/distance/specializations/detail/chebyshev.cuh>
+#include <raft/distance/specializations/detail/correlation.cuh>
+#include <raft/distance/specializations/detail/cosine.cuh>
+#include <raft/distance/specializations/detail/hamming_unexpanded.cuh>
+#include <raft/distance/specializations/detail/hellinger_expanded.cuh>
+#include <raft/distance/specializations/detail/jensen_shannon.cuh>
+#include <raft/distance/specializations/detail/kl_divergence.cuh>
+#include <raft/distance/specializations/detail/l1.cuh>
+#include <raft/distance/specializations/detail/l2_expanded.cuh>
+#include <raft/distance/specializations/detail/l2_sqrt_expanded.cuh>
+#include <raft/distance/specializations/detail/l2_sqrt_unexpanded.cuh>
+#include <raft/distance/specializations/detail/l2_unexpanded.cuh>
+#include <raft/distance/specializations/detail/lp_unexpanded.cuh>
diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh
new file mode 100644
index 0000000000..93c1080ff2
--- /dev/null
+++ b/cpp/include/raft/label/classlabels.cuh
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CLASS_LABELS_H
+#define __CLASS_LABELS_H
+
+#pragma once
+
+#include <raft/label/detail/classlabels.cuh>
+
+namespace raft {
+namespace label {
+
+/**
+ * Get unique class labels.
+ *
+ * The y array is assumed to store class labels. The unique values are selected
+ * from this array.
+ *
+ * @tparam value_t numeric type of the arrays with class labels
+ * @param [inout] unique output unique labels
+ * @param [in] y device array of labels, size [n]
+ * @param [in] n number of labels
+ * @param [in] stream cuda stream
+ * @returns unique device array of unique labels, unallocated on entry,
+ *   on exit it has size
+ */
+template <typename value_t>
+int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n, cudaStream_t stream)
+{
+  return detail::getUniquelabels<value_t>(unique, y, n, stream);
+}
+
+/**
+ * Assign one versus rest labels.
+ *
+ * The output labels will have values +/-1:
+ * y_out = (y == y_unique[idx]) ? +1 : -1;
+ *
+ * The output type currently is set to value_t, but for SVM in principle we are
+ * free to choose other type for y_out (it should represent +/-1, and it is used
+ * in floating point arithmetics).
+ *
+ * @param [in] y device array if input labels, size [n]
+ * @param [in] n number of labels
+ * @param [in] y_unique device array of unique labels, size [n_classes]
+ * @param [in] n_classes number of unique labels
+ * @param [out] y_out device array of output labels
+ * @param [in] idx index of unique label that should be labeled as 1
+ * @param [in] stream cuda stream
+ */
+template <typename value_t>
+void getOvrlabels(
+  value_t* y, int n, value_t* y_unique, int n_classes, value_t* y_out, int idx, cudaStream_t stream)
+{
+  detail::getOvrlabels<value_t>(y, n, y_unique, n_classes, y_out, idx, stream);
+}
+/**
+ * Maps an input array containing a series of numbers into a new array
+ * where numbers have been mapped to a monotonically increasing set
+ * of labels. This can be useful in machine learning algorithms, for instance,
+ * where a given set of labels is not taken from a monotonically increasing
+ * set. This can happen if they are filtered or if only a subset of the
+ * total labels are used in a dataset. This is also useful in graph algorithms
+ * where a set of vertices need to be labeled in a monotonically increasing
+ * order.
+ * @tparam Type the numeric type of the input and output arrays
+ * @tparam Lambda the type of an optional filter function, which determines
+ * which items in the array to map.
+ * @param[out] out the output monotonic array
+ * @param[in] in input label array
+ * @param[in] N number of elements in the input array
+ * @param[in] stream cuda stream to use
+ * @param[in] filter_op an optional function for specifying which values
+ * should have monotonically increasing labels applied to them.
+ * @param[in] zero_based force monotonic set to start at 0?
+ */
+template <typename Type, typename Lambda>
+void make_monotonic(
+  Type* out, Type* in, size_t N, cudaStream_t stream, Lambda filter_op, bool zero_based = false)
+{
+  detail::make_monotonic<Type, Lambda>(out, in, N, stream, filter_op, zero_based);
+}
+
+/**
+ * Maps an input array containing a series of numbers into a new array
+ * where numbers have been mapped to a monotonically increasing set
+ * of labels. This can be useful in machine learning algorithms, for instance,
+ * where a given set of labels is not taken from a monotonically increasing
+ * set. This can happen if they are filtered or if only a subset of the
+ * total labels are used in a dataset. This is also useful in graph algorithms
+ * where a set of vertices need to be labeled in a monotonically increasing
+ * order.
+ * @tparam Type the numeric type of the input and output arrays
+ * @param[out] out output label array with labels assigned monotonically
+ * @param[in] in input label array
+ * @param[in] N number of elements in the input array
+ * @param[in] stream cuda stream to use
+ * @param[in] zero_based force monotonic label set to start at 0?
+ */
+template <typename Type>
+void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zero_based = false)
+{
+  detail::make_monotonic<Type>(out, in, N, stream, zero_based);
+}
+};  // namespace label
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/label/classlabels.hpp b/cpp/include/raft/label/classlabels.hpp
index de9f60518d..189c26f69f 100644
--- a/cpp/include/raft/label/classlabels.hpp
+++ b/cpp/include/raft/label/classlabels.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#ifndef __CLASS_LABELS_H
+#define __CLASS_LABELS_H
+
 #pragma once
 
 #include <raft/label/detail/classlabels.cuh>
@@ -115,3 +118,5 @@ void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zer
 }
 };  // namespace label
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/label/detail/classlabels.cuh b/cpp/include/raft/label/detail/classlabels.cuh
index 53657a5dfa..a941751d78 100644
--- a/cpp/include/raft/label/detail/classlabels.cuh
+++ b/cpp/include/raft/label/detail/classlabels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/label/detail/merge_labels.cuh b/cpp/include/raft/label/detail/merge_labels.cuh
index bf03d1c738..1f62b3f0d6 100644
--- a/cpp/include/raft/label/detail/merge_labels.cuh
+++ b/cpp/include/raft/label/detail/merge_labels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/init.hpp>
+#include <raft/linalg/init.cuh>
 
 namespace raft {
 namespace label {
diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/merge_labels.cuh
new file mode 100644
index 0000000000..2bf2fa830b
--- /dev/null
+++ b/cpp/include/raft/label/merge_labels.cuh
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MERGE_LABELS_H
+#define __MERGE_LABELS_H
+
+#pragma once
+
+#include <raft/label/detail/merge_labels.cuh>
+
+namespace raft {
+namespace label {
+
+/**
+ * @brief Merge two labellings in-place, according to a core mask
+ *
+ * A labelling is a representation of disjoint sets (groups) where points that
+ * belong to the same group have the same label. It is assumed that group
+ * labels take values between 1 and N. labels relate to points, i.e a label i+1
+ * means that you belong to the same group as the point i.
+ * The special value MAX_LABEL is used to mark points that are not labelled.
+ *
+ * The two label arrays A and B induce two sets of groups over points 0..N-1.
+ * If a point is labelled i in A and j in B and the mask is true for this
+ * point, then i and j are equivalent labels and their groups are merged by
+ * relabeling the elements of both groups to have the same label. The new label
+ * is the smaller one from the original labels.
+ * It is required that if the mask is true for a point, this point is labelled
+ * (i.e its label is different than the special value MAX_LABEL).
+ *
+ * One use case is finding connected components: the two input label arrays can
+ * represent the connected components of graphs G_A and G_B, and the output
+ * would be the connected components labels of G_A \union G_B.
+ *
+ * @param[inout] labels_a    First input, and output label array (in-place)
+ * @param[in]    labels_b    Second input label array
+ * @param[in]    mask        Core point mask
+ * @param[out]   R           label equivalence map
+ * @param[in]    m           Working flag
+ * @param[in]    N           Number of points in the dataset
+ * @param[in]    stream      CUDA stream
+ */
+template <typename value_idx = int, int TPB_X = 256>
+void merge_labels(value_idx* labels_a,
+                  const value_idx* labels_b,
+                  const bool* mask,
+                  value_idx* R,
+                  bool* m,
+                  value_idx N,
+                  cudaStream_t stream)
+{
+  detail::merge_labels<value_idx, TPB_X>(labels_a, labels_b, mask, R, m, N, stream);
+}
+
+};  // namespace label
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/label/merge_labels.hpp b/cpp/include/raft/label/merge_labels.hpp
index 5ba8fe8a27..2bf2fa830b 100644
--- a/cpp/include/raft/label/merge_labels.hpp
+++ b/cpp/include/raft/label/merge_labels.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#ifndef __MERGE_LABELS_H
+#define __MERGE_LABELS_H
+
 #pragma once
 
 #include <raft/label/detail/merge_labels.cuh>
@@ -63,4 +66,6 @@ void merge_labels(value_idx* labels_a,
 }
 
 };  // namespace label
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/lap/detail/d_structs.h b/cpp/include/raft/lap/detail/d_structs.h
index e488dc528f..74679d64ce 100644
--- a/cpp/include/raft/lap/detail/d_structs.h
+++ b/cpp/include/raft/lap/detail/d_structs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  * Copyright 2020 KETAN DATE & RAKESH NAGI
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cpp/include/raft/lap/detail/lap_functions.cuh b/cpp/include/raft/lap/detail/lap_functions.cuh
index 6c6b09e5d8..3a801ff060 100644
--- a/cpp/include/raft/lap/detail/lap_functions.cuh
+++ b/cpp/include/raft/lap/detail/lap_functions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  * Copyright 2020 KETAN DATE & RAKESH NAGI
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cpp/include/raft/lap/detail/lap_kernels.cuh b/cpp/include/raft/lap/detail/lap_kernels.cuh
index b61d0bd269..e98b246733 100644
--- a/cpp/include/raft/lap/detail/lap_kernels.cuh
+++ b/cpp/include/raft/lap/detail/lap_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  * Copyright 2020 KETAN DATE & RAKESH NAGI
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cpp/include/raft/lap/lap.hpp b/cpp/include/raft/lap/lap.cuh
similarity index 99%
rename from cpp/include/raft/lap/lap.hpp
rename to cpp/include/raft/lap/lap.cuh
index 2350ebcddf..5f72ca27c8 100644
--- a/cpp/include/raft/lap/lap.hpp
+++ b/cpp/include/raft/lap/lap.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  * Copyright 2020 KETAN DATE & RAKESH NAGI
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh
new file mode 100644
index 0000000000..92152a8c03
--- /dev/null
+++ b/cpp/include/raft/linalg/add.cuh
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ADD_H
+#define __ADD_H
+
+#pragma once
+
+#include "detail/add.cuh"
+
+namespace raft {
+namespace linalg {
+
+using detail::adds_scalar;
+
+/**
+ * @brief Elementwise scalar add operation on the input buffer
+ *
+ * @tparam InT     input data-type. Also the data-type upon which the math ops
+ *                 will be performed
+ * @tparam OutT    output data-type
+ * @tparam IdxType Integer type used to for addressing
+ *
+ * @param out    the output buffer
+ * @param in     the input buffer
+ * @param scalar the scalar used in the operations
+ * @param len    number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ */
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
+{
+  detail::addScalar(out, in, scalar, len, stream);
+}
+
+/**
+ * @brief Elementwise add operation on the input buffers
+ * @tparam InT     input data-type. Also the data-type upon which the math ops
+ *                 will be performed
+ * @tparam OutT    output data-type
+ * @tparam IdxType Integer type used to for addressing
+ *
+ * @param out    the output buffer
+ * @param in1    the first input buffer
+ * @param in2    the second input buffer
+ * @param len    number of elements in the input buffers
+ * @param stream cuda stream where to launch work
+ */
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
+{
+  detail::add(out, in1, in2, len, stream);
+}
+
+/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
+ * write result to outDev[i]
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param outDev the output buffer
+ * @param inDev the input buffer
+ * @param singleScalarDev pointer to the scalar located in device memory
+ * @param len number of elements in the input and output buffer
+ * @param stream cuda stream
+ */
+template <typename math_t, typename IdxType = int>
+void addDevScalar(math_t* outDev,
+                  const math_t* inDev,
+                  const math_t* singleScalarDev,
+                  IdxType len,
+                  cudaStream_t stream)
+{
+  detail::addDevScalar(outDev, inDev, singleScalarDev, len, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/add.hpp b/cpp/include/raft/linalg/add.hpp
index 2f999a45d2..32c7f68459 100644
--- a/cpp/include/raft/linalg/add.hpp
+++ b/cpp/include/raft/linalg/add.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __ADD_H
+#define __ADD_H
 
 #pragma once
 
@@ -84,3 +91,5 @@ void addDevScalar(math_t* outDev,
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/axpy.cuh b/cpp/include/raft/linalg/axpy.cuh
new file mode 100644
index 0000000000..2e23047b5a
--- /dev/null
+++ b/cpp/include/raft/linalg/axpy.cuh
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __AXPY_H
+#define __AXPY_H
+
+#pragma once
+
+#include "detail/axpy.cuh"
+
+namespace raft::linalg {
+
+/**
+ * @brief the wrapper of cublas axpy function
+ *  It computes the following equation: y = alpha * x + y
+ *
+ * @tparam T the element type
+ * @tparam DevicePointerMode whether pointers alpha, beta point to device memory
+ * @param [in] handle raft handle
+ * @param [in] n number of elements in x and y
+ * @param [in] alpha host or device scalar
+ * @param [in] x vector of length n
+ * @param [in] incx stride between consecutive elements of x
+ * @param [inout] y vector of length n
+ * @param [in] incy stride between consecutive elements of y
+ * @param [in] stream
+ */
+template <typename T, bool DevicePointerMode = false>
+void axpy(const raft::handle_t& handle,
+          const int n,
+          const T* alpha,
+          const T* x,
+          const int incx,
+          T* y,
+          const int incy,
+          cudaStream_t stream)
+{
+  detail::axpy<T, DevicePointerMode>(handle, n, alpha, x, incx, y, incy, stream);
+}
+
+}  // namespace raft::linalg
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/axpy.hpp b/cpp/include/raft/linalg/axpy.hpp
index 5a5a873132..921ed3f89b 100644
--- a/cpp/include/raft/linalg/axpy.hpp
+++ b/cpp/include/raft/linalg/axpy.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __AXPY_H
+#define __AXPY_H
 
 #pragma once
 
-#include "detail/axpy.hpp"
+#include "detail/axpy.cuh"
 
 namespace raft::linalg {
 
@@ -49,3 +56,5 @@ void axpy(const raft::handle_t& handle,
 }
 
 }  // namespace raft::linalg
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh
new file mode 100644
index 0000000000..a85bf698f7
--- /dev/null
+++ b/cpp/include/raft/linalg/binary_op.cuh
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __BINARY_OP_H
+#define __BINARY_OP_H
+
+#pragma once
+
+#include "detail/binary_op.cuh"
+
+#include <raft/cuda_utils.cuh>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief perform element-wise binary operation on the input arrays
+ * @tparam InType input data-type
+ * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam OutType output data-type
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @param out the output array
+ * @param in1 the first input array
+ * @param in2 the second input array
+ * @param len number of elements in the input array
+ * @param op the device-lambda
+ * @param stream cuda stream where to launch work
+ * @note Lambda must be a functor with the following signature:
+ *       `OutType func(const InType& val1, const InType& val2);`
+ */
+template <typename InType,
+          typename Lambda,
+          typename OutType = InType,
+          typename IdxType = int,
+          int TPB          = 256>
+void binaryOp(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
+{
+  detail::binaryOp(out, in1, in2, len, op, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/binary_op.hpp b/cpp/include/raft/linalg/binary_op.hpp
index 5c73b6d3c5..468c278909 100644
--- a/cpp/include/raft/linalg/binary_op.hpp
+++ b/cpp/include/raft/linalg/binary_op.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __BINARY_OP_H
+#define __BINARY_OP_H
 
 #pragma once
 
@@ -52,3 +59,5 @@ void binaryOp(
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh
new file mode 100644
index 0000000000..7d22d6bcf7
--- /dev/null
+++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CHOLESKY_R1_UPDATE_H
+#define __CHOLESKY_R1_UPDATE_H
+
+#pragma once
+
+#include "detail/cholesky_r1_update.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Rank 1 update of Cholesky decomposition.
+ *
+ * This method is useful if an algorithm iteratively builds up matrix A, and
+ * the Cholesky decomposition of A is required at each step.
+ *
+ * On entry, L is the Cholesky decomposition of matrix A, where both A and L
+ * have size n-1 x n-1. We are interested in the Cholesky decomposition of a new
+ * matrix A', which we get by adding a row and column to A. In Python notation:
+ * - A'[0:n-1, 0:n-1] = A;
+ * - A'[:,n-1] = A[n-1,:] = A_new
+ *
+ * On entry, the new column A_new, is stored as the n-th column of L if uplo ==
+ * CUBLAS_FILL_MODE_UPPER, else A_new is stored as the n-th row of L.
+ *
+ * On exit L contains the Cholesky decomposition of A'. In practice the elements
+ * of A_new are overwritten with new row/column of the L matrix.
+ *
+ * The uplo paramater is used to select the matrix layout.
+ * If (uplo != CUBLAS_FILL_MODE_UPPER) then the input arg L stores the
+ * lower triangular matrix L, so that A = L * L.T. Otherwise the input arg L
+ * stores an upper triangular matrix U: A = U.T * U.
+ *
+ * On exit L will be updated to store the Cholesky decomposition of A'.
+ *
+ * If the matrix is not positive definit, or very ill conditioned then the new
+ * diagonal element of L would be NaN. In such a case an exception is thrown.
+ * The eps argument can be used to override this behavior: if eps >= 0 then
+ * the diagonal element is replaced by eps in case the diagonal is NaN or
+ * smaller than eps. Note: for an iterative solver it is probably better to
+ * stop early in case of error, rather than relying on the eps parameter.
+ *
+ * Examples:
+ *
+ * - Lower triangular factorization:
+ * @code{.cpp}
+ * // Initialize arrays
+ * int ld_L = n_rows;
+ * rmm::device_uvector<math_t> L(ld_L * n_rows, stream);
+ * MLCommon::LinAlg::choleskyRank1Update(handle, L, n_rows, ld_L, nullptr,
+ *                                       &n_bytes, CUBLAS_FILL_MODE_LOWER,
+ *                                       stream);
+ * rmm::device_uvector<char> workspace(n_bytes, stream);
+ *
+ * for (n=1; n<=n_rows; rank++) {
+ *   // Calculate a new row/column of matrix A into A_new
+ *   // ...
+ *   // Copy new row to L[rank-1,:]
+ *   RAFT_CUBLAS_TRY(cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1,
+ *                           L + n - 1, ld_L, stream));
+ *   // Update Cholesky factorization
+ *   MLCommon::LinAlg::choleskyRank1Update(
+ *       handle, L, rank, ld_L, workspace, &n_bytes, CUBLAS_FILL_MODE_LOWER,
+ *       stream);
+ * }
+ * Now L stores the Cholesky decomposition of A: A = L * L.T
+ * @endcode
+ *
+ * - Upper triangular factorization:
+ * @code{.cpp}
+ * // Initialize arrays
+ * int ld_U = n_rows;
+ * rmm::device_uvector<math_t> U(ld_U * n_rows, stream);
+ * MLCommon::LinAlg::choleskyRank1Update(handle, L, n_rows, ld_U, nullptr,
+ *                                       &n_bytes, CUBLAS_FILL_MODE_UPPER,
+ *                                       stream);
+ * rmm::device_uvector<char> workspace(stream, n_bytes, stream);
+ *
+ * for (n=1; n<=n_rows; n++) {
+ *   // Calculate a new row/column of matrix A into array A_new
+ *   // ...
+ *   // Copy new row to U[:,n-1] (column major layout)
+ *   raft::copy(U + ld_U * (n-1), A_new, n-1, stream);
+ *   //
+ *   // Update Cholesky factorization
+ *   MLCommon::LinAlg::choleskyRank1Update(
+ *       handle, U, n, ld_U, workspace, &n_bytes, CUBLAS_FILL_MODE_UPPER,
+ *       stream);
+ * }
+ * // Now U stores the Cholesky decomposition of A: A = U.T * U
+ * @endcode
+ *
+ * @param handle RAFT handle (used to retrive cuBLAS handles).
+ * @param L device array for to store the triangular matrix L, and the new
+ *     column of A in column major format, size [n*n]
+ * @param n number of elements in the new row.
+ * @param ld stride of colums in L
+ * @param workspace device pointer to workspace shall be nullptr ar an array
+ *    of size [n_bytes].
+ * @param n_bytes size of workspace is returned here if workspace==nullptr.
+ * @param stream CUDA stream
+ * @param uplo indicates whether L is stored as an upper or lower triangular
+ *    matrix (CUBLAS_FILL_MODE_UPPER or CUBLAS_FILL_MODE_LOWER)
+ * @param eps numerical parameter that can act as a regularizer for ill
+ *    conditioned systems. Negative values mean no regularizaton.
+ */
+template <typename math_t>
+void choleskyRank1Update(const raft::handle_t& handle,
+                         math_t* L,
+                         int n,
+                         int ld,
+                         void* workspace,
+                         int* n_bytes,
+                         cublasFillMode_t uplo,
+                         cudaStream_t stream,
+                         math_t eps = -1)
+{
+  detail::choleskyRank1Update(handle, L, n, ld, workspace, n_bytes, uplo, stream, eps);
+}
+};  // namespace linalg
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.hpp b/cpp/include/raft/linalg/cholesky_r1_update.hpp
index 583c65c50e..b55f5d06da 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.hpp
+++ b/cpp/include/raft/linalg/cholesky_r1_update.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __CHOLESKY_R1_UPDATE_H
+#define __CHOLESKY_R1_UPDATE_H
 
 #pragma once
 
-#include "detail/cholesky_r1_update.hpp"
+#include "detail/cholesky_r1_update.cuh"
 
 namespace raft {
 namespace linalg {
@@ -132,3 +139,5 @@ void choleskyRank1Update(const raft::handle_t& handle,
 }
 };  // namespace linalg
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh
new file mode 100644
index 0000000000..03477f72d6
--- /dev/null
+++ b/cpp/include/raft/linalg/coalesced_reduction.cuh
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __COALESCED_REDUCTION_H
+#define __COALESCED_REDUCTION_H
+
+#pragma once
+
+#include "detail/coalesced_reduction.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Compute reduction of the input matrix along the leading dimension
+ *
+ * @tparam InType the data type of the input
+ * @tparam OutType the data type of the output (as well as the data type for
+ *  which reduction is performed)
+ * @tparam IdxType data type of the indices of the array
+ * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
+ * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*ReduceLambda)(OutType);</pre>
+ * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*FinalLambda)(OutType);</pre>
+ * @param dots the output reduction vector
+ * @param data the input matrix
+ * @param D leading dimension of data
+ * @param N second dimension data
+ * @param init initial value to use for the reduction
+ * @param main_op elementwise operation to apply before reduction
+ * @param reduce_op binary reduction operation
+ * @param final_op elementwise operation to apply before storing results
+ * @param inplace reduction result added inplace or overwrites old values?
+ * @param stream cuda stream where to launch work
+ */
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::Nop<OutType>>
+void coalescedReduction(OutType* dots,
+                        const InType* data,
+                        int D,
+                        int N,
+                        OutType init,
+                        cudaStream_t stream,
+                        bool inplace           = false,
+                        MainLambda main_op     = raft::Nop<InType, IdxType>(),
+                        ReduceLambda reduce_op = raft::Sum<OutType>(),
+                        FinalLambda final_op   = raft::Nop<OutType>())
+{
+  detail::coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/coalesced_reduction.hpp b/cpp/include/raft/linalg/coalesced_reduction.hpp
index 0f1ca9202d..4b9e5d262f 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.hpp
+++ b/cpp/include/raft/linalg/coalesced_reduction.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __COALESCED_REDUCTION_H
+#define __COALESCED_REDUCTION_H
 
 #pragma once
 
@@ -70,3 +77,5 @@ void coalescedReduction(OutType* dots,
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh
new file mode 100644
index 0000000000..5ccbd15c3d
--- /dev/null
+++ b/cpp/include/raft/linalg/contractions.cuh
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CONTRACTIONS_H
+#define __CONTRACTIONS_H
+
+#pragma once
+
+#include "detail/contractions.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief This is the central enum that should be used to configure the perf
+ *        landscape of the Contraction kernel.
+ *
+ * Main goal of this Policy struct is to provide sufficient knobs to tune the
+ * perf of Contraction kernel, as and when we see matrices of different shapes.
+ *
+ * @tparam DataT   the IO and math datatype
+ * @tparam _veclen number of k-elements loaded by each thread for every LDG call
+ *                 it makes. This should be configured based on the input 'k'
+ *                 value and the input data type. For eg: if DataT = float and
+ *                 k is multiples of 4, then setting this to 4 gives the best
+ *                 LDG pattern. Possible values are {1, 2, 4}.
+ * @tparam _kblk   number of k-elements operated upon per main-loop iteration.
+ *                 Therefore total number of main-loop iterations will be
+ *                 `ceil(k/_kblk)`. This must be multiples of `_veclen`. Do note
+ *                 that bigger this value, the greater shared mem requirement.
+ * @tparam _rpt    Defines the number of rows that a given thread accumulates on.
+ *                 This directly results in increased register pressure. This
+ *                 also is used to compute the number of m-elements worked upon
+ *                 by each thread block.
+ * @tparam _cpt    Defines the number of cols that a given thread accumulates on.
+ *                 This directly results in increased register pressure. This
+ *                 also is used to compute the number of n-elements worked upon
+ *                 by each thread block.
+ * @tparam _tr     Number of threads working on the same output column. This is
+ *                 used to compute the number of m-elements worked upon by each
+ *                 thread block. This also determines the number of threads per
+ *                 thread block
+ * @tparam _tc     Number of threads working on the same output row. This is
+ *                 used to compute the number of m-elements worked upon by each
+ *                 thread block. This also determines the number of threads per
+ *                 thread block
+ */
+template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
+struct KernelPolicy {
+  enum {
+    /** number of elements along K worked upon per main loop iteration */
+    Kblk = _kblk,
+    /** number of elements loaded per LDG */
+    Veclen = _veclen,
+    /** number of rows a thread works on for accumulation */
+    AccRowsPerTh = _rpt,
+    /** number of cols a thread works on for accumulation */
+    AccColsPerTh = _cpt,
+    /** number of threads working the same output col */
+    AccThRows = _tr,
+    /** number of threads working the same output row */
+    AccThCols = _tc,
+    /** total threads per block */
+    Nthreads = AccThRows * AccThCols,
+    /** output tile size along rows */
+    Mblk = AccRowsPerTh * AccThRows,
+    /** output tile size along cols */
+    Nblk = AccColsPerTh * AccThCols,
+    /** number of threads loading a single row */
+    LdgThRow = Kblk / Veclen,
+    /** number of LDGs issued by a single thread for X */
+    LdgPerThX = Mblk * LdgThRow / Nthreads,
+    /** number of LDGs issued by a single thread for Y */
+    LdgPerThY = Nblk * LdgThRow / Nthreads,
+    /** number of rows of X covered per LDG */
+    LdgRowsX = Mblk / LdgPerThX,
+    /** number of rows of Y covered per LDG */
+    LdgRowsY = Nblk / LdgPerThY,
+    /** stride for accessing X/Y data in shared mem */
+    SmemStride = Kblk + Veclen,
+    /** size of one page for storing X data */
+    SmemPageX = SmemStride * Mblk,
+    /** size of one page for storing Y data */
+    SmemPageY = SmemStride * Nblk,
+    /** size of one smem page */
+    SmemPage = SmemPageX + SmemPageY,
+    /** size (in B) for smem needed */
+    SmemSize = 2 * SmemPage * sizeof(DataT),
+  };  // enum
+
+};  // struct KernelPolicy
+
+template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
+struct ColKernelPolicy {
+  enum {
+    /** number of elements along K worked upon per main loop iteration */
+    Kblk = _kblk,
+    /** number of elements loaded per LDG */
+    Veclen = _veclen,
+    /** number of rows a thread works on for accumulation */
+    AccRowsPerTh = _rpt,
+    /** number of cols a thread works on for accumulation */
+    AccColsPerTh = _cpt,
+    /** number of threads working the same output col */
+    AccThRows = _tr,
+    /** number of threads working the same output row */
+    AccThCols = _tc,
+    /** total threads per block */
+    Nthreads = AccThRows * AccThCols,
+    /** output tile size along rows */
+    Mblk = AccRowsPerTh * AccThRows,
+    /** output tile size along cols */
+    Nblk = AccColsPerTh * AccThCols,
+    /** number of threads loading a single col */
+    LdgThRow = Mblk / Veclen,
+    /** number of LDGs issued by a single thread for X */
+    LdgPerThX = Kblk * LdgThRow / Nthreads,
+    /** number of LDGs issued by a single thread for Y */
+    LdgPerThY = Kblk * LdgThRow / Nthreads,
+    /** number of rows of X covered per LDG */
+    LdgRowsX = Kblk / LdgPerThX,
+    /** number of rows of Y covered per LDG */
+    LdgRowsY = Kblk / LdgPerThY,
+    /** stride for accessing X/Y data in shared mem */
+    SmemStride = Mblk + Veclen,
+    /** size of one page for storing X data */
+    SmemPageX = SmemStride * Kblk,
+    /** size of one page for storing Y data */
+    SmemPageY = SmemStride * Kblk,
+    /** size of one smem page */
+    SmemPage = SmemPageX + SmemPageY,
+    /** size (in B) for smem needed */
+    SmemSize = 2 * SmemPage * sizeof(DataT),
+  };  // colMajor enum
+  static_assert(Mblk == Nblk, "Mblk should be equal to Nblk");
+};
+/**
+ * @defgroup Policy4x4 16 elements per thread Policy with k-block = 32
+ * @{
+ */
+template <typename DataT, int _veclen>
+struct Policy4x4 {
+};
+
+template <int _veclen>
+struct Policy4x4<float, _veclen> {
+  typedef KernelPolicy<float, _veclen, 32, 4, 4, 16, 16> Policy;
+  typedef ColKernelPolicy<float, _veclen, 32, 4, 4, 16, 16> ColPolicy;
+};
+
+template <int _veclen>
+struct Policy4x4<double, _veclen> {
+  typedef KernelPolicy<double, _veclen, 16, 4, 4, 16, 16> Policy;
+  typedef ColKernelPolicy<double, _veclen, 16, 4, 4, 16, 16> ColPolicy;
+};
+/** @} */
+
+/**
+ * @defgroup Policy2x8 16 elements per thread Policy with k-block = 16
+ * @{
+ */
+template <typename DataT, int _veclen = 1>
+struct Policy2x8 {
+};
+
+template <int _veclen>
+struct Policy2x8<float, _veclen> {
+  typedef KernelPolicy<float, _veclen, 16, 2, 8, 8, 32> Policy;
+  typedef ColKernelPolicy<float, _veclen, 16, 2, 8, 8, 32> ColPolicy;
+};
+
+template <int _veclen>
+struct Policy2x8<double, _veclen> {
+  // this is not used just for keeping compiler happy.
+  typedef KernelPolicy<double, _veclen, 32, 1, 2, 8, 32> Policy;
+  typedef ColKernelPolicy<double, _veclen, 32, 1, 2, 8, 32> ColPolicy;
+};
+/** @} */
+
+/**
+ * @brief Base class for gemm-like NT contractions
+ *
+ * This class does not provide any arithmetic operations, but only provides the
+ * memory-related operations of loading the `x` and `y` matrix blocks from the
+ * global memory into shared memory and then from shared into registers. Thus,
+ * this class acts as a basic building block for further composing gemm-like NT
+ * contractions on input matrices which are row-major (and so does the output)
+ *
+ * @tparam DataT  IO and math data type
+ * @tparam IdxT   indexing type
+ * @tparam Policy policy used to customize memory access behavior.
+ *                See documentation for `KernelPolicy` to know more.
+ */
+using detail::Contractions_NT;
+
+}  // namespace linalg
+}  // namespace raft
+
+#endif
diff --git a/cpp/include/raft/linalg/contractions.hpp b/cpp/include/raft/linalg/contractions.hpp
index e317588b1d..84c86b93a4 100644
--- a/cpp/include/raft/linalg/contractions.hpp
+++ b/cpp/include/raft/linalg/contractions.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __CONTRACTIONS_H
+#define __CONTRACTIONS_H
 
 #pragma once
 
@@ -205,3 +212,5 @@ using detail::Contractions_NT;
 
 }  // namespace linalg
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/cublas_macros.h b/cpp/include/raft/linalg/cublas_macros.h
new file mode 100644
index 0000000000..1cb5cfc81a
--- /dev/null
+++ b/cpp/include/raft/linalg/cublas_macros.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cublas_v2.h>
+#include <raft/error.hpp>
+
+///@todo: enable this once we have logger enabled
+//#include <cuml/common/logger.hpp>
+
+#include <cstdint>
+
+#define _CUBLAS_ERR_TO_STR(err) \
+  case err: return #err
+
+namespace raft {
+
+/**
+ * @brief Exception thrown when a cuBLAS error is encountered.
+ */
+struct cublas_error : public raft::exception {
+  explicit cublas_error(char const* const message) : raft::exception(message) {}
+  explicit cublas_error(std::string const& message) : raft::exception(message) {}
+};
+
+namespace linalg {
+namespace detail {
+
+inline const char* cublas_error_to_string(cublasStatus_t err)
+{
+  switch (err) {
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
+    default: return "CUBLAS_STATUS_UNKNOWN";
+  };
+}
+
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
+
+#undef _CUBLAS_ERR_TO_STR
+
+/**
+ * @brief Error checking macro for cuBLAS runtime API functions.
+ *
+ * Invokes a cuBLAS runtime API function call, if the call does not return
+ * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred
+ */
+#define RAFT_CUBLAS_TRY(call)                                              \
+  do {                                                                     \
+    cublasStatus_t const status = (call);                                  \
+    if (CUBLAS_STATUS_SUCCESS != status) {                                 \
+      std::string msg{};                                                   \
+      SET_ERROR_MSG(msg,                                                   \
+                    "cuBLAS error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                             \
+                    #call,                                                 \
+                    status,                                                \
+                    raft::linalg::detail::cublas_error_to_string(status)); \
+      throw raft::cublas_error(msg);                                       \
+    }                                                                      \
+  } while (0)
+
+// FIXME: Remove after consumers rename
+#ifndef CUBLAS_TRY
+#define CUBLAS_TRY(call) RAFT_CUBLAS_TRY(call)
+#endif
+
+// /**
+//  * @brief check for cuda runtime API errors but log error instead of raising
+//  *        exception.
+//  */
+#define RAFT_CUBLAS_TRY_NO_THROW(call)                               \
+  do {                                                               \
+    cublasStatus_t const status = call;                              \
+    if (CUBLAS_STATUS_SUCCESS != status) {                           \
+      printf("CUBLAS call='%s' at file=%s line=%d failed with %s\n", \
+             #call,                                                  \
+             __FILE__,                                               \
+             __LINE__,                                               \
+             raft::linalg::detail::cublas_error_to_string(status));  \
+    }                                                                \
+  } while (0)
+
+/** FIXME: remove after cuml rename */
+#ifndef CUBLAS_CHECK
+#define CUBLAS_CHECK(call) CUBLAS_TRY(call)
+#endif
+
+/** FIXME: remove after cuml rename */
+#ifndef CUBLAS_CHECK_NO_THROW
+#define CUBLAS_CHECK_NO_THROW(call) RAFT_CUBLAS_TRY_NO_THROW(call)
+#endif
diff --git a/cpp/include/raft/linalg/cusolver_macros.h b/cpp/include/raft/linalg/cusolver_macros.h
new file mode 100644
index 0000000000..6db0577509
--- /dev/null
+++ b/cpp/include/raft/linalg/cusolver_macros.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusolverDn.h>
+#include <cusolverSp.h>
+///@todo: enable this once logging is enabled
+//#include <cuml/common/logger.hpp>
+#include <raft/cudart_utils.h>
+#include <type_traits>
+
+#define _CUSOLVER_ERR_TO_STR(err) \
+  case err: return #err;
+
+namespace raft {
+
+/**
+ * @brief Exception thrown when a cuSOLVER error is encountered.
+ */
+struct cusolver_error : public raft::exception {
+  explicit cusolver_error(char const* const message) : raft::exception(message) {}
+  explicit cusolver_error(std::string const& message) : raft::exception(message) {}
+};
+
+namespace linalg {
+
+inline const char* cusolver_error_to_string(cusolverStatus_t err)
+{
+  switch (err) {
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ALLOC_FAILED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INVALID_VALUE);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ARCH_MISMATCH);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_EXECUTION_FAILED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INTERNAL_ERROR);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
+    default: return "CUSOLVER_STATUS_UNKNOWN";
+  };
+}
+
+}  // namespace linalg
+}  // namespace raft
+
+#undef _CUSOLVER_ERR_TO_STR
+
+/**
+ * @brief Error checking macro for cuSOLVER runtime API functions.
+ *
+ * Invokes a cuSOLVER runtime API function call, if the call does not return
+ * CUSolver_STATUS_SUCCESS, throws an exception detailing the cuSOLVER error that occurred
+ */
+#define RAFT_CUSOLVER_TRY(call)                                              \
+  do {                                                                       \
+    cusolverStatus_t const status = (call);                                  \
+    if (CUSOLVER_STATUS_SUCCESS != status) {                                 \
+      std::string msg{};                                                     \
+      SET_ERROR_MSG(msg,                                                     \
+                    "cuSOLVER error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                               \
+                    #call,                                                   \
+                    status,                                                  \
+                    raft::linalg::detail::cusolver_error_to_string(status)); \
+      throw raft::cusolver_error(msg);                                       \
+    }                                                                        \
+  } while (0)
+
+// FIXME: remove after consumer rename
+#ifndef CUSOLVER_TRY
+#define CUSOLVER_TRY(call) RAFT_CUSOLVER_TRY(call)
+#endif
+
+// /**
+//  * @brief check for cuda runtime API errors but log error instead of raising
+//  *        exception.
+//  */
+#define RAFT_CUSOLVER_TRY_NO_THROW(call)                               \
+  do {                                                                 \
+    cusolverStatus_t const status = call;                              \
+    if (CUSOLVER_STATUS_SUCCESS != status) {                           \
+      printf("CUSOLVER call='%s' at file=%s line=%d failed with %s\n", \
+             #call,                                                    \
+             __FILE__,                                                 \
+             __LINE__,                                                 \
+             raft::linalg::detail::cusolver_error_to_string(status));  \
+    }                                                                  \
+  } while (0)
+
+// FIXME: remove after cuml rename
+#ifndef CUSOLVER_CHECK
+#define CUSOLVER_CHECK(call) CUSOLVER_TRY(call)
+#endif
+
+#ifndef CUSOLVER_CHECK_NO_THROW
+#define CUSOLVER_CHECK_NO_THROW(call) CUSOLVER_TRY_NO_THROW(call)
+#endif
diff --git a/cpp/include/raft/linalg/detail/add.cuh b/cpp/include/raft/linalg/detail/add.cuh
index 794a776dcf..652ffd2e86 100644
--- a/cpp/include/raft/linalg/detail/add.cuh
+++ b/cpp/include/raft/linalg/detail/add.cuh
@@ -19,8 +19,8 @@
 #include "functional.cuh"
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.hpp>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/unary_op.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/axpy.hpp b/cpp/include/raft/linalg/detail/axpy.cuh
similarity index 100%
rename from cpp/include/raft/linalg/detail/axpy.hpp
rename to cpp/include/raft/linalg/detail/axpy.cuh
diff --git a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp b/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
similarity index 98%
rename from cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
rename to cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
index 48993886a6..df1fb0a1f3 100644
--- a/cpp/include/raft/linalg/detail/cholesky_r1_update.hpp
+++ b/cpp/include/raft/linalg/detail/cholesky_r1_update.cuh
@@ -18,9 +18,8 @@
 
 #include "cublas_wrappers.hpp"
 #include "cusolver_wrappers.hpp"
-#include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
-#include <raft/linalg/binary_op.hpp>
+#include <raft/linalg/binary_op.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
index 752235d246..7f9abc324e 100644
--- a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
+++ b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cublas_v2.h>
 #include <raft/error.hpp>
 
 #include <cublas_v2.h>
diff --git a/cpp/include/raft/linalg/detail/divide.hpp b/cpp/include/raft/linalg/detail/divide.cuh
similarity index 96%
rename from cpp/include/raft/linalg/detail/divide.hpp
rename to cpp/include/raft/linalg/detail/divide.cuh
index c694529fb5..cb46ae76de 100644
--- a/cpp/include/raft/linalg/detail/divide.hpp
+++ b/cpp/include/raft/linalg/detail/divide.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include "functional.cuh"
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/eig.hpp b/cpp/include/raft/linalg/detail/eig.cuh
similarity index 99%
rename from cpp/include/raft/linalg/detail/eig.hpp
rename to cpp/include/raft/linalg/detail/eig.cuh
index 8716b4de29..1d9a6bfa8f 100644
--- a/cpp/include/raft/linalg/detail/eig.hpp
+++ b/cpp/include/raft/linalg/detail/eig.cuh
@@ -18,10 +18,9 @@
 
 #include "cusolver_wrappers.hpp"
 #include <cuda_runtime_api.h>
-#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/matrix/matrix.hpp>
+#include <raft/matrix/matrix.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/linalg/detail/eltwise.hpp b/cpp/include/raft/linalg/detail/eltwise.cuh
similarity index 97%
rename from cpp/include/raft/linalg/detail/eltwise.hpp
rename to cpp/include/raft/linalg/detail/eltwise.cuh
index b15717f205..6d728c8b0f 100644
--- a/cpp/include/raft/linalg/detail/eltwise.hpp
+++ b/cpp/include/raft/linalg/detail/eltwise.cuh
@@ -18,8 +18,8 @@
 
 #include "functional.cuh"
 
-#include <raft/linalg/binary_op.hpp>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/unary_op.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/lanczos.hpp b/cpp/include/raft/linalg/detail/lanczos.cuh
similarity index 100%
rename from cpp/include/raft/linalg/detail/lanczos.hpp
rename to cpp/include/raft/linalg/detail/lanczos.cuh
diff --git a/cpp/include/raft/linalg/detail/lstsq.hpp b/cpp/include/raft/linalg/detail/lstsq.cuh
similarity index 98%
rename from cpp/include/raft/linalg/detail/lstsq.hpp
rename to cpp/include/raft/linalg/detail/lstsq.cuh
index 6553394cc4..3eef58b4df 100644
--- a/cpp/include/raft/linalg/detail/lstsq.hpp
+++ b/cpp/include/raft/linalg/detail/lstsq.cuh
@@ -18,20 +18,19 @@
 
 #include <common/nvtx.hpp>
 #include <raft/common/nvtx.hpp>
-#include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/detail/cusolver_wrappers.hpp>
-#include <raft/linalg/eig.hpp>
-#include <raft/linalg/eltwise.hpp>
-#include <raft/linalg/gemm.hpp>
-#include <raft/linalg/gemv.hpp>
-#include <raft/linalg/qr.hpp>
-#include <raft/linalg/svd.hpp>
-#include <raft/linalg/transpose.hpp>
-#include <raft/matrix/math.hpp>
-#include <raft/matrix/matrix.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/eig.cuh>
+#include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/gemv.cuh>
+#include <raft/linalg/qr.cuh>
+#include <raft/linalg/svd.cuh>
+#include <raft/linalg/transpose.cuh>
+#include <raft/matrix/math.cuh>
+#include <raft/matrix/matrix.cuh>
+#include <raft/random/rng.cuh>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
index 94545e59f6..4cfccdcaa3 100644
--- a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/matrix/matrix.hpp>
+#include <raft/matrix/matrix.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/mean_squared_error.hpp b/cpp/include/raft/linalg/detail/mean_squared_error.cuh
similarity index 96%
rename from cpp/include/raft/linalg/detail/mean_squared_error.hpp
rename to cpp/include/raft/linalg/detail/mean_squared_error.cuh
index f0a9daebdb..5889314eea 100644
--- a/cpp/include/raft/linalg/detail/mean_squared_error.hpp
+++ b/cpp/include/raft/linalg/detail/mean_squared_error.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/map_then_reduce.hpp>
+#include <raft/linalg/map_then_reduce.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/multiply.hpp b/cpp/include/raft/linalg/detail/multiply.cuh
similarity index 96%
rename from cpp/include/raft/linalg/detail/multiply.hpp
rename to cpp/include/raft/linalg/detail/multiply.cuh
index da06c23aed..ec3ec802de 100644
--- a/cpp/include/raft/linalg/detail/multiply.hpp
+++ b/cpp/include/raft/linalg/detail/multiply.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/norm.hpp b/cpp/include/raft/linalg/detail/norm.cuh
similarity index 99%
rename from cpp/include/raft/linalg/detail/norm.hpp
rename to cpp/include/raft/linalg/detail/norm.cuh
index fcf98c7daf..03d03497e9 100644
--- a/cpp/include/raft/linalg/detail/norm.hpp
+++ b/cpp/include/raft/linalg/detail/norm.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/reduce.hpp>
+#include <raft/linalg/reduce.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/qr.cuh b/cpp/include/raft/linalg/detail/qr.cuh
index 81b1867a82..4aa843081e 100644
--- a/cpp/include/raft/linalg/detail/qr.cuh
+++ b/cpp/include/raft/linalg/detail/qr.cuh
@@ -18,7 +18,7 @@
 
 #include "cublas_wrappers.hpp"
 #include "cusolver_wrappers.hpp"
-#include <raft/matrix/matrix.hpp>
+#include <raft/matrix/matrix.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/linalg/detail/reduce.hpp b/cpp/include/raft/linalg/detail/reduce.cuh
similarity index 95%
rename from cpp/include/raft/linalg/detail/reduce.hpp
rename to cpp/include/raft/linalg/detail/reduce.cuh
index 94c8f5ba52..4d5fa87202 100644
--- a/cpp/include/raft/linalg/detail/reduce.hpp
+++ b/cpp/include/raft/linalg/detail/reduce.cuh
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/coalesced_reduction.hpp>
-#include <raft/linalg/strided_reduction.hpp>
+#include <raft/linalg/coalesced_reduction.cuh>
+#include <raft/linalg/strided_reduction.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/rsvd.cuh b/cpp/include/raft/linalg/detail/rsvd.cuh
index 3dc22a7e89..033534be55 100644
--- a/cpp/include/raft/linalg/detail/rsvd.cuh
+++ b/cpp/include/raft/linalg/detail/rsvd.cuh
@@ -17,14 +17,14 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/eig.hpp>
-#include <raft/linalg/gemm.hpp>
-#include <raft/linalg/qr.hpp>
-#include <raft/linalg/svd.hpp>
-#include <raft/linalg/transpose.hpp>
-#include <raft/matrix/math.hpp>
-#include <raft/matrix/matrix.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/eig.cuh>
+#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/qr.cuh>
+#include <raft/linalg/svd.cuh>
+#include <raft/linalg/transpose.cuh>
+#include <raft/matrix/math.cuh>
+#include <raft/matrix/matrix.cuh>
+#include <raft/random/rng.cuh>
 
 #include <algorithm>
 
diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh
index a0d1e2abaa..f7af9e88d6 100644
--- a/cpp/include/raft/linalg/detail/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh
@@ -19,7 +19,7 @@
 #include "unary_op.cuh"
 #include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 #include <type_traits>
 
 namespace raft {
diff --git a/cpp/include/raft/linalg/detail/subtract.cuh b/cpp/include/raft/linalg/detail/subtract.cuh
index 23d5eded05..084c6d2fd3 100644
--- a/cpp/include/raft/linalg/detail/subtract.cuh
+++ b/cpp/include/raft/linalg/detail/subtract.cuh
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.hpp>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/unary_op.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/linalg/detail/svd.hpp b/cpp/include/raft/linalg/detail/svd.cuh
similarity index 98%
rename from cpp/include/raft/linalg/detail/svd.hpp
rename to cpp/include/raft/linalg/detail/svd.cuh
index 5d349cd101..aa33dcb0a9 100644
--- a/cpp/include/raft/linalg/detail/svd.hpp
+++ b/cpp/include/raft/linalg/detail/svd.cuh
@@ -18,16 +18,16 @@
 
 #include "cublas_wrappers.hpp"
 #include "cusolver_wrappers.hpp"
-#include <raft/linalg/eig.hpp>
-#include <raft/linalg/gemm.hpp>
-#include <raft/linalg/transpose.hpp>
+#include <raft/linalg/eig.cuh>
+#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/transpose.cuh>
 
 #include <raft/common/nvtx.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/matrix/math.hpp>
-#include <raft/matrix/matrix.hpp>
+#include <raft/matrix/math.cuh>
+#include <raft/matrix/matrix.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/linalg/detail/transpose.hpp b/cpp/include/raft/linalg/detail/transpose.cuh
similarity index 100%
rename from cpp/include/raft/linalg/detail/transpose.hpp
rename to cpp/include/raft/linalg/detail/transpose.cuh
diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh
new file mode 100644
index 0000000000..820c42f0ea
--- /dev/null
+++ b/cpp/include/raft/linalg/divide.cuh
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __DIVIDE_H
+#define __DIVIDE_H
+
+#pragma once
+
+#include "detail/divide.cuh"
+
+namespace raft {
+namespace linalg {
+
+using detail::divides_scalar;
+
+/**
+ * @defgroup ScalarOps Scalar operations on the input buffer
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in the input buffer
+ * @param scalar the scalar used in the operations
+ * @param len number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
+{
+  detail::divideScalar(out, in, scalar, len, stream);
+}
+/** @} */
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/divide.hpp b/cpp/include/raft/linalg/divide.hpp
index 6c8480bf19..88b919b92a 100644
--- a/cpp/include/raft/linalg/divide.hpp
+++ b/cpp/include/raft/linalg/divide.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __DIVIDE_H
+#define __DIVIDE_H
 
 #pragma once
 
-#include "detail/divide.hpp"
+#include "detail/divide.cuh"
 
 namespace raft {
 namespace linalg {
@@ -43,3 +50,5 @@ void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cud
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
new file mode 100644
index 0000000000..f1f02dc13e
--- /dev/null
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __EIG_H
+#define __EIG_H
+
+#pragma once
+
+#include "detail/eig.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @defgroup eig Eigen Decomposition Methods
+ * @{
+ */
+
+/**
+ * @brief eig decomp with divide and conquer method for the column-major
+ * symmetric matrices
+ * @param handle raft handle
+ * @param in the input buffer (symmetric matrix that has real eig values and
+ * vectors.
+ * @param n_rows: number of rows of the input
+ * @param n_cols: number of cols of the input
+ * @param eig_vectors: eigenvectors
+ * @param eig_vals: eigen values
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void eigDC(const raft::handle_t& handle,
+           const math_t* in,
+           std::size_t n_rows,
+           std::size_t n_cols,
+           math_t* eig_vectors,
+           math_t* eig_vals,
+           cudaStream_t stream)
+{
+  detail::eigDC(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream);
+}
+
+using detail::COPY_INPUT;
+using detail::EigVecMemUsage;
+using detail::OVERWRITE_INPUT;
+
+/**
+ * @brief eig sel decomp with divide and conquer method for the column-major
+ * symmetric matrices
+ * @param handle raft handle
+ * @param in the input buffer (symmetric matrix that has real eig values and
+ * vectors.
+ * @param n_rows: number of rows of the input
+ * @param n_cols: number of cols of the input
+ * @param n_eig_vals: number of eigenvectors to be generated
+ * @param eig_vectors: eigenvectors
+ * @param eig_vals: eigen values
+ * @param memUsage: the memory selection for eig vector output
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void eigSelDC(const raft::handle_t& handle,
+              math_t* in,
+              int n_rows,
+              int n_cols,
+              int n_eig_vals,
+              math_t* eig_vectors,
+              math_t* eig_vals,
+              EigVecMemUsage memUsage,
+              cudaStream_t stream)
+{
+  detail::eigSelDC(handle, in, n_rows, n_cols, n_eig_vals, eig_vectors, eig_vals, memUsage, stream);
+}
+
+/**
+ * @brief overloaded function for eig decomp with Jacobi method for the
+ * column-major symmetric matrices (in parameter)
+ * @param handle: raft handle
+ * @param in: input matrix
+ * @param n_rows: number of rows of the input
+ * @param n_cols: number of cols of the input
+ * @param eig_vectors: eigenvectors
+ * @param eig_vals: eigen values
+ * @param stream: stream on which this function will be run
+ * @param tol: error tolerance for the jacobi method. Algorithm stops when the
+ * error is below tol
+ * @param sweeps: number of sweeps in the Jacobi algorithm. The more the better
+ * accuracy.
+ */
+template <typename math_t>
+void eigJacobi(const raft::handle_t& handle,
+               const math_t* in,
+               int n_rows,
+               int n_cols,
+               math_t* eig_vectors,
+               math_t* eig_vals,
+               cudaStream_t stream,
+               math_t tol = 1.e-7,
+               int sweeps = 15)
+{
+  detail::eigJacobi(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream, tol, sweeps);
+}
+/** @} */  // end of eig
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/eig.hpp b/cpp/include/raft/linalg/eig.hpp
index 5c465a3a41..9417b6fb3f 100644
--- a/cpp/include/raft/linalg/eig.hpp
+++ b/cpp/include/raft/linalg/eig.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __EIG_H
+#define __EIG_H
 
 #pragma once
 
-#include "detail/eig.hpp"
+#include "detail/eig.cuh"
 
 namespace raft {
 namespace linalg {
@@ -114,3 +121,5 @@ void eigJacobi(const raft::handle_t& handle,
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/eltwise.cuh b/cpp/include/raft/linalg/eltwise.cuh
new file mode 100644
index 0000000000..dbc06a4af3
--- /dev/null
+++ b/cpp/include/raft/linalg/eltwise.cuh
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ELTWISE_H
+#define __ELTWISE_H
+
+#pragma once
+
+#include "detail/eltwise.cuh"
+
+namespace raft {
+namespace linalg {
+
+using detail::adds_scalar;
+
+/**
+ * @defgroup ScalarOps Scalar operations on the input buffer
+ * @tparam InType data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in the input buffer
+ * @param scalar the scalar used in the operations
+ * @param len number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template <typename InType, typename IdxType, typename OutType = InType>
+void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
+{
+  detail::scalarAdd(out, in, scalar, len, stream);
+}
+
+using detail::multiplies_scalar;
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
+{
+  detail::scalarMultiply(out, in, scalar, len, stream);
+}
+/** @} */
+
+/**
+ * @defgroup BinaryOps Element-wise binary operations on the input buffers
+ * @tparam InType data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in1 the first input buffer
+ * @param in2 the second input buffer
+ * @param len number of elements in the input buffers
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseAdd(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  detail::eltwiseAdd(out, in1, in2, len, stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseSub(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  detail::eltwiseSub(out, in1, in2, len, stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseMultiply(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  detail::eltwiseMultiply(out, in1, in2, len, stream);
+}
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseDivide(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  detail::eltwiseDivide(out, in1, in2, len, stream);
+}
+
+using detail::divides_check_zero;
+
+template <typename InType, typename IdxType, typename OutType = InType>
+void eltwiseDivideCheckZero(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
+  detail::eltwiseDivideCheckZero(out, in1, in2, len, stream);
+}
+/** @} */
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/eltwise.hpp b/cpp/include/raft/linalg/eltwise.hpp
index 5c2a97b57d..0ebefc7c25 100644
--- a/cpp/include/raft/linalg/eltwise.hpp
+++ b/cpp/include/raft/linalg/eltwise.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __ELTWISE_H
+#define __ELTWISE_H
 
 #pragma once
 
-#include "detail/eltwise.hpp"
+#include "detail/eltwise.cuh"
 
 namespace raft {
 namespace linalg {
@@ -100,3 +107,5 @@ void eltwiseDivideCheckZero(
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh
new file mode 100644
index 0000000000..9670834ff0
--- /dev/null
+++ b/cpp/include/raft/linalg/gemm.cuh
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __GEMM_H
+#define __GEMM_H
+
+#pragma once
+
+#include "detail/gemm.hpp"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief the wrapper of cublas gemm function
+ *  It computes the following equation: C = alpha .* opA(A) * opB(B) + beta .* C
+ *
+ * @tparam math_t the element type
+ * @tparam DevicePointerMode whether pointers alpha, beta point to device memory
+ * @param [in] handle raft handle
+ * @param [in] trans_a cublas transpose op for A
+ * @param [in] trans_b cublas transpose op for B
+ * @param [in] m number of rows of C
+ * @param [in] n number of columns of C
+ * @param [in] k number of rows of opB(B) / number of columns of opA(A)
+ * @param [in] alpha host or device scalar
+ * @param [in] A such a matrix that the shape of column-major opA(A) is [m, k]
+ * @param [in] lda leading dimension of A
+ * @param [in] B such a matrix that the shape of column-major opA(B) is [k, n]
+ * @param [in] ldb leading dimension of B
+ * @param [in] beta host or device scalar
+ * @param [inout] C column-major matrix of size [m, n]
+ * @param [in] ldc leading dimension of C
+ * @param [in] stream
+ */
+template <typename math_t, bool DevicePointerMode = false>
+void gemm(const raft::handle_t& handle,
+          const bool trans_a,
+          const bool trans_b,
+          const int m,
+          const int n,
+          const int k,
+          const math_t* alpha,
+          const math_t* A,
+          const int lda,
+          const math_t* B,
+          const int ldb,
+          const math_t* beta,
+          const math_t* C,
+          const int ldc,
+          cudaStream_t stream)
+{
+  detail::gemm<math_t, DevicePointerMode>(
+    handle, trans_a, trans_b, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, stream);
+}
+
+/**
+ * @brief the wrapper of cublas gemm function
+ *  It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C
+ * @tparam math_t the type of input/output matrices
+ * @param handle raft handle
+ * @param a input matrix
+ * @param n_rows_a number of rows of A
+ * @param n_cols_a number of columns of A
+ * @param b input matrix
+ * @param c output matrix
+ * @param n_rows_c number of rows of C
+ * @param n_cols_c number of columns of C
+ * @param trans_a cublas transpose op for A
+ * @param trans_b cublas transpose op for B
+ * @param alpha scalar
+ * @param beta scalar
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void gemm(const raft::handle_t& handle,
+          const math_t* a,
+          int n_rows_a,
+          int n_cols_a,
+          const math_t* b,
+          math_t* c,
+          int n_rows_c,
+          int n_cols_c,
+          cublasOperation_t trans_a,
+          cublasOperation_t trans_b,
+          math_t alpha,
+          math_t beta,
+          cudaStream_t stream)
+{
+  detail::gemm(
+    handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream);
+}
+
+/**
+ * @brief the wrapper of cublas gemm function
+ *  It computes the following equation: D = alpha . opA(A) * opB(B) + beta . C
+ * @tparam math_t the type of input/output matrices
+ * @param handle raft handle
+ * @param a input matrix
+ * @param n_rows_a number of rows of A
+ * @param n_cols_a number of columns of A
+ * @param b input matrix
+ * @param c output matrix
+ * @param n_rows_c number of rows of C
+ * @param n_cols_c number of columns of C
+ * @param trans_a cublas transpose op for A
+ * @param trans_b cublas transpose op for B
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void gemm(const raft::handle_t& handle,
+          const math_t* a,
+          int n_rows_a,
+          int n_cols_a,
+          const math_t* b,
+          math_t* c,
+          int n_rows_c,
+          int n_cols_c,
+          cublasOperation_t trans_a,
+          cublasOperation_t trans_b,
+          cudaStream_t stream)
+{
+  detail::gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, stream);
+}
+
+/**
+ * @brief A wrapper for CUBLS GEMM function designed for handling all possible
+ * combinations of operand layouts.
+ * It computes the following equation: Z = alpha . X * Y + beta . Z
+ * @tparam T Data type of input/output matrices (float/double)
+ * @param handle raft handle
+ * @param z output matrix of size M rows x N columns
+ * @param x input matrix of size M rows x K columns
+ * @param y input matrix of size K rows x N columns
+ * @param _M number of rows of X and Z
+ * @param _N number of rows of Y and columns of Z
+ * @param _K number of columns of X and rows of Y
+ * @param isZColMajor Storage layout of Z. true = col major, false = row major
+ * @param isXColMajor Storage layout of X. true = col major, false = row major
+ * @param isYColMajor Storage layout of Y. true = col major, false = row major
+ * @param stream cuda stream
+ * @param alpha scalar
+ * @param beta scalar
+ */
+template <typename T>
+void gemm(const raft::handle_t& handle,
+          T* z,
+          T* x,
+          T* y,
+          int _M,
+          int _N,
+          int _K,
+          bool isZColMajor,
+          bool isXColMajor,
+          bool isYColMajor,
+          cudaStream_t stream,
+          T alpha = T(1.0),
+          T beta  = T(0.0))
+{
+  detail::gemm(
+    handle, z, x, y, _M, _N, _K, isZColMajor, isXColMajor, isYColMajor, stream, alpha, beta);
+}
+
+}  // end namespace linalg
+}  // end namespace raft
+
+#endif
diff --git a/cpp/include/raft/linalg/gemm.hpp b/cpp/include/raft/linalg/gemm.hpp
index f22d15e650..736590938b 100644
--- a/cpp/include/raft/linalg/gemm.hpp
+++ b/cpp/include/raft/linalg/gemm.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __GEMM_H
+#define __GEMM_H
 
 #pragma once
 
@@ -173,3 +180,5 @@ void gemm(const raft::handle_t& handle,
 
 }  // end namespace linalg
 }  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/gemv.cuh b/cpp/include/raft/linalg/gemv.cuh
new file mode 100644
index 0000000000..26a6386148
--- /dev/null
+++ b/cpp/include/raft/linalg/gemv.cuh
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __GEMV_H
+#define __GEMV_H
+
+#pragma once
+
+#include "detail/gemv.hpp"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief the wrapper of cublas gemv function
+ *  It computes the following equation: y = alpha .* op(A) * x + beta .* y
+ *
+ * @tparam math_t the element type
+ * @tparam DevicePointerMode whether pointers alpha, beta point to device memory
+ * @param [in] handle raft handle
+ * @param [in] trans_a cublas transpose op for A
+ * @param [in] m number of rows of A
+ * @param [in] n number of columns of A
+ * @param [in] alpha host or device scalar
+ * @param [in] A column-major matrix of size [m, n]
+ * @param [in] lda leading dimension of A
+ * @param [in] x vector of length n if trans_a else m
+ * @param [in] incx stride between consecutive elements of x
+ * @param [in] beta host or device scalar
+ * @param [inout] y vector of length m if trans_a else n
+ * @param [in] incy stride between consecutive elements of y
+ * @param [in] stream
+ */
+template <typename math_t, bool DevicePointerMode = false>
+void gemv(const raft::handle_t& handle,
+          const bool trans_a,
+          const int m,
+          const int n,
+          const math_t* alpha,
+          const math_t* A,
+          const int lda,
+          const math_t* x,
+          const int incx,
+          const math_t* beta,
+          math_t* y,
+          const int incy,
+          cudaStream_t stream)
+{
+  detail::gemv<math_t, DevicePointerMode>(
+    handle, trans_a, m, n, alpha, A, lda, x, incx, beta, y, incy, stream);
+}
+
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows,
+          const int n_cols,
+          const math_t* x,
+          const int incx,
+          math_t* y,
+          const int incy,
+          const bool trans_a,
+          const math_t alpha,
+          const math_t beta,
+          cudaStream_t stream)
+{
+  detail::gemv(handle, A, n_rows, n_cols, x, incx, y, incy, trans_a, alpha, beta, stream);
+}
+
+/**
+ * y = alpha * op(A) * x + beta * y
+ *
+ * where
+ *
+ * @param handle raft handle
+ * @param A is a column-major matrix of size n_rows_a * n_cols_a.
+ *   op(A) is either the transpose operation (trans_a == true) or identity.
+ * @param n_rows_a number of rows in A
+ * @param n_cols_a number of cols in A
+ * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
+ * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ * @param trans_a whether to take transpose of a
+ * @param alpha is a scalar scale of Ax.
+ * @param beta is a scalar scale of y.
+ * @param stream stream on which this function is run
+ */
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          const math_t alpha,
+          const math_t beta,
+          cudaStream_t stream)
+{
+  detail::gemv(handle, A, n_rows_a, n_cols_a, x, y, trans_a, alpha, beta, stream);
+}
+
+/**
+ * y = op(A) * x
+ *
+ * where
+ *
+ * @param handle raft handle
+ * @param A is a column-major matrix of size n_rows_a * n_cols_a.
+ *   op(A) is either the transpose operation (trans_a == true) or identity.
+ * @param n_rows_a number of rows in A
+ * @param n_cols_a number of cols in A
+ * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
+ * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ * @param trans_a whether to take transpose of a
+ * @param stream stream on which this function is run
+ */
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          cudaStream_t stream)
+{
+  detail::gemv(handle, A, n_rows_a, n_cols_a, x, y, trans_a, stream);
+}
+
+/**
+ * y = alpha * op(A) * x + beta * y
+ *
+ * where
+ * @param handle raft handle
+ * @param A is a column-major matrix of size n_rows_a * n_cols_a.
+ *   op(A) is either the transpose operation (trans_a == true) or identity.
+ * @param n_rows_a number of rows in A
+ * @param n_cols_a number of cols in A
+ * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a.
+ *     set it when you need to use only the first n_rows_a rows of the matrix A, which has
+ *     (perhaps, due to padding) lda rows.
+ * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
+ * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ * @param trans_a whether to take transpose of a
+ * @param alpha is a scalar scale of Ax.
+ * @param beta is a scalar scale of y.
+ * @param stream stream on which this function is run
+ */
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const int lda,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          const math_t alpha,
+          const math_t beta,
+          cudaStream_t stream)
+{
+  detail::gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream);
+}
+
+/**
+ * y = op(A) * x
+ *
+ * where
+ * @param handle raft handle
+ * @param A is a column-major matrix of size n_rows_a * n_cols_a.
+ *   op(A) is either the transpose operation (trans_a == true) or identity.
+ * @param n_rows_a number of rows in A
+ * @param n_cols_a number of cols in A
+ * @param lda is the leading dimension of A (number of rows); lda must be not smaller than n_rows_a.
+ *     set it when you need to use only the first n_rows_a rows of the matrix A, which has
+ *     (perhaps, due to padding) lda rows.
+ * @param x is a vector of size `trans_a ? n_rows_a : n_cols_a`.
+ * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
+ * @param trans_a whether to take transpose of a
+ * @param stream stream on which this function is run
+ *
+ */
+template <typename math_t>
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const int lda,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          cudaStream_t stream)
+{
+  detail::gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, stream);
+}
+
+};  // namespace linalg
+};  // namespace raft
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/gemv.hpp b/cpp/include/raft/linalg/gemv.hpp
index 2098027b16..d6e0e0326b 100644
--- a/cpp/include/raft/linalg/gemv.hpp
+++ b/cpp/include/raft/linalg/gemv.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __GEMV_H
+#define __GEMV_H
 
 #pragma once
 
@@ -206,3 +213,5 @@ void gemv(const raft::handle_t& handle,
 
 };  // namespace linalg
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/init.cuh b/cpp/include/raft/linalg/init.cuh
new file mode 100644
index 0000000000..2fdf9dceb9
--- /dev/null
+++ b/cpp/include/raft/linalg/init.cuh
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __INIT_H
+#define __INIT_H
+
+#pragma once
+
+#include "detail/init.hpp"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Like Python range.
+ *
+ * Fills the output as out[i] = i.
+ *
+ * \param [out] out device array, size [end-start]
+ * \param [in] start of the range
+ * \param [in] end of range (exclusive)
+ * \param [in] stream cuda stream
+ */
+template <typename T>
+void range(T* out, int start, int end, cudaStream_t stream)
+{
+  detail::range(out, start, end, stream);
+}
+
+/**
+ * @brief Like Python range.
+ *
+ * Fills the output as out[i] = i.
+ *
+ * \param [out] out device array, size [n]
+ * \param [in] n length of the array
+ * \param [in] stream cuda stream
+ */
+template <typename T, int TPB = 256>
+void range(T* out, int n, cudaStream_t stream)
+{
+  detail::range(out, n, stream);
+}
+
+}  // namespace linalg
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/init.hpp b/cpp/include/raft/linalg/init.hpp
index bb577672e8..af3486f278 100644
--- a/cpp/include/raft/linalg/init.hpp
+++ b/cpp/include/raft/linalg/init.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __INIT_H
+#define __INIT_H
 
 #pragma once
 
@@ -54,3 +61,5 @@ void range(T* out, int n, cudaStream_t stream)
 
 }  // namespace linalg
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/lanczos.cuh b/cpp/include/raft/linalg/lanczos.cuh
new file mode 100644
index 0000000000..a7157adfab
--- /dev/null
+++ b/cpp/include/raft/linalg/lanczos.cuh
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __LANCZOS_H
+#define __LANCZOS_H
+
+#pragma once
+
+#include "detail/lanczos.cuh"
+#include <raft/spectral/matrix_wrappers.hpp>
+
+namespace raft {
+namespace linalg {
+
+// =========================================================
+// Eigensolver
+// =========================================================
+
+/**
+ *  @brief  Compute smallest eigenvectors of symmetric matrix
+ *    Computes eigenvalues and eigenvectors that are least
+ *    positive. If matrix is positive definite or positive
+ *    semidefinite, the computed eigenvalues are smallest in
+ *    magnitude.
+ *    The largest eigenvalue is estimated by performing several
+ *    Lanczos iterations. An implicitly restarted Lanczos method is
+ *    then applied to A+s*I, where s is negative the largest
+ *    eigenvalue.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param A Matrix.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter Maximum number of Lanczos steps. Does not include
+ *    Lanczos steps used to estimate largest eigenvalue.
+ *  @param restartIter Maximum size of Lanczos system before
+ *    performing an implicit restart. Should be at least 4.
+ *  @param tol Convergence tolerance. Lanczos iteration will
+ *    terminate when the residual norm is less than tol*theta, where
+ *    theta is an estimate for the smallest unwanted eigenvalue
+ *    (i.e. the (nEigVecs+1)th smallest eigenvalue).
+ *  @param reorthogonalize Whether to reorthogonalize Lanczos
+ *    vectors.
+ *  @param iter On exit, pointer to total number of Lanczos
+ *    iterations performed. Does not include Lanczos steps used to
+ *    estimate largest eigenvalue.
+ *  @param eigVals_dev (Output, device memory, nEigVecs entries)
+ *    Smallest eigenvalues of matrix.
+ *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
+ *    Eigenvectors corresponding to smallest eigenvalues of
+ *    matrix. Vectors are stored as columns of a column-major matrix
+ *    with dimensions n x nEigVecs.
+ *  @param seed random seed.
+ *  @return error flag.
+ */
+template <typename index_type_t, typename value_type_t>
+int computeSmallestEigenvectors(
+  handle_t const& handle,
+  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+  index_type_t nEigVecs,
+  index_type_t maxIter,
+  index_type_t restartIter,
+  value_type_t tol,
+  bool reorthogonalize,
+  index_type_t& iter,
+  value_type_t* __restrict__ eigVals_dev,
+  value_type_t* __restrict__ eigVecs_dev,
+  unsigned long long seed = 1234567)
+{
+  return detail::computeSmallestEigenvectors(handle,
+                                             A,
+                                             nEigVecs,
+                                             maxIter,
+                                             restartIter,
+                                             tol,
+                                             reorthogonalize,
+                                             iter,
+                                             eigVals_dev,
+                                             eigVecs_dev,
+                                             seed);
+}
+
+/**
+ *  @brief  Compute largest eigenvectors of symmetric matrix
+ *    Computes eigenvalues and eigenvectors that are least
+ *    positive. If matrix is positive definite or positive
+ *    semidefinite, the computed eigenvalues are largest in
+ *    magnitude.
+ *    The largest eigenvalue is estimated by performing several
+ *    Lanczos iterations. An implicitly restarted Lanczos method is
+ *    then applied to A+s*I, where s is negative the largest
+ *    eigenvalue.
+ *  @tparam index_type_t the type of data used for indexing.
+ *  @tparam value_type_t the type of data used for weights, distances.
+ *  @param handle the raft handle.
+ *  @param A Matrix.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter Maximum number of Lanczos steps. Does not include
+ *    Lanczos steps used to estimate largest eigenvalue.
+ *  @param restartIter Maximum size of Lanczos system before
+ *    performing an implicit restart. Should be at least 4.
+ *  @param tol Convergence tolerance. Lanczos iteration will
+ *    terminate when the residual norm is less than tol*theta, where
+ *    theta is an estimate for the largest unwanted eigenvalue
+ *    (i.e. the (nEigVecs+1)th largest eigenvalue).
+ *  @param reorthogonalize Whether to reorthogonalize Lanczos
+ *    vectors.
+ *  @param iter On exit, pointer to total number of Lanczos
+ *    iterations performed. Does not include Lanczos steps used to
+ *    estimate largest eigenvalue.
+ *  @param eigVals_dev (Output, device memory, nEigVecs entries)
+ *    Largest eigenvalues of matrix.
+ *  @param eigVecs_dev (Output, device memory, n*nEigVecs entries)
+ *    Eigenvectors corresponding to largest eigenvalues of
+ *    matrix. Vectors are stored as columns of a column-major matrix
+ *    with dimensions n x nEigVecs.
+ *  @param seed random seed.
+ *  @return error flag.
+ */
+template <typename index_type_t, typename value_type_t>
+int computeLargestEigenvectors(
+  handle_t const& handle,
+  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+  index_type_t nEigVecs,
+  index_type_t maxIter,
+  index_type_t restartIter,
+  value_type_t tol,
+  bool reorthogonalize,
+  index_type_t& iter,
+  value_type_t* __restrict__ eigVals_dev,
+  value_type_t* __restrict__ eigVecs_dev,
+  unsigned long long seed = 123456)
+{
+  return detail::computeLargestEigenvectors(handle,
+                                            A,
+                                            nEigVecs,
+                                            maxIter,
+                                            restartIter,
+                                            tol,
+                                            reorthogonalize,
+                                            iter,
+                                            eigVals_dev,
+                                            eigVecs_dev,
+                                            seed);
+}
+
+}  // namespace linalg
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp
index 21b65158fc..7663af3cb2 100644
--- a/cpp/include/raft/linalg/lanczos.hpp
+++ b/cpp/include/raft/linalg/lanczos.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __LANCZOS_H
+#define __LANCZOS_H
 
 #pragma once
 
-#include "detail/lanczos.hpp"
+#include "detail/lanczos.cuh"
 #include <raft/spectral/matrix_wrappers.hpp>
 
 namespace raft {
@@ -156,3 +163,5 @@ int computeLargestEigenvectors(
 
 }  // namespace linalg
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/lstsq.cuh b/cpp/include/raft/linalg/lstsq.cuh
new file mode 100644
index 0000000000..255f1293f4
--- /dev/null
+++ b/cpp/include/raft/linalg/lstsq.cuh
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __LSTSQ_H
+#define __LSTSQ_H
+
+#pragma once
+
+#include <raft/handle.hpp>
+#include <raft/linalg/detail/lstsq.cuh>
+namespace raft {
+namespace linalg {
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  Via SVD decomposition of `A = U S Vt` using default cuSOLVER routine.
+ *
+ * @param[in] handle raft handle
+ * @param[inout] A input feature matrix.
+ *            Warning: the content of this matrix is modified by the cuSOLVER routines.
+ * @param[in] n_rows number of rows in A
+ * @param[in] n_cols number of columns in A
+ * @param[inout] b input target vector.
+ *            Warning: the content of this vector is modified by the cuSOLVER routines.
+ * @param[out] w output coefficient vector
+ * @param[in] stream cuda stream for ordering operations
+ */
+template <typename math_t>
+void lstsqSvdQR(const raft::handle_t& handle,
+                math_t* A,
+                const int n_rows,
+                const int n_cols,
+                const math_t* b,
+                math_t* w,
+                cudaStream_t stream)
+{
+  detail::lstsqSvdQR(handle, A, n_rows, n_cols, b, w, stream);
+}
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  Via SVD decomposition of `A = U S V^T` using Jacobi iterations (cuSOLVER).
+ *
+ * @param[in] handle raft handle
+ * @param[inout] A input feature matrix.
+ *            Warning: the content of this matrix is modified by the cuSOLVER routines.
+ * @param[in] n_rows number of rows in A
+ * @param[in] n_cols number of columns in A
+ * @param[inout] b input target vector.
+ *            Warning: the content of this vector is modified by the cuSOLVER routines.
+ * @param[out] w output coefficient vector
+ * @param[in] stream cuda stream for ordering operations
+ */
+template <typename math_t>
+void lstsqSvdJacobi(const raft::handle_t& handle,
+                    math_t* A,
+                    const int n_rows,
+                    const int n_cols,
+                    const math_t* b,
+                    math_t* w,
+                    cudaStream_t stream)
+{
+  detail::lstsqSvdJacobi(handle, A, n_rows, n_cols, b, w, stream);
+}
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  via eigenvalue decomposition of `A^T * A` (covariance matrix for dataset A).
+ *  (`w = (A^T A)^-1  A^T b`)
+ */
+template <typename math_t>
+void lstsqEig(const raft::handle_t& handle,
+              const math_t* A,
+              const int n_rows,
+              const int n_cols,
+              const math_t* b,
+              math_t* w,
+              cudaStream_t stream)
+{
+  detail::lstsqEig(handle, A, n_rows, n_cols, b, w, stream);
+}
+
+/** Solves the linear ordinary least squares problem `Aw = b`
+ *  via QR decomposition of `A = QR`.
+ *  (triangular system of equations `Rw = Q^T b`)
+ *
+ * @param[in] handle raft handle
+ * @param[inout] A input feature matrix.
+ *            Warning: the content of this matrix is modified by the cuSOLVER routines.
+ * @param[in] n_rows number of rows in A
+ * @param[in] n_cols number of columns in A
+ * @param[inout] b input target vector.
+ *            Warning: the content of this vector is modified by the cuSOLVER routines.
+ * @param[out] w output coefficient vector
+ * @param[in] stream cuda stream for ordering operations
+ */
+template <typename math_t>
+void lstsqQR(const raft::handle_t& handle,
+             math_t* A,
+             const int n_rows,
+             const int n_cols,
+             math_t* b,
+             math_t* w,
+             cudaStream_t stream)
+{
+  detail::lstsqQR(handle, A, n_rows, n_cols, b, w, stream);
+}
+
+};  // namespace linalg
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/lstsq.hpp b/cpp/include/raft/linalg/lstsq.hpp
index 57dd0a7b15..008fcab653 100644
--- a/cpp/include/raft/linalg/lstsq.hpp
+++ b/cpp/include/raft/linalg/lstsq.hpp
@@ -13,11 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __LSTSQ_H
+#define __LSTSQ_H
 
 #pragma once
 
 #include <raft/handle.hpp>
-#include <raft/linalg/detail/lstsq.hpp>
+#include <raft/linalg/detail/lstsq.cuh>
 namespace raft {
 namespace linalg {
 
@@ -115,3 +122,5 @@ void lstsqQR(const raft::handle_t& handle,
 
 };  // namespace linalg
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.cuh
new file mode 100644
index 0000000000..5df4d24b4f
--- /dev/null
+++ b/cpp/include/raft/linalg/map.cuh
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __MAP_H
+#define __MAP_H
+
+#pragma once
+
+#include "detail/map.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief CUDA version of map
+ * @tparam InType data-type upon which the math operation will be performed
+ * @tparam MapOp the device-lambda performing the actual operation
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @tparam Args additional parameters
+ * @tparam OutType data-type in which the result will be stored
+ * @param out the output of the map operation (assumed to be a device pointer)
+ * @param len number of elements in the input array
+ * @param map the device-lambda
+ * @param stream cuda-stream where to launch this kernel
+ * @param in the input array
+ * @param args additional input arrays
+ */
+
+template <typename InType,
+          typename MapOp,
+          int TPB = 256,
+          typename... Args,
+          typename OutType = InType>
+void map(OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+{
+  detail::mapImpl<InType, OutType, MapOp, TPB, Args...>(out, len, map, stream, in, args...);
+}
+
+}  // namespace linalg
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/map.hpp b/cpp/include/raft/linalg/map.hpp
index febeaa8621..d4ee231eb1 100644
--- a/cpp/include/raft/linalg/map.hpp
+++ b/cpp/include/raft/linalg/map.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MAP_H
+#define __MAP_H
 
 #pragma once
 
@@ -48,3 +55,5 @@ void map(OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType*
 
 }  // namespace linalg
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/map_then_reduce.cuh
new file mode 100644
index 0000000000..36828cf154
--- /dev/null
+++ b/cpp/include/raft/linalg/map_then_reduce.cuh
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __MAP_THEN_REDUCE_H
+#define __MAP_THEN_REDUCE_H
+
+#pragma once
+
+#include "detail/map_then_reduce.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief CUDA version of map and then sum reduction operation
+ * @tparam Type data-type upon which the math operation will be performed
+ * @tparam MapOp the device-lambda performing the actual operation
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @tparam Args additional parameters
+ * @param out the output sum-reduced value (assumed to be a device pointer)
+ * @param len number of elements in the input array
+ * @param map the device-lambda
+ * @param stream cuda-stream where to launch this kernel
+ * @param in the input array
+ * @param args additional input arrays
+ */
+
+template <typename InType,
+          typename MapOp,
+          int TPB = 256,
+          typename... Args,
+          typename OutType = InType>
+void mapThenSumReduce(
+  OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+{
+  detail::mapThenReduceImpl<InType, OutType, MapOp, detail::sum_tag, TPB, Args...>(
+    out, len, (OutType)0, map, detail::sum_tag(), stream, in, args...);
+}
+
+/**
+ * @brief CUDA version of map and then generic reduction operation
+ * @tparam Type data-type upon which the math operation will be performed
+ * @tparam MapOp the device-lambda performing the actual map operation
+ * @tparam ReduceLambda the device-lambda performing the actual reduction
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @tparam Args additional parameters
+ * @param out the output reduced value (assumed to be a device pointer)
+ * @param len number of elements in the input array
+ * @param neutral The neutral element of the reduction operation. For example:
+ *    0 for sum, 1 for multiply, +Inf for Min, -Inf for Max
+ * @param map the device-lambda
+ * @param op the reduction device lambda
+ * @param stream cuda-stream where to launch this kernel
+ * @param in the input array
+ * @param args additional input arrays
+ */
+
+template <typename InType,
+          typename MapOp,
+          typename ReduceLambda,
+          int TPB          = 256,
+          typename OutType = InType,
+          typename... Args>
+void mapThenReduce(OutType* out,
+                   size_t len,
+                   OutType neutral,
+                   MapOp map,
+                   ReduceLambda op,
+                   cudaStream_t stream,
+                   const InType* in,
+                   Args... args)
+{
+  detail::mapThenReduceImpl<InType, OutType, MapOp, ReduceLambda, TPB, Args...>(
+    out, len, neutral, map, op, stream, in, args...);
+}
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/map_then_reduce.hpp b/cpp/include/raft/linalg/map_then_reduce.hpp
index 04275995a0..c4b136d1b8 100644
--- a/cpp/include/raft/linalg/map_then_reduce.hpp
+++ b/cpp/include/raft/linalg/map_then_reduce.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MAP_THEN_REDUCE_H
+#define __MAP_THEN_REDUCE_H
 
 #pragma once
 
@@ -85,3 +92,5 @@ void mapThenReduce(OutType* out,
 }
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
new file mode 100644
index 0000000000..56437313e3
--- /dev/null
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __MATRIX_VECTOR_OP_H
+#define __MATRIX_VECTOR_OP_H
+
+#pragma once
+
+#include "detail/matrix_vector_op.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Operations for all the columns or rows with a given vector.
+ * Caution : Threads process multiple elements to speed up processing. These
+ * are loaded in a single read thanks to type promotion. Faster processing
+ * would thus only be enabled when adresses are optimally aligned for it.
+ * Note : the function will also check that the size of the window of accesses
+ * is a multiple of the number of elements processed by a thread in order to
+ * enable faster processing
+ * @tparam Type the matrix/vector type
+ * @tparam Lambda a device function which represents a binary operator
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads per block of the cuda kernel launched
+ * @param out the output matrix (passing out = matrix makes it in-place)
+ * @param matrix the input matrix
+ * @param vec the vector
+ * @param D number of columns of matrix
+ * @param N number of rows of matrix
+ * @param rowMajor whether input is row or col major
+ * @param bcastAlongRows whether the broadcast of vector needs to happen along
+ * the rows of the matrix or columns
+ * @param op the mathematical operation
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
+void matrixVectorOp(Type* out,
+                    const Type* matrix,
+                    const Type* vec,
+                    IdxType D,
+                    IdxType N,
+                    bool rowMajor,
+                    bool bcastAlongRows,
+                    Lambda op,
+                    cudaStream_t stream)
+{
+  detail::matrixVectorOp(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
+}
+
+/**
+ * @brief Operations for all the columns or rows with the given vectors.
+ * Caution : Threads process multiple elements to speed up processing. These
+ * are loaded in a single read thanks to type promotion. Faster processing
+ * would thus only be enabled when adresses are optimally aligned for it.
+ * Note : the function will also check that the size of the window of accesses
+ * is a multiple of the number of elements processed by a thread in order to
+ * enable faster processing
+ * @tparam Type the matrix/vector type
+ * @tparam Lambda a device function which represents a binary operator
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads per block of the cuda kernel launched
+ * @param out the output matrix (passing out = matrix makes it in-place)
+ * @param matrix the input matrix
+ * @param vec1 the first vector
+ * @param vec2 the second vector
+ * @param D number of columns of matrix
+ * @param N number of rows of matrix
+ * @param rowMajor whether input is row or col major
+ * @param bcastAlongRows whether the broadcast of vector needs to happen along
+ * the rows of the matrix or columns
+ * @param op the mathematical operation
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
+void matrixVectorOp(Type* out,
+                    const Type* matrix,
+                    const Type* vec1,
+                    const Type* vec2,
+                    IdxType D,
+                    IdxType N,
+                    bool rowMajor,
+                    bool bcastAlongRows,
+                    Lambda op,
+                    cudaStream_t stream)
+{
+  detail::matrixVectorOp(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/matrix_vector_op.hpp b/cpp/include/raft/linalg/matrix_vector_op.hpp
index b9790ebce2..c041d4c263 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.hpp
+++ b/cpp/include/raft/linalg/matrix_vector_op.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MATRIX_VECTOR_OP_H
+#define __MATRIX_VECTOR_OP_H
 
 #pragma once
 
@@ -99,3 +106,5 @@ void matrixVectorOp(Type* out,
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.cuh
new file mode 100644
index 0000000000..1b3297f926
--- /dev/null
+++ b/cpp/include/raft/linalg/mean_squared_error.cuh
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __MSE_H
+#define __MSE_H
+
+#pragma once
+
+#include "detail/mean_squared_error.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief CUDA version mean squared error function mean((A-B)**2)
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam TPB threads-per-block
+ * @param out the output mean squared error value (assumed to be a device pointer)
+ * @param A input array (assumed to be a device pointer)
+ * @param B input array (assumed to be a device pointer)
+ * @param len number of elements in the input arrays
+ * @param weight weight to apply to every term in the mean squared error calculation
+ * @param stream cuda-stream where to launch this kernel
+ */
+template <typename math_t, int TPB = 256>
+void meanSquaredError(
+  math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream)
+{
+  detail::meanSquaredError(out, A, B, len, weight, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/mean_squared_error.hpp b/cpp/include/raft/linalg/mean_squared_error.hpp
index 42af8642b6..95428d47e0 100644
--- a/cpp/include/raft/linalg/mean_squared_error.hpp
+++ b/cpp/include/raft/linalg/mean_squared_error.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MSE_H
+#define __MSE_H
 
 #pragma once
 
-#include "detail/mean_squared_error.hpp"
+#include "detail/mean_squared_error.cuh"
 
 namespace raft {
 namespace linalg {
@@ -41,3 +48,5 @@ void meanSquaredError(
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh
new file mode 100644
index 0000000000..f1161b23cb
--- /dev/null
+++ b/cpp/include/raft/linalg/multiply.cuh
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __MULTIPLY_H
+#define __MULTIPLY_H
+
+#pragma once
+
+#include "detail/multiply.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @defgroup ScalarOps Scalar operations on the input buffer
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in the input buffer
+ * @param scalar the scalar used in the operations
+ * @param len number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
+{
+  detail::multiplyScalar(out, in, scalar, len, stream);
+}
+/** @} */
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/multiply.hpp b/cpp/include/raft/linalg/multiply.hpp
index 4a1628b44a..260fb25018 100644
--- a/cpp/include/raft/linalg/multiply.hpp
+++ b/cpp/include/raft/linalg/multiply.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MULTIPLY_H
+#define __MULTIPLY_H
 
 #pragma once
 
-#include "detail/multiply.hpp"
+#include "detail/multiply.cuh"
 
 namespace raft {
 namespace linalg {
@@ -41,3 +48,5 @@ void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, c
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.cuh
new file mode 100644
index 0000000000..87bd2a2b0a
--- /dev/null
+++ b/cpp/include/raft/linalg/norm.cuh
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __NORM_H
+#define __NORM_H
+
+#pragma once
+
+#include "detail/norm.cuh"
+
+namespace raft {
+namespace linalg {
+
+/** different types of norms supported on the input buffers */
+using detail::L1Norm;
+using detail::L2Norm;
+using detail::NormType;
+
+/**
+ * @brief Compute row-wise norm of the input matrix and perform fin_op lambda
+ *
+ * Row-wise norm is useful while computing pairwise distance matrix, for
+ * example.
+ * This is used in many clustering algos like knn, kmeans, dbscan, etc... The
+ * current implementation is optimized only for bigger values of 'D'.
+ *
+ * @tparam Type the data type
+ * @tparam Lambda device final lambda
+ * @tparam IdxType Integer type used to for addressing
+ * @param dots the output vector of row-wise dot products
+ * @param data the input matrix (currently assumed to be row-major)
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param type the type of norm to be applied
+ * @param rowMajor whether the input is row-major or not
+ * @param stream cuda stream where to launch work
+ * @param fin_op the final lambda op
+ */
+template <typename Type, typename IdxType = int, typename Lambda = raft::Nop<Type, IdxType>>
+void rowNorm(Type* dots,
+             const Type* data,
+             IdxType D,
+             IdxType N,
+             NormType type,
+             bool rowMajor,
+             cudaStream_t stream,
+             Lambda fin_op = raft::Nop<Type, IdxType>())
+{
+  detail::rowNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op);
+}
+
+/**
+ * @brief Compute column-wise norm of the input matrix and perform fin_op
+ * @tparam Type the data type
+ * @tparam Lambda device final lambda
+ * @tparam IdxType Integer type used to for addressing
+ * @param dots the output vector of column-wise dot products
+ * @param data the input matrix (currently assumed to be row-major)
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param type the type of norm to be applied
+ * @param rowMajor whether the input is row-major or not
+ * @param stream cuda stream where to launch work
+ * @param fin_op the final lambda op
+ */
+template <typename Type, typename IdxType = int, typename Lambda = raft::Nop<Type, IdxType>>
+void colNorm(Type* dots,
+             const Type* data,
+             IdxType D,
+             IdxType N,
+             NormType type,
+             bool rowMajor,
+             cudaStream_t stream,
+             Lambda fin_op = raft::Nop<Type, IdxType>())
+{
+  detail::colNormCaller(dots, data, D, N, type, rowMajor, stream, fin_op);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/norm.hpp b/cpp/include/raft/linalg/norm.hpp
index a6336769ca..7be524f6de 100644
--- a/cpp/include/raft/linalg/norm.hpp
+++ b/cpp/include/raft/linalg/norm.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __NORM_H
+#define __NORM_H
 
 #pragma once
 
-#include "detail/norm.hpp"
+#include "detail/norm.cuh"
 
 namespace raft {
 namespace linalg {
@@ -88,3 +95,5 @@ void colNorm(Type* dots,
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/power.cuh b/cpp/include/raft/linalg/power.cuh
index d17fa9a043..f94fcfc894 100644
--- a/cpp/include/raft/linalg/power.cuh
+++ b/cpp/include/raft/linalg/power.cuh
@@ -13,12 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#ifndef __POWER_H
+#define __POWER_H
 
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.hpp>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/unary_op.cuh>
 
 namespace raft {
 namespace linalg {
@@ -63,3 +65,5 @@ void power(math_t* out, const math_t* in1, const math_t* in2, IdxType len, cudaS
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/power.hpp b/cpp/include/raft/linalg/power.hpp
new file mode 100644
index 0000000000..124ee8513a
--- /dev/null
+++ b/cpp/include/raft/linalg/power.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __POWER_H
+#define __POWER_H
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/unary_op.cuh>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @defgroup ScalarOps Scalar operations on the input buffer
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in the input buffer
+ * @param scalar the scalar used in the operations
+ * @param len number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void powerScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(
+    out, in, len, [scalar] __device__(math_t in) { return raft::myPow(in, scalar); }, stream);
+}
+/** @} */
+
+/**
+ * @defgroup BinaryOps Element-wise binary operations on the input buffers
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in1 the first input buffer
+ * @param in2 the second input buffer
+ * @param len number of elements in the input buffers
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void power(math_t* out, const math_t* in1, const math_t* in2, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::binaryOp(
+    out, in1, in2, len, [] __device__(math_t a, math_t b) { return raft::myPow(a, b); }, stream);
+}
+/** @} */
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh
new file mode 100644
index 0000000000..fe6a5263ca
--- /dev/null
+++ b/cpp/include/raft/linalg/qr.cuh
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __QR_H
+#define __QR_H
+
+#pragma once
+
+#include "detail/qr.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @defgroup QRdecomp QR decomposition
+ * @{
+ */
+
+/**
+ * @brief compute QR decomp and return only Q matrix
+ * @param handle: raft handle
+ * @param M: input matrix
+ * @param Q: Q matrix to be returned (on GPU)
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param stream cuda stream
+ * @{
+ */
+template <typename math_t>
+void qrGetQ(const raft::handle_t& handle,
+            const math_t* M,
+            math_t* Q,
+            int n_rows,
+            int n_cols,
+            cudaStream_t stream)
+{
+  detail::qrGetQ(handle, M, Q, n_rows, n_cols, stream);
+}
+
+/**
+ * @brief compute QR decomp and return both Q and R matrices
+ * @param handle: raft handle
+ * @param M: input matrix
+ * @param Q: Q matrix to be returned (on GPU)
+ * @param R: R matrix to be returned (on GPU)
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void qrGetQR(const raft::handle_t& handle,
+             math_t* M,
+             math_t* Q,
+             math_t* R,
+             int n_rows,
+             int n_cols,
+             cudaStream_t stream)
+{
+  detail::qrGetQR(handle, M, Q, R, n_rows, n_cols, stream);
+}
+/** @} */
+
+};  // namespace linalg
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/qr.hpp b/cpp/include/raft/linalg/qr.hpp
index 50e97e4069..da8736b46f 100644
--- a/cpp/include/raft/linalg/qr.hpp
+++ b/cpp/include/raft/linalg/qr.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __QR_H
+#define __QR_H
 
 #pragma once
 
@@ -72,3 +79,5 @@ void qrGetQR(const raft::handle_t& handle,
 
 };  // namespace linalg
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh
new file mode 100644
index 0000000000..7640da8c2d
--- /dev/null
+++ b/cpp/include/raft/linalg/reduce.cuh
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __REDUCE_H
+#define __REDUCE_H
+
+#pragma once
+
+#include "detail/reduce.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Compute reduction of the input matrix along the requested dimension
+ *
+ * @tparam InType the data type of the input
+ * @tparam OutType the data type of the output (as well as the data type for
+ *  which reduction is performed)
+ * @tparam IdxType data type of the indices of the array
+ * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
+ * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*ReduceLambda)(OutType);</pre>
+ * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*FinalLambda)(OutType);</pre>
+ * @param dots the output reduction vector
+ * @param data the input matrix
+ * @param D number of columns
+ * @param N number of rows
+ * @param init initial value to use for the reduction
+ * @param rowMajor input matrix is row-major or not
+ * @param alongRows whether to reduce along rows or columns
+ * @param stream cuda stream where to launch work
+ * @param inplace reduction result added inplace or overwrites old values?
+ * @param main_op elementwise operation to apply before reduction
+ * @param reduce_op binary reduction operation
+ * @param final_op elementwise operation to apply before storing results
+ */
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::Nop<OutType>>
+void reduce(OutType* dots,
+            const InType* data,
+            int D,
+            int N,
+            OutType init,
+            bool rowMajor,
+            bool alongRows,
+            cudaStream_t stream,
+            bool inplace           = false,
+            MainLambda main_op     = raft::Nop<InType, IdxType>(),
+            ReduceLambda reduce_op = raft::Sum<OutType>(),
+            FinalLambda final_op   = raft::Nop<OutType>())
+{
+  detail::reduce(
+    dots, data, D, N, init, rowMajor, alongRows, stream, inplace, main_op, reduce_op, final_op);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/reduce.hpp b/cpp/include/raft/linalg/reduce.hpp
index 1c4ef70df8..b9f057771a 100644
--- a/cpp/include/raft/linalg/reduce.hpp
+++ b/cpp/include/raft/linalg/reduce.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __REDUCE_H
+#define __REDUCE_H
 
 #pragma once
 
-#include "detail/reduce.hpp"
+#include "detail/reduce.cuh"
 
 namespace raft {
 namespace linalg {
@@ -75,3 +82,5 @@ void reduce(OutType* dots,
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.cuh b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
index 82d272671c..2336639258 100644
--- a/cpp/include/raft/linalg/reduce_cols_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_cols_by_key.cuh
@@ -13,6 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#ifndef __REDUCE_COLS_BY_KEY
+#define __REDUCE_COLS_BY_KEY
 
 #pragma once
 
@@ -52,3 +54,5 @@ void reduce_cols_by_key(const T* data,
 }
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/reduce_cols_by_key.hpp b/cpp/include/raft/linalg/reduce_cols_by_key.hpp
new file mode 100644
index 0000000000..a338d8572b
--- /dev/null
+++ b/cpp/include/raft/linalg/reduce_cols_by_key.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __REDUCE_COLS_BY_KEY
+#define __REDUCE_COLS_BY_KEY
+
+#pragma once
+
+#include <raft/linalg/detail/reduce_cols_by_key.cuh>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Computes the sum-reduction of matrix columns for each given key
+ * @tparam T the input data type (as well as the output reduced matrix)
+ * @tparam KeyType data type of the keys
+ * @tparam IdxType indexing arithmetic type
+ * @param data the input data (dim = nrows x ncols). This is assumed to be in
+ * row-major layout
+ * @param keys keys array (len = ncols). It is assumed that each key in this
+ * array is between [0, nkeys). In case this is not true, the caller is expected
+ * to have called make_monotonic primitive to prepare such a contiguous and
+ * monotonically increasing keys array.
+ * @param out the output reduced matrix along columns (dim = nrows x nkeys).
+ * This will be assumed to be in row-major layout
+ * @param nrows number of rows in the input data
+ * @param ncols number of colums in the input data
+ * @param nkeys number of unique keys in the keys array
+ * @param stream cuda stream to launch the kernel onto
+ */
+template <typename T, typename KeyIteratorT, typename IdxType = int>
+void reduce_cols_by_key(const T* data,
+                        const KeyIteratorT keys,
+                        T* out,
+                        IdxType nrows,
+                        IdxType ncols,
+                        IdxType nkeys,
+                        cudaStream_t stream)
+{
+  detail::reduce_cols_by_key(data, keys, out, nrows, ncols, nkeys, stream);
+}
+};  // end namespace linalg
+};  // end namespace raft
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
index 76d4ed4971..ca7a956986 100644
--- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#ifndef __REDUCE_ROWS_BY_KEY
+#define __REDUCE_ROWS_BY_KEY
 
 #pragma once
 
@@ -108,3 +110,5 @@ void reduce_rows_by_key(const DataIteratorT d_A,
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.hpp b/cpp/include/raft/linalg/reduce_rows_by_key.hpp
new file mode 100644
index 0000000000..70ce9eaa4f
--- /dev/null
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.hpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __REDUCE_ROWS_BY_KEY
+#define __REDUCE_ROWS_BY_KEY
+
+#pragma once
+
+#include <raft/linalg/detail/reduce_rows_by_key.cuh>
+
+namespace raft {
+namespace linalg {
+
+/**
+ Small helper function to convert from int->char and char->int
+ Transform ncols*nrows read of int in 2*nrows reads of int + ncols*rows reads of chars
+**/
+template <typename IteratorT1, typename IteratorT2>
+void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st)
+{
+  detail::convert_array(dst, src, n, st);
+}
+
+/**
+ * @brief Computes the weighted reduction of matrix rows for each given key
+ *
+ * @tparam DataIteratorT Random-access iterator type, for reading input matrix
+ *                       (may be a simple pointer type)
+ * @tparam KeysIteratorT Random-access iterator type, for reading input keys
+ *                       (may be a simple pointer type)
+ *
+ * @param[in]  d_A         Input data array (lda x nrows)
+ * @param[in]  lda         Real row size for input data, d_A
+ * @param[in]  d_keys      Keys for each row (1 x nrows)
+ * @param[in]  d_weights   Weights for each observation in d_A (1 x nrows)
+ * @param[out] d_keys_char Scratch memory for conversion of keys to char
+ * @param[in]  nrows       Number of rows in d_A and d_keys
+ * @param[in]  ncols       Number of data columns in d_A
+ * @param[in]  nkeys       Number of unique keys in d_keys
+ * @param[out] d_sums      Row sums by key (ncols x d_keys)
+ * @param[in]  stream      CUDA stream
+ */
+template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
+void reduce_rows_by_key(const DataIteratorT d_A,
+                        int lda,
+                        const KeysIteratorT d_keys,
+                        const WeightT* d_weights,
+                        char* d_keys_char,
+                        int nrows,
+                        int ncols,
+                        int nkeys,
+                        DataIteratorT d_sums,
+                        cudaStream_t stream)
+{
+  detail::reduce_rows_by_key(
+    d_A, lda, d_keys, d_weights, d_keys_char, nrows, ncols, nkeys, d_sums, stream);
+}
+
+/**
+ * @brief Computes the reduction of matrix rows for each given key
+ * @tparam DataIteratorT Random-access iterator type, for reading input matrix (may be a simple
+ * pointer type)
+ * @tparam KeysIteratorT Random-access iterator type, for reading input keys (may be a simple
+ * pointer type)
+ * @param[in]  d_A         Input data array (lda x nrows)
+ * @param[in]  lda         Real row size for input data, d_A
+ * @param[in]  d_keys      Keys for each row (1 x nrows)
+ * @param      d_keys_char Scratch memory for conversion of keys to char
+ * @param[in]  nrows       Number of rows in d_A and d_keys
+ * @param[in]  ncols       Number of data columns in d_A
+ * @param[in]  nkeys       Number of unique keys in d_keys
+ * @param[out] d_sums      Row sums by key (ncols x d_keys)
+ * @param[in]  stream      CUDA stream
+ */
+template <typename DataIteratorT, typename KeysIteratorT>
+void reduce_rows_by_key(const DataIteratorT d_A,
+                        int lda,
+                        const KeysIteratorT d_keys,
+                        char* d_keys_char,
+                        int nrows,
+                        int ncols,
+                        int nkeys,
+                        DataIteratorT d_sums,
+                        cudaStream_t stream)
+{
+  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+  reduce_rows_by_key(d_A,
+                     lda,
+                     d_keys,
+                     static_cast<DataType*>(nullptr),
+                     d_keys_char,
+                     nrows,
+                     ncols,
+                     nkeys,
+                     d_sums,
+                     stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/rsvd.cuh b/cpp/include/raft/linalg/rsvd.cuh
index d1d739489f..f5eaba7526 100644
--- a/cpp/include/raft/linalg/rsvd.cuh
+++ b/cpp/include/raft/linalg/rsvd.cuh
@@ -13,6 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#ifndef __RSVD_H
+#define __RSVD_H
 
 #pragma once
 
@@ -137,3 +139,5 @@ void rsvdPerc(const raft::handle_t& handle,
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/rsvd.hpp b/cpp/include/raft/linalg/rsvd.hpp
new file mode 100644
index 0000000000..2dd5faa332
--- /dev/null
+++ b/cpp/include/raft/linalg/rsvd.hpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __RSVD_H
+#define __RSVD_H
+
+#pragma once
+
+#include <raft/linalg/detail/rsvd.cuh>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief randomized singular value decomposition (RSVD) on the column major
+ * float type input matrix (Jacobi-based), by specifying no. of PCs and
+ * upsamples directly
+ * @param handle: raft handle
+ * @param M: input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param S_vec: singular values of input matrix
+ * @param U: left singular values of input matrix
+ * @param V: right singular values of input matrix
+ * @param k: no. of singular values to be computed
+ * @param p: no. of upsamples
+ * @param use_bbt: whether use eigen decomposition in computation or not
+ * @param gen_left_vec: left vector needs to be generated or not?
+ * @param gen_right_vec: right vector needs to be generated or not?
+ * @param use_jacobi: whether to jacobi solver for decomposition
+ * @param tol: tolerance for Jacobi-based solvers
+ * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void rsvdFixedRank(const raft::handle_t& handle,
+                   math_t* M,
+                   int n_rows,
+                   int n_cols,
+                   math_t* S_vec,
+                   math_t* U,
+                   math_t* V,
+                   int k,
+                   int p,
+                   bool use_bbt,
+                   bool gen_left_vec,
+                   bool gen_right_vec,
+                   bool use_jacobi,
+                   math_t tol,
+                   int max_sweeps,
+                   cudaStream_t stream)
+{
+  detail::rsvdFixedRank(handle,
+                        M,
+                        n_rows,
+                        n_cols,
+                        S_vec,
+                        U,
+                        V,
+                        k,
+                        p,
+                        use_bbt,
+                        gen_left_vec,
+                        gen_right_vec,
+                        use_jacobi,
+                        tol,
+                        max_sweeps,
+                        stream);
+}
+
+/**
+ * @brief randomized singular value decomposition (RSVD) on the column major
+ * float type input matrix (Jacobi-based), by specifying the PC and upsampling
+ * ratio
+ * @param handle: raft handle
+ * @param M: input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param S_vec: singular values of input matrix
+ * @param U: left singular values of input matrix
+ * @param V: right singular values of input matrix
+ * @param PC_perc: percentage of singular values to be computed
+ * @param UpS_perc: upsampling percentage
+ * @param use_bbt: whether use eigen decomposition in computation or not
+ * @param gen_left_vec: left vector needs to be generated or not?
+ * @param gen_right_vec: right vector needs to be generated or not?
+ * @param use_jacobi: whether to jacobi solver for decomposition
+ * @param tol: tolerance for Jacobi-based solvers
+ * @param max_sweeps: maximum number of sweeps for Jacobi-based solvers
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void rsvdPerc(const raft::handle_t& handle,
+              math_t* M,
+              int n_rows,
+              int n_cols,
+              math_t* S_vec,
+              math_t* U,
+              math_t* V,
+              math_t PC_perc,
+              math_t UpS_perc,
+              bool use_bbt,
+              bool gen_left_vec,
+              bool gen_right_vec,
+              bool use_jacobi,
+              math_t tol,
+              int max_sweeps,
+              cudaStream_t stream)
+{
+  detail::rsvdPerc(handle,
+                   M,
+                   n_rows,
+                   n_cols,
+                   S_vec,
+                   U,
+                   V,
+                   PC_perc,
+                   UpS_perc,
+                   use_bbt,
+                   gen_left_vec,
+                   gen_right_vec,
+                   use_jacobi,
+                   tol,
+                   max_sweeps,
+                   stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/sqrt.cuh b/cpp/include/raft/linalg/sqrt.cuh
index c431cfdcc0..b58bc752ac 100644
--- a/cpp/include/raft/linalg/sqrt.cuh
+++ b/cpp/include/raft/linalg/sqrt.cuh
@@ -13,11 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#ifndef __SQRT_H
+#define __SQRT_H
 
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 
 namespace raft {
 namespace linalg {
@@ -42,3 +44,5 @@ void sqrt(math_t* out, const math_t* in, IdxType len, cudaStream_t stream)
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/sqrt.hpp b/cpp/include/raft/linalg/sqrt.hpp
new file mode 100644
index 0000000000..9856173248
--- /dev/null
+++ b/cpp/include/raft/linalg/sqrt.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SQRT_H
+#define __SQRT_H
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/unary_op.cuh>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @defgroup ScalarOps Scalar operations on the input buffer
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out the output buffer
+ * @param in the input buffer
+ * @param len number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void sqrt(math_t* out, const math_t* in, IdxType len, cudaStream_t stream)
+{
+  raft::linalg::unaryOp(
+    out, in, len, [] __device__(math_t in) { return raft::mySqrt(in); }, stream);
+}
+/** @} */
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh
new file mode 100644
index 0000000000..941e64dcb1
--- /dev/null
+++ b/cpp/include/raft/linalg/strided_reduction.cuh
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __STRIDED_REDUCTION_H
+#define __STRIDED_REDUCTION_H
+
+#pragma once
+
+#include "detail/strided_reduction.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Compute reduction of the input matrix along the strided dimension
+ *
+ * @tparam InType the data type of the input
+ * @tparam OutType the data type of the output (as well as the data type for
+ *  which reduction is performed)
+ * @tparam IdxType data type of the indices of the array
+ * @tparam MainLambda Unary lambda applied while acculumation (eg: L1 or L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*MainLambda)(InType, IdxType);</pre>
+ * @tparam ReduceLambda Binary lambda applied for reduction (eg: addition(+) for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*ReduceLambda)(OutType);</pre>
+ * @tparam FinalLambda the final lambda applied before STG (eg: Sqrt for L2 norm)
+ * It must be a 'callable' supporting the following input and output:
+ * <pre>OutType (*FinalLambda)(OutType);</pre>
+ * @param dots the output reduction vector
+ * @param data the input matrix
+ * @param D leading dimension of data
+ * @param N second dimension data
+ * @param init initial value to use for the reduction
+ * @param main_op elementwise operation to apply before reduction
+ * @param reduce_op binary reduction operation
+ * @param final_op elementwise operation to apply before storing results
+ * @param inplace reduction result added inplace or overwrites old values?
+ * @param stream cuda stream where to launch work
+ */
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
+          typename ReduceLambda = raft::Sum<OutType>,
+          typename FinalLambda  = raft::Nop<OutType>>
+void stridedReduction(OutType* dots,
+                      const InType* data,
+                      IdxType D,
+                      IdxType N,
+                      OutType init,
+                      cudaStream_t stream,
+                      bool inplace           = false,
+                      MainLambda main_op     = raft::Nop<InType, IdxType>(),
+                      ReduceLambda reduce_op = raft::Sum<OutType>(),
+                      FinalLambda final_op   = raft::Nop<OutType>())
+{
+  detail::stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/strided_reduction.hpp b/cpp/include/raft/linalg/strided_reduction.hpp
index 0f97323e5a..534f8edcf7 100644
--- a/cpp/include/raft/linalg/strided_reduction.hpp
+++ b/cpp/include/raft/linalg/strided_reduction.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __STRIDED_REDUCTION_H
+#define __STRIDED_REDUCTION_H
 
 #pragma once
 
@@ -70,3 +77,5 @@ void stridedReduction(OutType* dots,
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh
new file mode 100644
index 0000000000..9ca36ddddf
--- /dev/null
+++ b/cpp/include/raft/linalg/subtract.cuh
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SUBTRACT_H
+#define __SUBTRACT_H
+
+#pragma once
+
+#include "detail/subtract.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief Elementwise scalar subtraction operation on the input buffer
+ *
+ * @tparam InT     input data-type. Also the data-type upon which the math ops
+ *                 will be performed
+ * @tparam OutT    output data-type
+ * @tparam IdxType Integer type used to for addressing
+ *
+ * @param out    the output buffer
+ * @param in     the input buffer
+ * @param scalar the scalar used in the operations
+ * @param len    number of elements in the input buffer
+ * @param stream cuda stream where to launch work
+ */
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
+{
+  detail::subtractScalar(out, in, scalar, len, stream);
+}
+
+/**
+ * @brief Elementwise subtraction operation on the input buffers
+ * @tparam InT     input data-type. Also the data-type upon which the math ops
+ *                 will be performed
+ * @tparam OutT    output data-type
+ * @tparam IdxType Integer type used to for addressing
+ *
+ * @param out    the output buffer
+ * @param in1    the first input buffer
+ * @param in2    the second input buffer
+ * @param len    number of elements in the input buffers
+ * @param stream cuda stream where to launch work
+ */
+template <typename InT, typename OutT = InT, typename IdxType = int>
+void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
+{
+  detail::subtract(out, in1, in2, len, stream);
+}
+
+/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
+ * write result to outDev[i]
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param outDev the output buffer
+ * @param inDev the input buffer
+ * @param singleScalarDev pointer to the scalar located in device memory
+ * @param len number of elements in the input and output buffer
+ * @param stream cuda stream
+ * @remark block size has not been tuned
+ */
+template <typename math_t, typename IdxType = int, int TPB = 256>
+void subtractDevScalar(math_t* outDev,
+                       const math_t* inDev,
+                       const math_t* singleScalarDev,
+                       IdxType len,
+                       cudaStream_t stream)
+{
+  detail::subtractDevScalar(outDev, inDev, singleScalarDev, len, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/subtract.hpp b/cpp/include/raft/linalg/subtract.hpp
index 9d48948cad..2420ce69e2 100644
--- a/cpp/include/raft/linalg/subtract.hpp
+++ b/cpp/include/raft/linalg/subtract.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SUBTRACT_H
+#define __SUBTRACT_H
 
 #pragma once
 
@@ -83,3 +90,5 @@ void subtractDevScalar(math_t* outDev,
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh
new file mode 100644
index 0000000000..b48def90a3
--- /dev/null
+++ b/cpp/include/raft/linalg/svd.cuh
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SVD_H
+#define __SVD_H
+
+#pragma once
+
+#include "detail/svd.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief singular value decomposition (SVD) on the column major float type
+ * input matrix using QR method
+ * @param handle: raft handle
+ * @param in: input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param sing_vals: singular values of input matrix
+ * @param left_sing_vecs: left singular values of input matrix
+ * @param right_sing_vecs: right singular values of input matrix
+ * @param trans_right: transpose right vectors or not
+ * @param gen_left_vec: generate left eig vector. Not activated.
+ * @param gen_right_vec: generate right eig vector. Not activated.
+ * @param stream cuda stream
+ */
+// TODO: activate gen_left_vec and gen_right_vec options
+// TODO: couldn't template this function due to cusolverDnSgesvd and
+// cusolverSnSgesvd. Check if there is any other way.
+template <typename T>
+void svdQR(const raft::handle_t& handle,
+           T* in,
+           int n_rows,
+           int n_cols,
+           T* sing_vals,
+           T* left_sing_vecs,
+           T* right_sing_vecs,
+           bool trans_right,
+           bool gen_left_vec,
+           bool gen_right_vec,
+           cudaStream_t stream)
+{
+  detail::svdQR(handle,
+                in,
+                n_rows,
+                n_cols,
+                sing_vals,
+                left_sing_vecs,
+                right_sing_vecs,
+                trans_right,
+                gen_left_vec,
+                gen_right_vec,
+                stream);
+}
+
+template <typename T>
+void svdEig(const raft::handle_t& handle,
+            T* in,
+            int n_rows,
+            int n_cols,
+            T* S,
+            T* U,
+            T* V,
+            bool gen_left_vec,
+            cudaStream_t stream)
+{
+  detail::svdEig(handle, in, n_rows, n_cols, S, U, V, gen_left_vec, stream);
+}
+
+/**
+ * @brief on the column major input matrix using Jacobi method
+ * @param handle: raft handle
+ * @param in: input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param sing_vals: singular values of input matrix
+ * @param left_sing_vecs: left singular vectors of input matrix
+ * @param right_sing_vecs: right singular vectors of input matrix
+ * @param gen_left_vec: generate left eig vector. Not activated.
+ * @param gen_right_vec: generate right eig vector. Not activated.
+ * @param tol: error tolerance for the jacobi method. Algorithm stops when the
+ * error is below tol
+ * @param max_sweeps: number of sweeps in the Jacobi algorithm. The more the better
+ * accuracy.
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void svdJacobi(const raft::handle_t& handle,
+               math_t* in,
+               int n_rows,
+               int n_cols,
+               math_t* sing_vals,
+               math_t* left_sing_vecs,
+               math_t* right_sing_vecs,
+               bool gen_left_vec,
+               bool gen_right_vec,
+               math_t tol,
+               int max_sweeps,
+               cudaStream_t stream)
+{
+  detail::svdJacobi(handle,
+                    in,
+                    n_rows,
+                    n_cols,
+                    sing_vals,
+                    left_sing_vecs,
+                    right_sing_vecs,
+                    gen_left_vec,
+                    gen_right_vec,
+                    tol,
+                    max_sweeps,
+                    stream);
+}
+
+/**
+ * @brief reconstruct a matrix use left and right singular vectors and
+ * singular values
+ * @param handle: raft handle
+ * @param U: left singular vectors of size n_rows x k
+ * @param S: square matrix with singular values on its diagonal, k x k
+ * @param V: right singular vectors of size n_cols x k
+ * @param out: reconstructed matrix to be returned
+ * @param n_rows: number rows of output matrix
+ * @param n_cols: number columns of output matrix
+ * @param k: number of singular values
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void svdReconstruction(const raft::handle_t& handle,
+                       math_t* U,
+                       math_t* S,
+                       math_t* V,
+                       math_t* out,
+                       int n_rows,
+                       int n_cols,
+                       int k,
+                       cudaStream_t stream)
+{
+  detail::svdReconstruction(handle, U, S, V, out, n_rows, n_cols, k, stream);
+}
+
+/**
+ * @brief reconstruct a matrix use left and right singular vectors and
+ * singular values
+ * @param handle: raft handle
+ * @param A_d: input matrix
+ * @param U: left singular vectors of size n_rows x k
+ * @param S_vec: singular values as a vector
+ * @param V: right singular vectors of size n_cols x k
+ * @param n_rows: number rows of output matrix
+ * @param n_cols: number columns of output matrix
+ * @param k: number of singular values to be computed, 1.0 for normal SVD
+ * @param tol: tolerance for the evaluation
+ * @param stream cuda stream
+ */
+template <typename math_t>
+bool evaluateSVDByL2Norm(const raft::handle_t& handle,
+                         math_t* A_d,
+                         math_t* U,
+                         math_t* S_vec,
+                         math_t* V,
+                         int n_rows,
+                         int n_cols,
+                         int k,
+                         math_t tol,
+                         cudaStream_t stream)
+{
+  return detail::evaluateSVDByL2Norm(handle, A_d, U, S_vec, V, n_rows, n_cols, k, tol, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/svd.hpp b/cpp/include/raft/linalg/svd.hpp
index a30180b174..765f364d5b 100644
--- a/cpp/include/raft/linalg/svd.hpp
+++ b/cpp/include/raft/linalg/svd.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SVD_H
+#define __SVD_H
 
 #pragma once
 
-#include "detail/svd.hpp"
+#include "detail/svd.cuh"
 
 namespace raft {
 namespace linalg {
@@ -182,3 +189,5 @@ bool evaluateSVDByL2Norm(const raft::handle_t& handle,
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/ternary_op.cuh b/cpp/include/raft/linalg/ternary_op.cuh
index be411e6492..158cca168d 100644
--- a/cpp/include/raft/linalg/ternary_op.cuh
+++ b/cpp/include/raft/linalg/ternary_op.cuh
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#ifndef __TERNARY_OP_H
+#define __TERNARY_OP_H
+
 #pragma once
 
 #include <raft/linalg/detail/ternary_op.cuh>
@@ -47,4 +50,6 @@ void ternaryOp(math_t* out,
 }
 
 };  // end namespace linalg
-};  // end namespace raft
\ No newline at end of file
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/ternary_op.hpp b/cpp/include/raft/linalg/ternary_op.hpp
new file mode 100644
index 0000000000..1e8892211c
--- /dev/null
+++ b/cpp/include/raft/linalg/ternary_op.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __TERNARY_OP_H
+#define __TERNARY_OP_H
+
+#pragma once
+
+#include <raft/linalg/detail/ternary_op.cuh>
+
+namespace raft {
+namespace linalg {
+/**
+ * @brief perform element-wise ternary operation on the input arrays
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @param out the output array
+ * @param in1 the first input array
+ * @param in2 the second input array
+ * @param in3 the third input array
+ * @param len number of elements in the input array
+ * @param op the device-lambda
+ * @param stream cuda stream where to launch work
+ */
+template <typename math_t, typename Lambda, typename IdxType = int, int TPB = 256>
+void ternaryOp(math_t* out,
+               const math_t* in1,
+               const math_t* in2,
+               const math_t* in3,
+               IdxType len,
+               Lambda op,
+               cudaStream_t stream)
+{
+  detail::ternaryOp(out, in1, in2, in3, len, op, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/transpose.cuh b/cpp/include/raft/linalg/transpose.cuh
new file mode 100644
index 0000000000..a9ada5125a
--- /dev/null
+++ b/cpp/include/raft/linalg/transpose.cuh
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __TRANSPOSE_H
+#define __TRANSPOSE_H
+
+#pragma once
+
+#include "detail/transpose.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief transpose on the column major input matrix using Jacobi method
+ * @param handle: raft handle
+ * @param in: input matrix
+ * @param out: output. Transposed input matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_cols: number columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename math_t>
+void transpose(const raft::handle_t& handle,
+               math_t* in,
+               math_t* out,
+               int n_rows,
+               int n_cols,
+               cudaStream_t stream)
+{
+  detail::transpose(handle, in, out, n_rows, n_cols, stream);
+}
+
+/**
+ * @brief transpose on the column major input matrix using Jacobi method
+ * @param inout: input and output matrix
+ * @param n: number of rows and columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename math_t>
+void transpose(math_t* inout, int n, cudaStream_t stream)
+{
+  detail::transpose(inout, n, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/transpose.hpp b/cpp/include/raft/linalg/transpose.hpp
index 50608877fa..765d523b16 100644
--- a/cpp/include/raft/linalg/transpose.hpp
+++ b/cpp/include/raft/linalg/transpose.hpp
@@ -13,10 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __TRANSPOSE_H
+#define __TRANSPOSE_H
 
 #pragma once
 
-#include "detail/transpose.hpp"
+#include "detail/transpose.cuh"
 
 namespace raft {
 namespace linalg {
@@ -55,3 +62,5 @@ void transpose(math_t* inout, int n, cudaStream_t stream)
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh
new file mode 100644
index 0000000000..f2466df463
--- /dev/null
+++ b/cpp/include/raft/linalg/unary_op.cuh
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __UNARY_OP_H
+#define __UNARY_OP_H
+
+#pragma once
+
+#include "detail/unary_op.cuh"
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief perform element-wise unary operation in the input array
+ * @tparam InType input data-type
+ * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam OutType output data-type
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @param out the output array
+ * @param in the input array
+ * @param len number of elements in the input array
+ * @param op the device-lambda
+ * @param stream cuda stream where to launch work
+ * @note Lambda must be a functor with the following signature:
+ *       `OutType func(const InType& val);`
+ */
+template <typename InType,
+          typename Lambda,
+          typename IdxType = int,
+          typename OutType = InType,
+          int TPB          = 256>
+void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
+{
+  detail::unaryOpCaller(out, in, len, op, stream);
+}
+
+/**
+ * @brief Perform an element-wise unary operation into the output array
+ *
+ * Compared to `unaryOp()`, this method does not do any reads from any inputs
+ *
+ * @tparam OutType output data-type
+ * @tparam Lambda  the device-lambda performing the actual operation
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB     threads-per-block in the final kernel launched
+ *
+ * @param[out] out    the output array [on device] [len = len]
+ * @param[in]  len    number of elements in the input array
+ * @param[in]  op     the device-lambda which must be of the form:
+ *                    `void func(OutType* outLocationOffset, IdxType idx);`
+ *                    where outLocationOffset will be out + idx.
+ * @param[in]  stream cuda stream where to launch work
+ */
+template <typename OutType, typename Lambda, typename IdxType = int, int TPB = 256>
+void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
+{
+  detail::writeOnlyUnaryOpCaller(out, len, op, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/unary_op.hpp b/cpp/include/raft/linalg/unary_op.hpp
index 51faa2e4a4..12d841340b 100644
--- a/cpp/include/raft/linalg/unary_op.hpp
+++ b/cpp/include/raft/linalg/unary_op.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __UNARY_OP_H
+#define __UNARY_OP_H
 
 #pragma once
 
@@ -71,3 +78,5 @@ void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
 
 };  // end namespace linalg
 };  // end namespace raft
+
+#endif
diff --git a/cpp/include/raft/matrix/col_wise_sort.cuh b/cpp/include/raft/matrix/col_wise_sort.cuh
new file mode 100644
index 0000000000..afdec24ebd
--- /dev/null
+++ b/cpp/include/raft/matrix/col_wise_sort.cuh
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __COL_WISE_SORT_H
+#define __COL_WISE_SORT_H
+
+#pragma once
+
+#include <raft/matrix/detail/columnWiseSort.cuh>
+
+namespace raft {
+namespace matrix {
+
+/**
+ * @brief sort columns within each row of row-major input matrix and return sorted indexes
+ * modelled as key-value sort with key being input matrix and value being index of values
+ * @param in: input matrix
+ * @param out: output value(index) matrix
+ * @param n_rows: number rows of input matrix
+ * @param n_columns: number columns of input matrix
+ * @param bAllocWorkspace: check returned value, if true allocate workspace passed in workspaceSize
+ * @param workspacePtr: pointer to workspace memory
+ * @param workspaceSize: Size of workspace to be allocated
+ * @param stream: cuda stream to execute prim on
+ * @param sortedKeys: Optional, output matrix for sorted keys (input)
+ */
+template <typename InType, typename OutType>
+void sort_cols_per_row(const InType* in,
+                       OutType* out,
+                       int n_rows,
+                       int n_columns,
+                       bool& bAllocWorkspace,
+                       void* workspacePtr,
+                       size_t& workspaceSize,
+                       cudaStream_t stream,
+                       InType* sortedKeys = nullptr)
+{
+  detail::sortColumnsPerRow<InType, OutType>(
+    in, out, n_rows, n_columns, bAllocWorkspace, workspacePtr, workspaceSize, stream, sortedKeys);
+}
+};  // end namespace matrix
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/col_wise_sort.hpp b/cpp/include/raft/matrix/col_wise_sort.hpp
index 7ace5881bc..f259bc71a8 100644
--- a/cpp/include/raft/matrix/col_wise_sort.hpp
+++ b/cpp/include/raft/matrix/col_wise_sort.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __COL_WISE_SORT_H
+#define __COL_WISE_SORT_H
 
 #pragma once
 
@@ -50,3 +57,5 @@ void sort_cols_per_row(const InType* in,
 }
 };  // end namespace matrix
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh
index 6b32cbc06e..9e996e19d9 100644
--- a/cpp/include/raft/matrix/detail/math.cuh
+++ b/cpp/include/raft/matrix/detail/math.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,10 +20,10 @@
 
 #include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.hpp>
-#include <raft/linalg/map_then_reduce.hpp>
-#include <raft/linalg/matrix_vector_op.hpp>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/map_then_reduce.cuh>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/unary_op.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index f057ba283c..3fa602d865 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh
new file mode 100644
index 0000000000..9e103afda5
--- /dev/null
+++ b/cpp/include/raft/matrix/math.cuh
@@ -0,0 +1,468 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MATH_H
+#define __MATH_H
+
+#pragma once
+
+#include "detail/math.cuh"
+
+namespace raft {
+namespace matrix {
+
+/**
+ * @defgroup MatrixMathOp math operation on the input matrix
+ * @{
+ */
+
+/**
+ * @brief Power of every element in the input matrix
+ * @param in: input matrix
+ * @param out: output matrix. The result is stored in the out matrix
+ * @param scalar: every element is multiplied with scalar.
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream)
+{
+  detail::power(in, out, scalar, len, stream);
+}
+
+/**
+ * @brief Power of every element in the input matrix
+ * @param inout: input matrix and also the result is stored
+ * @param scalar: every element is multiplied with scalar.
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void power(math_t* inout, math_t scalar, int len, cudaStream_t stream)
+{
+  detail::power(inout, scalar, len, stream);
+}
+
+/**
+ * @brief Power of every element in the input matrix
+ * @param inout: input matrix and also the result is stored
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void power(math_t* inout, int len, cudaStream_t stream)
+{
+  detail::power(inout, len, stream);
+}
+
+/**
+ * @brief Power of every element in the input matrix
+ * @param in: input matrix
+ * @param out: output matrix. The result is stored in the out matrix
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ * @{
+ */
+template <typename math_t>
+void power(math_t* in, math_t* out, int len, cudaStream_t stream)
+{
+  detail::power(in, out, len, stream);
+}
+
+/**
+ * @brief Square root of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param in: input matrix and also the result is stored
+ * @param out: output matrix. The result is stored in the out matrix
+ * @param scalar: every element is multiplied with scalar
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ * @param set_neg_zero whether to set negative numbers to zero
+ */
+template <typename math_t, typename IdxType = int>
+void seqRoot(math_t* in,
+             math_t* out,
+             math_t scalar,
+             IdxType len,
+             cudaStream_t stream,
+             bool set_neg_zero = false)
+{
+  detail::seqRoot(in, out, scalar, len, stream, set_neg_zero);
+}
+
+/**
+ * @brief Square root of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param inout: input matrix and also the result is stored
+ * @param scalar: every element is multiplied with scalar
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ * @param set_neg_zero whether to set negative numbers to zero
+ */
+template <typename math_t, typename IdxType = int>
+void seqRoot(
+  math_t* inout, math_t scalar, IdxType len, cudaStream_t stream, bool set_neg_zero = false)
+{
+  detail::seqRoot(inout, scalar, len, stream, set_neg_zero);
+}
+
+/**
+ * @brief Square root of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param in: input matrix and also the result is stored
+ * @param out: output matrix. The result is stored in the out matrix
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t, typename IdxType = int>
+void seqRoot(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
+{
+  detail::seqRoot(in, out, len, stream);
+}
+
+/**
+ * @brief Square root of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param inout: input matrix with in-place results
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t, typename IdxType = int>
+void seqRoot(math_t* inout, IdxType len, cudaStream_t stream)
+{
+  detail::seqRoot(inout, len, stream);
+}
+
+/**
+ * @brief sets the small values to zero based on a defined threshold
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param out: output matrix. The result is stored in the out matrix
+ * @param in: input matrix
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ * @param thres threshold to set values to zero
+ */
+template <typename math_t, typename IdxType = int>
+void setSmallValuesZero(
+  math_t* out, const math_t* in, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
+{
+  detail::setSmallValuesZero(out, in, len, stream, thres);
+}
+
+/**
+ * @brief sets the small values to zero based on a defined threshold
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param inout: input matrix and also the result is stored
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ * @param thres: threshold
+ */
+template <typename math_t, typename IdxType = int>
+void setSmallValuesZero(math_t* inout, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
+{
+  detail::setSmallValuesZero(inout, len, stream, thres);
+}
+
+/**
+ * @brief Reciprocal of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param in: input matrix and also the result is stored
+ * @param out: output matrix. The result is stored in the out matrix
+ * @param scalar: every element is multiplied with scalar
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ * @param setzero round down to zero if the input is less the threshold
+ * @param thres the threshold used to forcibly set inputs to zero
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void reciprocal(math_t* in,
+                math_t* out,
+                math_t scalar,
+                int len,
+                cudaStream_t stream,
+                bool setzero = false,
+                math_t thres = 1e-15)
+{
+  detail::reciprocal(in, out, scalar, len, stream, setzero, thres);
+}
+
+/**
+ * @brief Reciprocal of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param inout: input matrix with in-place results
+ * @param scalar: every element is multiplied with scalar
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ * @param setzero round down to zero if the input is less the threshold
+ * @param thres the threshold used to forcibly set inputs to zero
+ * @{
+ */
+template <typename math_t, typename IdxType = int>
+void reciprocal(math_t* inout,
+                math_t scalar,
+                IdxType len,
+                cudaStream_t stream,
+                bool setzero = false,
+                math_t thres = 1e-15)
+{
+  detail::reciprocal(inout, scalar, len, stream, setzero, thres);
+}
+
+/**
+ * @brief Reciprocal of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param inout: input matrix and also the result is stored
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t, typename IdxType = int>
+void reciprocal(math_t* inout, IdxType len, cudaStream_t stream)
+{
+  detail::reciprocal(inout, len, stream);
+}
+
+/**
+ * @brief Reciprocal of every element in the input matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param in: input matrix and also the result is stored
+ * @param out: output matrix. The result is stored in the out matrix
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t, typename IdxType = int>
+void reciprocal(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
+{
+  detail::reciprocal(in, out, len, stream);
+}
+
+/**
+ * @brief set values to scalar in matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @param out output matrix. The result is stored in the out matrix
+ * @param in input matrix
+ * @param scalar svalar value
+ * @param len number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_t stream = 0)
+{
+  detail::setValue(out, in, scalar, len, stream);
+}
+
+/**
+ * @brief ratio of every element over sum of input vector is calculated
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam IdxType Integer type used to for addressing
+ * @param handle
+ * @param src: input matrix
+ * @param dest: output matrix. The result is stored in the dest matrix
+ * @param len: number elements of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t, typename IdxType = int>
+void ratio(
+  const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream)
+{
+  detail::ratio(handle, src, dest, len, stream);
+}
+
+/** @} */
+
+/**
+ * @brief Argmax: find the row idx with maximum value for each column
+ * @param in: input matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param out: output vector of size n_cols
+ * @param stream: cuda stream
+ */
+template <typename math_t>
+void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t stream)
+{
+  detail::argmax(in, n_rows, n_cols, out, stream);
+}
+
+/**
+ * @brief sign flip for PCA. This is used to stabilize the sign of column
+ * major eigen vectors. Flips the sign if the column has negative |max|.
+ * @param inout: input matrix. Result also stored in this parameter
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param stream cuda stream
+ */
+template <typename math_t>
+void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream)
+{
+  detail::signFlip(inout, n_rows, n_cols, stream);
+}
+
+/**
+ * @brief multiply each row or column of matrix with vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryMult(Type* data,
+                            const Type* vec,
+                            IdxType n_row,
+                            IdxType n_col,
+                            bool rowMajor,
+                            bool bcastAlongRows,
+                            cudaStream_t stream)
+{
+  detail::matrixVectorBinaryMult<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
+}
+
+/**
+ * @brief multiply each row or column of matrix with vector, skipping zeros in vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryMultSkipZero(Type* data,
+                                    const Type* vec,
+                                    IdxType n_row,
+                                    IdxType n_col,
+                                    bool rowMajor,
+                                    bool bcastAlongRows,
+                                    cudaStream_t stream)
+{
+  detail::matrixVectorBinaryMultSkipZero<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
+}
+
+/**
+ * @brief divide each row or column of matrix with vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryDiv(Type* data,
+                           const Type* vec,
+                           IdxType n_row,
+                           IdxType n_col,
+                           bool rowMajor,
+                           bool bcastAlongRows,
+                           cudaStream_t stream)
+{
+  detail::matrixVectorBinaryDiv<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
+}
+
+/**
+ * @brief divide each row or column of matrix with vector, skipping zeros in vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ * @param return_zero result is zero if true and vector value is below threshold, original value if
+ * false
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryDivSkipZero(Type* data,
+                                   const Type* vec,
+                                   IdxType n_row,
+                                   IdxType n_col,
+                                   bool rowMajor,
+                                   bool bcastAlongRows,
+                                   cudaStream_t stream,
+                                   bool return_zero = false)
+{
+  detail::matrixVectorBinaryDivSkipZero<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream, return_zero);
+}
+
+/**
+ * @brief add each row or column of matrix with vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinaryAdd(Type* data,
+                           const Type* vec,
+                           IdxType n_row,
+                           IdxType n_col,
+                           bool rowMajor,
+                           bool bcastAlongRows,
+                           cudaStream_t stream)
+{
+  detail::matrixVectorBinaryAdd<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
+}
+
+/**
+ * @brief subtract each row or column of matrix with vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void matrixVectorBinarySub(Type* data,
+                           const Type* vec,
+                           IdxType n_row,
+                           IdxType n_col,
+                           bool rowMajor,
+                           bool bcastAlongRows,
+                           cudaStream_t stream)
+{
+  detail::matrixVectorBinarySub<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
+}
+
+};  // end namespace matrix
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/math.hpp b/cpp/include/raft/matrix/math.hpp
index 619e20a702..ab02c8a85f 100644
--- a/cpp/include/raft/matrix/math.hpp
+++ b/cpp/include/raft/matrix/math.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MATH_H
+#define __MATH_H
 
 #pragma once
 
@@ -461,3 +468,5 @@ void matrixVectorBinarySub(Type* data,
 
 };  // end namespace matrix
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh
new file mode 100644
index 0000000000..1af7e37dec
--- /dev/null
+++ b/cpp/include/raft/matrix/matrix.cuh
@@ -0,0 +1,278 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MATRIX_H
+#define __MATRIX_H
+
+#pragma once
+
+#include "detail/linewise_op.cuh"
+#include "detail/matrix.cuh"
+
+#include <raft/common/nvtx.hpp>
+
+namespace raft {
+namespace matrix {
+
+using namespace std;
+
+/**
+ * @brief Copy selected rows of the input matrix into contiguous space.
+ *
+ * On exit out[i + k*n_rows] = in[indices[i] + k*n_rows],
+ * where i = 0..n_rows_indices-1, and k = 0..n_cols-1.
+ *
+ * @param in input matrix
+ * @param n_rows number of rows of output matrix
+ * @param n_cols number of columns of output matrix
+ * @param out output matrix
+ * @param indices of the rows to be copied
+ * @param n_rows_indices number of rows to copy
+ * @param stream cuda stream
+ * @param rowMajor whether the matrix has row major layout
+ */
+template <typename m_t, typename idx_array_t = int, typename idx_t = size_t>
+void copyRows(const m_t* in,
+              idx_t n_rows,
+              idx_t n_cols,
+              m_t* out,
+              const idx_array_t* indices,
+              idx_t n_rows_indices,
+              cudaStream_t stream,
+              bool rowMajor = false)
+{
+  detail::copyRows(in, n_rows, n_cols, out, indices, n_rows_indices, stream, rowMajor);
+}
+
+/**
+ * @brief copy matrix operation for column major matrices.
+ * @param in: input matrix
+ * @param out: output matrix
+ * @param n_rows: number of rows of output matrix
+ * @param n_cols: number of columns of output matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t = int>
+void copy(const m_t* in, m_t* out, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
+  raft::copy_async(out, in, n_rows * n_cols, stream);
+}
+
+/**
+ * @brief copy matrix operation for column major matrices. First n_rows and
+ * n_cols of input matrix "in" is copied to "out" matrix.
+ * @param in: input matrix
+ * @param in_n_rows: number of rows of input matrix
+ * @param out: output matrix
+ * @param out_n_rows: number of rows of output matrix
+ * @param out_n_cols: number of columns of output matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t = int>
+void truncZeroOrigin(
+  m_t* in, idx_t in_n_rows, m_t* out, idx_t out_n_rows, idx_t out_n_cols, cudaStream_t stream)
+{
+  detail::truncZeroOrigin(in, in_n_rows, out, out_n_rows, out_n_cols, stream);
+}
+
+/**
+ * @brief Columns of a column major matrix is reversed (i.e. first column and
+ * last column are swapped)
+ * @param inout: input and output matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t = int>
+void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
+  detail::colReverse(inout, n_rows, n_cols, stream);
+}
+
+/**
+ * @brief Rows of a column major matrix is reversed (i.e. first row and last
+ * row are swapped)
+ * @param inout: input and output matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t = int>
+void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
+  detail::rowReverse(inout, n_rows, n_cols, stream);
+}
+
+/**
+ * @brief Prints the data stored in GPU memory
+ * @param in: input matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param h_separator: horizontal separator character
+ * @param v_separator: vertical separator character
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t = int>
+void print(const m_t* in,
+           idx_t n_rows,
+           idx_t n_cols,
+           char h_separator    = ' ',
+           char v_separator    = '\n',
+           cudaStream_t stream = rmm::cuda_stream_default)
+{
+  detail::print(in, n_rows, n_cols, h_separator, v_separator, stream);
+}
+
+/**
+ * @brief Prints the data stored in CPU memory
+ * @param in: input matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ */
+template <typename m_t, typename idx_t = int>
+void printHost(const m_t* in, idx_t n_rows, idx_t n_cols)
+{
+  detail::printHost(in, n_rows, n_cols);
+}
+
+/**
+ * @brief Slice a matrix (in-place)
+ * @param in: input matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param out: output matrix
+ * @param x1, y1: coordinate of the top-left point of the wanted area (0-based)
+ * @param x2, y2: coordinate of the bottom-right point of the wanted area
+ * (1-based)
+ * example: Slice the 2nd and 3rd columns of a 4x3 matrix: slice_matrix(M_d, 4,
+ * 3, 0, 1, 4, 3);
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t = int>
+void sliceMatrix(m_t* in,
+                 idx_t n_rows,
+                 idx_t n_cols,
+                 m_t* out,
+                 idx_t x1,
+                 idx_t y1,
+                 idx_t x2,
+                 idx_t y2,
+                 cudaStream_t stream)
+{
+  detail::sliceMatrix(in, n_rows, n_cols, out, x1, y1, x2, y2, stream);
+}
+
+/**
+ * @brief Copy the upper triangular part of a matrix to another
+ * @param src: input matrix with a size of n_rows x n_cols
+ * @param dst: output matrix with a size of kxk, k = min(n_rows, n_cols)
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t = int>
+void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
+  detail::copyUpperTriangular(src, dst, n_rows, n_cols, stream);
+}
+
+/**
+ * @brief Initialize a diagonal matrix with a vector
+ * @param vec: vector of length k = min(n_rows, n_cols)
+ * @param matrix: matrix of size n_rows x n_cols
+ * @param n_rows: number of rows of the matrix
+ * @param n_cols: number of columns of the matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t = int>
+void initializeDiagonalMatrix(
+  m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
+  detail::initializeDiagonalMatrix(vec, matrix, n_rows, n_cols, stream);
+}
+
+/**
+ * @brief Get a square matrix with elements on diagonal reversed (in-place)
+ * @param in: square input matrix with size len x len
+ * @param len: size of one side of the matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t = int>
+void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
+{
+  detail::getDiagonalInverseMatrix(in, len, stream);
+}
+
+/**
+ * @brief Get the L2/F-norm of a matrix/vector
+ * @param handle
+ * @param in: input matrix/vector with totally size elements
+ * @param size: size of the matrix/vector
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t = int>
+m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream)
+{
+  return detail::getL2Norm(handle, in, size, stream);
+}
+
+/**
+ * Run a function over matrix lines (rows or columns) with a variable number
+ * row-vectors or column-vectors.
+ * The term `line` here signifies that the lines can be either columns or rows,
+ * depending on the matrix layout.
+ * What matters is if the vectors are applied along lines (indices of vectors correspond to
+ * indices within lines), or across lines (indices of vectors correspond to line numbers).
+ *
+ * @param [out] out result of the operation; can be same as `in`; should be aligned the same
+ *        as `in` to allow faster vectorized memory transfers.
+ * @param [in] in input matrix consisting of `nLines` lines, each `lineLen`-long.
+ * @param [in] lineLen length of matrix line in elements (`=nCols` in row-major or `=nRows` in
+ * col-major)
+ * @param [in] nLines number of matrix lines (`=nRows` in row-major or `=nCols` in col-major)
+ * @param [in] alongLines whether vectors are indices along or across lines.
+ * @param [in] op the operation applied on each line:
+ *    for i in [0..lineLen) and j in [0..nLines):
+ *      out[i, j] = op(in[i, j], vec1[i], vec2[i], ... veck[i])   if alongLines = true
+ *      out[i, j] = op(in[i, j], vec1[j], vec2[j], ... veck[j])   if alongLines = false
+ *    where matrix indexing is row-major ([i, j] = [i + lineLen * j]).
+ * @param [in] stream a cuda stream for the kernels
+ * @param [in] vecs zero or more vectors to be passed as arguments,
+ *    size of each vector is `alongLines ? lineLen : nLines`.
+ */
+template <typename m_t, typename idx_t = int, typename Lambda, typename... Vecs>
+void linewiseOp(m_t* out,
+                const m_t* in,
+                const idx_t lineLen,
+                const idx_t nLines,
+                const bool alongLines,
+                Lambda op,
+                cudaStream_t stream,
+                Vecs... vecs)
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope("linewiseOp-%c-%zu (%zu, %zu)",
+                                                            alongLines ? 'l' : 'x',
+                                                            sizeof...(Vecs),
+                                                            size_t(lineLen),
+                                                            size_t(nLines));
+  detail::MatrixLinewiseOp<16, 256>::run<m_t, idx_t, Lambda, Vecs...>(
+    out, in, lineLen, nLines, alongLines, op, stream, vecs...);
+}
+
+};  // end namespace matrix
+};  // end namespace raft
+
+#endif
diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp
index e3e2f88d14..cf5f5d1f25 100644
--- a/cpp/include/raft/matrix/matrix.hpp
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MATRIX_H
+#define __MATRIX_H
 
 #pragma once
 
@@ -271,3 +278,5 @@ void linewiseOp(m_t* out,
 
 };  // end namespace matrix
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp
index 151c49af7c..96aa622525 100644
--- a/cpp/include/raft/mr/buffer_base.hpp
+++ b/cpp/include/raft/mr/buffer_base.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/mr/device/buffer.hpp b/cpp/include/raft/mr/device/buffer.hpp
index aee3cba046..954ce83d1f 100644
--- a/cpp/include/raft/mr/device/buffer.hpp
+++ b/cpp/include/raft/mr/device/buffer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/mr/host/buffer.hpp b/cpp/include/raft/mr/host/buffer.hpp
index de9468add8..25aed3e725 100644
--- a/cpp/include/raft/mr/host/buffer.hpp
+++ b/cpp/include/raft/mr/host/buffer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/random/detail/make_blobs.cuh b/cpp/include/raft/random/detail/make_blobs.cuh
index b79178567b..10ded9c93e 100644
--- a/cpp/include/raft/random/detail/make_blobs.cuh
+++ b/cpp/include/raft/random/detail/make_blobs.cuh
@@ -19,8 +19,8 @@
 #include "permute.cuh"
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/unary_op.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/random/rng.cuh>
 #include <rmm/device_uvector.hpp>
 #include <vector>
 
diff --git a/cpp/include/raft/random/detail/make_regression.cuh b/cpp/include/raft/random/detail/make_regression.cuh
index eb8eaf565e..8bab85e485 100644
--- a/cpp/include/raft/random/detail/make_regression.cuh
+++ b/cpp/include/raft/random/detail/make_regression.cuh
@@ -24,15 +24,15 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/add.hpp>
+#include <raft/linalg/add.cuh>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/linalg/init.hpp>
-#include <raft/linalg/qr.hpp>
-#include <raft/linalg/transpose.hpp>
-#include <raft/matrix/matrix.hpp>
+#include <raft/linalg/init.cuh>
+#include <raft/linalg/qr.cuh>
+#include <raft/linalg/transpose.cuh>
+#include <raft/matrix/matrix.cuh>
 #include <raft/mr/device/buffer.hpp>
-#include <raft/random/permute.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/random/permute.cuh>
+#include <raft/random/rng.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft::random {
diff --git a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
index bf79b3cb71..15789742fd 100644
--- a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
+++ b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
@@ -22,8 +22,8 @@
 #include <raft/handle.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/detail/cusolver_wrappers.hpp>
-#include <raft/linalg/matrix_vector_op.hpp>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/unary_op.cuh>
 #include <stdio.h>
 
 // mvg.cuh takes in matrices that are colomn major (as in fortan)
diff --git a/cpp/include/raft/random/make_blobs.cuh b/cpp/include/raft/random/make_blobs.cuh
new file mode 100644
index 0000000000..2ad3a7960d
--- /dev/null
+++ b/cpp/include/raft/random/make_blobs.cuh
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MAKE_BLOBS_H
+#define __MAKE_BLOBS_H
+
+#pragma once
+
+#include "detail/make_blobs.cuh"
+
+namespace raft::random {
+
+/**
+ * @brief GPU-equivalent of sklearn.datasets.make_blobs
+ *
+ * @tparam DataT output data type
+ * @tparam IdxT  indexing arithmetic type
+ *
+ * @param[out] out                generated data [on device]
+ *                                [dim = n_rows x n_cols]
+ * @param[out] labels             labels for the generated data [on device]
+ *                                [len = n_rows]
+ * @param[in]  n_rows             number of rows in the generated data
+ * @param[in]  n_cols             number of columns in the generated data
+ * @param[in]  n_clusters         number of clusters (or classes) to generate
+ * @param[in]  stream             cuda stream to schedule the work on
+ * @param[in]  row_major          whether input `centers` and output `out`
+ *                                buffers are to be stored in row or column
+ *                                major layout
+ * @param[in]  centers            centers of each of the cluster, pass a nullptr
+ *                                if you need this also to be generated randomly
+ *                                [on device] [dim = n_clusters x n_cols]
+ * @param[in]  cluster_std        standard deviation of each cluster center,
+ *                                pass a nullptr if this is to be read from the
+ *                                `cluster_std_scalar`. [on device]
+ *                                [len = n_clusters]
+ * @param[in]  cluster_std_scalar if 'cluster_std' is nullptr, then use this as
+ *                                the std-dev across all dimensions.
+ * @param[in]  shuffle            shuffle the generated dataset and labels
+ * @param[in]  center_box_min     min value of box from which to pick cluster
+ *                                centers. Useful only if 'centers' is nullptr
+ * @param[in]  center_box_max     max value of box from which to pick cluster
+ *                                centers. Useful only if 'centers' is nullptr
+ * @param[in]  seed               seed for the RNG
+ * @param[in]  type               RNG type
+ */
+template <typename DataT, typename IdxT>
+void make_blobs(DataT* out,
+                IdxT* labels,
+                IdxT n_rows,
+                IdxT n_cols,
+                IdxT n_clusters,
+                cudaStream_t stream,
+                bool row_major                 = true,
+                const DataT* centers           = nullptr,
+                const DataT* cluster_std       = nullptr,
+                const DataT cluster_std_scalar = (DataT)1.0,
+                bool shuffle                   = true,
+                DataT center_box_min           = (DataT)-10.0,
+                DataT center_box_max           = (DataT)10.0,
+                uint64_t seed                  = 0ULL,
+                GeneratorType type             = GenPhilox)
+{
+  detail::make_blobs_caller(out,
+                            labels,
+                            n_rows,
+                            n_cols,
+                            n_clusters,
+                            stream,
+                            row_major,
+                            centers,
+                            cluster_std,
+                            cluster_std_scalar,
+                            shuffle,
+                            center_box_min,
+                            center_box_max,
+                            seed,
+                            type);
+}
+
+}  // end namespace raft::random
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/random/make_blobs.hpp b/cpp/include/raft/random/make_blobs.hpp
index afdabfe55b..19d4b8499b 100644
--- a/cpp/include/raft/random/make_blobs.hpp
+++ b/cpp/include/raft/random/make_blobs.hpp
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MAKE_BLOBS_H
+#define __MAKE_BLOBS_H
+
 #pragma once
 
 #include "detail/make_blobs.cuh"
@@ -88,4 +96,6 @@ void make_blobs(DataT* out,
                             type);
 }
 
-}  // end namespace raft::random
\ No newline at end of file
+}  // end namespace raft::random
+
+#endif
diff --git a/cpp/include/raft/random/make_regression.cuh b/cpp/include/raft/random/make_regression.cuh
new file mode 100644
index 0000000000..4fbb48fa35
--- /dev/null
+++ b/cpp/include/raft/random/make_regression.cuh
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Adapted from scikit-learn
+ * https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/datasets/_samples_generator.py
+ */
+
+#ifndef __MAKE_REGRESSION_H
+#define __MAKE_REGRESSION_H
+
+#pragma once
+
+#include <algorithm>
+
+#include "detail/make_regression.cuh"
+
+namespace raft::random {
+
+/**
+ * @brief GPU-equivalent of sklearn.datasets.make_regression as documented at:
+ * https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html
+ *
+ * @tparam  DataT  Scalar type
+ * @tparam  IdxT   Index type
+ *
+ * @param[in]   handle          RAFT handle
+ * @param[out]  out             Row-major (samples, features) matrix to store
+ *                              the problem data
+ * @param[out]  values          Row-major (samples, targets) matrix to store
+ *                              the values for the regression problem
+ * @param[in]   n_rows          Number of samples
+ * @param[in]   n_cols          Number of features
+ * @param[in]   n_informative   Number of informative features (non-zero
+ *                              coefficients)
+ * @param[in]   stream          CUDA stream
+ * @param[out]  coef            Row-major (features, targets) matrix to store
+ *                              the coefficients used to generate the values
+ *                              for the regression problem. If nullptr is
+ *                              given, nothing will be written
+ * @param[in]   n_targets       Number of targets (generated values per sample)
+ * @param[in]   bias            A scalar that will be added to the values
+ * @param[in]   effective_rank  The approximate rank of the data matrix (used
+ *                              to create correlations in the data). -1 is the
+ *                              code to use well-conditioned data
+ * @param[in]   tail_strength   The relative importance of the fat noisy tail
+ *                              of the singular values profile if
+ *                              effective_rank is not -1
+ * @param[in]   noise           Standard deviation of the gaussian noise
+ *                              applied to the output
+ * @param[in]   shuffle         Shuffle the samples and the features
+ * @param[in]   seed            Seed for the random number generator
+ * @param[in]   type            Random generator type
+ */
+template <typename DataT, typename IdxT>
+void make_regression(const raft::handle_t& handle,
+                     DataT* out,
+                     DataT* values,
+                     IdxT n_rows,
+                     IdxT n_cols,
+                     IdxT n_informative,
+                     cudaStream_t stream,
+                     DataT* coef         = nullptr,
+                     IdxT n_targets      = (IdxT)1,
+                     DataT bias          = (DataT)0.0,
+                     IdxT effective_rank = (IdxT)-1,
+                     DataT tail_strength = (DataT)0.5,
+                     DataT noise         = (DataT)0.0,
+                     bool shuffle        = true,
+                     uint64_t seed       = 0ULL,
+                     GeneratorType type  = GenPhilox)
+{
+  detail::make_regression_caller(handle,
+                                 out,
+                                 values,
+                                 n_rows,
+                                 n_cols,
+                                 n_informative,
+                                 stream,
+                                 coef,
+                                 n_targets,
+                                 bias,
+                                 effective_rank,
+                                 tail_strength,
+                                 noise,
+                                 shuffle,
+                                 seed,
+                                 type);
+}
+
+}  // namespace raft::random
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/random/make_regression.hpp b/cpp/include/raft/random/make_regression.hpp
index d6fceff466..c050a447ed 100644
--- a/cpp/include/raft/random/make_regression.hpp
+++ b/cpp/include/raft/random/make_regression.hpp
@@ -18,6 +18,14 @@
  * https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/datasets/_samples_generator.py
  */
 
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MAKE_REGRESSION_H
+#define __MAKE_REGRESSION_H
+
 #pragma once
 
 #include <algorithm>
@@ -97,4 +105,6 @@ void make_regression(const raft::handle_t& handle,
                                  type);
 }
 
-}  // namespace raft::random
\ No newline at end of file
+}  // namespace raft::random
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/random/multi_variable_gaussian.cuh b/cpp/include/raft/random/multi_variable_gaussian.cuh
new file mode 100644
index 0000000000..1d9d63f6c5
--- /dev/null
+++ b/cpp/include/raft/random/multi_variable_gaussian.cuh
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MVG_H
+#define __MVG_H
+
+#pragma once
+
+#include "detail/multi_variable_gaussian.cuh"
+
+namespace raft::random {
+
+template <typename T>
+class multi_variable_gaussian : public detail::multi_variable_gaussian_impl<T> {
+ public:
+  // using Decomposer = typename detail::multi_variable_gaussian_impl<T>::Decomposer;
+  // using detail::multi_variable_gaussian_impl<T>::Decomposer::chol_decomp;
+  // using detail::multi_variable_gaussian_impl<T>::Decomposer::jacobi;
+  // using detail::multi_variable_gaussian_impl<T>::Decomposer::qr;
+
+  multi_variable_gaussian() = delete;
+  multi_variable_gaussian(const raft::handle_t& handle,
+                          const int dim,
+                          typename detail::multi_variable_gaussian_impl<T>::Decomposer method)
+    : detail::multi_variable_gaussian_impl<T>{handle, dim, method}
+  {
+  }
+
+  std::size_t get_workspace_size()
+  {
+    return detail::multi_variable_gaussian_impl<T>::get_workspace_size();
+  }
+
+  void set_workspace(T* workarea)
+  {
+    detail::multi_variable_gaussian_impl<T>::set_workspace(workarea);
+  }
+
+  void give_gaussian(const int nPoints, T* P, T* X, const T* x = 0)
+  {
+    detail::multi_variable_gaussian_impl<T>::give_gaussian(nPoints, P, X, x);
+  }
+
+  void deinit() { detail::multi_variable_gaussian_impl<T>::deinit(); }
+
+  ~multi_variable_gaussian() { deinit(); }
+};  // end of multi_variable_gaussian
+
+};  // end of namespace raft::random
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/random/multi_variable_gaussian.hpp b/cpp/include/raft/random/multi_variable_gaussian.hpp
index c2af52322a..fd1de4aadd 100644
--- a/cpp/include/raft/random/multi_variable_gaussian.hpp
+++ b/cpp/include/raft/random/multi_variable_gaussian.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MVG_H
+#define __MVG_H
 
 #pragma once
 
@@ -56,4 +63,6 @@ class multi_variable_gaussian : public detail::multi_variable_gaussian_impl<T> {
   ~multi_variable_gaussian() { deinit(); }
 };  // end of multi_variable_gaussian
 
-};  // end of namespace raft::random
\ No newline at end of file
+};  // end of namespace raft::random
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/random/permute.cuh b/cpp/include/raft/random/permute.cuh
new file mode 100644
index 0000000000..1c01d589f4
--- /dev/null
+++ b/cpp/include/raft/random/permute.cuh
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PERMUTE_H
+#define __PERMUTE_H
+
+#pragma once
+
+#include "detail/permute.cuh"
+
+namespace raft::random {
+
+/**
+ * @brief Generate permutations of the input array. Pretty useful primitive for
+ * shuffling the input datasets in ML algos. See note at the end for some of its
+ * limitations!
+ * @tparam Type Data type of the array to be shuffled
+ * @tparam IntType Integer type used for ther perms array
+ * @tparam IdxType Integer type used for addressing indices
+ * @tparam TPB threads per block
+ * @param perms the output permutation indices. Typically useful only when
+ * one wants to refer back. If you don't need this, pass a nullptr
+ * @param out the output shuffled array. Pass nullptr if you don't want this to
+ * be written. For eg: when you only want the perms array to be filled.
+ * @param in input array (in-place is not supported due to race conditions!)
+ * @param D number of columns of the input array
+ * @param N length of the input array (or number of rows)
+ * @param rowMajor whether the input/output matrices are row or col major
+ * @param stream cuda stream where to launch the work
+ *
+ * @note This is NOT a uniform permutation generator! In fact, it only generates
+ * very small percentage of permutations. If your application really requires a
+ * high quality permutation generator, it is recommended that you pick
+ * Knuth Shuffle.
+ */
+template <typename Type, typename IntType = int, typename IdxType = int, int TPB = 256>
+void permute(IntType* perms,
+             Type* out,
+             const Type* in,
+             IntType D,
+             IntType N,
+             bool rowMajor,
+             cudaStream_t stream)
+{
+  detail::permute<Type, IntType, IdxType, TPB>(perms, out, in, D, N, rowMajor, stream);
+}
+
+};  // end namespace raft::random
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/random/permute.hpp b/cpp/include/raft/random/permute.hpp
index 32ed3779e4..3507d66cc3 100644
--- a/cpp/include/raft/random/permute.hpp
+++ b/cpp/include/raft/random/permute.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __PERMUTE_H
+#define __PERMUTE_H
 
 #pragma once
 
@@ -55,4 +62,6 @@ void permute(IntType* perms,
   detail::permute<Type, IntType, IdxType, TPB>(perms, out, in, D, N, rowMajor, stream);
 }
 
-};  // end namespace raft::random
\ No newline at end of file
+};  // end namespace raft::random
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh
new file mode 100644
index 0000000000..3e75b2ae74
--- /dev/null
+++ b/cpp/include/raft/random/rng.cuh
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __RNG_H
+#define __RNG_H
+
+#pragma once
+
+#include "detail/rng_impl.cuh"
+
+namespace raft {
+namespace random {
+
+using detail::RngState;
+
+using detail::GeneratorType;
+using detail::GenPC;
+using detail::GenPhilox;
+
+using detail::PCGenerator;
+using detail::PhiloxGenerator;
+
+using detail::BernoulliDistParams;
+using detail::ExponentialDistParams;
+using detail::GumbelDistParams;
+using detail::InvariantDistParams;
+using detail::LaplaceDistParams;
+using detail::LogisticDistParams;
+using detail::LogNormalDistParams;
+using detail::NormalDistParams;
+using detail::NormalIntDistParams;
+using detail::NormalTableDistParams;
+using detail::RayleighDistParams;
+using detail::SamplingParams;
+using detail::ScaledBernoulliDistParams;
+using detail::UniformDistParams;
+using detail::UniformIntDistParams;
+
+// Not strictly needed due to C++ ADL rules
+using detail::custom_next;
+
+/**
+ * @brief Helper method to compute Box Muller transform
+ *
+ * @tparam Type data type
+ *
+ * @param[inout] val1   first value
+ * @param[inout] val2   second value
+ * @param[in]    sigma1 standard deviation of output gaussian for first value
+ * @param[in]    mu1    mean of output gaussian for first value
+ * @param[in]    sigma2 standard deviation of output gaussian for second value
+ * @param[in]    mu2    mean of output gaussian for second value
+ * @{
+ */
+template <typename Type>
+DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2)
+{
+  detail::box_muller_transform(val1, val2, sigma1, mu1, sigma2, mu2);
+}
+
+template <typename Type>
+DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1)
+{
+  detail::box_muller_transform(val1, val2, sigma1, mu1);
+}
+/** @} */
+
+class Rng : public detail::RngImpl {
+ public:
+  /**
+   * @brief ctor
+   * @param _s 64b seed used to initialize the RNG
+   * @param _t backend device RNG generator type
+   * @note Refer to the `Rng::seed` method for details about seeding the engine
+   */
+  Rng(uint64_t _s, GeneratorType _t = GenPhilox) : detail::RngImpl(_s, _t) {}
+
+  /**
+   * @brief Generates the 'a' and 'b' parameters for a modulo affine
+   *        transformation equation: `(ax + b) % n`
+   *
+   * @tparam IdxT integer type
+   *
+   * @param[in]  n the modulo range
+   * @param[out] a slope parameter
+   * @param[out] b intercept parameter
+   */
+  template <typename IdxT>
+  void affine_transform_params(IdxT n, IdxT& a, IdxT& b)
+  {
+    detail::RngImpl::affine_transform_params(n, a, b);
+  }
+
+  /**
+   * @brief Generate uniformly distributed numbers in the given range
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param start start of the range
+   * @param end end of the range
+   * @param stream stream where to launch the kernel
+   * @{
+   */
+  template <typename OutType, typename LenType = int>
+  void uniform(OutType* ptr, LenType len, OutType start, OutType end, cudaStream_t stream)
+  {
+    detail::RngImpl::uniform(ptr, len, start, end, stream);
+  }
+
+  template <typename OutType, typename LenType = int>
+  void uniformInt(OutType* ptr, LenType len, OutType start, OutType end, cudaStream_t stream)
+  {
+    detail::RngImpl::uniformInt(ptr, len, start, end, stream);
+  }
+  /** @} */
+
+  /**
+   * @brief Generate normal distributed numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param mu mean of the distribution
+   * @param sigma std-dev of the distribution
+   * @param stream stream where to launch the kernel
+   * @{
+   */
+  template <typename OutType, typename LenType = int>
+  void normal(OutType* ptr, LenType len, OutType mu, OutType sigma, cudaStream_t stream)
+  {
+    detail::RngImpl::normal(ptr, len, mu, sigma, stream);
+  }
+
+  template <typename IntType, typename LenType = int>
+  void normalInt(IntType* ptr, LenType len, IntType mu, IntType sigma, cudaStream_t stream)
+  {
+    detail::RngImpl::normalInt(ptr, len, mu, sigma, stream);
+  }
+  /** @} */
+
+  /**
+   * @brief Generate normal distributed table according to the given set of
+   * means and scalar standard deviations.
+   *
+   * Each row in this table conforms to a normally distributed n-dim vector
+   * whose mean is the input vector and standard deviation is the corresponding
+   * vector or scalar. Correlations among the dimensions itself is assumed to
+   * be absent.
+   *
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output table (dim = n_rows x n_cols)
+   * @param n_rows number of rows in the table
+   * @param n_cols number of columns in the table
+   * @param mu_vec mean vector (dim = n_cols x 1).
+   * @param sigma_vec std-dev vector of each component (dim = n_cols x 1). Pass
+   * a nullptr to use the same scalar 'sigma' across all components
+   * @param sigma scalar sigma to be used if 'sigma_vec' is nullptr
+   * @param stream stream where to launch the kernel
+   */
+  template <typename OutType, typename LenType = int>
+  void normalTable(OutType* ptr,
+                   LenType n_rows,
+                   LenType n_cols,
+                   const OutType* mu_vec,
+                   const OutType* sigma_vec,
+                   OutType sigma,
+                   cudaStream_t stream)
+  {
+    detail::RngImpl::normalTable(ptr, n_rows, n_cols, mu_vec, sigma_vec, sigma, stream);
+  }
+
+  /**
+   * @brief Fill an array with the given value
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param val value to be filled
+   * @param stream stream where to launch the kernel
+   */
+  template <typename OutType, typename LenType = int>
+  void fill(OutType* ptr, LenType len, OutType val, cudaStream_t stream)
+  {
+    detail::RngImpl::fill(ptr, len, val, stream);
+  }
+
+  /**
+   * @brief Generate bernoulli distributed boolean array
+   *
+   * @tparam Type    data type in which to compute the probabilities
+   * @tparam OutType output data type
+   * @tparam LenType data type used to represent length of the arrays
+   *
+   * @param[out] ptr    the output array
+   * @param[in]  len    the number of elements in the output
+   * @param[in]  prob   coin-toss probability for heads
+   * @param[in]  stream stream where to launch the kernel
+   */
+  template <typename Type, typename OutType = bool, typename LenType = int>
+  void bernoulli(OutType* ptr, LenType len, Type prob, cudaStream_t stream)
+  {
+    detail::RngImpl::bernoulli(ptr, len, prob, stream);
+  }
+
+  /**
+   * @brief Generate bernoulli distributed array and applies scale
+   * @tparam Type data type in which to compute the probabilities
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param prob coin-toss probability for heads
+   * @param scale scaling factor
+   * @param stream stream where to launch the kernel
+   */
+  template <typename OutType, typename LenType = int>
+  void scaled_bernoulli(OutType* ptr, LenType len, OutType prob, OutType scale, cudaStream_t stream)
+  {
+    detail::RngImpl::scaled_bernoulli(ptr, len, prob, scale, stream);
+  }
+
+  /**
+   * @brief Generate Gumbel distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param mu mean value
+   * @param beta scale value
+   * @param stream stream where to launch the kernel
+   * @note https://en.wikipedia.org/wiki/Gumbel_distribution
+   */
+  template <typename OutType, typename LenType = int>
+  void gumbel(OutType* ptr, LenType len, OutType mu, OutType beta, cudaStream_t stream)
+  {
+    detail::RngImpl::gumbel(ptr, len, mu, beta, stream);
+  }
+
+  /**
+   * @brief Generate lognormal distributed numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr the output array
+   * @param len the number of elements in the output
+   * @param mu mean of the distribution
+   * @param sigma std-dev of the distribution
+   * @param stream stream where to launch the kernel
+   */
+  template <typename OutType, typename LenType = int>
+  void lognormal(OutType* ptr, LenType len, OutType mu, OutType sigma, cudaStream_t stream)
+  {
+    detail::RngImpl::lognormal(ptr, len, mu, sigma, stream);
+  }
+
+  /**
+   * @brief Generate logistic distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param mu mean value
+   * @param scale scale value
+   * @param stream stream where to launch the kernel
+   */
+  template <typename OutType, typename LenType = int>
+  void logistic(OutType* ptr, LenType len, OutType mu, OutType scale, cudaStream_t stream)
+  {
+    detail::RngImpl::logistic(ptr, len, mu, scale, stream);
+  }
+
+  /**
+   * @brief Generate exponentially distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param lambda the lambda
+   * @param stream stream where to launch the kernel
+   */
+  template <typename OutType, typename LenType = int>
+  void exponential(OutType* ptr, LenType len, OutType lambda, cudaStream_t stream)
+  {
+    detail::RngImpl::exponential(ptr, len, lambda, stream);
+  }
+
+  /**
+   * @brief Generate rayleigh distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param sigma the sigma
+   * @param stream stream where to launch the kernel
+   */
+  template <typename OutType, typename LenType = int>
+  void rayleigh(OutType* ptr, LenType len, OutType sigma, cudaStream_t stream)
+  {
+    detail::RngImpl::rayleigh(ptr, len, sigma, stream);
+  }
+
+  /**
+   * @brief Generate laplace distributed random numbers
+   * @tparam Type data type of output random number
+   * @tparam LenType data type used to represent length of the arrays
+   * @param ptr output array
+   * @param len number of elements in the output array
+   * @param mu the mean
+   * @param scale the scale
+   * @param stream stream where to launch the kernel
+   */
+  template <typename OutType, typename LenType = int>
+  void laplace(OutType* ptr, LenType len, OutType mu, OutType scale, cudaStream_t stream)
+  {
+    detail::RngImpl::laplace(ptr, len, mu, scale, stream);
+  }
+
+  void advance(uint64_t max_streams, uint64_t max_calls_per_subsequence)
+  {
+    detail::RngImpl::advance(max_streams, max_calls_per_subsequence);
+  }
+
+  /**
+   * @brief Sample the input array without replacement, optionally based on the
+   * input weight vector for each element in the array
+   *
+   * Implementation here is based on the `one-pass sampling` algo described here:
+   * https://www.ethz.ch/content/dam/ethz/special-interest/baug/ivt/ivt-dam/vpl/reports/1101-1200/ab1141.pdf
+   *
+   * @note In the sampled array the elements which are picked will always appear
+   * in the increasing order of their weights as computed using the exponential
+   * distribution. So, if you're particular about the order (for eg. array
+   * permutations), then this might not be the right choice!
+   *
+   * @tparam DataT data type
+   * @tparam WeightsT weights type
+   * @tparam IdxT index type
+   * @param handle
+   * @param out output sampled array (of length 'sampledLen')
+   * @param outIdx indices of the sampled array (of length 'sampledLen'). Pass
+   * a nullptr if this is not required.
+   * @param in input array to be sampled (of length 'len')
+   * @param wts weights array (of length 'len'). Pass a nullptr if uniform
+   * sampling is desired
+   * @param sampledLen output sampled array length
+   * @param len input array length
+   * @param stream cuda stream
+   */
+  template <typename DataT, typename WeightsT, typename IdxT = int>
+  void sampleWithoutReplacement(const raft::handle_t& handle,
+                                DataT* out,
+                                IdxT* outIdx,
+                                const DataT* in,
+                                const WeightsT* wts,
+                                IdxT sampledLen,
+                                IdxT len,
+                                cudaStream_t stream)
+  {
+    detail::RngImpl::sampleWithoutReplacement(
+      handle, out, outIdx, in, wts, sampledLen, len, stream);
+  }
+};
+
+};  // end namespace random
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/random/rng.hpp b/cpp/include/raft/random/rng.hpp
index 2b1bdbccf7..2d1af6a97e 100644
--- a/cpp/include/raft/random/rng.hpp
+++ b/cpp/include/raft/random/rng.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __RNG_H
+#define __RNG_H
 
 #pragma once
 
@@ -373,3 +380,5 @@ class Rng : public detail::RngImpl {
 
 };  // end namespace random
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/convert/coo.cuh b/cpp/include/raft/sparse/convert/coo.cuh
new file mode 100644
index 0000000000..b5568ef7d9
--- /dev/null
+++ b/cpp/include/raft/sparse/convert/coo.cuh
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __COO_H
+#define __COO_H
+
+#pragma once
+
+#include <raft/sparse/convert/detail/coo.cuh>
+
+namespace raft {
+namespace sparse {
+namespace convert {
+
+/**
+ * @brief Convert a CSR row_ind array to a COO rows array
+ * @param row_ind: Input CSR row_ind array
+ * @param m: size of row_ind array
+ * @param coo_rows: Output COO row array
+ * @param nnz: size of output COO row array
+ * @param stream: cuda stream to use
+ */
+template <typename value_idx = int>
+void csr_to_coo(
+  const value_idx* row_ind, value_idx m, value_idx* coo_rows, value_idx nnz, cudaStream_t stream)
+{
+  detail::csr_to_coo<value_idx, 32>(row_ind, m, coo_rows, nnz, stream);
+}
+
+};  // end NAMESPACE convert
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/convert/coo.hpp b/cpp/include/raft/sparse/convert/coo.hpp
index c647b99620..009a19a563 100644
--- a/cpp/include/raft/sparse/convert/coo.hpp
+++ b/cpp/include/raft/sparse/convert/coo.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __COO_H
+#define __COO_H
 
 #pragma once
 
@@ -39,4 +46,6 @@ void csr_to_coo(
 
 };  // end NAMESPACE convert
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
new file mode 100644
index 0000000000..10bc22bcc1
--- /dev/null
+++ b/cpp/include/raft/sparse/convert/csr.cuh
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CSR_H
+#define __CSR_H
+
+#pragma once
+
+#include <raft/sparse/convert/detail/csr.cuh>
+#include <raft/sparse/csr.hpp>
+
+namespace raft {
+namespace sparse {
+namespace convert {
+
+template <typename value_t>
+void coo_to_csr(const raft::handle_t& handle,
+                const int* srcRows,
+                const int* srcCols,
+                const value_t* srcVals,
+                int nnz,
+                int m,
+                int* dst_offsets,
+                int* dstCols,
+                value_t* dstVals)
+{
+  detail::coo_to_csr(handle, srcRows, srcCols, srcVals, nnz, m, dst_offsets, dstCols, dstVals);
+}
+
+/**
+ * @brief Constructs an adjacency graph CSR row_ind_ptr array from
+ * a row_ind array and adjacency array.
+ * @tparam T the numeric type of the index arrays
+ * @tparam TPB_X the number of threads to use per block for kernels
+ * @tparam Lambda function for fused operation in the adj_graph construction
+ * @param row_ind the input CSR row_ind array
+ * @param total_rows number of vertices in graph
+ * @param nnz number of non-zeros
+ * @param batchSize number of vertices in current batch
+ * @param adj an adjacency array (size batchSize x total_rows)
+ * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph
+ * @param stream cuda stream to use
+ * @param fused_op: the fused operation
+ */
+template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph_batched(const Index_* row_ind,
+                           Index_ total_rows,
+                           Index_ nnz,
+                           Index_ batchSize,
+                           const bool* adj,
+                           Index_* row_ind_ptr,
+                           cudaStream_t stream,
+                           Lambda fused_op)
+{
+  detail::csr_adj_graph_batched<Index_, 32, Lambda>(
+    row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream, fused_op);
+}
+
+template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph_batched(const Index_* row_ind,
+                           Index_ total_rows,
+                           Index_ nnz,
+                           Index_ batchSize,
+                           const bool* adj,
+                           Index_* row_ind_ptr,
+                           cudaStream_t stream)
+{
+  detail::csr_adj_graph_batched<Index_, 32, Lambda>(
+    row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream);
+}
+
+/**
+ * @brief Constructs an adjacency graph CSR row_ind_ptr array from a
+ * a row_ind array and adjacency array.
+ * @tparam T the numeric type of the index arrays
+ * @tparam TPB_X the number of threads to use per block for kernels
+ * @param row_ind the input CSR row_ind array
+ * @param total_rows number of total vertices in graph
+ * @param nnz number of non-zeros
+ * @param adj an adjacency array
+ * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph
+ * @param stream cuda stream to use
+ * @param fused_op the fused operation
+ */
+template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph(const Index_* row_ind,
+                   Index_ total_rows,
+                   Index_ nnz,
+                   const bool* adj,
+                   Index_* row_ind_ptr,
+                   cudaStream_t stream,
+                   Lambda fused_op)
+{
+  detail::csr_adj_graph<Index_, 32, Lambda>(
+    row_ind, total_rows, nnz, adj, row_ind_ptr, stream, fused_op);
+}
+
+/**
+ * @brief Generate the row indices array for a sorted COO matrix
+ *
+ * @param rows: COO rows array
+ * @param nnz: size of COO rows array
+ * @param row_ind: output row indices array
+ * @param m: number of rows in dense matrix
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void sorted_coo_to_csr(const T* rows, int nnz, T* row_ind, int m, cudaStream_t stream)
+{
+  detail::sorted_coo_to_csr(rows, nnz, row_ind, m, stream);
+}
+
+/**
+ * @brief Generate the row indices array for a sorted COO matrix
+ *
+ * @param coo: Input COO matrix
+ * @param row_ind: output row indices array
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void sorted_coo_to_csr(COO<T>* coo, int* row_ind, cudaStream_t stream)
+{
+  detail::sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, stream);
+}
+
+};  // end NAMESPACE convert
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/convert/csr.hpp b/cpp/include/raft/sparse/convert/csr.hpp
index f0fe76bed3..6a9a99d014 100644
--- a/cpp/include/raft/sparse/convert/csr.hpp
+++ b/cpp/include/raft/sparse/convert/csr.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __CSR_H
+#define __CSR_H
 
 #pragma once
 
@@ -135,4 +142,6 @@ void sorted_coo_to_csr(COO<T>* coo, int* row_ind, cudaStream_t stream)
 
 };  // end NAMESPACE convert
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/convert/dense.cuh b/cpp/include/raft/sparse/convert/dense.cuh
new file mode 100644
index 0000000000..a146113a86
--- /dev/null
+++ b/cpp/include/raft/sparse/convert/dense.cuh
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __DENSE_H
+#define __DENSE_H
+
+#pragma once
+
+#include <raft/sparse/convert/detail/dense.cuh>
+
+namespace raft {
+namespace sparse {
+namespace convert {
+
+/**
+ * Convert CSR arrays to a dense matrix in either row-
+ * or column-major format. A custom kernel is used when
+ * row-major output is desired since cusparse does not
+ * output row-major.
+ * @tparam value_idx : data type of the CSR index arrays
+ * @tparam value_t : data type of the CSR value array
+ * @param[in] handle : cusparse handle for conversion
+ * @param[in] nrows : number of rows in CSR
+ * @param[in] ncols : number of columns in CSR
+ * @param[in] nnz : number of nonzeros in CSR
+ * @param[in] csr_indptr : CSR row index pointer array
+ * @param[in] csr_indices : CSR column indices array
+ * @param[in] csr_data : CSR data array
+ * @param[in] lda : Leading dimension (used for col-major only)
+ * @param[out] out : Dense output array of size nrows * ncols
+ * @param[in] stream : Cuda stream for ordering events
+ * @param[in] row_major : Is row-major output desired?
+ */
+template <typename value_idx, typename value_t>
+void csr_to_dense(cusparseHandle_t handle,
+                  value_idx nrows,
+                  value_idx ncols,
+                  value_idx nnz,
+                  const value_idx* csr_indptr,
+                  const value_idx* csr_indices,
+                  const value_t* csr_data,
+                  value_idx lda,
+                  value_t* out,
+                  cudaStream_t stream,
+                  bool row_major = true)
+{
+  detail::csr_to_dense<value_idx, value_t>(
+    handle, nrows, ncols, nnz, csr_indptr, csr_indices, csr_data, lda, out, stream, row_major);
+}
+
+};  // end NAMESPACE convert
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/convert/dense.hpp b/cpp/include/raft/sparse/convert/dense.hpp
index 2570d7ae65..1bdfa26732 100644
--- a/cpp/include/raft/sparse/convert/dense.hpp
+++ b/cpp/include/raft/sparse/convert/dense.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __DENSE_H
+#define __DENSE_H
 
 #pragma once
 
@@ -60,4 +67,6 @@ void csr_to_dense(cusparseHandle_t handle,
 
 };  // end NAMESPACE convert
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/convert/detail/coo.cuh b/cpp/include/raft/sparse/convert/detail/coo.cuh
index c37087789c..2d13bfa34e 100644
--- a/cpp/include/raft/sparse/convert/detail/coo.cuh
+++ b/cpp/include/raft/sparse/convert/detail/coo.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/convert/detail/csr.cuh b/cpp/include/raft/sparse/convert/detail/csr.cuh
index 751335dfca..2516d00533 100644
--- a/cpp/include/raft/sparse/convert/detail/csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/csr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,8 +35,8 @@
 
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/detail/utils.h>
-#include <raft/sparse/linalg/degree.hpp>
-#include <raft/sparse/op/row_op.hpp>
+#include <raft/sparse/linalg/degree.cuh>
+#include <raft/sparse/op/row_op.cuh>
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/include/raft/sparse/convert/detail/dense.cuh b/cpp/include/raft/sparse/convert/detail/dense.cuh
index b2756b81c9..4f97cee8b4 100644
--- a/cpp/include/raft/sparse/convert/detail/dense.cuh
+++ b/cpp/include/raft/sparse/convert/detail/dense.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/detail/csr.cuh b/cpp/include/raft/sparse/detail/csr.cuh
index a256ac402b..1fd2bb9366 100644
--- a/cpp/include/raft/sparse/detail/csr.cuh
+++ b/cpp/include/raft/sparse/detail/csr.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/detail/cusparse_macros.h b/cpp/include/raft/sparse/detail/cusparse_macros.h
index 1f9f0e5175..10c7e8836c 100644
--- a/cpp/include/raft/sparse/detail/cusparse_macros.h
+++ b/cpp/include/raft/sparse/detail/cusparse_macros.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/detail/cusparse_wrappers.h b/cpp/include/raft/sparse/detail/cusparse_wrappers.h
index aef3976294..b9c4a61850 100644
--- a/cpp/include/raft/sparse/detail/cusparse_wrappers.h
+++ b/cpp/include/raft/sparse/detail/cusparse_wrappers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
index 124fa2285d..7c1229b0d3 100644
--- a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
index 020de9e014..9edd1305b3 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
index 6e717e9920..0848d24bde 100644
--- a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,12 +22,12 @@
 #include <raft/distance/distance_type.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 
-#include <raft/sparse/convert/coo.hpp>
+#include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/detail/utils.h>
 #include <raft/sparse/distance/common.h>
 #include <raft/sparse/distance/detail/coo_spmv.cuh>
 #include <raft/sparse/distance/detail/operators.cuh>
-#include <raft/sparse/linalg/transpose.hpp>
+#include <raft/sparse/linalg/transpose.cuh>
 #include <rmm/device_uvector.hpp>
 
 #include <nvfunctional>
diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
index e6dd396f2d..468689848b 100644
--- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,12 @@
 
 #pragma once
 
-#include <raft/spatial/knn/knn.hpp>
+#include <raft/spatial/knn/knn.cuh>
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance_type.hpp>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 #include <raft/sparse/csr.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/sparse/detail/utils.h>
diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
index 96d51f2e75..c6ff32caf3 100644
--- a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@
 #include <raft/sparse/csr.hpp>
 #include <raft/sparse/detail/utils.h>
 
-#include <raft/sparse/convert/coo.hpp>
+#include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/distance/common.h>
 #include <raft/sparse/distance/detail/operators.cuh>
 
diff --git a/cpp/include/raft/sparse/distance/detail/utils.cuh b/cpp/include/raft/sparse/distance/detail/utils.cuh
index 06c034ad9f..a2fe090c96 100644
--- a/cpp/include/raft/sparse/distance/detail/utils.cuh
+++ b/cpp/include/raft/sparse/distance/detail/utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/distance/distance.cuh b/cpp/include/raft/sparse/distance/distance.cuh
new file mode 100644
index 0000000000..ab189796ea
--- /dev/null
+++ b/cpp/include/raft/sparse/distance/distance.cuh
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SPARSE_DIST_H
+#define __SPARSE_DIST_H
+
+#pragma once
+
+#include <raft/sparse/distance/common.h>
+#include <unordered_set>
+
+#include <raft/distance/distance_type.hpp>
+
+#include <raft/sparse/distance/detail/bin_distance.cuh>
+#include <raft/sparse/distance/detail/ip_distance.cuh>
+#include <raft/sparse/distance/detail/l2_distance.cuh>
+#include <raft/sparse/distance/detail/lp_distance.cuh>
+
+namespace raft {
+namespace sparse {
+namespace distance {
+
+static const std::unordered_set<raft::distance::DistanceType> supportedDistance{
+  raft::distance::DistanceType::L2Expanded,
+  raft::distance::DistanceType::L2Unexpanded,
+  raft::distance::DistanceType::L2SqrtExpanded,
+  raft::distance::DistanceType::L2SqrtUnexpanded,
+  raft::distance::DistanceType::InnerProduct,
+  raft::distance::DistanceType::L1,
+  raft::distance::DistanceType::Canberra,
+  raft::distance::DistanceType::Linf,
+  raft::distance::DistanceType::LpUnexpanded,
+  raft::distance::DistanceType::JaccardExpanded,
+  raft::distance::DistanceType::CosineExpanded,
+  raft::distance::DistanceType::HellingerExpanded,
+  raft::distance::DistanceType::DiceExpanded,
+  raft::distance::DistanceType::CorrelationExpanded,
+  raft::distance::DistanceType::RusselRaoExpanded,
+  raft::distance::DistanceType::HammingUnexpanded,
+  raft::distance::DistanceType::JensenShannon,
+  raft::distance::DistanceType::KLDivergence};
+
+/**
+ * Compute pairwise distances between A and B, using the provided
+ * input configuration and distance function.
+ *
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @param[out] out dense output array (size A.nrows * B.nrows)
+ * @param[in] input_config input argument configuration
+ * @param[in] metric distance metric to use
+ * @param[in] metric_arg metric argument (used for Minkowski distance)
+ */
+template <typename value_idx = int, typename value_t = float>
+void pairwiseDistance(value_t* out,
+                      distances_config_t<value_idx, value_t> input_config,
+                      raft::distance::DistanceType metric,
+                      float metric_arg)
+{
+  switch (metric) {
+    case raft::distance::DistanceType::L2Expanded:
+      detail::l2_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::L2SqrtExpanded:
+      detail::l2_sqrt_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::InnerProduct:
+      detail::ip_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::L2Unexpanded:
+      detail::l2_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::L2SqrtUnexpanded:
+      detail::l2_sqrt_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::L1:
+      detail::l1_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::LpUnexpanded:
+      detail::lp_unexpanded_distances_t<value_idx, value_t>(input_config, metric_arg).compute(out);
+      break;
+    case raft::distance::DistanceType::Linf:
+      detail::linf_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::Canberra:
+      detail::canberra_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::JaccardExpanded:
+      detail::jaccard_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::CosineExpanded:
+      detail::cosine_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::HellingerExpanded:
+      detail::hellinger_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::DiceExpanded:
+      detail::dice_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::CorrelationExpanded:
+      detail::correlation_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::RusselRaoExpanded:
+      detail::russelrao_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::HammingUnexpanded:
+      detail::hamming_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::JensenShannon:
+      detail::jensen_shannon_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+    case raft::distance::DistanceType::KLDivergence:
+      detail::kl_divergence_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      break;
+
+    default: THROW("Unsupported distance: %d", metric);
+  }
+}
+
+};  // namespace distance
+};  // namespace sparse
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp
index dc9837ab43..cba419e53a 100644
--- a/cpp/include/raft/sparse/distance/distance.hpp
+++ b/cpp/include/raft/sparse/distance/distance.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SPARSE_DIST_H
+#define __SPARSE_DIST_H
 
 #pragma once
 
@@ -130,3 +137,5 @@ void pairwiseDistance(value_t* out,
 };  // namespace distance
 };  // namespace sparse
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
index 105f1cc9f6..c89f5a370a 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
index fe58246545..9d4126f8fd 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,14 +20,14 @@
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 #include <rmm/device_uvector.hpp>
 
 #include <raft/distance/distance_type.hpp>
-#include <raft/sparse/convert/csr.hpp>
+#include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/hierarchy/common.h>
-#include <raft/sparse/selection/knn_graph.hpp>
+#include <raft/sparse/selection/knn_graph.cuh>
 
 #include <limits>
 
diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
index 10e9d04c0d..545a371850 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,8 +20,8 @@
 #include <raft/cudart_utils.h>
 
 #include <raft/sparse/mst/mst.cuh>
-#include <raft/sparse/op/sort.hpp>
-#include <raft/sparse/selection/connect_components.hpp>
+#include <raft/sparse/op/sort.cuh>
+#include <raft/sparse/selection/connect_components.cuh>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
diff --git a/cpp/include/raft/sparse/hierarchy/detail/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/detail/single_linkage.cuh
similarity index 99%
rename from cpp/include/raft/sparse/hierarchy/detail/single_linkage.hpp
rename to cpp/include/raft/sparse/hierarchy/detail/single_linkage.cuh
index 702198e422..4e94b6f65d 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/single_linkage.hpp
+++ b/cpp/include/raft/sparse/hierarchy/detail/single_linkage.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.cuh b/cpp/include/raft/sparse/hierarchy/single_linkage.cuh
new file mode 100644
index 0000000000..86940005b4
--- /dev/null
+++ b/cpp/include/raft/sparse/hierarchy/single_linkage.cuh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SINGLE_LINKAGE_H
+#define __SINGLE_LINKAGE_H
+
+#pragma once
+
+#include <raft/sparse/hierarchy/common.h>
+#include <raft/sparse/hierarchy/detail/single_linkage.cuh>
+
+namespace raft {
+namespace hierarchy {
+
+/**
+ * Single-linkage clustering, capable of constructing a KNN graph to
+ * scale the algorithm beyond the n^2 memory consumption of implementations
+ * that use the fully-connected graph of pairwise distances by connecting
+ * a knn graph when k is not large enough to connect it.
+
+ * @tparam value_idx
+ * @tparam value_t
+ * @tparam dist_type method to use for constructing connectivities graph
+ * @param[in] handle raft handle
+ * @param[in] X dense input matrix in row-major layout
+ * @param[in] m number of rows in X
+ * @param[in] n number of columns in X
+ * @param[in] metric distance metrix to use when constructing connectivities graph
+ * @param[out] out struct containing output dendrogram and cluster assignments
+ * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect
+ control
+ *            of k. The algorithm will set `k = log(n) + c`
+ * @param[in] n_clusters number of clusters to assign data samples
+ */
+template <typename value_idx,
+          typename value_t,
+          LinkageDistance dist_type = LinkageDistance::KNN_GRAPH>
+void single_linkage(const raft::handle_t& handle,
+                    const value_t* X,
+                    size_t m,
+                    size_t n,
+                    raft::distance::DistanceType metric,
+                    linkage_output<value_idx, value_t>* out,
+                    int c,
+                    size_t n_clusters)
+{
+  detail::single_linkage<value_idx, value_t, dist_type>(
+    handle, X, m, n, metric, out, c, n_clusters);
+}
+};  // namespace hierarchy
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
index 104c1235d4..e7a37b7bf5 100644
--- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
+++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,11 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SINGLE_LINKAGE_H
+#define __SINGLE_LINKAGE_H
 
 #pragma once
 
 #include <raft/sparse/hierarchy/common.h>
-#include <raft/sparse/hierarchy/detail/single_linkage.hpp>
+#include <raft/sparse/hierarchy/detail/single_linkage.cuh>
 
 namespace raft {
 namespace hierarchy {
@@ -59,3 +66,5 @@ void single_linkage(const raft::handle_t& handle,
 }
 };  // namespace hierarchy
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh
new file mode 100644
index 0000000000..def305afb2
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/add.cuh
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SPARSE_ADD_H
+#define __SPARSE_ADD_H
+
+#pragma once
+
+#include <raft/sparse/linalg/detail/add.cuh>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * @brief Calculate the CSR row_ind array that would result
+ * from summing together two CSR matrices
+ * @param a_ind: left hand row_ind array
+ * @param a_indptr: left hand index_ptr array
+ * @param a_val: left hand data array
+ * @param nnz1: size of left hand index_ptr and val arrays
+ * @param b_ind: right hand row_ind array
+ * @param b_indptr: right hand index_ptr array
+ * @param b_val: right hand data array
+ * @param nnz2: size of right hand index_ptr and val arrays
+ * @param m: size of output array (number of rows in final matrix)
+ * @param out_ind: output row_ind array
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+size_t csr_add_calc_inds(const int* a_ind,
+                         const int* a_indptr,
+                         const T* a_val,
+                         int nnz1,
+                         const int* b_ind,
+                         const int* b_indptr,
+                         const T* b_val,
+                         int nnz2,
+                         int m,
+                         int* out_ind,
+                         cudaStream_t stream)
+{
+  return detail::csr_add_calc_inds(
+    a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, out_ind, stream);
+}
+
+/**
+ * @brief Calculate the CSR row_ind array that would result
+ * from summing together two CSR matrices
+ * @param a_ind: left hand row_ind array
+ * @param a_indptr: left hand index_ptr array
+ * @param a_val: left hand data array
+ * @param nnz1: size of left hand index_ptr and val arrays
+ * @param b_ind: right hand row_ind array
+ * @param b_indptr: right hand index_ptr array
+ * @param b_val: right hand data array
+ * @param nnz2: size of right hand index_ptr and val arrays
+ * @param m: size of output array (number of rows in final matrix)
+ * @param c_ind: output row_ind array
+ * @param c_indptr: output ind_ptr array
+ * @param c_val: output data array
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void csr_add_finalize(const int* a_ind,
+                      const int* a_indptr,
+                      const T* a_val,
+                      int nnz1,
+                      const int* b_ind,
+                      const int* b_indptr,
+                      const T* b_val,
+                      int nnz2,
+                      int m,
+                      int* c_ind,
+                      int* c_indptr,
+                      T* c_val,
+                      cudaStream_t stream)
+{
+  detail::csr_add_finalize(
+    a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, c_ind, c_indptr, c_val, stream);
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/add.hpp b/cpp/include/raft/sparse/linalg/add.hpp
index 30c39b1ffc..33259cb39f 100644
--- a/cpp/include/raft/sparse/linalg/add.hpp
+++ b/cpp/include/raft/sparse/linalg/add.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SPARSE_ADD_H
+#define __SPARSE_ADD_H
 
 #pragma once
 
@@ -93,3 +100,5 @@ void csr_add_finalize(const int* a_ind,
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh
new file mode 100644
index 0000000000..57c9b986b4
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/degree.cuh
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SPARSE_DEGREE_H
+#define __SPARSE_DEGREE_H
+
+#pragma once
+
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/linalg/detail/degree.cuh>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * @brief Count the number of values for each row
+ * @tparam TPB_X: number of threads to use per block
+ * @param rows: rows array of the COO matrix
+ * @param nnz: size of the rows array
+ * @param results: output result array
+ * @param stream: cuda stream to use
+ */
+template <typename T = int>
+void coo_degree(const T* rows, int nnz, T* results, cudaStream_t stream)
+{
+  detail::coo_degree<64, T>(rows, nnz, results, stream);
+}
+
+/**
+ * @brief Count the number of values for each row
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: type name of underlying values array
+ * @param in: input COO object for counting rows
+ * @param results: output array with row counts (size=in->n_rows)
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_degree(COO<T>* in, int* results, cudaStream_t stream)
+{
+  coo_degree(in->rows(), in->nnz, results, stream);
+}
+
+/**
+ * @brief Count the number of values for each row that doesn't match a particular scalar
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: the type name of the underlying value arrays
+ * @param rows: Input COO row array
+ * @param vals: Input COO val arrays
+ * @param nnz: size of input COO arrays
+ * @param scalar: scalar to match for counting rows
+ * @param results: output row counts
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_degree_scalar(
+  const int* rows, const T* vals, int nnz, T scalar, int* results, cudaStream_t stream = 0)
+{
+  detail::coo_degree_scalar<64>(rows, vals, nnz, scalar, results, stream);
+}
+
+/**
+ * @brief Count the number of values for each row that doesn't match a particular scalar
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: the type name of the underlying value arrays
+ * @param in: Input COO array
+ * @param scalar: scalar to match for counting rows
+ * @param results: output row counts
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_degree_scalar(COO<T>* in, T scalar, int* results, cudaStream_t stream)
+{
+  coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, results, stream);
+}
+
+/**
+ * @brief Count the number of nonzeros for each row
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: the type name of the underlying value arrays
+ * @param rows: Input COO row array
+ * @param vals: Input COO val arrays
+ * @param nnz: size of input COO arrays
+ * @param results: output row counts
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_degree_nz(const int* rows, const T* vals, int nnz, int* results, cudaStream_t stream)
+{
+  detail::coo_degree_nz<64>(rows, vals, nnz, results, stream);
+}
+
+/**
+ * @brief Count the number of nonzero values for each row
+ * @tparam TPB_X: number of threads to use per block
+ * @tparam T: the type name of the underlying value arrays
+ * @param in: Input COO array
+ * @param results: output row counts
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_degree_nz(COO<T>* in, int* results, cudaStream_t stream)
+{
+  coo_degree_nz(in->rows(), in->vals(), in->nnz, results, stream);
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/degree.hpp b/cpp/include/raft/sparse/linalg/degree.hpp
index 04643b219d..0c6af596ce 100644
--- a/cpp/include/raft/sparse/linalg/degree.hpp
+++ b/cpp/include/raft/sparse/linalg/degree.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SPARSE_DEGREE_H
+#define __SPARSE_DEGREE_H
 
 #pragma once
 
@@ -117,3 +124,5 @@ void coo_degree_nz(COO<T>* in, int* results, cudaStream_t stream)
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/detail/add.cuh b/cpp/include/raft/sparse/linalg/detail/add.cuh
index b288d0a603..5c3d07fc02 100644
--- a/cpp/include/raft/sparse/linalg/detail/add.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/add.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/linalg/detail/norm.cuh b/cpp/include/raft/sparse/linalg/detail/norm.cuh
index b7420a55e7..ba0ecd5dcc 100644
--- a/cpp/include/raft/sparse/linalg/detail/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/norm.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
index 7e5bd5b9e4..c295932719 100644
--- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -17,12 +17,12 @@
 #include <raft/cudart_utils.h>
 
 #include <raft/cuda_utils.cuh>
-#include <raft/spectral/cluster_solvers.hpp>
-#include <raft/spectral/eigen_solvers.hpp>
-#include <raft/spectral/partition.hpp>
+#include <raft/spectral/cluster_solvers.cuh>
+#include <raft/spectral/eigen_solvers.cuh>
+#include <raft/spectral/partition.cuh>
 #include <rmm/device_uvector.hpp>
 
-#include <raft/sparse/convert/csr.hpp>
+#include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
index 4384f2ba55..9143aac84f 100644
--- a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <raft/device_atomics.cuh>
-#include <raft/sparse/op/sort.hpp>
+#include <raft/sparse/op/sort.cuh>
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
 
@@ -35,10 +35,10 @@
 #include <algorithm>
 #include <iostream>
 
-#include <raft/sparse/convert/csr.hpp>
+#include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/detail/utils.h>
-#include <raft/sparse/op/reduce.hpp>
+#include <raft/sparse/op/reduce.cuh>
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/include/raft/sparse/linalg/detail/transpose.h b/cpp/include/raft/sparse/linalg/detail/transpose.h
index 398877eaab..4820b489d1 100644
--- a/cpp/include/raft/sparse/linalg/detail/transpose.h
+++ b/cpp/include/raft/sparse/linalg/detail/transpose.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh
new file mode 100644
index 0000000000..e13fd22843
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/norm.cuh
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SPARSE_NORM_H
+#define __SPARSE_NORM_H
+
+#pragma once
+
+#include <raft/sparse/linalg/detail/norm.cuh>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * @brief Perform L1 normalization on the rows of a given CSR-formatted sparse matrix
+ *
+ * @param ia: row_ind array
+ * @param vals: data array
+ * @param nnz: size of data array
+ * @param m: size of row_ind array
+ * @param result: l1 normalized data array
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void csr_row_normalize_l1(const int* ia,  // csr row ex_scan (sorted by row)
+                          const T* vals,
+                          int nnz,  // array of values and number of non-zeros
+                          int m,    // num rows in csr
+                          T* result,
+                          cudaStream_t stream)
+{  // output array
+  detail::csr_row_normalize_l1(ia, vals, nnz, m, result, stream);
+}
+
+/**
+ * @brief Perform L_inf normalization on a given CSR-formatted sparse matrix
+ *
+ * @param ia: row_ind array
+ * @param vals: data array
+ * @param nnz: size of data array
+ * @param m: size of row_ind array
+ * @param result: l1 normalized data array
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
+                           const T* vals,
+                           int nnz,  // array of values and number of non-zeros
+                           int m,    // num total rows in csr
+                           T* result,
+                           cudaStream_t stream)
+{
+  detail::csr_row_normalize_max(ia, vals, nnz, m, result, stream);
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/norm.hpp b/cpp/include/raft/sparse/linalg/norm.hpp
index 683daedf4f..196951bac7 100644
--- a/cpp/include/raft/sparse/linalg/norm.hpp
+++ b/cpp/include/raft/sparse/linalg/norm.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SPARSE_NORM_H
+#define __SPARSE_NORM_H
 
 #pragma once
 
@@ -66,4 +73,6 @@ void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
 
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh
new file mode 100644
index 0000000000..fe95d1414c
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/spectral.cuh
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SPARSE_SPECTRAL_H
+#define __SPARSE_SPECTRAL_H
+
+#include <raft/handle.hpp>
+#include <raft/sparse/linalg/detail/spectral.cuh>
+
+namespace raft {
+namespace sparse {
+namespace spectral {
+
+template <typename T>
+void fit_embedding(const raft::handle_t& handle,
+                   int* rows,
+                   int* cols,
+                   T* vals,
+                   int nnz,
+                   int n,
+                   int n_components,
+                   T* out,
+                   unsigned long long seed = 1234567)
+{
+  detail::fit_embedding(handle, rows, cols, vals, nnz, n, n_components, out, seed);
+}
+};  // namespace spectral
+};  // namespace sparse
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/spectral.hpp b/cpp/include/raft/sparse/linalg/spectral.hpp
index 619987062f..9daa6e07b0 100644
--- a/cpp/include/raft/sparse/linalg/spectral.hpp
+++ b/cpp/include/raft/sparse/linalg/spectral.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SPARSE_SPECTRAL_H
+#define __SPARSE_SPECTRAL_H
 
 #include <raft/handle.hpp>
 #include <raft/sparse/linalg/detail/spectral.cuh>
@@ -37,3 +44,5 @@ void fit_embedding(const raft::handle_t& handle,
 };  // namespace spectral
 };  // namespace sparse
 };  // namespace raft
+
+#endif
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh
new file mode 100644
index 0000000000..d41540c0b3
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SYMMETRIZE_H
+#define __SYMMETRIZE_H
+
+#pragma once
+
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/linalg/detail/symmetrize.cuh>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * @brief takes a COO matrix which may not be symmetric and symmetrizes
+ * it, running a custom reduction function against the each value
+ * and its transposed value.
+ *
+ * @param in: Input COO matrix
+ * @param out: Output symmetrized COO matrix
+ * @param reduction_op: a custom reduction function
+ * @param stream: cuda stream to use
+ */
+template <typename T, typename Lambda>
+void coo_symmetrize(COO<T>* in,
+                    COO<T>* out,
+                    Lambda reduction_op,  // two-argument reducer
+                    cudaStream_t stream)
+{
+  detail::coo_symmetrize(in, out, reduction_op, stream);
+}
+
+/**
+ * @brief Find how much space needed in each row.
+ * We look through all datapoints and increment the count for each row.
+ *
+ * TODO: This isn't generalized. Remove in place of `symmetrize()`
+ * @param data: Input knn distances(n, k)
+ * @param indices: Input knn indices(n, k)
+ * @param n: Number of rows
+ * @param k: Number of n_neighbors
+ * @param row_sizes: Input empty row sum 1 array(n)
+ * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction
+ */
+template <typename value_idx = int64_t, typename value_t = float>
+__global__ static void symmetric_find_size(const value_t* __restrict__ data,
+                                           const value_idx* __restrict__ indices,
+                                           const value_idx n,
+                                           const int k,
+                                           value_idx* __restrict__ row_sizes,
+                                           value_idx* __restrict__ row_sizes2)
+{
+  detail::symmetric_find_size(data, indices, n, k, row_sizes, row_sizes2);
+}
+
+/**
+ * @brief Reduce sum(row_sizes) + k
+ * Reduction for symmetric_find_size kernel. Allows algo to be faster.
+ *
+ * TODO: This isn't generalized. Remove in place of `symmetrize()`
+ * @param n: Number of rows
+ * @param k: Number of n_neighbors
+ * @param row_sizes: Input row sum 1 array(n)
+ * @param row_sizes2: Input row sum 2 array(n) for faster reduction
+ */
+template <typename value_idx>
+__global__ static void reduce_find_size(const value_idx n,
+                                        const int k,
+                                        value_idx* __restrict__ row_sizes,
+                                        const value_idx* __restrict__ row_sizes2)
+{
+  detail::reduce_find_size(n, k, row_sizes, row_sizes2);
+}
+
+/**
+ * @brief Perform data + data.T operation.
+ * Can only run once row_sizes from the CSR matrix of data + data.T has been
+ * determined.
+ *
+ * TODO: This isn't generalized. Remove in place of `symmetrize()`
+ *
+ * @param edges: Input row sum array(n) after reduction
+ * @param data: Input knn distances(n, k)
+ * @param indices: Input knn indices(n, k)
+ * @param VAL: Output values for data + data.T
+ * @param COL: Output column indices for data + data.T
+ * @param ROW: Output row indices for data + data.T
+ * @param n: Number of rows
+ * @param k: Number of n_neighbors
+ */
+template <typename value_idx = int64_t, typename value_t = float>
+__global__ static void symmetric_sum(value_idx* __restrict__ edges,
+                                     const value_t* __restrict__ data,
+                                     const value_idx* __restrict__ indices,
+                                     value_t* __restrict__ VAL,
+                                     value_idx* __restrict__ COL,
+                                     value_idx* __restrict__ ROW,
+                                     const value_idx n,
+                                     const int k)
+{
+  detail::symmetric_sum(edges, data, indices, VAL, COL, ROW, n, k);
+}
+
+/**
+ * @brief Perform data + data.T on raw KNN data.
+ * The following steps are invoked:
+ * (1) Find how much space needed in each row
+ * (2) Compute final space needed (n*k + sum(row_sizes)) == 2*n*k
+ * (3) Allocate new space
+ * (4) Prepare edges for each new row
+ * (5) Perform final data + data.T operation
+ * (6) Return summed up VAL, COL, ROW
+ *
+ * TODO: This isn't generalized. Remove in place of `symmetrize()`
+ *
+ * @param knn_indices: Input knn distances(n, k)
+ * @param knn_dists: Input knn indices(n, k)
+ * @param n: Number of rows
+ * @param k: Number of n_neighbors
+ * @param out: Output COO Matrix class
+ * @param stream: Input cuda stream
+ */
+template <typename value_idx = int64_t, typename value_t = float, int TPB_X = 32, int TPB_Y = 32>
+void from_knn_symmetrize_matrix(const value_idx* __restrict__ knn_indices,
+                                const value_t* __restrict__ knn_dists,
+                                const value_idx n,
+                                const int k,
+                                COO<value_t, value_idx>* out,
+                                cudaStream_t stream)
+{
+  detail::from_knn_symmetrize_matrix(knn_indices, knn_dists, n, k, out, stream);
+}
+
+/**
+ * Symmetrizes a COO matrix
+ */
+template <typename value_idx, typename value_t>
+void symmetrize(const raft::handle_t& handle,
+                const value_idx* rows,
+                const value_idx* cols,
+                const value_t* vals,
+                size_t m,
+                size_t n,
+                size_t nnz,
+                raft::sparse::COO<value_t, value_idx>& out)
+{
+  detail::symmetrize(handle, rows, cols, vals, m, n, nnz, out);
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.hpp b/cpp/include/raft/sparse/linalg/symmetrize.hpp
index 64d27f5b6f..4d8520dabf 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.hpp
+++ b/cpp/include/raft/sparse/linalg/symmetrize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SYMMETRIZE_H
+#define __SYMMETRIZE_H
 
 #pragma once
 
@@ -162,3 +169,5 @@ void symmetrize(const raft::handle_t& handle,
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
+
+#endif
diff --git a/cpp/include/raft/sparse/linalg/transpose.cuh b/cpp/include/raft/sparse/linalg/transpose.cuh
new file mode 100644
index 0000000000..8f0105f512
--- /dev/null
+++ b/cpp/include/raft/sparse/linalg/transpose.cuh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __TRANSPOSE_H
+#define __TRANSPOSE_H
+
+#pragma once
+
+#include <raft/handle.hpp>
+#include <raft/sparse/linalg/detail/transpose.h>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/**
+ * Transpose a set of CSR arrays into a set of CSC arrays.
+ * @tparam value_idx : data type of the CSR index arrays
+ * @tparam value_t : data type of the CSR data array
+ * @param[in] handle : used for invoking cusparse
+ * @param[in] csr_indptr : CSR row index array
+ * @param[in] csr_indices : CSR column indices array
+ * @param[in] csr_data : CSR data array
+ * @param[out] csc_indptr : CSC row index array
+ * @param[out] csc_indices : CSC column indices array
+ * @param[out] csc_data : CSC data array
+ * @param[in] csr_nrows : Number of rows in CSR
+ * @param[in] csr_ncols : Number of columns in CSR
+ * @param[in] nnz : Number of nonzeros of CSR
+ * @param[in] stream : Cuda stream for ordering events
+ */
+template <typename value_idx, typename value_t>
+void csr_transpose(const raft::handle_t& handle,
+                   const value_idx* csr_indptr,
+                   const value_idx* csr_indices,
+                   const value_t* csr_data,
+                   value_idx* csc_indptr,
+                   value_idx* csc_indices,
+                   value_t* csc_data,
+                   value_idx csr_nrows,
+                   value_idx csr_ncols,
+                   value_idx nnz,
+                   cudaStream_t stream)
+{
+  detail::csr_transpose(handle.get_cusparse_handle(),
+                        csr_indptr,
+                        csr_indices,
+                        csr_data,
+                        csc_indptr,
+                        csc_indices,
+                        csc_data,
+                        csr_nrows,
+                        csr_ncols,
+                        nnz,
+                        stream);
+}
+
+};  // end NAMESPACE linalg
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/linalg/transpose.hpp b/cpp/include/raft/sparse/linalg/transpose.hpp
index 6e40b647e9..0aea254803 100644
--- a/cpp/include/raft/sparse/linalg/transpose.hpp
+++ b/cpp/include/raft/sparse/linalg/transpose.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __TRANSPOSE_H
+#define __TRANSPOSE_H
 
 #pragma once
 
@@ -68,3 +75,5 @@ void csr_transpose(const raft::handle_t& handle,
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/mst/mst.cuh b/cpp/include/raft/sparse/mst/mst.cuh
index b49003467b..70a6ff521f 100644
--- a/cpp/include/raft/sparse/mst/mst.cuh
+++ b/cpp/include/raft/sparse/mst/mst.cuh
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#ifndef __MST_H
+#define __MST_H
 
 #pragma once
 
@@ -51,3 +53,5 @@ raft::Graph_COO<vertex_t, edge_t, weight_t> mst(const raft::handle_t& handle,
 
 }  // namespace mst
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/mst/mst.hpp b/cpp/include/raft/sparse/mst/mst.hpp
new file mode 100644
index 0000000000..ac4cf21b64
--- /dev/null
+++ b/cpp/include/raft/sparse/mst/mst.hpp
@@ -0,0 +1,63 @@
+
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MST_H
+#define __MST_H
+
+#pragma once
+
+#include "mst_solver.cuh"
+
+namespace raft {
+namespace mst {
+
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t = weight_t>
+raft::Graph_COO<vertex_t, edge_t, weight_t> mst(const raft::handle_t& handle,
+                                                edge_t const* offsets,
+                                                vertex_t const* indices,
+                                                weight_t const* weights,
+                                                vertex_t const v,
+                                                edge_t const e,
+                                                vertex_t* color,
+                                                cudaStream_t stream,
+                                                bool symmetrize_output = true,
+                                                bool initialize_colors = true,
+                                                int iterations         = 0)
+{
+  MST_solver<vertex_t, edge_t, weight_t, alteration_t> mst_solver(handle,
+                                                                  offsets,
+                                                                  indices,
+                                                                  weights,
+                                                                  v,
+                                                                  e,
+                                                                  color,
+                                                                  stream,
+                                                                  symmetrize_output,
+                                                                  initialize_colors,
+                                                                  iterations);
+  return mst_solver.solve();
+}
+
+}  // namespace mst
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/detail/filter.cuh b/cpp/include/raft/sparse/op/detail/filter.cuh
index 80a6584251..4e4e76946c 100644
--- a/cpp/include/raft/sparse/op/detail/filter.cuh
+++ b/cpp/include/raft/sparse/op/detail/filter.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,7 +35,7 @@
 
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/detail/utils.h>
-#include <raft/sparse/linalg/degree.hpp>
+#include <raft/sparse/linalg/degree.cuh>
 
 namespace raft {
 namespace sparse {
diff --git a/cpp/include/raft/sparse/op/detail/reduce.cuh b/cpp/include/raft/sparse/op/detail/reduce.cuh
index 988f478f2b..eb747cce1e 100644
--- a/cpp/include/raft/sparse/op/detail/reduce.cuh
+++ b/cpp/include/raft/sparse/op/detail/reduce.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 #include <raft/sparse/detail/cusparse_wrappers.h>
 
 #include <raft/device_atomics.cuh>
-#include <raft/sparse/op/sort.hpp>
+#include <raft/sparse/op/sort.cuh>
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
 
@@ -34,7 +34,7 @@
 #include <algorithm>
 #include <iostream>
 
-#include <raft/sparse/convert/csr.hpp>
+#include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/detail/utils.h>
 
diff --git a/cpp/include/raft/sparse/op/detail/row_op.cuh b/cpp/include/raft/sparse/op/detail/row_op.cuh
index 4754f753d4..63c8cafaa7 100644
--- a/cpp/include/raft/sparse/op/detail/row_op.cuh
+++ b/cpp/include/raft/sparse/op/detail/row_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/op/detail/slice.h b/cpp/include/raft/sparse/op/detail/slice.cuh
similarity index 97%
rename from cpp/include/raft/sparse/op/detail/slice.h
rename to cpp/include/raft/sparse/op/detail/slice.cuh
index e3c0f09e14..6bf6688076 100644
--- a/cpp/include/raft/sparse/op/detail/slice.h
+++ b/cpp/include/raft/sparse/op/detail/slice.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 
 #include <thrust/device_ptr.h>
diff --git a/cpp/include/raft/sparse/op/detail/sort.h b/cpp/include/raft/sparse/op/detail/sort.h
index 9fc7cac5e3..a8b8161716 100644
--- a/cpp/include/raft/sparse/op/detail/sort.h
+++ b/cpp/include/raft/sparse/op/detail/sort.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh
new file mode 100644
index 0000000000..6c36538137
--- /dev/null
+++ b/cpp/include/raft/sparse/op/filter.cuh
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __FILTER_H
+#define __FILTER_H
+
+#pragma once
+
+#include <raft/handle.hpp>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/op/detail/filter.cuh>
+
+namespace raft {
+namespace sparse {
+namespace op {
+
+/**
+ * @brief Removes the values matching a particular scalar from a COO formatted sparse matrix.
+ *
+ * @param rows: input array of rows (size n)
+ * @param cols: input array of cols (size n)
+ * @param vals: input array of vals (size n)
+ * @param nnz: size of current rows/cols/vals arrays
+ * @param crows: compressed array of rows
+ * @param ccols: compressed array of cols
+ * @param cvals: compressed array of vals
+ * @param cnnz: array of non-zero counts per row
+ * @param cur_cnnz array of counts per row
+ * @param scalar: scalar to remove from arrays
+ * @param n: number of rows in dense matrix
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_remove_scalar(const int* rows,
+                       const int* cols,
+                       const T* vals,
+                       int nnz,
+                       int* crows,
+                       int* ccols,
+                       T* cvals,
+                       int* cnnz,
+                       int* cur_cnnz,
+                       T scalar,
+                       int n,
+                       cudaStream_t stream)
+{
+  detail::coo_remove_scalar<128, T>(
+    rows, cols, vals, nnz, crows, ccols, cvals, cnnz, cur_cnnz, scalar, n, stream);
+}
+
+/**
+ * @brief Removes the values matching a particular scalar from a COO formatted sparse matrix.
+ *
+ * @param in: input COO matrix
+ * @param out: output COO matrix
+ * @param scalar: scalar to remove from arrays
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
+{
+  detail::coo_remove_scalar<128, T>(in, out, scalar, stream);
+}
+
+/**
+ * @brief Removes zeros from a COO formatted sparse matrix.
+ *
+ * @param in: input COO matrix
+ * @param out: output COO matrix
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_remove_zeros(COO<T>* in, COO<T>* out, cudaStream_t stream)
+{
+  coo_remove_scalar<T>(in, out, T(0.0), stream);
+}
+
+};  // namespace op
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/filter.hpp b/cpp/include/raft/sparse/op/filter.hpp
index 0dff063e91..b67084f18a 100644
--- a/cpp/include/raft/sparse/op/filter.hpp
+++ b/cpp/include/raft/sparse/op/filter.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __FILTER_H
+#define __FILTER_H
 
 #pragma once
 
@@ -88,3 +95,5 @@ void coo_remove_zeros(COO<T>* in, COO<T>* out, cudaStream_t stream)
 };  // namespace op
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh
new file mode 100644
index 0000000000..fd860d2dc1
--- /dev/null
+++ b/cpp/include/raft/sparse/op/reduce.cuh
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SPARSE_REDUCE_H
+#define __SPARSE_REDUCE_H
+
+#pragma once
+
+#include <raft/handle.hpp>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/op/detail/reduce.cuh>
+
+namespace raft {
+namespace sparse {
+namespace op {
+/**
+ * Computes a mask from a sorted COO matrix where 0's denote
+ * duplicate values and 1's denote new values. This mask can
+ * be useful for computing an exclusive scan to pre-build offsets
+ * for reducing duplicates, such as when symmetrizing
+ * or taking the min of each duplicated value.
+ *
+ * Note that this function always marks the first value as 0 so that
+ * a cumulative sum can be performed as a follow-on. However, even
+ * if the mask is used direclty, any duplicates should always have a
+ * 1 when first encountered so it can be assumed that the first element
+ * is always a 1 otherwise.
+ *
+ * @tparam value_idx
+ * @param[out] mask output mask, size nnz
+ * @param[in] rows COO rows array, size nnz
+ * @param[in] cols COO cols array, size nnz
+ * @param[in] nnz number of nonzeros in input arrays
+ * @param[in] stream cuda ops will be ordered wrt this stream
+ */
+template <typename value_idx>
+void compute_duplicates_mask(
+  value_idx* mask, const value_idx* rows, const value_idx* cols, size_t nnz, cudaStream_t stream)
+{
+  detail::compute_duplicates_mask(mask, rows, cols, nnz, stream);
+}
+
+/**
+ * Performs a COO reduce of duplicate columns per row, taking the max weight
+ * for duplicate columns in each row. This function assumes the input COO
+ * has been sorted by both row and column but makes no assumption on
+ * the sorting of values.
+ * @tparam value_idx
+ * @tparam value_t
+ * @param[in] handle
+ * @param[out] out output COO, the nnz will be computed allocate() will be called in this function.
+ * @param[in] rows COO rows array, size nnz
+ * @param[in] cols COO cols array, size nnz
+ * @param[in] vals COO vals array, size nnz
+ * @param[in] nnz number of nonzeros in COO input arrays
+ * @param[in] m number of rows in COO input matrix
+ * @param[in] n number of columns in COO input matrix
+ */
+template <typename value_idx, typename value_t>
+void max_duplicates(const raft::handle_t& handle,
+                    raft::sparse::COO<value_t, value_idx>& out,
+                    const value_idx* rows,
+                    const value_idx* cols,
+                    const value_t* vals,
+                    size_t nnz,
+                    size_t m,
+                    size_t n)
+{
+  detail::max_duplicates(handle, out, rows, cols, vals, nnz, m, n);
+}
+};  // END namespace op
+};  // END namespace sparse
+};  // END namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/reduce.hpp b/cpp/include/raft/sparse/op/reduce.hpp
index b181f1c46f..a7e771d157 100644
--- a/cpp/include/raft/sparse/op/reduce.hpp
+++ b/cpp/include/raft/sparse/op/reduce.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SPARSE_REDUCE_H
+#define __SPARSE_REDUCE_H
 
 #pragma once
 
@@ -81,3 +88,5 @@ void max_duplicates(const raft::handle_t& handle,
 };  // END namespace op
 };  // END namespace sparse
 };  // END namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/row_op.cuh b/cpp/include/raft/sparse/op/row_op.cuh
new file mode 100644
index 0000000000..b31d3f29b6
--- /dev/null
+++ b/cpp/include/raft/sparse/op/row_op.cuh
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SPARSE_ROW_OP_H
+#define __SPARSE_ROW_OP_H
+#pragma once
+
+#include <raft/handle.hpp>
+#include <raft/sparse/op/detail/row_op.cuh>
+
+namespace raft {
+namespace sparse {
+namespace op {
+
+/**
+ * @brief Perform a custom row operation on a CSR matrix in batches.
+ * @tparam T numerical type of row_ind array
+ * @tparam TPB_X number of threads per block to use for underlying kernel
+ * @tparam Lambda type of custom operation function
+ * @param row_ind the CSR row_ind array to perform parallel operations over
+ * @param n_rows total number vertices in graph
+ * @param nnz number of non-zeros
+ * @param op custom row operation functor accepting the row and beginning index.
+ * @param stream cuda stream to use
+ */
+template <typename Index_, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_row_op(const Index_* row_ind, Index_ n_rows, Index_ nnz, Lambda op, cudaStream_t stream)
+{
+  detail::csr_row_op<Index_, 128, Lambda>(row_ind, n_rows, nnz, op, stream);
+}
+
+};  // namespace op
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/row_op.hpp b/cpp/include/raft/sparse/op/row_op.hpp
index 5dc115cfce..b3eafafa66 100644
--- a/cpp/include/raft/sparse/op/row_op.hpp
+++ b/cpp/include/raft/sparse/op/row_op.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SPARSE_ROW_OP_H
+#define __SPARSE_ROW_OP_H
 
 #pragma once
 
@@ -43,3 +50,5 @@ void csr_row_op(const Index_* row_ind, Index_ n_rows, Index_ nnz, Lambda op, cud
 };  // namespace op
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/slice.cuh b/cpp/include/raft/sparse/op/slice.cuh
new file mode 100644
index 0000000000..cd7be1924b
--- /dev/null
+++ b/cpp/include/raft/sparse/op/slice.cuh
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SLICE_H
+#define __SLICE_H
+
+#pragma once
+
+#include <raft/handle.hpp>
+#include <raft/sparse/op/detail/slice.cuh>
+
+namespace raft {
+namespace sparse {
+namespace op {
+
+/**
+ * Slice consecutive rows from a CSR array and populate newly sliced indptr array
+ * @tparam value_idx
+ * @param[in] start_row : beginning row to slice
+ * @param[in] stop_row : ending row to slice
+ * @param[in] indptr : indptr of input CSR to slice
+ * @param[out] indptr_out : output sliced indptr to populate
+ * @param[in] start_offset : beginning column offset of input indptr
+ * @param[in] stop_offset : ending column offset of input indptr
+ * @param[in] stream : cuda stream for ordering events
+ */
+template <typename value_idx>
+void csr_row_slice_indptr(value_idx start_row,
+                          value_idx stop_row,
+                          const value_idx* indptr,
+                          value_idx* indptr_out,
+                          value_idx* start_offset,
+                          value_idx* stop_offset,
+                          cudaStream_t stream)
+{
+  detail::csr_row_slice_indptr(
+    start_row, stop_row, indptr, indptr_out, start_offset, stop_offset, stream);
+}
+
+/**
+ * Slice rows from a CSR, populate column and data arrays
+ * @tparam value_idx : data type of CSR index arrays
+ * @tparam value_t : data type of CSR data array
+ * @param[in] start_offset : beginning column offset to slice
+ * @param[in] stop_offset : ending column offset to slice
+ * @param[in] indices : column indices array from input CSR
+ * @param[in] data : data array from input CSR
+ * @param[out] indices_out : output column indices array
+ * @param[out] data_out : output data array
+ * @param[in] stream : cuda stream for ordering events
+ */
+template <typename value_idx, typename value_t>
+void csr_row_slice_populate(value_idx start_offset,
+                            value_idx stop_offset,
+                            const value_idx* indices,
+                            const value_t* data,
+                            value_idx* indices_out,
+                            value_t* data_out,
+                            cudaStream_t stream)
+{
+  detail::csr_row_slice_populate(
+    start_offset, stop_offset, indices, data, indices_out, data_out, stream);
+}
+
+};  // namespace op
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/slice.hpp b/cpp/include/raft/sparse/op/slice.hpp
index 917233319c..b4e0622ced 100644
--- a/cpp/include/raft/sparse/op/slice.hpp
+++ b/cpp/include/raft/sparse/op/slice.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,11 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SLICE_H
+#define __SLICE_H
 
 #pragma once
 
 #include <raft/handle.hpp>
-#include <raft/sparse/op/detail/slice.h>
+#include <raft/sparse/op/detail/slice.cuh>
 
 namespace raft {
 namespace sparse {
@@ -75,3 +82,5 @@ void csr_row_slice_populate(value_idx start_offset,
 };  // namespace op
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/sort.cuh b/cpp/include/raft/sparse/op/sort.cuh
new file mode 100644
index 0000000000..ae0e587c3b
--- /dev/null
+++ b/cpp/include/raft/sparse/op/sort.cuh
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SPARSE_SORT_H
+#define __SPARSE_SORT_H
+
+#pragma once
+
+#include <raft/handle.hpp>
+#include <raft/sparse/op/detail/sort.h>
+
+namespace raft {
+namespace sparse {
+namespace op {
+
+/**
+ * @brief Sorts the arrays that comprise the coo matrix
+ * by row and then by column.
+ *
+ * @param m number of rows in coo matrix
+ * @param n number of cols in coo matrix
+ * @param nnz number of non-zeros
+ * @param rows rows array from coo matrix
+ * @param cols cols array from coo matrix
+ * @param vals vals array from coo matrix
+ * @param stream: cuda stream to use
+ */
+template <typename T>
+void coo_sort(int m, int n, int nnz, int* rows, int* cols, T* vals, cudaStream_t stream)
+{
+  detail::coo_sort(m, n, nnz, rows, cols, vals, stream);
+}
+
+/**
+ * @brief Sort the underlying COO arrays by row
+ * @tparam T: the type name of the underlying value array
+ * @param in: COO to sort by row
+ * @param stream: the cuda stream to use
+ */
+template <typename T>
+void coo_sort(COO<T>* const in, cudaStream_t stream)
+{
+  coo_sort<T>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream);
+}
+
+/**
+ * Sorts a COO by its weight
+ * @tparam value_idx
+ * @tparam value_t
+ * @param[inout] rows source edges
+ * @param[inout] cols dest edges
+ * @param[inout] data edge weights
+ * @param[in] nnz number of edges in edge list
+ * @param[in] stream cuda stream for which to order cuda operations
+ */
+template <typename value_idx, typename value_t>
+void coo_sort_by_weight(
+  value_idx* rows, value_idx* cols, value_t* data, value_idx nnz, cudaStream_t stream)
+{
+  detail::coo_sort_by_weight(rows, cols, data, nnz, stream);
+}
+};  // namespace op
+};  // end NAMESPACE sparse
+};  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/op/sort.hpp b/cpp/include/raft/sparse/op/sort.hpp
index eb5c716976..12a4a77ca9 100644
--- a/cpp/include/raft/sparse/op/sort.hpp
+++ b/cpp/include/raft/sparse/op/sort.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SPARSE_SORT_H
+#define __SPARSE_SORT_H
 
 #pragma once
 
@@ -72,3 +79,5 @@ void coo_sort_by_weight(
 };  // namespace op
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh
new file mode 100644
index 0000000000..28bb5aa74b
--- /dev/null
+++ b/cpp/include/raft/sparse/selection/connect_components.cuh
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __CONNECT_COMPONENTS_H
+#define __CONNECT_COMPONENTS_H
+
+#include <raft/handle.hpp>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/selection/detail/connect_components.cuh>
+
+namespace raft {
+namespace linkage {
+
+template <typename value_idx, typename value_t>
+using FixConnectivitiesRedOp = detail::FixConnectivitiesRedOp<value_idx, value_t>;
+
+/**
+ * Gets the number of unique components from array of
+ * colors or labels. This does not assume the components are
+ * drawn from a monotonically increasing set.
+ * @tparam value_idx
+ * @param[in] colors array of components
+ * @param[in] n_rows size of components array
+ * @param[in] stream cuda stream for which to order cuda operations
+ * @return total number of components
+ */
+template <typename value_idx>
+value_idx get_n_components(value_idx* colors, size_t n_rows, cudaStream_t stream)
+{
+  return detail::get_n_components(colors, n_rows, stream);
+}
+
+/**
+ * Connects the components of an otherwise unconnected knn graph
+ * by computing a 1-nn to neighboring components of each data point
+ * (e.g. component(nn) != component(self)) and reducing the results to
+ * include the set of smallest destination components for each source
+ * component. The result will not necessarily contain
+ * n_components^2 - n_components number of elements because many components
+ * will likely not be contained in the neighborhoods of 1-nns.
+ * @tparam value_idx
+ * @tparam value_t
+ * @param[in] handle raft handle
+ * @param[out] out output edge list containing nearest cross-component
+ *             edges.
+ * @param[in] X original (row-major) dense matrix for which knn graph should be constructed.
+ * @param[in] orig_colors array containing component number for each row of X
+ * @param[in] n_rows number of rows in X
+ * @param[in] n_cols number of cols in X
+ * @param[in] reduction_op
+ * @param[in] metric
+ */
+template <typename value_idx, typename value_t, typename red_op>
+void connect_components(
+  const raft::handle_t& handle,
+  raft::sparse::COO<value_t, value_idx>& out,
+  const value_t* X,
+  const value_idx* orig_colors,
+  size_t n_rows,
+  size_t n_cols,
+  red_op reduction_op,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded)
+{
+  detail::connect_components(handle, out, X, orig_colors, n_rows, n_cols, reduction_op, metric);
+}
+
+};  // end namespace linkage
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/selection/connect_components.hpp b/cpp/include/raft/sparse/selection/connect_components.hpp
index 23d247b50e..83d8fce8ba 100644
--- a/cpp/include/raft/sparse/selection/connect_components.hpp
+++ b/cpp/include/raft/sparse/selection/connect_components.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __CONNECT_COMPONENTS_H
+#define __CONNECT_COMPONENTS_H
 
 #include <raft/handle.hpp>
 #include <raft/sparse/coo.hpp>
@@ -76,3 +83,5 @@ void connect_components(
 
 };  // end namespace linkage
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/selection/detail/connect_components.cuh b/cpp/include/raft/sparse/selection/detail/connect_components.cuh
index 2b9ca2d8b5..9cfa2bbd44 100644
--- a/cpp/include/raft/sparse/selection/detail/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/detail/connect_components.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,13 @@
 
 #include <cub/cub.cuh>
 
-#include <raft/distance/fused_l2_nn.hpp>
-#include <raft/label/classlabels.hpp>
-#include <raft/linalg/norm.hpp>
-#include <raft/sparse/convert/csr.hpp>
+#include <raft/distance/fused_l2_nn.cuh>
+#include <raft/label/classlabels.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
-#include <raft/sparse/linalg/symmetrize.hpp>
-#include <raft/sparse/op/reduce.hpp>
+#include <raft/sparse/linalg/symmetrize.cuh>
+#include <raft/sparse/op/reduce.cuh>
 
 #include <raft/cudart_utils.h>
 
diff --git a/cpp/include/raft/sparse/selection/detail/knn.cuh b/cpp/include/raft/sparse/selection/detail/knn.cuh
index d263f2409f..b1dd6116e7 100644
--- a/cpp/include/raft/sparse/selection/detail/knn.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,15 +21,15 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance_type.hpp>
-#include <raft/linalg/unary_op.hpp>
-#include <raft/matrix/matrix.hpp>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/matrix/matrix.cuh>
 
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/csr.hpp>
 #include <raft/sparse/detail/utils.h>
-#include <raft/sparse/distance/distance.hpp>
-#include <raft/sparse/op/slice.hpp>
-#include <raft/spatial/knn/knn.hpp>
+#include <raft/sparse/distance/distance.cuh>
+#include <raft/sparse/op/slice.cuh>
+#include <raft/spatial/knn/knn.cuh>
 
 #include <algorithm>
 
diff --git a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
index b222dfd9bd..32b7fd3c63 100644
--- a/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
+++ b/cpp/include/raft/sparse/selection/detail/knn_graph.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,9 +20,9 @@
 #include <raft/cudart_utils.h>
 
 #include <raft/sparse/coo.hpp>
-#include <raft/sparse/linalg/symmetrize.hpp>
+#include <raft/sparse/linalg/symmetrize.cuh>
 
-#include <raft/spatial/knn/knn.hpp>
+#include <raft/spatial/knn/knn.cuh>
 
 #include <raft/distance/distance_type.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh
new file mode 100644
index 0000000000..fd9ab4ac3d
--- /dev/null
+++ b/cpp/include/raft/sparse/selection/knn.cuh
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SPARSE_KNN_H
+#define __SPARSE_KNN_H
+
+#pragma once
+
+#include <raft/distance/distance_type.hpp>
+#include <raft/handle.hpp>
+#include <raft/sparse/selection/detail/knn.cuh>
+
+namespace raft {
+namespace sparse {
+namespace selection {
+
+/**
+ * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors
+ * using some distance implementation
+ * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1)
+ * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz)
+ * @param[in] idxData csr data array of the index matrix (size idxNNZ)
+ * @param[in] idxNNZ number of non-zeros for sparse index matrix
+ * @param[in] n_idx_rows number of data samples in index matrix
+ * @param[in] n_idx_cols
+ * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1)
+ * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ)
+ * @param[in] queryData csr data array of the query matrix (size queryNNZ)
+ * @param[in] queryNNZ number of non-zeros for sparse query matrix
+ * @param[in] n_query_rows number of data samples in query matrix
+ * @param[in] n_query_cols number of features in query matrix
+ * @param[out] output_indices dense matrix for output indices (size n_query_rows * k)
+ * @param[out] output_dists dense matrix for output distances (size n_query_rows * k)
+ * @param[in] k the number of neighbors to query
+ * @param[in] handle CUDA handle.get_stream() to order operations with respect to
+ * @param[in] batch_size_index maximum number of rows to use from index matrix per batch
+ * @param[in] batch_size_query maximum number of rows to use from query matrix per batch
+ * @param[in] metric distance metric/measure to use
+ * @param[in] metricArg potential argument for metric (currently unused)
+ */
+template <typename value_idx = int, typename value_t = float, int TPB_X = 32>
+void brute_force_knn(const value_idx* idxIndptr,
+                     const value_idx* idxIndices,
+                     const value_t* idxData,
+                     size_t idxNNZ,
+                     int n_idx_rows,
+                     int n_idx_cols,
+                     const value_idx* queryIndptr,
+                     const value_idx* queryIndices,
+                     const value_t* queryData,
+                     size_t queryNNZ,
+                     int n_query_rows,
+                     int n_query_cols,
+                     value_idx* output_indices,
+                     value_t* output_dists,
+                     int k,
+                     const raft::handle_t& handle,
+                     size_t batch_size_index             = 2 << 14,  // approx 1M
+                     size_t batch_size_query             = 2 << 14,
+                     raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
+                     float metricArg                     = 0)
+{
+  detail::sparse_knn_t<value_idx, value_t>(idxIndptr,
+                                           idxIndices,
+                                           idxData,
+                                           idxNNZ,
+                                           n_idx_rows,
+                                           n_idx_cols,
+                                           queryIndptr,
+                                           queryIndices,
+                                           queryData,
+                                           queryNNZ,
+                                           n_query_rows,
+                                           n_query_cols,
+                                           output_indices,
+                                           output_dists,
+                                           k,
+                                           handle,
+                                           batch_size_index,
+                                           batch_size_query,
+                                           metric,
+                                           metricArg)
+    .run();
+}
+
+};  // namespace selection
+};  // namespace sparse
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/selection/knn.hpp b/cpp/include/raft/sparse/selection/knn.hpp
index 8b2747d104..4158bd40c2 100644
--- a/cpp/include/raft/sparse/selection/knn.hpp
+++ b/cpp/include/raft/sparse/selection/knn.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SPARSE_KNN_H
+#define __SPARSE_KNN_H
 
 #pragma once
 
@@ -96,3 +103,5 @@ void brute_force_knn(const value_idx* idxIndptr,
 };  // namespace selection
 };  // namespace sparse
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh
new file mode 100644
index 0000000000..7d342db43b
--- /dev/null
+++ b/cpp/include/raft/sparse/selection/knn_graph.cuh
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __KNN_GRAPH_H
+#define __KNN_GRAPH_H
+
+#pragma once
+
+#include <raft/distance/distance_type.hpp>
+#include <raft/sparse/coo.hpp>
+#include <raft/sparse/selection/detail/knn_graph.cuh>
+
+#include <cstdint>
+
+namespace raft {
+namespace sparse {
+namespace selection {
+
+/**
+ * Constructs a (symmetrized) knn graph edge list from
+ * dense input vectors.
+ *
+ * Note: The resulting KNN graph is not guaranteed to be connected.
+ *
+ * @tparam value_idx
+ * @tparam value_t
+ * @param[in] handle raft handle
+ * @param[in] X dense matrix of input data samples and observations
+ * @param[in] m number of data samples (rows) in X
+ * @param[in] n number of observations (columns) in X
+ * @param[in] metric distance metric to use when constructing neighborhoods
+ * @param[out] out output edge list
+ * @param c
+ */
+template <typename value_idx = int, typename value_t = float>
+void knn_graph(const handle_t& handle,
+               const value_t* X,
+               std::size_t m,
+               std::size_t n,
+               raft::distance::DistanceType metric,
+               raft::sparse::COO<value_t, value_idx>& out,
+               int c = 15)
+{
+  detail::knn_graph(handle, X, m, n, metric, out, c);
+}
+
+};  // namespace selection
+};  // namespace sparse
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/selection/knn_graph.hpp b/cpp/include/raft/sparse/selection/knn_graph.hpp
index 825761d44d..eb035390ce 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.hpp
+++ b/cpp/include/raft/sparse/selection/knn_graph.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __KNN_GRAPH_H
+#define __KNN_GRAPH_H
 
 #pragma once
 
@@ -57,3 +64,5 @@ void knn_graph(const handle_t& handle,
 };  // namespace selection
 };  // namespace sparse
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/ann.cuh b/cpp/include/raft/spatial/knn/ann.cuh
new file mode 100644
index 0000000000..2ef2ae0fa4
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/ann.cuh
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ANN_H
+#define __ANN_H
+
+#pragma once
+
+#include "ann_common.h"
+#include "detail/ann_quantized_faiss.cuh"
+
+#include <faiss/gpu/GpuIndex.h>
+#include <raft/spatial/knn/faiss_mr.hpp>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+/**
+ * @brief Flat C++ API function to build an approximate nearest neighbors index
+ * from an index array and a set of parameters.
+ *
+ * @param[in] handle RAFT handle
+ * @param[out] index index to be built
+ * @param[in] params parametrization of the index to be built
+ * @param[in] metric distance metric to use. Euclidean (L2) is used by default
+ * @param[in] metricArg metric argument
+ * @param[in] index_array the index array to build the index with
+ * @param[in] n number of rows in the index array
+ * @param[in] D the dimensionality of the index array
+ */
+template <typename value_idx = int>
+inline void approx_knn_build_index(raft::handle_t& handle,
+                                   raft::spatial::knn::knnIndex* index,
+                                   knnIndexParam* params,
+                                   raft::distance::DistanceType metric,
+                                   float metricArg,
+                                   float* index_array,
+                                   value_idx n,
+                                   value_idx D)
+{
+  detail::approx_knn_build_index(handle, index, params, metric, metricArg, index_array, n, D);
+}
+
+/**
+ * @brief Flat C++ API function to perform an approximate nearest neighbors
+ * search from previously built index and a query array
+ *
+ * @param[in] handle RAFT handle
+ * @param[out] distances distances of the nearest neighbors toward
+ *                       their query point
+ * @param[out] indices indices of the nearest neighbors
+ * @param[in] index index to perform a search with
+ * @param[in] k the number of nearest neighbors to search for
+ * @param[in] query_array the query to perform a search with
+ * @param[in] n number of rows in the query array
+ */
+template <typename value_idx = int>
+inline void approx_knn_search(raft::handle_t& handle,
+                              float* distances,
+                              int64_t* indices,
+                              raft::spatial::knn::knnIndex* index,
+                              value_idx k,
+                              float* query_array,
+                              value_idx n)
+{
+  detail::approx_knn_search(handle, distances, indices, index, k, query_array, n);
+}
+
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp
index 5f64a8d1b5..bb11a2b11b 100644
--- a/cpp/include/raft/spatial/knn/ann.hpp
+++ b/cpp/include/raft/spatial/knn/ann.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __ANN_H
+#define __ANN_H
 
 #pragma once
 
@@ -80,3 +87,5 @@ inline void approx_knn_search(raft::handle_t& handle,
 }  // namespace knn
 }  // namespace spatial
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index 339ca3687a..5cdd6b1141 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/spatial/knn/ball_cover.cuh b/cpp/include/raft/spatial/knn/ball_cover.cuh
new file mode 100644
index 0000000000..df797ecca2
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/ball_cover.cuh
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BALL_COVER_H
+#define __BALL_COVER_H
+
+#pragma once
+
+#include <cstdint>
+
+#include "ball_cover_common.h"
+#include "detail/ball_cover.cuh"
+#include "detail/ball_cover/common.cuh"
+#include <raft/distance/distance_type.hpp>
+#include <thrust/transform.h>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+template <typename value_idx = std::int64_t, typename value_t, typename value_int = std::uint32_t>
+void rbc_build_index(const raft::handle_t& handle,
+                     BallCoverIndex<value_idx, value_t, value_int>& index)
+{
+  ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions");
+  if (index.metric == raft::distance::DistanceType::Haversine) {
+    detail::rbc_build_index(handle, index, detail::HaversineFunc<value_t, value_int>());
+  } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
+             index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
+    detail::rbc_build_index(handle, index, detail::EuclideanFunc<value_t, value_int>());
+  } else {
+    RAFT_FAIL("Metric not support");
+  }
+
+  index.set_index_trained();
+}
+
+/**
+ * Performs a faster exact knn in metric spaces using the triangle
+ * inequality with a number of landmark points to reduce the
+ * number of distance computations from O(n^2) to O(sqrt(n)). This
+ * performs an all neighbors knn, which can reuse memory when
+ * the index and query are the same array. This function will
+ * build the index and assumes rbc_build_index() has not already
+ * been called.
+ * @tparam value_idx knn index type
+ * @tparam value_t knn distance type
+ * @tparam value_int type for integers, such as number of rows/cols
+ * @param handle raft handle for resource management
+ * @param index ball cover index which has not yet been built
+ * @param k number of nearest neighbors to find
+ * @param perform_post_filtering if this is false, only the closest k landmarks
+ *                               are considered (which will return approximate
+ *                               results).
+ * @param[out] inds output knn indices
+ * @param[out] dists output knn distances
+ * @param weight a weight for overlap between the closest landmark and
+ *               the radius of other landmarks when pruning distances.
+ *               Setting this value below 1 can effectively turn off
+ *               computing distances against many other balls, enabling
+ *               approximate nearest neighbors. Recall can be adjusted
+ *               based on how many relevant balls are ignored. Note that
+ *               many datasets can still have great recall even by only
+ *               looking in the closest landmark.
+ */
+template <typename value_idx = std::int64_t, typename value_t, typename value_int = std::uint32_t>
+void rbc_all_knn_query(const raft::handle_t& handle,
+                       BallCoverIndex<value_idx, value_t, value_int>& index,
+                       value_int k,
+                       value_idx* inds,
+                       value_t* dists,
+                       bool perform_post_filtering = true,
+                       float weight                = 1.0)
+{
+  ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions");
+  if (index.metric == raft::distance::DistanceType::Haversine) {
+    detail::rbc_all_knn_query(handle,
+                              index,
+                              k,
+                              inds,
+                              dists,
+                              detail::HaversineFunc<value_t, value_int>(),
+                              perform_post_filtering,
+                              weight);
+  } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
+             index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
+    detail::rbc_all_knn_query(handle,
+                              index,
+                              k,
+                              inds,
+                              dists,
+                              detail::EuclideanFunc<value_t, value_int>(),
+                              perform_post_filtering,
+                              weight);
+  } else {
+    RAFT_FAIL("Metric not supported");
+  }
+
+  index.set_index_trained();
+}
+
+/**
+ * Performs a faster exact knn in metric spaces using the triangle
+ * inequality with a number of landmark points to reduce the
+ * number of distance computations from O(n^2) to O(sqrt(n)). This
+ * function does not build the index and assumes rbc_build_index() has
+ * already been called. Use this function when the index and
+ * query arrays are different, otherwise use rbc_all_knn_query().
+ * @tparam value_idx index type
+ * @tparam value_t distances type
+ * @tparam value_int integer type for size info
+ * @param handle raft handle for resource management
+ * @param index ball cover index which has not yet been built
+ * @param k number of nearest neighbors to find
+ * @param query the
+ * @param perform_post_filtering if this is false, only the closest k landmarks
+ *                               are considered (which will return approximate
+ *                               results).
+ * @param[out] inds output knn indices
+ * @param[out] dists output knn distances
+ * @param weight a weight for overlap between the closest landmark and
+ *               the radius of other landmarks when pruning distances.
+ *               Setting this value below 1 can effectively turn off
+ *               computing distances against many other balls, enabling
+ *               approximate nearest neighbors. Recall can be adjusted
+ *               based on how many relevant balls are ignored. Note that
+ *               many datasets can still have great recall even by only
+ *               looking in the closest landmark.
+ * @param[in] n_query_pts number of query points
+ */
+template <typename value_idx = std::int64_t, typename value_t, typename value_int = std::uint32_t>
+void rbc_knn_query(const raft::handle_t& handle,
+                   BallCoverIndex<value_idx, value_t, value_int>& index,
+                   value_int k,
+                   const value_t* query,
+                   value_int n_query_pts,
+                   value_idx* inds,
+                   value_t* dists,
+                   bool perform_post_filtering = true,
+                   float weight                = 1.0)
+{
+  ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions");
+  if (index.metric == raft::distance::DistanceType::Haversine) {
+    detail::rbc_knn_query(handle,
+                          index,
+                          k,
+                          query,
+                          n_query_pts,
+                          inds,
+                          dists,
+                          detail::HaversineFunc<value_t, value_int>(),
+                          perform_post_filtering,
+                          weight);
+  } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
+             index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
+    detail::rbc_knn_query(handle,
+                          index,
+                          k,
+                          query,
+                          n_query_pts,
+                          inds,
+                          dists,
+                          detail::EuclideanFunc<value_t, value_int>(),
+                          perform_post_filtering,
+                          weight);
+  } else {
+    RAFT_FAIL("Metric not supported");
+  }
+}
+
+// TODO: implement functions for:
+//  4. rbc_eps_neigh() - given a populated index, perform query against different query array
+//  5. rbc_all_eps_neigh() - populate a BallCoverIndex and query against training data
+
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/ball_cover.hpp b/cpp/include/raft/spatial/knn/ball_cover.hpp
index d44e87710b..26c2c1fb2e 100644
--- a/cpp/include/raft/spatial/knn/ball_cover.hpp
+++ b/cpp/include/raft/spatial/knn/ball_cover.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __BALL_COVER_H
+#define __BALL_COVER_H
 
 #pragma once
 
@@ -185,3 +192,5 @@ void rbc_knn_query(const raft::handle_t& handle,
 }  // namespace knn
 }  // namespace spatial
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/ball_cover_common.h b/cpp/include/raft/spatial/knn/ball_cover_common.h
index e1a202107b..0567e124d9 100644
--- a/cpp/include/raft/spatial/knn/ball_cover_common.h
+++ b/cpp/include/raft/spatial/knn/ball_cover_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index 4d9bfd82ad..78631b431f 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@
 #include <raft/cudart_utils.h>
 
 #include <label/classlabels.cuh>
-#include <raft/distance/distance.hpp>
+#include <raft/distance/distance.cuh>
 #include <raft/spatial/knn/faiss_mr.hpp>
 
 #include <faiss/gpu/GpuDistance.h>
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
index d430a98ea0..afab663e2b 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
@@ -31,9 +31,9 @@
 
 #include <raft/cuda_utils.cuh>
 
-#include <raft/matrix/matrix.hpp>
-#include <raft/random/rng.hpp>
-#include <raft/sparse/convert/csr.hpp>
+#include <raft/matrix/matrix.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/sparse/convert/csr.cuh>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
diff --git a/cpp/include/raft/spatial/knn/detail/common_faiss.h b/cpp/include/raft/spatial/knn/detail/common_faiss.h
index 587505316b..aca1571de2 100644
--- a/cpp/include/raft/spatial/knn/detail/common_faiss.h
+++ b/cpp/include/raft/spatial/knn/detail/common_faiss.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
index 3b4a8d4174..e16efe4a69 100644
--- a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
+++ b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/device_utils.cuh>
-#include <raft/linalg/contractions.hpp>
+#include <raft/linalg/contractions.cuh>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index e3e33e6642..9b69d437f4 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #include <cub/cub.cuh>
 #include <faiss/gpu/utils/Select.cuh>
 #include <limits>
-#include <raft/linalg/norm.hpp>
+#include <raft/linalg/norm.cuh>
 // TODO: Need to hide the PairwiseDistance class impl and expose to public API
 #include "processing.hpp"
 #include <raft/distance/detail/distance.cuh>
diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
index 06473f6151..c2d89aae7d 100644
--- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
+++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index d5dfe4f8f8..196124352a 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index a515ca8507..001f57a4aa 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c)2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,11 @@
 #pragma once
 
 #include <raft/distance/distance_type.hpp>
-#include <raft/linalg/matrix_vector_op.hpp>
-#include <raft/linalg/norm.hpp>
-#include <raft/linalg/unary_op.hpp>
-#include <raft/stats/mean.hpp>
-#include <raft/stats/mean_center.hpp>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/stats/mean.cuh>
+#include <raft/stats/mean_center.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh b/cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh
new file mode 100644
index 0000000000..29ed51fb3d
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __EPSILON_NEIGH_H
+#define __EPSILON_NEIGH_H
+
+#pragma once
+
+#include <raft/spatial/knn/detail/epsilon_neighborhood.cuh>
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+/**
+ * @brief Computes epsilon neighborhood for the L2-Squared distance metric
+ *
+ * @tparam DataT   IO and math type
+ * @tparam IdxT    Index type
+ *
+ * @param[out] adj    adjacency matrix [row-major] [on device] [dim = m x n]
+ * @param[out] vd     vertex degree array [on device] [len = m + 1]
+ *                    `vd + m` stores the total number of edges in the adjacency
+ *                    matrix. Pass a nullptr if you don't need this info.
+ * @param[in]  x      first matrix [row-major] [on device] [dim = m x k]
+ * @param[in]  y      second matrix [row-major] [on device] [dim = n x k]
+ * @param[in]  m      number of rows in x
+ * @param[in]  n      number of rows in y
+ * @param[in]  k      number of columns in x and k
+ * @param[in]  eps    defines epsilon neighborhood radius (should be passed as
+ *                    squared as we compute L2-squared distance in this method)
+ * @param[in]  stream cuda stream
+ */
+template <typename DataT, typename IdxT>
+void epsUnexpL2SqNeighborhood(bool* adj,
+                              IdxT* vd,
+                              const DataT* x,
+                              const DataT* y,
+                              IdxT m,
+                              IdxT n,
+                              IdxT k,
+                              DataT eps,
+                              cudaStream_t stream)
+{
+  detail::epsUnexpL2SqNeighborhood<DataT, IdxT>(adj, vd, x, y, m, n, k, eps, stream);
+}
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
index cd9163096a..b3ba0fc442 100644
--- a/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
+++ b/cpp/include/raft/spatial/knn/epsilon_neighborhood.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __EPSILON_NEIGH_H
+#define __EPSILON_NEIGH_H
 
 #pragma once
 
@@ -57,3 +64,5 @@ void epsUnexpL2SqNeighborhood(bool* adj,
 }  // namespace knn
 }  // namespace spatial
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/knn.cuh b/cpp/include/raft/spatial/knn/knn.cuh
new file mode 100644
index 0000000000..189b537361
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/knn.cuh
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KNN_H
+#define __KNN_H
+
+#pragma once
+
+#include "detail/knn_brute_force_faiss.cuh"
+#include "detail/selection_faiss.cuh"
+
+namespace raft {
+namespace spatial {
+namespace knn {
+
+/**
+ * Performs a k-select across row partitioned index/distance
+ * matrices formatted like the following:
+ * row1: k0, k1, k2
+ * row2: k0, k1, k2
+ * row3: k0, k1, k2
+ * row1: k0, k1, k2
+ * row2: k0, k1, k2
+ * row3: k0, k1, k2
+ *
+ * etc...
+ *
+ * @tparam value_idx
+ * @tparam value_t
+ * @param inK
+ * @param inV
+ * @param outK
+ * @param outV
+ * @param n_samples
+ * @param n_parts
+ * @param k
+ * @param stream
+ * @param translations
+ */
+template <typename value_idx = int64_t, typename value_t = float>
+inline void knn_merge_parts(value_t* inK,
+                            value_idx* inV,
+                            value_t* outK,
+                            value_idx* outV,
+                            size_t n_samples,
+                            int n_parts,
+                            int k,
+                            cudaStream_t stream,
+                            value_idx* translations)
+{
+  detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
+}
+
+/**
+ * Performs a k-select across column-partitioned index/distance
+ * matrices formatted like the following:
+ * row1: k0, k1, k2, k0, k1, k2
+ * row2: k0, k1, k2, k0, k1, k2
+ * row3: k0, k1, k2, k0, k1, k2
+ *
+ * etc...
+ *
+ * @tparam value_idx
+ * @tparam value_t
+ * @param inK
+ * @param inV
+ * @param n_rows
+ * @param n_cols
+ * @param outK
+ * @param outV
+ * @param select_min
+ * @param k
+ * @param stream
+ */
+template <typename value_idx = int, typename value_t = float>
+inline void select_k(value_t* inK,
+                     value_idx* inV,
+                     size_t n_rows,
+                     size_t n_cols,
+                     value_t* outK,
+                     value_idx* outV,
+                     bool select_min,
+                     int k,
+                     cudaStream_t stream)
+{
+  detail::select_k(inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+}
+
+/**
+ * @brief Flat C++ API function to perform a brute force knn on
+ * a series of input arrays and combine the results into a single
+ * output array for indexes and distances.
+ *
+ * @param[in] handle the cuml handle to use
+ * @param[in] input vector of pointers to the input arrays
+ * @param[in] sizes vector of sizes of input arrays
+ * @param[in] D the dimensionality of the arrays
+ * @param[in] search_items array of items to search of dimensionality D
+ * @param[in] n number of rows in search_items
+ * @param[out] res_I the resulting index array of size n * k
+ * @param[out] res_D the resulting distance array of size n * k
+ * @param[in] k the number of nearest neighbors to return
+ * @param[in] rowMajorIndex are the index arrays in row-major order?
+ * @param[in] rowMajorQuery are the query arrays in row-major order?
+ * @param[in] metric distance metric to use. Euclidean (L2) is used by
+ * 			   default
+ * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances. This
+ * 					 is ignored if the metric_type is not Minkowski.
+ * @param[in] translations starting offsets for partitions. should be the same size
+ *            as input vector.
+ */
+template <typename value_idx = std::int64_t, typename value_t = float, typename value_int = int>
+void brute_force_knn(raft::handle_t const& handle,
+                     std::vector<value_t*>& input,
+                     std::vector<value_int>& sizes,
+                     value_int D,
+                     value_t* search_items,
+                     value_int n,
+                     value_idx* res_I,
+                     value_t* res_D,
+                     value_int k,
+                     bool rowMajorIndex                   = true,
+                     bool rowMajorQuery                   = true,
+                     std::vector<value_idx>* translations = nullptr,
+                     distance::DistanceType metric        = distance::DistanceType::L2Unexpanded,
+                     float metric_arg                     = 2.0f)
+{
+  ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size");
+
+  detail::brute_force_knn_impl(handle,
+                               input,
+                               sizes,
+                               D,
+                               search_items,
+                               n,
+                               res_I,
+                               res_D,
+                               k,
+                               rowMajorIndex,
+                               rowMajorQuery,
+                               translations,
+                               metric,
+                               metric_arg);
+}
+}  // namespace knn
+}  // namespace spatial
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index 59df75ba36..da18e891d4 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __KNN_H
+#define __KNN_H
 
 #pragma once
 
@@ -155,3 +162,5 @@ void brute_force_knn(raft::handle_t const& handle,
 }  // namespace knn
 }  // namespace spatial
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/specializations.cuh b/cpp/include/raft/spatial/knn/specializations.cuh
new file mode 100644
index 0000000000..fbac1c8f8b
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/specializations.cuh
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KNN_SPECIALIZATIONS_H
+#define __KNN_SPECIALIZATIONS_H
+
+#pragma once
+
+#include <raft/spatial/knn/specializations/ball_cover.cuh>
+#include <raft/spatial/knn/specializations/fused_l2_knn.cuh>
+#include <raft/spatial/knn/specializations/knn.cuh>
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/specializations.hpp b/cpp/include/raft/spatial/knn/specializations.hpp
index 663e77c6a0..538e1b1380 100644
--- a/cpp/include/raft/spatial/knn/specializations.hpp
+++ b/cpp/include/raft/spatial/knn/specializations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,9 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __KNN_SPECIALIZATIONS_H
+#define __KNN_SPECIALIZATIONS_H
 
 #pragma once
 
-#include <raft/spatial/knn/specializations/ball_cover.hpp>
-#include <raft/spatial/knn/specializations/fused_l2_knn.hpp>
-#include <raft/spatial/knn/specializations/knn.hpp>
+#include <raft/spatial/knn/specializations/ball_cover.cuh>
+#include <raft/spatial/knn/specializations/fused_l2_knn.cuh>
+#include <raft/spatial/knn/specializations/knn.cuh>
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/specializations/ball_cover.hpp b/cpp/include/raft/spatial/knn/specializations/ball_cover.cuh
similarity index 95%
rename from cpp/include/raft/spatial/knn/specializations/ball_cover.hpp
rename to cpp/include/raft/spatial/knn/specializations/ball_cover.cuh
index 6b8b10b35a..033862c2f1 100644
--- a/cpp/include/raft/spatial/knn/specializations/ball_cover.hpp
+++ b/cpp/include/raft/spatial/knn/specializations/ball_cover.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/spatial/knn/ball_cover.hpp>
+#include <raft/spatial/knn/ball_cover.cuh>
 #include <raft/spatial/knn/ball_cover_common.h>
 #include <raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp>
 
diff --git a/cpp/include/raft/spatial/knn/specializations/fused_l2_knn.hpp b/cpp/include/raft/spatial/knn/specializations/fused_l2_knn.cuh
similarity index 98%
rename from cpp/include/raft/spatial/knn/specializations/fused_l2_knn.hpp
rename to cpp/include/raft/spatial/knn/specializations/fused_l2_knn.cuh
index 961351d734..916db8f0a2 100644
--- a/cpp/include/raft/spatial/knn/specializations/fused_l2_knn.hpp
+++ b/cpp/include/raft/spatial/knn/specializations/fused_l2_knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/spatial/knn/specializations/knn.hpp b/cpp/include/raft/spatial/knn/specializations/knn.cuh
similarity index 97%
rename from cpp/include/raft/spatial/knn/specializations/knn.hpp
rename to cpp/include/raft/spatial/knn/specializations/knn.cuh
index bd8673af39..6cf2418d29 100644
--- a/cpp/include/raft/spatial/knn/specializations/knn.hpp
+++ b/cpp/include/raft/spatial/knn/specializations/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/spatial/knn/knn.hpp>
+#include <raft/spatial/knn/knn.cuh>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/include/raft/spectral/cluster_solvers.cuh b/cpp/include/raft/spectral/cluster_solvers.cuh
new file mode 100644
index 0000000000..27599c9464
--- /dev/null
+++ b/cpp/include/raft/spectral/cluster_solvers.cuh
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CLUSTER_SOLVERS_H
+#define __CLUSTER_SOLVERS_H
+
+#pragma once
+
+#include <raft/cluster/kmeans.cuh>
+#include <utility>  // for std::pair
+
+namespace raft {
+namespace spectral {
+
+using namespace matrix;
+
+// aggregate of control params for Eigen Solver:
+//
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+struct cluster_solver_config_t {
+  size_type_t n_clusters;
+  size_type_t maxIter;
+
+  value_type_t tol;
+
+  unsigned long long seed{123456};
+};
+
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+struct kmeans_solver_t {
+  explicit kmeans_solver_t(
+    cluster_solver_config_t<index_type_t, value_type_t, size_type_t> const& config)
+    : config_(config)
+  {
+  }
+
+  std::pair<value_type_t, index_type_t> solve(handle_t const& handle,
+                                              size_type_t n_obs_vecs,
+                                              size_type_t dim,
+                                              value_type_t const* __restrict__ obs,
+                                              index_type_t* __restrict__ codes) const
+  {
+    RAFT_EXPECTS(obs != nullptr, "Null obs buffer.");
+    RAFT_EXPECTS(codes != nullptr, "Null codes buffer.");
+    value_type_t residual{};
+    index_type_t iters{};
+
+    raft::cluster::kmeans(handle,
+                          n_obs_vecs,
+                          dim,
+                          config_.n_clusters,
+                          config_.tol,
+                          config_.maxIter,
+                          obs,
+                          codes,
+                          residual,
+                          iters,
+                          config_.seed);
+    return std::make_pair(residual, iters);
+  }
+
+  auto const& get_config(void) const { return config_; }
+
+ private:
+  cluster_solver_config_t<index_type_t, value_type_t, size_type_t> config_;
+};
+
+}  // namespace spectral
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp
index cc25e66cae..c6b166bb4f 100644
--- a/cpp/include/raft/spectral/cluster_solvers.hpp
+++ b/cpp/include/raft/spectral/cluster_solvers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,9 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __CLUSTER_SOLVERS_H
+#define __CLUSTER_SOLVERS_H
+
 #pragma once
 
-#include <raft/cluster/kmeans.hpp>
+#include <raft/cluster/kmeans.cuh>
 #include <utility>  // for std::pair
 
 namespace raft {
@@ -76,3 +85,5 @@ struct kmeans_solver_t {
 
 }  // namespace spectral
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spectral/detail/lapack.hpp b/cpp/include/raft/spectral/detail/lapack.hpp
index d066c68a68..fa9cabf6a3 100644
--- a/cpp/include/raft/spectral/detail/lapack.hpp
+++ b/cpp/include/raft/spectral/detail/lapack.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.cuh b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
similarity index 99%
rename from cpp/include/raft/spectral/detail/matrix_wrappers.cuh
rename to cpp/include/raft/spectral/detail/matrix_wrappers.hpp
index b4a2ed175f..716260abd5 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.cuh
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/spectral/detail/modularity_maximization.hpp b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
index 6bb3dca920..b60ca719fb 100644
--- a/cpp/include/raft/spectral/detail/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,9 +27,9 @@
 #include <tuple>
 
 #include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/spectral/cluster_solvers.hpp>
+#include <raft/spectral/cluster_solvers.cuh>
 #include <raft/spectral/detail/spectral_util.cuh>
-#include <raft/spectral/eigen_solvers.hpp>
+#include <raft/spectral/eigen_solvers.cuh>
 #include <raft/spectral/matrix_wrappers.hpp>
 
 #ifdef COLLECT_TIME_STATISTICS
diff --git a/cpp/include/raft/spectral/detail/partition.hpp b/cpp/include/raft/spectral/detail/partition.hpp
index 775b37d118..97e10963dc 100644
--- a/cpp/include/raft/spectral/detail/partition.hpp
+++ b/cpp/include/raft/spectral/detail/partition.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,9 +26,9 @@
 #include <tuple>
 
 #include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/spectral/cluster_solvers.hpp>
+#include <raft/spectral/cluster_solvers.cuh>
 #include <raft/spectral/detail/spectral_util.cuh>
-#include <raft/spectral/eigen_solvers.hpp>
+#include <raft/spectral/eigen_solvers.cuh>
 
 namespace raft {
 namespace spectral {
diff --git a/cpp/include/raft/spectral/detail/spectral_util.cuh b/cpp/include/raft/spectral/detail/spectral_util.cuh
index c1796cbbc1..08ae9b856a 100644
--- a/cpp/include/raft/spectral/detail/spectral_util.cuh
+++ b/cpp/include/raft/spectral/detail/spectral_util.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/spectral/eigen_solvers.cuh b/cpp/include/raft/spectral/eigen_solvers.cuh
new file mode 100644
index 0000000000..787a5bde39
--- /dev/null
+++ b/cpp/include/raft/spectral/eigen_solvers.cuh
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __EIGEN_SOLVERS_H
+#define __EIGEN_SOLVERS_H
+
+#pragma once
+
+#include <raft/linalg/lanczos.cuh>
+#include <raft/spectral/matrix_wrappers.hpp>
+
+namespace raft {
+namespace spectral {
+
+// aggregate of control params for Eigen Solver:
+//
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+struct eigen_solver_config_t {
+  size_type_t n_eigVecs;
+  size_type_t maxIter;
+
+  size_type_t restartIter;
+  value_type_t tol;
+
+  bool reorthogonalize{false};
+  unsigned long long seed{
+    1234567};  // CAVEAT: this default value is now common to all instances of using seed in
+               // Lanczos; was not the case before: there were places where a default seed = 123456
+               // was used; this may trigger slightly different # solver iterations
+};
+
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+struct lanczos_solver_t {
+  explicit lanczos_solver_t(
+    eigen_solver_config_t<index_type_t, value_type_t, size_type_t> const& config)
+    : config_(config)
+  {
+  }
+
+  index_type_t solve_smallest_eigenvectors(
+    handle_t const& handle,
+    matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+    value_type_t* __restrict__ eigVals,
+    value_type_t* __restrict__ eigVecs) const
+  {
+    RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+    RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+    index_type_t iters{};
+    linalg::computeSmallestEigenvectors(handle,
+                                        A,
+                                        config_.n_eigVecs,
+                                        config_.maxIter,
+                                        config_.restartIter,
+                                        config_.tol,
+                                        config_.reorthogonalize,
+                                        iters,
+                                        eigVals,
+                                        eigVecs,
+                                        config_.seed);
+    return iters;
+  }
+
+  index_type_t solve_largest_eigenvectors(
+    handle_t const& handle,
+    matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+    value_type_t* __restrict__ eigVals,
+    value_type_t* __restrict__ eigVecs) const
+  {
+    RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+    RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+    index_type_t iters{};
+    linalg::computeLargestEigenvectors(handle,
+                                       A,
+                                       config_.n_eigVecs,
+                                       config_.maxIter,
+                                       config_.restartIter,
+                                       config_.tol,
+                                       config_.reorthogonalize,
+                                       iters,
+                                       eigVals,
+                                       eigVecs,
+                                       config_.seed);
+    return iters;
+  }
+
+  auto const& get_config(void) const { return config_; }
+
+ private:
+  eigen_solver_config_t<index_type_t, value_type_t, size_type_t> config_;
+};
+
+}  // namespace spectral
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp
index 0033dbeea9..d55ddf952a 100644
--- a/cpp/include/raft/spectral/eigen_solvers.hpp
+++ b/cpp/include/raft/spectral/eigen_solvers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,9 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __EIGEN_SOLVERS_H
+#define __EIGEN_SOLVERS_H
+
 #pragma once
 
-#include <raft/linalg/lanczos.hpp>
+#include <raft/linalg/lanczos.cuh>
 #include <raft/spectral/matrix_wrappers.hpp>
 
 namespace raft {
@@ -100,3 +108,5 @@ struct lanczos_solver_t {
 
 }  // namespace spectral
 }  // namespace raft
+
+#endif
diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp
index 237f1275fd..952dac0715 100644
--- a/cpp/include/raft/spectral/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/matrix_wrappers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <raft/spectral/detail/matrix_wrappers.cuh>
+#include <raft/spectral/detail/matrix_wrappers.hpp>
 
 // =========================================================
 // Useful macros
diff --git a/cpp/include/raft/spectral/modularity_maximization.cuh b/cpp/include/raft/spectral/modularity_maximization.cuh
new file mode 100644
index 0000000000..c8221e434c
--- /dev/null
+++ b/cpp/include/raft/spectral/modularity_maximization.cuh
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __MODULARITY_MAXIMIZATION_H
+#define __MODULARITY_MAXIMIZATION_H
+
+#pragma once
+
+#include <tuple>
+
+#include <raft/spectral/detail/modularity_maximization.hpp>
+
+namespace raft {
+namespace spectral {
+
+// =========================================================
+// Spectral modularity_maximization
+// =========================================================
+
+/** Compute partition for a weighted undirected graph. This
+ *  partition attempts to minimize the cost function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter_lanczos Maximum number of Lanczos iterations.
+ *  @param restartIter_lanczos Maximum size of Lanczos system before
+ *    implicit restart.
+ *  @param tol_lanczos Convergence tolerance for Lanczos method.
+ *  @param maxIter_kmeans Maximum number of k-means iterations.
+ *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+ *  @param clusters (Output, device memory, n entries) Cluster
+ *    assignments.
+ *  @param iters_lanczos On exit, number of Lanczos iterations
+ *    performed.
+ *  @param iters_kmeans On exit, number of k-means iterations
+ *    performed.
+ *  @return error flag.
+ */
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
+  handle_t const& handle,
+  matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
+{
+  return raft::spectral::detail::
+    modularity_maximization<vertex_t, weight_t, EigenSolver, ClusterSolver>(
+      handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
+}
+//===================================================
+// Analysis of graph partition
+// =========================================================
+
+/// Compute modularity
+/** This function determines the modularity based on a graph and cluster assignments
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of clusters.
+ *  @param clusters (Input, device memory, n entries) Cluster assignments.
+ *  @param modularity On exit, modularity
+ */
+template <typename vertex_t, typename weight_t>
+void analyzeModularity(handle_t const& handle,
+                       matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                       vertex_t nClusters,
+                       vertex_t const* __restrict__ clusters,
+                       weight_t& modularity)
+{
+  raft::spectral::detail::analyzeModularity<vertex_t, weight_t>(
+    handle, csr_m, nClusters, clusters, modularity);
+}
+
+}  // namespace spectral
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp
index e67be767a2..d1c3ea00f7 100644
--- a/cpp/include/raft/spectral/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/modularity_maximization.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MODULARITY_MAXIMIZATION_H
+#define __MODULARITY_MAXIMIZATION_H
 
 #pragma once
 
@@ -86,3 +93,5 @@ void analyzeModularity(handle_t const& handle,
 
 }  // namespace spectral
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spectral/partition.cuh b/cpp/include/raft/spectral/partition.cuh
new file mode 100644
index 0000000000..9ccc21c868
--- /dev/null
+++ b/cpp/include/raft/spectral/partition.cuh
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PARTITION_H
+#define __PARTITION_H
+
+#pragma once
+
+#include <tuple>
+
+#include <raft/spectral/detail/partition.hpp>
+
+namespace raft {
+namespace spectral {
+
+// =========================================================
+// Spectral partitioner
+// =========================================================
+
+/// Compute spectral graph partition
+/** Compute partition for a weighted undirected graph. This
+ *  partition attempts to minimize the cost function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter_lanczos Maximum number of Lanczos iterations.
+ *  @param restartIter_lanczos Maximum size of Lanczos system before
+ *    implicit restart.
+ *  @param tol_lanczos Convergence tolerance for Lanczos method.
+ *  @param maxIter_kmeans Maximum number of k-means iterations.
+ *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+ *  @param clusters (Output, device memory, n entries) Partition
+ *    assignments.
+ *  @param iters_lanczos On exit, number of Lanczos iterations
+ *    performed.
+ *  @param iters_kmeans On exit, number of k-means iterations
+ *    performed.
+ *  @return statistics: number of eigensolver iterations, .
+ */
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> partition(
+  handle_t const& handle,
+  matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
+{
+  return raft::spectral::detail::partition<vertex_t, weight_t, EigenSolver, ClusterSolver>(
+    handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
+}
+
+// =========================================================
+// Analysis of graph partition
+// =========================================================
+
+/// Compute cost function for partition
+/** This function determines the edges cut by a partition and a cost
+ *  function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *  Graph is assumed to be weighted and undirected.
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param clusters (Input, device memory, n entries) Partition
+ *    assignments.
+ *  @param edgeCut On exit, weight of edges cut by partition.
+ *  @param cost On exit, partition cost function.
+ *  @return error flag.
+ */
+template <typename vertex_t, typename weight_t>
+void analyzePartition(handle_t const& handle,
+                      matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                      vertex_t nClusters,
+                      const vertex_t* __restrict__ clusters,
+                      weight_t& edgeCut,
+                      weight_t& cost)
+{
+  raft::spectral::detail::analyzePartition<vertex_t, weight_t>(
+    handle, csr_m, nClusters, clusters, edgeCut, cost);
+}
+
+}  // namespace spectral
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp
index f62773a958..fde2e6572b 100644
--- a/cpp/include/raft/spectral/partition.hpp
+++ b/cpp/include/raft/spectral/partition.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __PARTITION_H
+#define __PARTITION_H
+
 #pragma once
 
 #include <tuple>
@@ -94,3 +102,5 @@ void analyzePartition(handle_t const& handle,
 
 }  // namespace spectral
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/accuracy.cuh b/cpp/include/raft/stats/accuracy.cuh
new file mode 100644
index 0000000000..250ce579e5
--- /dev/null
+++ b/cpp/include/raft/stats/accuracy.cuh
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __STATS_ACCURACY_H
+#define __STATS_ACCURACY_H
+
+#pragma once
+
+#include <raft/stats/detail/scores.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute accuracy of predictions. Useful for classification.
+ * @tparam math_t: data type for predictions (e.g., int for classification)
+ * @param[in] predictions: array of predictions (GPU pointer).
+ * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer).
+ * @param[in] n: number of elements in each of predictions, ref_predictions.
+ * @param[in] stream: cuda stream.
+ * @return: Accuracy score in [0, 1]; higher is better.
+ */
+template <typename math_t>
+float accuracy(const math_t* predictions, const math_t* ref_predictions, int n, cudaStream_t stream)
+{
+  return detail::accuracy_score(predictions, ref_predictions, n, stream);
+}
+
+}  // namespace stats
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/accuracy.hpp b/cpp/include/raft/stats/accuracy.hpp
index 043d2c0d0b..eefe96b2d1 100644
--- a/cpp/include/raft/stats/accuracy.hpp
+++ b/cpp/include/raft/stats/accuracy.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __STATS_ACCURACY_H
+#define __STATS_ACCURACY_H
 
 #pragma once
 
@@ -38,3 +45,5 @@ float accuracy(const math_t* predictions, const math_t* ref_predictions, int n,
 
 }  // namespace stats
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/adjusted_rand_index.cuh b/cpp/include/raft/stats/adjusted_rand_index.cuh
new file mode 100644
index 0000000000..a59d7b4c81
--- /dev/null
+++ b/cpp/include/raft/stats/adjusted_rand_index.cuh
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file adjusted_rand_index.hpp
+ * @brief The adjusted Rand index is the corrected-for-chance version of the Rand index.
+ * Such a correction for chance establishes a baseline by using the expected similarity
+ * of all pair-wise comparisons between clusterings specified by a random model.
+ */
+#ifndef __ADJUSTED_RAND_INDEX_H
+#define __ADJUSTED_RAND_INDEX_H
+
+#pragma once
+
+#include <raft/stats/detail/adjusted_rand_index.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate Adjusted RandIndex as described
+ *        <a href="https://en.wikipedia.org/wiki/Rand_index">here</a>
+ * @tparam T data-type for input label arrays
+ * @tparam MathT integral data-type used for computing n-choose-r
+ * @param firstClusterArray: the array of classes
+ * @param secondClusterArray: the array of classes
+ * @param size: the size of the data points of type int
+ * @param stream: the cudaStream object
+ */
+template <typename T, typename MathT = int>
+double adjusted_rand_index(const T* firstClusterArray,
+                           const T* secondClusterArray,
+                           int size,
+                           cudaStream_t stream)
+{
+  return detail::compute_adjusted_rand_index(firstClusterArray, secondClusterArray, size, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/adjusted_rand_index.hpp b/cpp/include/raft/stats/adjusted_rand_index.hpp
index 22d81e5296..cbf6112000 100644
--- a/cpp/include/raft/stats/adjusted_rand_index.hpp
+++ b/cpp/include/raft/stats/adjusted_rand_index.hpp
@@ -13,6 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __ADJUSTED_RAND_INDEX_H
+#define __ADJUSTED_RAND_INDEX_H
+
 /**
  * @file adjusted_rand_index.hpp
  * @brief The adjusted Rand index is the corrected-for-chance version of the Rand index.
@@ -48,3 +56,5 @@ double adjusted_rand_index(const T* firstClusterArray,
 
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
diff --git a/cpp/include/raft/stats/completeness_score.cuh b/cpp/include/raft/stats/completeness_score.cuh
new file mode 100644
index 0000000000..dbfe6ce430
--- /dev/null
+++ b/cpp/include/raft/stats/completeness_score.cuh
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __COMPLETENESS_SCORE_H
+#define __COMPLETENESS_SCORE_H
+
+#pragma once
+
+#include <raft/stats/detail/completeness_score.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate the completeness score between two clusters
+ *
+ * @param truthClusterArray: the array of truth classes of type T
+ * @param predClusterArray: the array of predicted classes of type T
+ * @param size: the size of the data points of type int
+ * @param lowerLabelRange: the lower bound of the range of labels
+ * @param upperLabelRange: the upper bound of the range of labels
+ * @param stream: the cudaStream object
+ */
+template <typename T>
+double completeness_score(const T* truthClusterArray,
+                          const T* predClusterArray,
+                          int size,
+                          T lowerLabelRange,
+                          T upperLabelRange,
+                          cudaStream_t stream)
+{
+  return detail::completeness_score(
+    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/completeness_score.hpp b/cpp/include/raft/stats/completeness_score.hpp
index ee8598bcc4..01ed0d66b9 100644
--- a/cpp/include/raft/stats/completeness_score.hpp
+++ b/cpp/include/raft/stats/completeness_score.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __COMPLETENESS_SCORE_H
+#define __COMPLETENESS_SCORE_H
 
 #pragma once
 
@@ -45,3 +52,5 @@ double completeness_score(const T* truthClusterArray,
 
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
diff --git a/cpp/include/raft/stats/contingency_matrix.cuh b/cpp/include/raft/stats/contingency_matrix.cuh
new file mode 100644
index 0000000000..081782432c
--- /dev/null
+++ b/cpp/include/raft/stats/contingency_matrix.cuh
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CONTINGENCY_MATRIX_H
+#define __CONTINGENCY_MATRIX_H
+
+#pragma once
+
+#include <raft/stats/detail/contingencyMatrix.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief use this to allocate output matrix size
+ * size of matrix = (maxLabel - minLabel + 1)^2 * sizeof(int)
+ * @param groundTruth: device 1-d array for ground truth (num of rows)
+ * @param nSamples: number of elements in input array
+ * @param stream: cuda stream for execution
+ * @param minLabel: [out] calculated min value in input array
+ * @param maxLabel: [out] calculated max value in input array
+ */
+template <typename T>
+void getInputClassCardinality(
+  const T* groundTruth, const int nSamples, cudaStream_t stream, T& minLabel, T& maxLabel)
+{
+  detail::getInputClassCardinality(groundTruth, nSamples, stream, minLabel, maxLabel);
+}
+
+/**
+ * @brief Calculate workspace size for running contingency matrix calculations
+ * @tparam T label type
+ * @tparam OutT output matrix type
+ * @param nSamples: number of elements in input array
+ * @param groundTruth: device 1-d array for ground truth (num of rows)
+ * @param stream: cuda stream for execution
+ * @param minLabel: Optional, min value in input array
+ * @param maxLabel: Optional, max value in input array
+ */
+template <typename T, typename OutT = int>
+size_t getContingencyMatrixWorkspaceSize(int nSamples,
+                                         const T* groundTruth,
+                                         cudaStream_t stream,
+                                         T minLabel = std::numeric_limits<T>::max(),
+                                         T maxLabel = std::numeric_limits<T>::max())
+{
+  return detail::getContingencyMatrixWorkspaceSize(
+    nSamples, groundTruth, stream, minLabel, maxLabel);
+}
+
+/**
+ * @brief contruct contingency matrix given input ground truth and prediction
+ *        labels. Users should call function getInputClassCardinality to find
+ *        and allocate memory for output. Similarly workspace requirements
+ *        should be checked using function getContingencyMatrixWorkspaceSize
+ * @tparam T label type
+ * @tparam OutT output matrix type
+ * @param groundTruth: device 1-d array for ground truth (num of rows)
+ * @param predictedLabel: device 1-d array for prediction (num of columns)
+ * @param nSamples: number of elements in input array
+ * @param outMat: output buffer for contingecy matrix
+ * @param stream: cuda stream for execution
+ * @param workspace: Optional, workspace memory allocation
+ * @param workspaceSize: Optional, size of workspace memory
+ * @param minLabel: Optional, min value in input ground truth array
+ * @param maxLabel: Optional, max value in input ground truth array
+ */
+template <typename T, typename OutT = int>
+void contingencyMatrix(const T* groundTruth,
+                       const T* predictedLabel,
+                       int nSamples,
+                       OutT* outMat,
+                       cudaStream_t stream,
+                       void* workspace      = nullptr,
+                       size_t workspaceSize = 0,
+                       T minLabel           = std::numeric_limits<T>::max(),
+                       T maxLabel           = std::numeric_limits<T>::max())
+{
+  detail::contingencyMatrix<T, OutT>(groundTruth,
+                                     predictedLabel,
+                                     nSamples,
+                                     outMat,
+                                     stream,
+                                     workspace,
+                                     workspaceSize,
+                                     minLabel,
+                                     maxLabel);
+}
+
+};  // namespace stats
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/contingency_matrix.hpp b/cpp/include/raft/stats/contingency_matrix.hpp
index 7783bb9f42..6fa4a314f9 100644
--- a/cpp/include/raft/stats/contingency_matrix.hpp
+++ b/cpp/include/raft/stats/contingency_matrix.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __CONTINGENCY_MATRIX_H
+#define __CONTINGENCY_MATRIX_H
 
 #pragma once
 
@@ -99,3 +106,5 @@ void contingencyMatrix(const T* groundTruth,
 
 };  // namespace stats
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/cov.cuh b/cpp/include/raft/stats/cov.cuh
new file mode 100644
index 0000000000..06e8ba0215
--- /dev/null
+++ b/cpp/include/raft/stats/cov.cuh
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __COV_H
+#define __COV_H
+
+#pragma once
+
+#include <raft/stats/detail/cov.cuh>
+namespace raft {
+namespace stats {
+/**
+ * @brief Compute covariance of the input matrix
+ *
+ * Mean operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @param covar the output covariance matrix
+ * @param data the input matrix (this will get mean-centered at the end!)
+ * @param mu mean vector of the input matrix
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param sample whether to evaluate sample covariance or not. In other words,
+ * whether to normalize the output using N-1 or N, for true or false,
+ * respectively
+ * @param rowMajor whether the input data is row or col major
+ * @param stable whether to run the slower-but-numerically-stable version or not
+ * @param handle cublas handle
+ * @param stream cuda stream
+ * @note if stable=true, then the input data will be mean centered after this
+ * function returns!
+ */
+template <typename Type>
+void cov(const raft::handle_t& handle,
+         Type* covar,
+         Type* data,
+         const Type* mu,
+         std::size_t D,
+         std::size_t N,
+         bool sample,
+         bool rowMajor,
+         bool stable,
+         cudaStream_t stream)
+{
+  detail::cov(handle, covar, data, mu, D, N, sample, rowMajor, stable, stream);
+}
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/cov.hpp b/cpp/include/raft/stats/cov.hpp
index dc5bc63ee8..27b4ede876 100644
--- a/cpp/include/raft/stats/cov.hpp
+++ b/cpp/include/raft/stats/cov.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __COV_H
+#define __COV_H
 
 #pragma once
 
@@ -56,3 +63,5 @@ void cov(const raft::handle_t& handle,
 }
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/detail/adjusted_rand_index.cuh b/cpp/include/raft/stats/detail/adjusted_rand_index.cuh
index 03ffac6377..6b97c49864 100644
--- a/cpp/include/raft/stats/detail/adjusted_rand_index.cuh
+++ b/cpp/include/raft/stats/detail/adjusted_rand_index.cuh
@@ -27,9 +27,9 @@
 #include <math.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/map_then_reduce.hpp>
-#include <raft/linalg/reduce.hpp>
-#include <raft/stats/histogram.hpp>
+#include <raft/linalg/map_then_reduce.cuh>
+#include <raft/linalg/reduce.cuh>
+#include <raft/stats/histogram.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/stats/detail/batched/information_criterion.cuh b/cpp/include/raft/stats/detail/batched/information_criterion.cuh
index a6d8d174b0..1590910594 100644
--- a/cpp/include/raft/stats/detail/batched/information_criterion.cuh
+++ b/cpp/include/raft/stats/detail/batched/information_criterion.cuh
@@ -15,7 +15,7 @@
  */
 
 #pragma once
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 #include <raft/stats/common.hpp>
 
 #include <cmath>
diff --git a/cpp/include/raft/stats/detail/completeness_score.cuh b/cpp/include/raft/stats/detail/completeness_score.cuh
index 1ddd4ffc4c..5e6fb835ef 100644
--- a/cpp/include/raft/stats/detail/completeness_score.cuh
+++ b/cpp/include/raft/stats/detail/completeness_score.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,8 +22,8 @@
 
 #pragma once
 
-#include <raft/stats/entropy.hpp>
-#include <raft/stats/mutual_info_score.hpp>
+#include <raft/stats/entropy.cuh>
+#include <raft/stats/mutual_info_score.cuh>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/include/raft/stats/detail/cov.cuh b/cpp/include/raft/stats/detail/cov.cuh
index 7e3fc701a1..24de58dd91 100644
--- a/cpp/include/raft/stats/detail/cov.cuh
+++ b/cpp/include/raft/stats/detail/cov.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <raft/linalg/gemm.hpp>
-#include <raft/stats/mean_center.hpp>
+#include <raft/linalg/gemm.cuh>
+#include <raft/stats/mean_center.cuh>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/include/raft/stats/detail/dispersion.cuh b/cpp/include/raft/stats/detail/dispersion.cuh
index c1d9376e05..0c4d25b9aa 100644
--- a/cpp/include/raft/stats/detail/dispersion.cuh
+++ b/cpp/include/raft/stats/detail/dispersion.cuh
@@ -21,7 +21,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/interruptible.hpp>
-#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/eltwise.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/stats/detail/entropy.cuh b/cpp/include/raft/stats/detail/entropy.cuh
index 3eed86f705..d36fa1d7ba 100644
--- a/cpp/include/raft/stats/detail/entropy.cuh
+++ b/cpp/include/raft/stats/detail/entropy.cuh
@@ -24,8 +24,8 @@
 #include <math.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/divide.hpp>
-#include <raft/linalg/map_then_reduce.hpp>
+#include <raft/linalg/divide.cuh>
+#include <raft/linalg/map_then_reduce.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/stats/detail/homogeneity_score.cuh b/cpp/include/raft/stats/detail/homogeneity_score.cuh
index b91175fe0f..4c78553258 100644
--- a/cpp/include/raft/stats/detail/homogeneity_score.cuh
+++ b/cpp/include/raft/stats/detail/homogeneity_score.cuh
@@ -23,8 +23,8 @@
 #pragma once
 
 #include <raft/mr/device/allocator.hpp>
-#include <raft/stats/entropy.hpp>
-#include <raft/stats/mutual_info_score.hpp>
+#include <raft/stats/entropy.cuh>
+#include <raft/stats/mutual_info_score.cuh>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/include/raft/stats/detail/kl_divergence.cuh b/cpp/include/raft/stats/detail/kl_divergence.cuh
index 117dfd07fc..1a95aff531 100644
--- a/cpp/include/raft/stats/detail/kl_divergence.cuh
+++ b/cpp/include/raft/stats/detail/kl_divergence.cuh
@@ -24,7 +24,7 @@
 #include <math.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/map_then_reduce.hpp>
+#include <raft/linalg/map_then_reduce.cuh>
 #include <rmm/device_scalar.hpp>
 
 namespace raft {
diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh
index a512579c11..a55b7b4cd1 100644
--- a/cpp/include/raft/stats/detail/mean.cuh
+++ b/cpp/include/raft/stats/detail/mean.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/eltwise.cuh>
 
 #include <cub/cub.cuh>
 
diff --git a/cpp/include/raft/stats/detail/mean_center.cuh b/cpp/include/raft/stats/detail/mean_center.cuh
index db2eaf8459..1a4fc20c51 100644
--- a/cpp/include/raft/stats/detail/mean_center.cuh
+++ b/cpp/include/raft/stats/detail/mean_center.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/vectorized.cuh>
 
 namespace raft {
diff --git a/cpp/include/raft/stats/detail/meanvar.cuh b/cpp/include/raft/stats/detail/meanvar.cuh
index 075e7fe170..1d4e1f95bd 100644
--- a/cpp/include/raft/stats/detail/meanvar.cuh
+++ b/cpp/include/raft/stats/detail/meanvar.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/reduce.hpp>
+#include <raft/linalg/reduce.cuh>
 
 namespace raft::stats::detail {
 
diff --git a/cpp/include/raft/stats/detail/mutual_info_score.cuh b/cpp/include/raft/stats/detail/mutual_info_score.cuh
index b1349d6379..c730ac0362 100644
--- a/cpp/include/raft/stats/detail/mutual_info_score.cuh
+++ b/cpp/include/raft/stats/detail/mutual_info_score.cuh
@@ -30,8 +30,8 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/interruptible.hpp>
-#include <raft/linalg/reduce.hpp>
-#include <raft/stats/contingency_matrix.hpp>
+#include <raft/linalg/reduce.cuh>
+#include <raft/stats/contingency_matrix.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/stats/detail/scores.cuh b/cpp/include/raft/stats/detail/scores.cuh
index 130bdb4a85..0c345cbb21 100644
--- a/cpp/include/raft/stats/detail/scores.cuh
+++ b/cpp/include/raft/stats/detail/scores.cuh
@@ -18,12 +18,12 @@
 
 #include <memory>
 #include <raft/cudart_utils.h>
-#include <raft/distance/distance.hpp>
-#include <raft/linalg/eltwise.hpp>
+#include <raft/distance/distance.cuh>
+#include <raft/linalg/eltwise.cuh>
 #include <raft/linalg/power.cuh>
-#include <raft/linalg/subtract.hpp>
-#include <raft/spatial/knn/knn.hpp>
-#include <raft/stats/mean.hpp>
+#include <raft/linalg/subtract.cuh>
+#include <raft/spatial/knn/knn.cuh>
+#include <raft/stats/mean.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <thrust/device_ptr.h>
diff --git a/cpp/include/raft/stats/detail/silhouette_score.cuh b/cpp/include/raft/stats/detail/silhouette_score.cuh
index 8f02ff5045..aa100f7299 100644
--- a/cpp/include/raft/stats/detail/silhouette_score.cuh
+++ b/cpp/include/raft/stats/detail/silhouette_score.cuh
@@ -22,13 +22,13 @@
 #include <math.h>
 #include <numeric>
 #include <raft/cuda_utils.cuh>
-#include <raft/distance/distance.hpp>
+#include <raft/distance/distance.cuh>
 #include <raft/distance/distance_type.hpp>
-#include <raft/linalg/add.hpp>
-#include <raft/linalg/eltwise.hpp>
-#include <raft/linalg/map_then_reduce.hpp>
-#include <raft/linalg/matrix_vector_op.hpp>
-#include <raft/linalg/reduce.hpp>
+#include <raft/linalg/add.cuh>
+#include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/map_then_reduce.cuh>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/reduce.cuh>
 #include <raft/linalg/reduce_cols_by_key.cuh>
 #include <rmm/device_scalar.hpp>
 
diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh
index c07c212e54..b9149b5a9f 100644
--- a/cpp/include/raft/stats/detail/stddev.cuh
+++ b/cpp/include/raft/stats/detail/stddev.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.hpp>
+#include <raft/linalg/binary_op.cuh>
 
 #include <cub/cub.cuh>
 
diff --git a/cpp/include/raft/stats/detail/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh
index ad46c3bf10..3652a852de 100644
--- a/cpp/include/raft/stats/detail/sum.cuh
+++ b/cpp/include/raft/stats/detail/sum.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/eltwise.cuh>
 
 #include <cub/cub.cuh>
 
diff --git a/cpp/include/raft/stats/detail/trustworthiness_score.cuh b/cpp/include/raft/stats/detail/trustworthiness_score.cuh
index 04ae0228d6..feb3fe607d 100644
--- a/cpp/include/raft/stats/detail/trustworthiness_score.cuh
+++ b/cpp/include/raft/stats/detail/trustworthiness_score.cuh
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include <raft/distance/distance.hpp>
-#include <raft/matrix/col_wise_sort.hpp>
-#include <raft/spatial/knn/knn.hpp>
+#include <raft/distance/distance.cuh>
+#include <raft/matrix/col_wise_sort.cuh>
+#include <raft/spatial/knn/knn.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/raft/stats/detail/v_measure.cuh b/cpp/include/raft/stats/detail/v_measure.cuh
index c51ababbb9..346755503a 100644
--- a/cpp/include/raft/stats/detail/v_measure.cuh
+++ b/cpp/include/raft/stats/detail/v_measure.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
  * @file v_measure.cuh
  */
 
-#include <raft/stats/homogeneity_score.hpp>
+#include <raft/stats/homogeneity_score.cuh>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/include/raft/stats/detail/weighted_mean.cuh b/cpp/include/raft/stats/detail/weighted_mean.cuh
index ca7fc136d3..0069cf0a3f 100644
--- a/cpp/include/raft/stats/detail/weighted_mean.cuh
+++ b/cpp/include/raft/stats/detail/weighted_mean.cuh
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/coalesced_reduction.hpp>
-#include <raft/linalg/strided_reduction.hpp>
+#include <raft/linalg/coalesced_reduction.cuh>
+#include <raft/linalg/strided_reduction.cuh>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/include/raft/stats/dispersion.cuh b/cpp/include/raft/stats/dispersion.cuh
new file mode 100644
index 0000000000..c868092517
--- /dev/null
+++ b/cpp/include/raft/stats/dispersion.cuh
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DISPERSION_H
+#define __DISPERSION_H
+
+#pragma once
+
+#include <raft/stats/detail/dispersion.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute cluster dispersion metric. This is very useful for
+ * automatically finding the 'k' (in kmeans) that improves this metric.
+ * @tparam DataT data type
+ * @tparam IdxT index type
+ * @tparam TPB threads block for kernels launched
+ * @param centroids the cluster centroids. This is assumed to be row-major
+ *   and of dimension (nClusters x dim)
+ * @param clusterSizes number of points in the dataset which belong to each
+ *   cluster. This is of length nClusters
+ * @param globalCentroid compute the global weighted centroid of all cluster
+ *   centroids. This is of length dim. Pass a nullptr if this is not needed
+ * @param nClusters number of clusters
+ * @param nPoints number of points in the dataset
+ * @param dim dataset dimensionality
+ * @param stream cuda stream
+ * @return the cluster dispersion value
+ */
+template <typename DataT, typename IdxT = int, int TPB = 256>
+DataT dispersion(const DataT* centroids,
+                 const IdxT* clusterSizes,
+                 DataT* globalCentroid,
+                 IdxT nClusters,
+                 IdxT nPoints,
+                 IdxT dim,
+                 cudaStream_t stream)
+{
+  return detail::dispersion(
+    centroids, clusterSizes, globalCentroid, nClusters, nPoints, dim, stream);
+}
+
+}  // end namespace stats
+}  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/dispersion.hpp b/cpp/include/raft/stats/dispersion.hpp
index 381f210d85..5958551e87 100644
--- a/cpp/include/raft/stats/dispersion.hpp
+++ b/cpp/include/raft/stats/dispersion.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __DISPERSION_H
+#define __DISPERSION_H
 
 #pragma once
 
@@ -54,3 +61,5 @@ DataT dispersion(const DataT* centroids,
 
 }  // end namespace stats
 }  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/entropy.cuh b/cpp/include/raft/stats/entropy.cuh
new file mode 100644
index 0000000000..59cbbd368f
--- /dev/null
+++ b/cpp/include/raft/stats/entropy.cuh
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ENTROPY_H
+#define __ENTROPY_H
+
+#pragma once
+#include <raft/stats/detail/entropy.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate entropy
+ * <a href="https://en.wikipedia.org/wiki/Entropy_(information_theory)">more info on entropy</a>
+ *
+ * @param clusterArray: the array of classes of type T
+ * @param size: the size of the data points of type int
+ * @param lowerLabelRange: the lower bound of the range of labels
+ * @param upperLabelRange: the upper bound of the range of labels
+ * @param stream: the cudaStream object
+ * @return the entropy score
+ */
+template <typename T>
+double entropy(const T* clusterArray,
+               const int size,
+               const T lowerLabelRange,
+               const T upperLabelRange,
+               cudaStream_t stream)
+{
+  return detail::entropy(clusterArray, size, lowerLabelRange, upperLabelRange, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/entropy.hpp b/cpp/include/raft/stats/entropy.hpp
index c1f15cb0fe..eb1fee2949 100644
--- a/cpp/include/raft/stats/entropy.hpp
+++ b/cpp/include/raft/stats/entropy.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __ENTROPY_H
+#define __ENTROPY_H
 
 #pragma once
 #include <raft/stats/detail/entropy.cuh>
@@ -43,3 +50,5 @@ double entropy(const T* clusterArray,
 
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/histogram.cuh b/cpp/include/raft/stats/histogram.cuh
new file mode 100644
index 0000000000..e8176ebc92
--- /dev/null
+++ b/cpp/include/raft/stats/histogram.cuh
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __HISTOGRAM_H
+#define __HISTOGRAM_H
+
+#pragma once
+
+#include <raft/stats/common.hpp>
+#include <raft/stats/detail/histogram.cuh>
+
+// This file is a shameless amalgamation of independent works done by
+// Lars Nyland and Andy Adinets
+
+///@todo: add cub's histogram as another option
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Perform histogram on the input data. It chooses the right load size
+ * based on the input data vector length. It also supports large-bin cases
+ * using a specialized smem-based hashing technique.
+ * @tparam DataT input data type
+ * @tparam IdxT data type used to compute indices
+ * @tparam BinnerOp takes the input data and computes its bin index
+ * @param type histogram implementation type to choose
+ * @param bins the output bins (length = ncols * nbins)
+ * @param nbins number of bins
+ * @param data input data (length = ncols * nrows)
+ * @param nrows data array length in each column (or batch)
+ * @param ncols number of columsn (or batch size)
+ * @param stream cuda stream
+ * @param binner the operation that computes the bin index of the input data
+ *
+ * @note signature of BinnerOp is `int func(DataT, IdxT);`
+ */
+template <typename DataT, typename IdxT = int, typename BinnerOp = IdentityBinner<DataT, IdxT>>
+void histogram(HistType type,
+               int* bins,
+               IdxT nbins,
+               const DataT* data,
+               IdxT nrows,
+               IdxT ncols,
+               cudaStream_t stream,
+               BinnerOp binner = IdentityBinner<DataT, IdxT>())
+{
+  detail::histogram<DataT, IdxT, BinnerOp>(type, bins, nbins, data, nrows, ncols, stream, binner);
+}
+
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/histogram.hpp b/cpp/include/raft/stats/histogram.hpp
index d4d3b449f7..828719236b 100644
--- a/cpp/include/raft/stats/histogram.hpp
+++ b/cpp/include/raft/stats/histogram.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __HISTOGRAM_H
+#define __HISTOGRAM_H
 
 #pragma once
 
@@ -60,3 +67,5 @@ void histogram(HistType type,
 
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/homogeneity_score.cuh b/cpp/include/raft/stats/homogeneity_score.cuh
new file mode 100644
index 0000000000..5fe92db78a
--- /dev/null
+++ b/cpp/include/raft/stats/homogeneity_score.cuh
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __HOMOGENEITY_SCORE_H
+#define __HOMOGENEITY_SCORE_H
+
+#pragma once
+
+#include <raft/stats/detail/homogeneity_score.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate the homogeneity score between two clusters
+ * <a href="https://en.wikipedia.org/wiki/Homogeneity_(statistics)">more info on mutual
+ * information</a>
+ * @param truthClusterArray: the array of truth classes of type T
+ * @param predClusterArray: the array of predicted classes of type T
+ * @param size: the size of the data points of type int
+ * @param lowerLabelRange: the lower bound of the range of labels
+ * @param upperLabelRange: the upper bound of the range of labels
+ * @param stream: the cudaStream object
+ */
+template <typename T>
+double homogeneity_score(const T* truthClusterArray,
+                         const T* predClusterArray,
+                         int size,
+                         T lowerLabelRange,
+                         T upperLabelRange,
+                         cudaStream_t stream)
+{
+  return detail::homogeneity_score(
+    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/homogeneity_score.hpp b/cpp/include/raft/stats/homogeneity_score.hpp
index e94d519902..49baea0c19 100644
--- a/cpp/include/raft/stats/homogeneity_score.hpp
+++ b/cpp/include/raft/stats/homogeneity_score.hpp
@@ -14,6 +14,14 @@
  * limitations under the License.
  */
 
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __HOMOGENEITY_SCORE_H
+#define __HOMOGENEITY_SCORE_H
+
 #pragma once
 
 #include <raft/stats/detail/homogeneity_score.cuh>
@@ -46,3 +54,5 @@ double homogeneity_score(const T* truthClusterArray,
 
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/information_criterion.cuh b/cpp/include/raft/stats/information_criterion.cuh
new file mode 100644
index 0000000000..0744dcdffe
--- /dev/null
+++ b/cpp/include/raft/stats/information_criterion.cuh
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * @file information_criterion.hpp
+ * @brief These information criteria are used to evaluate the quality of models
+ *        by balancing the quality of the fit and the number of parameters.
+ *
+ * See:
+ *  - AIC: https://en.wikipedia.org/wiki/Akaike_information_criterion
+ *  - AICc: https://en.wikipedia.org/wiki/Akaike_information_criterion#AICc
+ *  - BIC: https://en.wikipedia.org/wiki/Bayesian_information_criterion
+ */
+
+#ifndef __INFORMATION_CRIT_H
+#define __INFORMATION_CRIT_H
+
+#pragma once
+
+#include <raft/stats/common.hpp>
+#include <raft/stats/detail/batched/information_criterion.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * Compute the given type of information criterion
+ *
+ * @note: it is safe to do the computation in-place (i.e give same pointer
+ *        as input and output)
+ *
+ * @param[out] d_ic             Information criterion to be returned for each
+ *                              series (device)
+ * @param[in]  d_loglikelihood  Log-likelihood for each series (device)
+ * @param[in]  ic_type          Type of criterion to compute. See IC_Type
+ * @param[in]  n_params         Number of parameters in the model
+ * @param[in]  batch_size       Number of series in the batch
+ * @param[in]  n_samples        Number of samples in each series
+ * @param[in]  stream           CUDA stream
+ */
+template <typename ScalarT, typename IdxT>
+void information_criterion_batched(ScalarT* d_ic,
+                                   const ScalarT* d_loglikelihood,
+                                   IC_Type ic_type,
+                                   IdxT n_params,
+                                   IdxT batch_size,
+                                   IdxT n_samples,
+                                   cudaStream_t stream)
+{
+  batched::detail::information_criterion(
+    d_ic, d_loglikelihood, ic_type, n_params, batch_size, n_samples, stream);
+}
+
+}  // namespace stats
+}  // namespace raft
+#endif
diff --git a/cpp/include/raft/stats/information_criterion.hpp b/cpp/include/raft/stats/information_criterion.hpp
index c367471953..f6dd69aa08 100644
--- a/cpp/include/raft/stats/information_criterion.hpp
+++ b/cpp/include/raft/stats/information_criterion.hpp
@@ -13,6 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __INFORMATION_CRIT_H
+#define __INFORMATION_CRIT_H
+
 /**
  * @file information_criterion.hpp
  * @brief These information criteria are used to evaluate the quality of models
@@ -61,3 +69,5 @@ void information_criterion_batched(ScalarT* d_ic,
 
 }  // namespace stats
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/kl_divergence.cuh b/cpp/include/raft/stats/kl_divergence.cuh
new file mode 100644
index 0000000000..b29f277b4a
--- /dev/null
+++ b/cpp/include/raft/stats/kl_divergence.cuh
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __KL_DIVERGENCE_H
+#define __KL_DIVERGENCE_H
+
+#pragma once
+
+#include <raft/stats/detail/kl_divergence.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate KL Divergence
+ * <a href="https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence">more info on KL
+ * Divergence</a>
+ *
+ * @tparam DataT: Data type of the input array
+ * @param modelPDF: the model array of probability density functions of type DataT
+ * @param candidatePDF: the candidate array of probability density functions of type DataT
+ * @param size: the size of the data points of type int
+ * @param stream: the cudaStream object
+ */
+template <typename DataT>
+DataT kl_divergence(const DataT* modelPDF, const DataT* candidatePDF, int size, cudaStream_t stream)
+{
+  return detail::kl_divergence(modelPDF, candidatePDF, size, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
diff --git a/cpp/include/raft/stats/kl_divergence.hpp b/cpp/include/raft/stats/kl_divergence.hpp
index 377e96719d..9d7c0b1e46 100644
--- a/cpp/include/raft/stats/kl_divergence.hpp
+++ b/cpp/include/raft/stats/kl_divergence.hpp
@@ -13,6 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __KL_DIVERGENCE_H
+#define __KL_DIVERGENCE_H
+
 #pragma once
 
 #include <raft/stats/detail/kl_divergence.cuh>
@@ -39,3 +47,5 @@ DataT kl_divergence(const DataT* modelPDF, const DataT* candidatePDF, int size,
 
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/mean.cuh
new file mode 100644
index 0000000000..eed3159d5d
--- /dev/null
+++ b/cpp/include/raft/stats/mean.cuh
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MEAN_H
+#define __MEAN_H
+
+#pragma once
+
+#include "detail/mean.cuh"
+
+#include <raft/handle.hpp>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute mean of the input matrix
+ *
+ * Mean operation is assumed to be performed on a given column.
+ *
+ * @tparam Type: the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param mu: the output mean vector
+ * @param data: the input matrix
+ * @param D: number of columns of data
+ * @param N: number of rows of data
+ * @param sample: whether to evaluate sample mean or not. In other words,
+ * whether
+ *  to normalize the output using N-1 or N, for true or false, respectively
+ * @param rowMajor: whether the input data is row or col major
+ * @param stream: cuda stream
+ */
+template <typename Type, typename IdxType = int>
+void mean(
+  Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream)
+{
+  detail::mean(mu, data, D, N, sample, rowMajor, stream);
+}
+
+};  // namespace stats
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/mean.hpp b/cpp/include/raft/stats/mean.hpp
index ba1eb55e71..add9e47569 100644
--- a/cpp/include/raft/stats/mean.hpp
+++ b/cpp/include/raft/stats/mean.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MEAN_H
+#define __MEAN_H
 
 #pragma once
 
@@ -49,3 +56,5 @@ void mean(
 
 };  // namespace stats
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/mean_center.cuh b/cpp/include/raft/stats/mean_center.cuh
new file mode 100644
index 0000000000..3b2222ef52
--- /dev/null
+++ b/cpp/include/raft/stats/mean_center.cuh
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MEAN_CENTER_H
+#define __MEAN_CENTER_H
+
+#pragma once
+
+#include "detail/mean_center.cuh"
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Center the input matrix wrt its mean
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads per block of the cuda kernel launched
+ * @param out the output mean-centered matrix
+ * @param data input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param rowMajor whether input is row or col major
+ * @param bcastAlongRows whether to broadcast vector along rows or columns
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void meanCenter(Type* out,
+                const Type* data,
+                const Type* mu,
+                IdxType D,
+                IdxType N,
+                bool rowMajor,
+                bool bcastAlongRows,
+                cudaStream_t stream)
+{
+  detail::meanCenter<Type, IdxType, TPB>(out, data, mu, D, N, rowMajor, bcastAlongRows, stream);
+}
+
+/**
+ * @brief Add the input matrix wrt its mean
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads per block of the cuda kernel launched
+ * @param out the output mean-added matrix
+ * @param data input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param rowMajor whether input is row or col major
+ * @param bcastAlongRows whether to broadcast vector along rows or columns
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void meanAdd(Type* out,
+             const Type* data,
+             const Type* mu,
+             IdxType D,
+             IdxType N,
+             bool rowMajor,
+             bool bcastAlongRows,
+             cudaStream_t stream)
+{
+  detail::meanAdd<Type, IdxType, TPB>(out, data, mu, D, N, rowMajor, bcastAlongRows, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/mean_center.hpp b/cpp/include/raft/stats/mean_center.hpp
index 406a0b5047..69ce79338b 100644
--- a/cpp/include/raft/stats/mean_center.hpp
+++ b/cpp/include/raft/stats/mean_center.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MEAN_CENTER_H
+#define __MEAN_CENTER_H
 
 #pragma once
 
@@ -77,3 +84,5 @@ void meanAdd(Type* out,
 
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/meanvar.cuh b/cpp/include/raft/stats/meanvar.cuh
new file mode 100644
index 0000000000..0c3c423493
--- /dev/null
+++ b/cpp/include/raft/stats/meanvar.cuh
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __MEANVAR_H
+#define __MEANVAR_H
+
+#pragma once
+
+#include "detail/meanvar.cuh"
+
+namespace raft::stats {
+
+/**
+ * @brief Compute mean and variance for each column of a given matrix.
+ *
+ * The operation is performed in a single sweep. Consider using it when you need to compute
+ * both mean and variance, or when you need to compute variance but don't have the mean.
+ * It's almost twice faster than running `mean` and `vars` sequentially, because all three
+ * kernels are memory-bound.
+ *
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used for addressing
+ * @param [out] mean the output mean vector of size D
+ * @param [out] var the output variance vector of size D
+ * @param [in] data the input matrix of size [N, D]
+ * @param [in] D number of columns of data
+ * @param [in] N number of rows of data
+ * @param [in] sample whether to evaluate sample variance or not. In other words, whether to
+ * normalize the variance using N-1 or N, for true or false respectively.
+ * @param [in] rowMajor whether the input data is row- or col-major, for true or false respectively.
+ * @param [in] stream
+ */
+template <typename Type, typename IdxType = int>
+void meanvar(Type* mean,
+             Type* var,
+             const Type* data,
+             IdxType D,
+             IdxType N,
+             bool sample,
+             bool rowMajor,
+             cudaStream_t stream)
+{
+  detail::meanvar(mean, var, data, D, N, sample, rowMajor, stream);
+}
+
+};  // namespace raft::stats
+
+#endif
diff --git a/cpp/include/raft/stats/meanvar.hpp b/cpp/include/raft/stats/meanvar.hpp
index 3a41ee8a00..a6809170e7 100644
--- a/cpp/include/raft/stats/meanvar.hpp
+++ b/cpp/include/raft/stats/meanvar.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MEANVAR_H
+#define __MEANVAR_H
 
 #pragma once
 
@@ -54,3 +61,5 @@ void meanvar(Type* mean,
 }
 
 };  // namespace raft::stats
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/minmax.cuh b/cpp/include/raft/stats/minmax.cuh
new file mode 100644
index 0000000000..62533b1a00
--- /dev/null
+++ b/cpp/include/raft/stats/minmax.cuh
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __MINMAX_H
+#define __MINMAX_H
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/stats/detail/minmax.cuh>
+
+#include <limits>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Computes min/max across every column of the input matrix, as well as
+ * optionally allow to subsample based on the given row/col ID mapping vectors
+ *
+ * @tparam T the data type
+ * @tparam TPB number of threads per block
+ * @param data input data
+ * @param rowids actual row ID mappings. It is of length nrows. If you want to
+ * skip this index lookup entirely, pass nullptr
+ * @param colids actual col ID mappings. It is of length ncols. If you want to
+ * skip this index lookup entirely, pass nullptr
+ * @param nrows number of rows of data to be worked upon. The actual rows of the
+ * input "data" can be bigger than this!
+ * @param ncols number of cols of data to be worked upon. The actual cols of the
+ * input "data" can be bigger than this!
+ * @param row_stride stride (in number of elements) between 2 adjacent columns
+ * @param globalmin final col-wise global minimum (size = ncols)
+ * @param globalmax final col-wise global maximum (size = ncols)
+ * @param sampledcols output sampled data. Pass nullptr if you don't need this
+ * @param stream cuda stream
+ * @note This method makes the following assumptions:
+ * 1. input and output matrices are assumed to be col-major
+ * 2. ncols is small enough to fit the whole of min/max values across all cols
+ *    in shared memory
+ */
+template <typename T, int TPB = 512>
+void minmax(const T* data,
+            const unsigned* rowids,
+            const unsigned* colids,
+            int nrows,
+            int ncols,
+            int row_stride,
+            T* globalmin,
+            T* globalmax,
+            T* sampledcols,
+            cudaStream_t stream)
+{
+  detail::minmax<T, TPB>(
+    data, rowids, colids, nrows, ncols, row_stride, globalmin, globalmax, sampledcols, stream);
+}
+
+};  // namespace stats
+};  // namespace raft
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/minmax.hpp b/cpp/include/raft/stats/minmax.hpp
index 966287bb41..669b3c5837 100644
--- a/cpp/include/raft/stats/minmax.hpp
+++ b/cpp/include/raft/stats/minmax.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MINMAX_H
+#define __MINMAX_H
 
 #pragma once
 
@@ -68,3 +75,5 @@ void minmax(const T* data,
 
 };  // namespace stats
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/mutual_info_score.cuh b/cpp/include/raft/stats/mutual_info_score.cuh
new file mode 100644
index 0000000000..9e48168e74
--- /dev/null
+++ b/cpp/include/raft/stats/mutual_info_score.cuh
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __MUTUAL_INFO_SCORE_H
+#define __MUTUAL_INFO_SCORE_H
+
+#pragma once
+
+#include <raft/stats/detail/mutual_info_score.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate the mutual information between two clusters
+ * <a href="https://en.wikipedia.org/wiki/Mutual_information">more info on mutual information</a>
+ * @param firstClusterArray: the array of classes of type T
+ * @param secondClusterArray: the array of classes of type T
+ * @param size: the size of the data points of type int
+ * @param lowerLabelRange: the lower bound of the range of labels
+ * @param upperLabelRange: the upper bound of the range of labels
+ * @param stream: the cudaStream object
+ */
+template <typename T>
+double mutual_info_score(const T* firstClusterArray,
+                         const T* secondClusterArray,
+                         int size,
+                         T lowerLabelRange,
+                         T upperLabelRange,
+                         cudaStream_t stream)
+{
+  return detail::mutual_info_score(
+    firstClusterArray, secondClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/mutual_info_score.hpp b/cpp/include/raft/stats/mutual_info_score.hpp
index b1044d0a3c..c900f9ce5b 100644
--- a/cpp/include/raft/stats/mutual_info_score.hpp
+++ b/cpp/include/raft/stats/mutual_info_score.hpp
@@ -13,6 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __MUTUAL_INFO_SCORE_H
+#define __MUTUAL_INFO_SCORE_H
+
 #pragma once
 
 #include <raft/stats/detail/mutual_info_score.cuh>
@@ -44,3 +52,5 @@ double mutual_info_score(const T* firstClusterArray,
 
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/r2_score.cuh b/cpp/include/raft/stats/r2_score.cuh
new file mode 100644
index 0000000000..88fac5aaa6
--- /dev/null
+++ b/cpp/include/raft/stats/r2_score.cuh
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __R2_SCORE_H
+#define __R2_SCORE_H
+
+#pragma once
+
+#include <raft/stats/detail/scores.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * Calculates the "Coefficient of Determination" (R-Squared) score
+ * normalizing the sum of squared errors by the total sum of squares.
+ *
+ * This score indicates the proportionate amount of variation in an
+ * expected response variable is explained by the independent variables
+ * in a linear regression model. The larger the R-squared value, the
+ * more variability is explained by the linear regression model.
+ *
+ * @param y: Array of ground-truth response variables
+ * @param y_hat: Array of predicted response variables
+ * @param n: Number of elements in y and y_hat
+ * @param stream: cuda stream
+ * @return: The R-squared value.
+ */
+template <typename math_t>
+math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
+{
+  return detail::r2_score(y, y_hat, n, stream);
+}
+
+}  // namespace stats
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/r2_score.hpp b/cpp/include/raft/stats/r2_score.hpp
index 4858a2b2a8..4e126d903b 100644
--- a/cpp/include/raft/stats/r2_score.hpp
+++ b/cpp/include/raft/stats/r2_score.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __R2_SCORE_H
+#define __R2_SCORE_H
 
 #pragma once
 
@@ -44,3 +51,5 @@ math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
 
 }  // namespace stats
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/rand_index.cuh b/cpp/include/raft/stats/rand_index.cuh
new file mode 100644
index 0000000000..82bf046c4e
--- /dev/null
+++ b/cpp/include/raft/stats/rand_index.cuh
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __RAND_INDEX_H
+#define __RAND_INDEX_H
+
+#pragma once
+
+#include <raft/stats/detail/rand_index.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate RandIndex
+ * <a href="https://en.wikipedia.org/wiki/Rand_index">more info on rand index</a>
+ * @param firstClusterArray: the array of classes of type T
+ * @param secondClusterArray: the array of classes of type T
+ * @param size: the size of the data points of type uint64_t
+ * @param stream: the cudaStream object
+ */
+template <typename T>
+double rand_index(T* firstClusterArray, T* secondClusterArray, uint64_t size, cudaStream_t stream)
+{
+  return detail::compute_rand_index(firstClusterArray, secondClusterArray, size, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/rand_index.hpp b/cpp/include/raft/stats/rand_index.hpp
index 602ff11f47..c94e4fa8db 100644
--- a/cpp/include/raft/stats/rand_index.hpp
+++ b/cpp/include/raft/stats/rand_index.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __RAND_INDEX_H
+#define __RAND_INDEX_H
 
 #pragma once
 
@@ -37,3 +44,5 @@ double rand_index(T* firstClusterArray, T* secondClusterArray, uint64_t size, cu
 
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/regression_metrics.cuh b/cpp/include/raft/stats/regression_metrics.cuh
new file mode 100644
index 0000000000..0fb6d39967
--- /dev/null
+++ b/cpp/include/raft/stats/regression_metrics.cuh
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __REGRESSION_METRICS_H
+#define __REGRESSION_METRICS_H
+
+#pragma once
+
+#include <raft/stats/detail/scores.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute regression metrics mean absolute error, mean squared error, median absolute error
+ * @tparam T: data type for predictions (e.g., float or double for regression).
+ * @param[in] predictions: array of predictions (GPU pointer).
+ * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer).
+ * @param[in] n: number of elements in each of predictions, ref_predictions. Should be > 0.
+ * @param[in] stream: cuda stream.
+ * @param[out] mean_abs_error: Mean Absolute Error. Sum over n of (|predictions[i] -
+ * ref_predictions[i]|) / n.
+ * @param[out] mean_squared_error: Mean Squared Error. Sum over n of ((predictions[i] -
+ * ref_predictions[i])^2) / n.
+ * @param[out] median_abs_error: Median Absolute Error. Median of |predictions[i] -
+ * ref_predictions[i]| for i in [0, n).
+ */
+template <typename T>
+void regression_metrics(const T* predictions,
+                        const T* ref_predictions,
+                        int n,
+                        cudaStream_t stream,
+                        double& mean_abs_error,
+                        double& mean_squared_error,
+                        double& median_abs_error)
+{
+  detail::regression_metrics(
+    predictions, ref_predictions, n, stream, mean_abs_error, mean_squared_error, median_abs_error);
+}
+}  // namespace stats
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/regression_metrics.hpp b/cpp/include/raft/stats/regression_metrics.hpp
index 4cfbb88231..b8868bdb33 100644
--- a/cpp/include/raft/stats/regression_metrics.hpp
+++ b/cpp/include/raft/stats/regression_metrics.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __REGRESSION_METRICS_H
+#define __REGRESSION_METRICS_H
 
 #pragma once
 
@@ -49,3 +56,5 @@ void regression_metrics(const T* predictions,
 }
 }  // namespace stats
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/silhouette_score.cuh b/cpp/include/raft/stats/silhouette_score.cuh
new file mode 100644
index 0000000000..9f02cf6d74
--- /dev/null
+++ b/cpp/include/raft/stats/silhouette_score.cuh
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __SILHOUETTE_SCORE_H
+#define __SILHOUETTE_SCORE_H
+
+#pragma once
+
+#include <raft/stats/detail/batched/silhouette_score.cuh>
+#include <raft/stats/detail/silhouette_score.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief main function that returns the average silhouette score for a given set of data and its
+ * clusterings
+ * @tparam DataT: type of the data samples
+ * @tparam LabelT: type of the labels
+ * @param handle: raft handle for managing expensive resources
+ * @param X_in: pointer to the input Data samples array (nRows x nCols)
+ * @param nRows: number of data samples
+ * @param nCols: number of features
+ * @param labels: the pointer to the array containing labels for every data sample (1 x nRows)
+ * @param nLabels: number of Labels
+ * @param silhouette_scorePerSample: pointer to the array that is optionally taken in as input and
+ * is populated with the silhouette score for every sample (1 x nRows)
+ * @param stream: the cuda stream where to launch this kernel
+ * @param metric: the numerical value that maps to the type of distance metric to be used in the
+ * calculations
+ */
+template <typename DataT, typename LabelT>
+DataT silhouette_score(
+  const raft::handle_t& handle,
+  DataT* X_in,
+  int nRows,
+  int nCols,
+  LabelT* labels,
+  int nLabels,
+  DataT* silhouette_scorePerSample,
+  cudaStream_t stream,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded)
+{
+  return detail::silhouette_score(
+    handle, X_in, nRows, nCols, labels, nLabels, silhouette_scorePerSample, stream, metric);
+}
+
+template <typename value_t, typename value_idx, typename label_idx>
+value_t silhouette_score_batched(
+  const raft::handle_t& handle,
+  value_t* X,
+  value_idx n_rows,
+  value_idx n_cols,
+  label_idx* y,
+  label_idx n_labels,
+  value_t* scores,
+  value_idx chunk,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded)
+{
+  return batched::detail::silhouette_score(
+    handle, X, n_rows, n_cols, y, n_labels, scores, chunk, metric);
+}
+
+};  // namespace stats
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/silhouette_score.hpp b/cpp/include/raft/stats/silhouette_score.hpp
index c0e4afb413..7506d9a733 100644
--- a/cpp/include/raft/stats/silhouette_score.hpp
+++ b/cpp/include/raft/stats/silhouette_score.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SILHOUETTE_SCORE_H
+#define __SILHOUETTE_SCORE_H
 
 #pragma once
 
@@ -73,3 +80,5 @@ value_t silhouette_score_batched(
 
 };  // namespace stats
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/specializations.cuh b/cpp/include/raft/stats/specializations.cuh
new file mode 100644
index 0000000000..660eee783f
--- /dev/null
+++ b/cpp/include/raft/stats/specializations.cuh
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __STATS_SPECIALIZATIONS_H
+#define __STATS_SPECIALIZATIONS_H
+
+#pragma once
+
+#include <raft/distance/specializations.cuh>
+#include <raft/spatial/knn/specializations.cuh>
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/specializations.hpp b/cpp/include/raft/stats/specializations.hpp
index 8f33690e5b..87301deccc 100644
--- a/cpp/include/raft/stats/specializations.hpp
+++ b/cpp/include/raft/stats/specializations.hpp
@@ -13,8 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __STATS_SPECIALIZATIONS_H
+#define __STATS_SPECIALIZATIONS_H
 
 #pragma once
 
-#include <raft/distance/specializations.hpp>
-#include <raft/spatial/knn/specializations.hpp>
\ No newline at end of file
+#include <raft/distance/specializations.cuh>
+#include <raft/spatial/knn/specializations.cuh>
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/stddev.cuh b/cpp/include/raft/stats/stddev.cuh
new file mode 100644
index 0000000000..72df090939
--- /dev/null
+++ b/cpp/include/raft/stats/stddev.cuh
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __STDDEV_H
+#define __STDDEV_H
+
+#pragma once
+
+#include "detail/stddev.cuh"
+
+#include <raft/handle.hpp>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute stddev of the input matrix
+ *
+ * Stddev operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param std the output stddev vector
+ * @param data the input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param sample whether to evaluate sample stddev or not. In other words,
+ * whether
+ *  to normalize the output using N-1 or N, for true or false, respectively
+ * @param rowMajor whether the input data is row or col major
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int>
+void stddev(Type* std,
+            const Type* data,
+            const Type* mu,
+            IdxType D,
+            IdxType N,
+            bool sample,
+            bool rowMajor,
+            cudaStream_t stream)
+{
+  detail::stddev(std, data, mu, D, N, sample, rowMajor, stream);
+}
+
+/**
+ * @brief Compute variance of the input matrix
+ *
+ * Variance operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param var the output stddev vector
+ * @param data the input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param sample whether to evaluate sample stddev or not. In other words,
+ * whether
+ *  to normalize the output using N-1 or N, for true or false, respectively
+ * @param rowMajor whether the input data is row or col major
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int>
+void vars(Type* var,
+          const Type* data,
+          const Type* mu,
+          IdxType D,
+          IdxType N,
+          bool sample,
+          bool rowMajor,
+          cudaStream_t stream)
+{
+  detail::vars(var, data, mu, D, N, sample, rowMajor, stream);
+}
+
+};  // namespace stats
+};  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/stddev.hpp b/cpp/include/raft/stats/stddev.hpp
index 9393dec8bc..e038fecc02 100644
--- a/cpp/include/raft/stats/stddev.hpp
+++ b/cpp/include/raft/stats/stddev.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __STDDEV_H
+#define __STDDEV_H
 
 #pragma once
 
@@ -87,3 +94,5 @@ void vars(Type* var,
 
 };  // namespace stats
 };  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/sum.cuh b/cpp/include/raft/stats/sum.cuh
new file mode 100644
index 0000000000..2e07e9aafa
--- /dev/null
+++ b/cpp/include/raft/stats/sum.cuh
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __SUM_H
+#define __SUM_H
+
+#pragma once
+
+#include "detail/sum.cuh"
+
+#include <raft/cudart_utils.h>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute sum of the input matrix
+ *
+ * Sum operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param output the output mean vector
+ * @param input the input matrix
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param rowMajor whether the input data is row or col major
+ * @param stream cuda stream where to launch work
+ */
+template <typename Type, typename IdxType = int>
+void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream)
+{
+  detail::sum(output, input, D, N, rowMajor, stream);
+}
+
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/sum.hpp b/cpp/include/raft/stats/sum.hpp
index cfb5142a14..c2b93b79db 100644
--- a/cpp/include/raft/stats/sum.hpp
+++ b/cpp/include/raft/stats/sum.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __SUM_H
+#define __SUM_H
 
 #pragma once
 
@@ -45,3 +52,5 @@ void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, c
 
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/trustworthiness_score.cuh b/cpp/include/raft/stats/trustworthiness_score.cuh
new file mode 100644
index 0000000000..c89eab8d2b
--- /dev/null
+++ b/cpp/include/raft/stats/trustworthiness_score.cuh
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __TRUSTWORTHINESS_SCORE_H
+#define __TRUSTWORTHINESS_SCORE_H
+
+#pragma once
+#include <raft/stats/detail/trustworthiness_score.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute the trustworthiness score
+ * @param[in] h: raft handle
+ * @param[in] X: Data in original dimension
+ * @param[in] X_embedded: Data in target dimension (embedding)
+ * @param[in] n: Number of samples
+ * @param[in] m: Number of features in high/original dimension
+ * @param[in] d: Number of features in low/embedded dimension
+ * @param[in] n_neighbors Number of neighbors considered by trustworthiness score
+ * @param[in] batchSize Batch size
+ * @return[out] Trustworthiness score
+ */
+template <typename math_t, raft::distance::DistanceType distance_type>
+double trustworthiness_score(const raft::handle_t& h,
+                             const math_t* X,
+                             math_t* X_embedded,
+                             int n,
+                             int m,
+                             int d,
+                             int n_neighbors,
+                             int batchSize = 512)
+{
+  return detail::trustworthiness_score<math_t, distance_type>(
+    h, X, X_embedded, n, m, d, n_neighbors, batchSize);
+}
+}  // namespace stats
+}  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/trustworthiness_score.hpp b/cpp/include/raft/stats/trustworthiness_score.hpp
index f3f1bacfd4..81ca4eb5b7 100644
--- a/cpp/include/raft/stats/trustworthiness_score.hpp
+++ b/cpp/include/raft/stats/trustworthiness_score.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __TRUSTWORTHINESS_SCORE_H
+#define __TRUSTWORTHINESS_SCORE_H
 
 #pragma once
 #include <raft/stats/detail/trustworthiness_score.cuh>
@@ -47,3 +54,5 @@ double trustworthiness_score(const raft::handle_t& h,
 }
 }  // namespace stats
 }  // namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/v_measure.cuh b/cpp/include/raft/stats/v_measure.cuh
new file mode 100644
index 0000000000..dd6ebd9b15
--- /dev/null
+++ b/cpp/include/raft/stats/v_measure.cuh
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __V_MEASURE_H
+#define __V_MEASURE_H
+
+#pragma once
+#include <raft/stats/detail/v_measure.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Function to calculate the v-measure between two clusters
+ *
+ * @param truthClusterArray: the array of truth classes of type T
+ * @param predClusterArray: the array of predicted classes of type T
+ * @param size: the size of the data points of type int
+ * @param lowerLabelRange: the lower bound of the range of labels
+ * @param upperLabelRange: the upper bound of the range of labels
+ * @param stream: the cudaStream object
+ * @param beta: v_measure parameter
+ */
+template <typename T>
+double v_measure(const T* truthClusterArray,
+                 const T* predClusterArray,
+                 int size,
+                 T lowerLabelRange,
+                 T upperLabelRange,
+                 cudaStream_t stream,
+                 double beta = 1.0)
+{
+  return detail::v_measure(
+    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream, beta);
+}
+
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/v_measure.hpp b/cpp/include/raft/stats/v_measure.hpp
index c7c4c3942d..925171c2d2 100644
--- a/cpp/include/raft/stats/v_measure.hpp
+++ b/cpp/include/raft/stats/v_measure.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __V_MEASURE_H
+#define __V_MEASURE_H
+
 #pragma once
 #include <raft/stats/detail/v_measure.cuh>
 
@@ -45,3 +53,5 @@ double v_measure(const T* truthClusterArray,
 
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/weighted_mean.cuh b/cpp/include/raft/stats/weighted_mean.cuh
new file mode 100644
index 0000000000..fe54d927ca
--- /dev/null
+++ b/cpp/include/raft/stats/weighted_mean.cuh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __WEIGHTED_MEAN_H
+#define __WEIGHTED_MEAN_H
+
+#pragma once
+
+#include <raft/stats/detail/weighted_mean.cuh>
+
+namespace raft {
+namespace stats {
+
+/**
+ * @brief Compute the row-wise weighted mean of the input matrix
+ *
+ * @tparam Type the data type
+ * @param mu the output mean vector
+ * @param data the input matrix (assumed to be row-major)
+ * @param weights per-column means
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param stream cuda stream to launch work on
+ */
+template <typename Type>
+void rowWeightedMean(
+  Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream)
+{
+  detail::rowWeightedMean(mu, data, weights, D, N, stream);
+}
+
+/**
+ * @brief Compute the column-wise weighted mean of the input matrix
+ *
+ * @tparam Type the data type
+ * @param mu the output mean vector
+ * @param data the input matrix (assumed to be column-major)
+ * @param weights per-column means
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param stream cuda stream to launch work on
+ */
+template <typename Type>
+void colWeightedMean(
+  Type* mu, const Type* data, const Type* weights, int D, int N, cudaStream_t stream)
+{
+  detail::colWeightedMean(mu, data, weights, D, N, stream);
+}
+};  // end namespace stats
+};  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/weighted_mean.hpp b/cpp/include/raft/stats/weighted_mean.hpp
index ad90142a08..6d2fd1e928 100644
--- a/cpp/include/raft/stats/weighted_mean.hpp
+++ b/cpp/include/raft/stats/weighted_mean.hpp
@@ -13,6 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+/**
+ * @warning This file is deprecated and will be removed in release 22.06.
+ * Please use the cuh version instead.
+ */
+
+#ifndef __WEIGHTED_MEAN_H
+#define __WEIGHTED_MEAN_H
 
 #pragma once
 
@@ -58,3 +65,5 @@ void colWeightedMean(
 }
 };  // end namespace stats
 };  // end namespace raft
+
+#endif
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu b/cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
index 615af0554c..30fbf70322 100644
--- a/cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
+++ b/cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu b/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
index c737e1645e..31bc11aa71 100644
--- a/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
+++ b/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu b/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu
index 91e5f4b3d1..3fb652479a 100644
--- a/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu
+++ b/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu b/cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
index 3add90ce4e..0ac2b23b29 100644
--- a/cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
+++ b/cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu b/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
index ae81c29aff..f56179fcac 100644
--- a/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
+++ b/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu b/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu
index d6638004ed..db12d23f0e 100644
--- a/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu
+++ b/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l1_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l1_double_double_double_int.cu
index 49cef9a76f..b19603dcb9 100644
--- a/cpp/src/distance/specializations/detail/l1_double_double_double_int.cu
+++ b/cpp/src/distance/specializations/detail/l1_double_double_double_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l1_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l1_float_float_float_int.cu
index afec666d57..8319239115 100644
--- a/cpp/src/distance/specializations/detail/l1_float_float_float_int.cu
+++ b/cpp/src/distance/specializations/detail/l1_float_float_float_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l1_float_float_float_uint32.cu b/cpp/src/distance/specializations/detail/l1_float_float_float_uint32.cu
index b12f10a3c3..f906d711d3 100644
--- a/cpp/src/distance/specializations/detail/l1_float_float_float_uint32.cu
+++ b/cpp/src/distance/specializations/detail/l1_float_float_float_uint32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
index 690fdb304a..c77dee4220 100644
--- a/cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
+++ b/cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
index 743e885bde..778a65ad01 100644
--- a/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
+++ b/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_uint32.cu b/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_uint32.cu
index 3e84786db5..43494eabd1 100644
--- a/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_uint32.cu
+++ b/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_uint32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu
index a57d664c7b..68e178089c 100644
--- a/cpp/src/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu
+++ b/cpp/src/distance/specializations/detail/l2_sqrt_expanded_double_double_double_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu
index 836d3b28e4..bcfa5d99d1 100644
--- a/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu
+++ b/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_uint32.cu b/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_uint32.cu
index ff57678a5d..e85058e34f 100644
--- a/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_uint32.cu
+++ b/cpp/src/distance/specializations/detail/l2_sqrt_expanded_float_float_float_uint32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu
index b12c70df58..4cfb058a55 100644
--- a/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu
+++ b/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu
index 24d6e6916c..b8331977c4 100644
--- a/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu
+++ b/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_uint32.cu b/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_uint32.cu
index f61c40541c..1531b3b728 100644
--- a/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_uint32.cu
+++ b/cpp/src/distance/specializations/detail/l2_sqrt_unexpanded_float_float_float_uint32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
index 8c02098c96..fec7c2dce3 100644
--- a/cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
+++ b/cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
index 350cb27874..8f83b9cfbb 100644
--- a/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
+++ b/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_uint32.cu b/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_uint32.cu
index 607113a18d..baad4cc451 100644
--- a/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_uint32.cu
+++ b/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_uint32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
index bd306df055..082fad62d1 100644
--- a/cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
+++ b/cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
index 64a0656e27..919d069e1d 100644
--- a/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
+++ b/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_uint32.cu b/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_uint32.cu
index fcf6f2c65b..79a560dde7 100644
--- a/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_uint32.cu
+++ b/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_uint32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/nn/specializations/ball_cover.cu b/cpp/src/nn/specializations/ball_cover.cu
index ceb9468c21..d142a49264 100644
--- a/cpp/src/nn/specializations/ball_cover.cu
+++ b/cpp/src/nn/specializations/ball_cover.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#include <raft/spatial/knn/ball_cover.hpp>
+#include <raft/spatial/knn/ball_cover.cuh>
 #include <raft/spatial/knn/ball_cover_common.h>
 
 // Ignore upstream specializations to avoid unnecessary recompiling
-#include <raft/distance/specializations.hpp>
+#include <raft/distance/specializations.cuh>
 #include <raft/spatial/knn/specializations/detail/ball_cover_lowdim.hpp>
-#include <raft/spatial/knn/specializations/fused_l2_knn.hpp>
-#include <raft/spatial/knn/specializations/knn.hpp>
+#include <raft/spatial/knn/specializations/fused_l2_knn.cuh>
+#include <raft/spatial/knn/specializations/knn.cuh>
 
 #include <cstdint>
 
diff --git a/cpp/src/nn/specializations/knn.cu b/cpp/src/nn/specializations/knn.cu
index 8973cfbb02..bb59e5b2ba 100644
--- a/cpp/src/nn/specializations/knn.cu
+++ b/cpp/src/nn/specializations/knn.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 
 #include <cstdint>
-#include <raft/spatial/knn/knn.hpp>
+#include <raft/spatial/knn/knn.cuh>
 
 namespace raft {
 namespace spatial {
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index ee57b7c09a..0d3121fee6 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu
index 0030596e21..d475fd2a69 100644
--- a/cpp/test/cluster_solvers.cu
+++ b/cpp/test/cluster_solvers.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,8 +19,8 @@
 #include <memory>
 #include <raft/handle.hpp>
 
-#include <raft/spectral/cluster_solvers.hpp>
-#include <raft/spectral/modularity_maximization.hpp>
+#include <raft/spectral/cluster_solvers.cuh>
+#include <raft/spectral/modularity_maximization.cuh>
 
 namespace raft {
 namespace spectral {
diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index 3bfc70ccf0..7676fd2e07 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/distance/distance.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/distance/distance.cuh>
+#include <raft/random/rng.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index afac15522f..1070f18b96 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -19,11 +19,11 @@
 #include <raft/common/nvtx.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/distance/distance.hpp>
+#include <raft/distance/distance.cuh>
 #if defined RAFT_DISTANCE_COMPILED
-#include <raft/distance/specializations.hpp>
+#include <raft/distance/specializations.cuh>
 #endif
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace distance {
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 176922529f..5ec55bf30d 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,9 +19,9 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/distance/detail/fused_l2_nn.cuh>
-#include <raft/distance/fused_l2_nn.hpp>
-#include <raft/linalg/norm.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/distance/fused_l2_nn.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace distance {
diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu
index 541d4dccc8..635908240b 100644
--- a/cpp/test/eigen_solvers.cu
+++ b/cpp/test/eigen_solvers.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@
 
 #include <raft/common/nvtx.hpp>
 #include <raft/handle.hpp>
-#include <raft/spectral/eigen_solvers.hpp>
-#include <raft/spectral/partition.hpp>
+#include <raft/spectral/eigen_solvers.cuh>
+#include <raft/spectral/partition.cuh>
 
 #include <gtest/gtest.h>
 
diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp
index 118002dba0..d594a49e83 100644
--- a/cpp/test/handle.cpp
+++ b/cpp/test/handle.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu
index b19accc3b4..06f25cb308 100644
--- a/cpp/test/label/label.cu
+++ b/cpp/test/label/label.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 
-#include <raft/label/classlabels.hpp>
+#include <raft/label/classlabels.cuh>
 
 #include "../test_utils.h"
 #include <raft/cuda_utils.cuh>
diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu
index db6b34bbd6..cab8c44969 100644
--- a/cpp/test/label/merge_labels.cu
+++ b/cpp/test/label/merge_labels.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 
 #include <gtest/gtest.h>
-#include <raft/label/merge_labels.hpp>
+#include <raft/label/merge_labels.cuh>
 
 #include "../test_utils.h"
 #include <raft/cudart_utils.h>
diff --git a/cpp/test/lap/lap.cu b/cpp/test/lap/lap.cu
index 24e1c6be4f..1f847ceef3 100644
--- a/cpp/test/lap/lap.cu
+++ b/cpp/test/lap/lap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  * Copyright 2020 KETAN DATE & RAKESH NAGI
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -28,7 +28,7 @@
 
 #include <iostream>
 #include <omp.h>
-#include <raft/lap/lap.hpp>
+#include <raft/lap/lap.cuh>
 #include <random>
 
 #define PROBLEMSIZE  1000  // Number of rows/columns
diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu
index d5daef8d7b..e3d1919b09 100644
--- a/cpp/test/linalg/add.cu
+++ b/cpp/test/linalg/add.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include "add.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/add.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/add.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/add.cuh b/cpp/test/linalg/add.cuh
index 1f1ff87a4d..215b4d3805 100644
--- a/cpp/test/linalg/add.cuh
+++ b/cpp/test/linalg/add.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/add.hpp>
+#include <raft/linalg/add.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu
index d1b00da728..591d20fdb6 100644
--- a/cpp/test/linalg/binary_op.cu
+++ b/cpp/test/linalg/binary_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include "binary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/binary_op.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/binary_op.cuh>
+#include <raft/random/rng.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/test/linalg/binary_op.cuh
index b9ca9f8fd2..763398aff1 100644
--- a/cpp/test/linalg/binary_op.cuh
+++ b/cpp/test/linalg/binary_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include "../test_utils.h"
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.hpp>
+#include <raft/linalg/binary_op.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu
index 9f44cc8d5f..c057c20403 100644
--- a/cpp/test/linalg/cholesky_r1.cu
+++ b/cpp/test/linalg/cholesky_r1.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/cholesky_r1_update.hpp>
+#include <raft/linalg/cholesky_r1_update.cuh>
 #include <raft/linalg/detail/cusolver_wrappers.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index 56b4c5bd49..6214a5eccc 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,8 +19,8 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/coalesced_reduction.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/coalesced_reduction.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu
index fd1bb8a670..ef186b6c1f 100644
--- a/cpp/test/linalg/divide.cu
+++ b/cpp/test/linalg/divide.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include "unary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/divide.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/divide.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
index 9949c900ef..51a540a3dd 100644
--- a/cpp/test/linalg/eig.cu
+++ b/cpp/test/linalg/eig.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eig.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/eig.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index e35835a445..23ded35174 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -20,7 +20,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eig.hpp>
+#include <raft/linalg/eig.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu
index 982dc21573..6f16c7684a 100644
--- a/cpp/test/linalg/eltwise.cu
+++ b/cpp/test/linalg/eltwise.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eltwise.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/eltwise.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu
index 72567ff5f9..422ba26f46 100644
--- a/cpp/test/linalg/gemm_layout.cu
+++ b/cpp/test/linalg/gemm_layout.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/gemm.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/gemm.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/gemv.cu b/cpp/test/linalg/gemv.cu
index ea84e06675..43d2a6bd54 100644
--- a/cpp/test/linalg/gemv.cu
+++ b/cpp/test/linalg/gemv.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/gemv.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/gemv.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu
index d27fad4dfc..0814d821a9 100644
--- a/cpp/test/linalg/map.cu
+++ b/cpp/test/linalg/map.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eltwise.hpp>
-#include <raft/linalg/map.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/map.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index a12bb6ff9d..4696cc0d0c 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include <gtest/gtest.h>
 #include <limits>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/map_then_reduce.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/map_then_reduce.cuh>
+#include <raft/random/rng.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index 1a97603430..29ebe0a902 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #include "matrix_vector_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh
index 9ab005a075..1e5812ba89 100644
--- a/cpp/test/linalg/matrix_vector_op.cuh
+++ b/cpp/test/linalg/matrix_vector_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #include "../test_utils.h"
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/matrix_vector_op.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu
index 6341fa341d..7f34a19580 100644
--- a/cpp/test/linalg/multiply.cu
+++ b/cpp/test/linalg/multiply.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include "unary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/multiply.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/multiply.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
index e574c52692..7070510738 100644
--- a/cpp/test/linalg/norm.cu
+++ b/cpp/test/linalg/norm.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/norm.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/norm.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/power.cu b/cpp/test/linalg/power.cu
index 0ec8613ce7..5c536eb498 100644
--- a/cpp/test/linalg/power.cu
+++ b/cpp/test/linalg/power.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/power.cuh>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
index cb69dc0e81..7793f3ab7a 100644
--- a/cpp/test/linalg/reduce.cu
+++ b/cpp/test/linalg/reduce.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,8 +19,8 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/reduce.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/reduce.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh
index 7840df2c0d..16f261cfc2 100644
--- a/cpp/test/linalg/reduce.cuh
+++ b/cpp/test/linalg/reduce.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include <cublas_v2.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/device_ptr.h>
diff --git a/cpp/test/linalg/reduce_cols_by_key.cu b/cpp/test/linalg/reduce_cols_by_key.cu
index 94459769f8..ee0505fcfc 100644
--- a/cpp/test/linalg/reduce_cols_by_key.cu
+++ b/cpp/test/linalg/reduce_cols_by_key.cu
@@ -19,7 +19,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/interruptible.hpp>
 #include <raft/linalg/reduce_cols_by_key.cuh>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/reduce_rows_by_key.cu b/cpp/test/linalg/reduce_rows_by_key.cu
index 9219c4f561..174ed12d4d 100644
--- a/cpp/test/linalg/reduce_rows_by_key.cu
+++ b/cpp/test/linalg/reduce_rows_by_key.cu
@@ -19,7 +19,7 @@
 #include <iostream>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/reduce_rows_by_key.cuh>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/rsvd.cu b/cpp/test/linalg/rsvd.cu
index 7b0bb7c928..66b472c7e1 100644
--- a/cpp/test/linalg/rsvd.cu
+++ b/cpp/test/linalg/rsvd.cu
@@ -20,7 +20,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 #include <raft/linalg/rsvd.cuh>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 #include <rmm/device_uvector.hpp>
 
 #include <algorithm>
diff --git a/cpp/test/linalg/sqrt.cu b/cpp/test/linalg/sqrt.cu
index 92c9626395..8401170647 100644
--- a/cpp/test/linalg/sqrt.cu
+++ b/cpp/test/linalg/sqrt.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/sqrt.cuh>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu
index 840889dee8..7957baacac 100644
--- a/cpp/test/linalg/strided_reduction.cu
+++ b/cpp/test/linalg/strided_reduction.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include "reduce.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/strided_reduction.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/strided_reduction.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu
index 2801592de9..13f268fbe9 100644
--- a/cpp/test/linalg/subtract.cu
+++ b/cpp/test/linalg/subtract.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/subtract.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/subtract.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu
index e074197dec..9d7656445e 100644
--- a/cpp/test/linalg/svd.cu
+++ b/cpp/test/linalg/svd.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,9 +18,9 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/svd.hpp>
-#include <raft/matrix/matrix.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/svd.cuh>
+#include <raft/matrix/matrix.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/ternary_op.cu b/cpp/test/linalg/ternary_op.cu
index 4140a9c4b3..d816ea5ee1 100644
--- a/cpp/test/linalg/ternary_op.cu
+++ b/cpp/test/linalg/ternary_op.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/ternary_op.cuh>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index 3c651bb8ee..3484d50a3a 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/transpose.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/transpose.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu
index 7a976ec336..c3a086da28 100644
--- a/cpp/test/linalg/unary_op.cu
+++ b/cpp/test/linalg/unary_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include "unary_op.cuh"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/unary_op.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/test/linalg/unary_op.cuh
index b47e60d4f6..625fe7ab00 100644
--- a/cpp/test/linalg/unary_op.cuh
+++ b/cpp/test/linalg/unary_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include "../test_utils.h"
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/test/matrix/columnSort.cu b/cpp/test/matrix/columnSort.cu
index d0b27bb4a4..dbfaacaa9a 100644
--- a/cpp/test/matrix/columnSort.cu
+++ b/cpp/test/matrix/columnSort.cu
@@ -19,7 +19,7 @@
 #include <gtest/gtest.h>
 #include <numeric>
 #include <raft/cudart_utils.h>
-#include <raft/matrix/col_wise_sort.hpp>
+#include <raft/matrix/col_wise_sort.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/test/matrix/linewise_op.cu b/cpp/test/matrix/linewise_op.cu
index cd0d065ad4..ad273c9363 100644
--- a/cpp/test/matrix/linewise_op.cu
+++ b/cpp/test/matrix/linewise_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,9 +20,9 @@
 #include <gtest/gtest.h>
 #include <raft/common/nvtx.hpp>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/matrix_vector_op.hpp>
-#include <raft/matrix/matrix.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/matrix/matrix.cuh>
+#include <raft/random/rng.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu
index 127e582145..42eb5d0e36 100644
--- a/cpp/test/matrix/math.cu
+++ b/cpp/test/matrix/math.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/matrix/math.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/matrix/math.cuh>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace matrix {
diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu
index fb2f6c6b15..234d02b9ca 100644
--- a/cpp/test/matrix/matrix.cu
+++ b/cpp/test/matrix/matrix.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/matrix/matrix.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/matrix/matrix.cuh>
+#include <raft/random/rng.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp
index 324e9b9e4b..b060568981 100644
--- a/cpp/test/mr/device/buffer.cpp
+++ b/cpp/test/mr/device/buffer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/mr/host/buffer.cpp b/cpp/test/mr/host/buffer.cpp
index c174b269da..89b185547a 100644
--- a/cpp/test/mr/host/buffer.cpp
+++ b/cpp/test/mr/host/buffer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/random/make_blobs.cu b/cpp/test/random/make_blobs.cu
index 48e8986947..b2b4ba9e66 100644
--- a/cpp/test/random/make_blobs.cu
+++ b/cpp/test/random/make_blobs.cu
@@ -19,7 +19,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/random/make_blobs.hpp>
+#include <raft/random/make_blobs.cuh>
 
 namespace raft {
 namespace random {
diff --git a/cpp/test/random/make_regression.cu b/cpp/test/random/make_regression.cu
index 01c3008cd3..72c7f64cd0 100644
--- a/cpp/test/random/make_regression.cu
+++ b/cpp/test/random/make_regression.cu
@@ -22,9 +22,9 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/linalg/subtract.hpp>
-#include <raft/linalg/transpose.hpp>
-#include <raft/random/make_regression.hpp>
+#include <raft/linalg/subtract.cuh>
+#include <raft/linalg/transpose.cuh>
+#include <raft/random/make_regression.cuh>
 
 namespace raft::random {
 
diff --git a/cpp/test/random/multi_variable_gaussian.cu b/cpp/test/random/multi_variable_gaussian.cu
index daafdbc754..58fbed7eb2 100644
--- a/cpp/test/random/multi_variable_gaussian.cu
+++ b/cpp/test/random/multi_variable_gaussian.cu
@@ -19,7 +19,7 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <raft/cudart_utils.h>
-#include <raft/random/multi_variable_gaussian.hpp>
+#include <raft/random/multi_variable_gaussian.cuh>
 #include <random>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/random/permute.cu b/cpp/test/random/permute.cu
index 294444d409..643dfc4c98 100644
--- a/cpp/test/random/permute.cu
+++ b/cpp/test/random/permute.cu
@@ -18,8 +18,8 @@
 #include <algorithm>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/random/permute.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/random/permute.cuh>
+#include <raft/random/rng.cuh>
 #include <vector>
 
 namespace raft {
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
index c63763d5a4..872ed25000 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/test/random/rng.cu
@@ -19,9 +19,9 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
-#include <raft/stats/mean.hpp>
-#include <raft/stats/stddev.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/stats/mean.cuh>
+#include <raft/stats/stddev.cuh>
 
 namespace raft {
 namespace random {
diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu
index 2715181db1..7fffd84063 100644
--- a/cpp/test/random/rng_int.cu
+++ b/cpp/test/random/rng_int.cu
@@ -19,7 +19,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 namespace raft {
 namespace random {
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu
index e469c366c3..b764b6b0f3 100644
--- a/cpp/test/random/sample_without_replacement.cu
+++ b/cpp/test/random/sample_without_replacement.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 #include <set>
 #include <vector>
 
diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu
index 0804b46957..5a6dc2966b 100644
--- a/cpp/test/sparse/add.cu
+++ b/cpp/test/sparse/add.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,11 +17,11 @@
 #include <gtest/gtest.h>
 
 #include <raft/sparse/csr.hpp>
-#include <raft/sparse/linalg/add.hpp>
+#include <raft/sparse/linalg/add.cuh>
 
 #include "../test_utils.h"
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 #include <iostream>
 #include <limits>
diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu
index e4b197d7f5..167c88e264 100644
--- a/cpp/test/sparse/connect_components.cu
+++ b/cpp/test/sparse/connect_components.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,15 +22,15 @@
 #include <raft/cudart_utils.h>
 #include <vector>
 
-#include <raft/sparse/linalg/symmetrize.hpp>
+#include <raft/sparse/linalg/symmetrize.cuh>
 #include <raft/sparse/mst/mst.cuh>
-#include <raft/sparse/selection/knn_graph.hpp>
+#include <raft/sparse/selection/knn_graph.cuh>
 
 #include <raft/distance/distance_type.hpp>
-#include <raft/linalg/transpose.hpp>
-#include <raft/sparse/convert/csr.hpp>
+#include <raft/linalg/transpose.cuh>
+#include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
-#include <raft/sparse/hierarchy/single_linkage.hpp>
+#include <raft/sparse/hierarchy/single_linkage.cuh>
 #include <rmm/device_uvector.hpp>
 
 #include "../test_utils.h"
diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu
index ecc1315c5f..a4fc8f3fe1 100644
--- a/cpp/test/sparse/convert_coo.cu
+++ b/cpp/test/sparse/convert_coo.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,11 @@
 
 #include <gtest/gtest.h>
 
-#include <raft/sparse/convert/coo.hpp>
+#include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/csr.hpp>
 
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 #include "../test_utils.h"
 
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index f4cd5640fe..6e4262b72f 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
-#include <raft/sparse/convert/csr.hpp>
+#include <raft/sparse/convert/csr.cuh>
 #include <raft/sparse/coo.hpp>
 
 #include <iostream>
diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu
index e92717c454..fa2b88cdef 100644
--- a/cpp/test/sparse/csr_row_slice.cu
+++ b/cpp/test/sparse/csr_row_slice.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 
 #include <gtest/gtest.h>
 #include <raft/sparse/detail/cusparse_wrappers.h>
-#include <raft/sparse/op/slice.hpp>
+#include <raft/sparse/op/slice.cuh>
 
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu
index 60447e3a81..fbc3708b37 100644
--- a/cpp/test/sparse/csr_to_dense.cu
+++ b/cpp/test/sparse/csr_to_dense.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include <raft/handle.hpp>
 
 #include <gtest/gtest.h>
-#include <raft/sparse/convert/dense.hpp>
+#include <raft/sparse/convert/dense.cuh>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu
index e4fb7a102b..d06a365b15 100644
--- a/cpp/test/sparse/csr_transpose.cu
+++ b/cpp/test/sparse/csr_transpose.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
-#include <raft/sparse/linalg/transpose.hpp>
+#include <raft/sparse/linalg/transpose.cuh>
 
 #include "../test_utils.h"
 
diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu
index 6f567c260d..c87bd29b9d 100644
--- a/cpp/test/sparse/degree.cu
+++ b/cpp/test/sparse/degree.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
-#include <raft/sparse/linalg/degree.hpp>
+#include <raft/sparse/linalg/degree.cuh>
 
 #include <iostream>
 
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index e2288daed9..1ccff3532f 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,11 +18,11 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance_type.hpp>
-#include <raft/linalg/unary_op.hpp>
+#include <raft/linalg/unary_op.cuh>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <rmm/device_uvector.hpp>
 
-#include <raft/sparse/convert/coo.hpp>
+#include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/distance/detail/coo_spmv.cuh>
 #include <raft/sparse/distance/detail/operators.cuh>
 
diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu
index 7c61f2ed1c..d211a2a0c8 100644
--- a/cpp/test/sparse/distance.cu
+++ b/cpp/test/sparse/distance.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 #include <raft/distance/distance_type.hpp>
 #include <raft/sparse/detail/cusparse_wrappers.h>
 
-#include <raft/sparse/distance/distance.hpp>
+#include <raft/sparse/distance/distance.cuh>
 
 #include "../test_utils.h"
 
diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu
index 5a389b8c87..e9ce0e5a9e 100644
--- a/cpp/test/sparse/filter.cu
+++ b/cpp/test/sparse/filter.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,11 +17,11 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 #include <raft/sparse/coo.hpp>
-#include <raft/sparse/op/filter.hpp>
-#include <raft/sparse/op/sort.hpp>
+#include <raft/sparse/op/filter.cuh>
+#include <raft/sparse/op/sort.cuh>
 
 #include <iostream>
 
diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu
index 5a066c2c28..7ced61fa9c 100644
--- a/cpp/test/sparse/knn.cu
+++ b/cpp/test/sparse/knn.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 
 #include "../test_utils.h"
 #include <raft/distance/distance_type.hpp>
-#include <raft/sparse/selection/knn.hpp>
+#include <raft/sparse/selection/knn.cuh>
 
 #include <raft/cudart_utils.h>
 
diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu
index 3645ef45ba..5cab91bc30 100644
--- a/cpp/test/sparse/knn_graph.cu
+++ b/cpp/test/sparse/knn_graph.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,14 +17,14 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <raft/sparse/coo.hpp>
-#include <raft/sparse/selection/knn_graph.hpp>
+#include <raft/sparse/selection/knn_graph.cuh>
 #if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.hpp>
+#include <raft/spatial/knn/specializations.cuh>
 #endif
 
 #include <iostream>
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu
index 7944d0ee1f..35501c661a 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/sparse/linkage.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,9 +18,9 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance_type.hpp>
-#include <raft/linalg/transpose.hpp>
+#include <raft/linalg/transpose.cuh>
 #include <raft/sparse/coo.hpp>
-#include <raft/sparse/hierarchy/single_linkage.hpp>
+#include <raft/sparse/hierarchy/single_linkage.cuh>
 
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
index ac5443d43b..5e534d6374 100644
--- a/cpp/test/sparse/norm.cu
+++ b/cpp/test/sparse/norm.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,9 +18,9 @@
 
 #include "../test_utils.h"
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 #include <raft/sparse/csr.hpp>
-#include <raft/sparse/linalg/norm.hpp>
+#include <raft/sparse/linalg/norm.cuh>
 
 #include <iostream>
 #include <limits>
diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu
index edf7432c49..c605943cb4 100644
--- a/cpp/test/sparse/reduce.cu
+++ b/cpp/test/sparse/reduce.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 #include <raft/sparse/coo.hpp>
-#include <raft/sparse/op/reduce.hpp>
+#include <raft/sparse/op/reduce.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu
index 8d0317abac..da9adcd5ae 100644
--- a/cpp/test/sparse/row_op.cu
+++ b/cpp/test/sparse/row_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,11 +17,11 @@
 #include <gtest/gtest.h>
 
 #include <raft/sparse/csr.hpp>
-#include <raft/sparse/op/row_op.hpp>
+#include <raft/sparse/op/row_op.cuh>
 
 #include "../test_utils.h"
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
 #include <iostream>
 #include <limits>
diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu
index 66407341da..10aa6e244a 100644
--- a/cpp/test/sparse/sort.cu
+++ b/cpp/test/sparse/sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 
-#include <raft/sparse/op/sort.hpp>
+#include <raft/sparse/op/sort.cuh>
 
 #include <iostream>
 
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu
index 9a2e35b0fe..f8c9b7a03d 100644
--- a/cpp/test/sparse/symmetrize.cu
+++ b/cpp/test/sparse/symmetrize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,10 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
-#include <raft/sparse/convert/coo.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/sparse/convert/coo.cuh>
 #include <raft/sparse/coo.hpp>
-#include <raft/sparse/linalg/symmetrize.hpp>
+#include <raft/sparse/linalg/symmetrize.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu
index 0cdc0d8765..0470750f36 100644
--- a/cpp/test/spatial/ball_cover.cu
+++ b/cpp/test/spatial/ball_cover.cu
@@ -22,7 +22,7 @@
 #include <raft/spatial/knn/ball_cover.hpp>
 #include <raft/spatial/knn/detail/knn_brute_force_faiss.cuh>
 #if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.hpp>
+#include <raft/spatial/knn/specializations.cuh>
 #endif
 
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/test/spatial/epsilon_neighborhood.cu b/cpp/test/spatial/epsilon_neighborhood.cu
index 33af5726a0..30cd79188b 100644
--- a/cpp/test/spatial/epsilon_neighborhood.cu
+++ b/cpp/test/spatial/epsilon_neighborhood.cu
@@ -18,8 +18,8 @@
 #include <gtest/gtest.h>
 #include <memory>
 #include <raft/cudart_utils.h>
-#include <raft/random/make_blobs.hpp>
-#include <raft/spatial/knn/epsilon_neighborhood.hpp>
+#include <raft/random/make_blobs.cuh>
+#include <raft/spatial/knn/epsilon_neighborhood.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/test/spatial/faiss_mr.cu b/cpp/test/spatial/faiss_mr.cu
index e635619897..eee221cffa 100644
--- a/cpp/test/spatial/faiss_mr.cu
+++ b/cpp/test/spatial/faiss_mr.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include <faiss/gpu/GpuResources.h>
 #include <raft/distance/distance_type.hpp>
-#include <raft/spatial/knn/knn.hpp>
+#include <raft/spatial/knn/knn.cuh>
 
 #include <rmm/device_buffer.hpp>
 
diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu
index 3254d41401..65c4284dd2 100644
--- a/cpp/test/spatial/fused_l2_knn.cu
+++ b/cpp/test/spatial/fused_l2_knn.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,10 +20,10 @@
 #include <faiss/gpu/StandardGpuResources.h>
 
 #include <raft/distance/distance_type.hpp>
-#include <raft/random/rng.hpp>
+#include <raft/random/rng.cuh>
 #include <raft/spatial/knn/detail/common_faiss.h>
 #include <raft/spatial/knn/detail/fused_l2_knn.cuh>
-#include <raft/spatial/knn/knn.hpp>
+#include <raft/spatial/knn/knn.cuh>
 
 #include <rmm/device_buffer.hpp>
 
diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu
index f78c6c46da..473d1e31da 100644
--- a/cpp/test/spatial/haversine.cu
+++ b/cpp/test/spatial/haversine.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index 54c3b55e5e..bf13288f48 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,9 +18,9 @@
 
 #include <raft/distance/distance_type.hpp>
 
-#include <raft/spatial/knn/knn.hpp>
+#include <raft/spatial/knn/knn.cuh>
 #if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.hpp>
+#include <raft/spatial/knn/specializations.cuh>
 #endif
 
 #include <rmm/device_buffer.hpp>
diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu
index 769406487a..25ec2e50ab 100644
--- a/cpp/test/spatial/selection.cu
+++ b/cpp/test/spatial/selection.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,9 +20,9 @@
 #include "../test_utils.h"
 
 #include <raft/sparse/detail/utils.h>
-#include <raft/spatial/knn/knn.hpp>
+#include <raft/spatial/knn/knn.cuh>
 #if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.hpp>
+#include <raft/spatial/knn/specializations.cuh>
 #endif
 
 namespace raft {
diff --git a/cpp/test/spatial/spatial_data.h b/cpp/test/spatial/spatial_data.h
index dbb32c4546..d71b47cf1e 100644
--- a/cpp/test/spatial/spatial_data.h
+++ b/cpp/test/spatial/spatial_data.h
@@ -1,22 +1,38 @@
-#include <vector>
-
-namespace raft {
-namespace spatial {
-
-// Latitude and longitude coordinates of 51 US states / territories
-std::vector<float> spatial_data = {
-  63.588753, -154.493062, 32.318231, -86.902298,  35.20105,  -91.831833,  34.048928, -111.093731,
-  36.778261, -119.417932, 39.550051, -105.782067, 41.603221, -73.087749,  38.905985, -77.033418,
-  38.910832, -75.52767,   27.664827, -81.515754,  32.157435, -82.907123,  19.898682, -155.665857,
-  41.878003, -93.097702,  44.068202, -114.742041, 40.633125, -89.398528,  40.551217, -85.602364,
-  39.011902, -98.484246,  37.839333, -84.270018,  31.244823, -92.145024,  42.407211, -71.382437,
-  39.045755, -76.641271,  45.253783, -69.445469,  44.314844, -85.602364,  46.729553, -94.6859,
-  37.964253, -91.831833,  32.354668, -89.398528,  46.879682, -110.362566, 35.759573, -79.0193,
-  47.551493, -101.002012, 41.492537, -99.901813,  43.193852, -71.572395,  40.058324, -74.405661,
-  34.97273,  -105.032363, 38.80261,  -116.419389, 43.299428, -74.217933,  40.417287, -82.907123,
-  35.007752, -97.092877,  43.804133, -120.554201, 41.203322, -77.194525,  18.220833, -66.590149,
-  41.580095, -71.477429,  33.836081, -81.163725,  43.969515, -99.901813,  35.517491, -86.580447,
-  31.968599, -99.901813,  39.32098,  -111.093731, 37.431573, -78.656894,  44.558803, -72.577841,
-  47.751074, -120.740139, 43.78444,  -88.787868,  38.597626, -80.454903,  43.075968, -107.290284};
-};  // namespace spatial
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vector>
+
+namespace raft {
+namespace spatial {
+
+// Latitude and longitude coordinates of 51 US states / territories
+std::vector<float> spatial_data = {
+  63.588753, -154.493062, 32.318231, -86.902298,  35.20105,  -91.831833,  34.048928, -111.093731,
+  36.778261, -119.417932, 39.550051, -105.782067, 41.603221, -73.087749,  38.905985, -77.033418,
+  38.910832, -75.52767,   27.664827, -81.515754,  32.157435, -82.907123,  19.898682, -155.665857,
+  41.878003, -93.097702,  44.068202, -114.742041, 40.633125, -89.398528,  40.551217, -85.602364,
+  39.011902, -98.484246,  37.839333, -84.270018,  31.244823, -92.145024,  42.407211, -71.382437,
+  39.045755, -76.641271,  45.253783, -69.445469,  44.314844, -85.602364,  46.729553, -94.6859,
+  37.964253, -91.831833,  32.354668, -89.398528,  46.879682, -110.362566, 35.759573, -79.0193,
+  47.551493, -101.002012, 41.492537, -99.901813,  43.193852, -71.572395,  40.058324, -74.405661,
+  34.97273,  -105.032363, 38.80261,  -116.419389, 43.299428, -74.217933,  40.417287, -82.907123,
+  35.007752, -97.092877,  43.804133, -120.554201, 41.203322, -77.194525,  18.220833, -66.590149,
+  41.580095, -71.477429,  33.836081, -81.163725,  43.969515, -99.901813,  35.517491, -86.580447,
+  31.968599, -99.901813,  39.32098,  -111.093731, 37.431573, -78.656894,  44.558803, -72.577841,
+  47.751074, -120.740139, 43.78444,  -88.787868,  38.597626, -80.454903,  43.075968, -107.290284};
+};  // namespace spatial
 };  // namespace raft
\ No newline at end of file
diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu
index 5d0768a729..2e2d918016 100644
--- a/cpp/test/spectral_matrix.cu
+++ b/cpp/test/spectral_matrix.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/stats/adjusted_rand_index.cu b/cpp/test/stats/adjusted_rand_index.cu
index 33e05295e1..4bacbadbf7 100644
--- a/cpp/test/stats/adjusted_rand_index.cu
+++ b/cpp/test/stats/adjusted_rand_index.cu
@@ -19,8 +19,8 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <raft/cudart_utils.h>
-#include <raft/stats/adjusted_rand_index.hpp>
-#include <raft/stats/contingency_matrix.hpp>
+#include <raft/stats/adjusted_rand_index.cuh>
+#include <raft/stats/contingency_matrix.cuh>
 #include <random>
 
 namespace raft {
diff --git a/cpp/test/stats/completeness_score.cu b/cpp/test/stats/completeness_score.cu
index b8ca65ed7b..f0f06614e3 100644
--- a/cpp/test/stats/completeness_score.cu
+++ b/cpp/test/stats/completeness_score.cu
@@ -18,9 +18,9 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <raft/cudart_utils.h>
-#include <raft/stats/completeness_score.hpp>
-#include <raft/stats/entropy.hpp>
-#include <raft/stats/mutual_info_score.hpp>
+#include <raft/stats/completeness_score.cuh>
+#include <raft/stats/entropy.cuh>
+#include <raft/stats/mutual_info_score.cuh>
 #include <random>
 
 namespace raft {
diff --git a/cpp/test/stats/contingencyMatrix.cu b/cpp/test/stats/contingencyMatrix.cu
index fbae9f5224..5c8d6da566 100644
--- a/cpp/test/stats/contingencyMatrix.cu
+++ b/cpp/test/stats/contingencyMatrix.cu
@@ -20,7 +20,7 @@
 #include <iostream>
 #include <raft/cudart_utils.h>
 #include <raft/interruptible.hpp>
-#include <raft/stats/contingency_matrix.hpp>
+#include <raft/stats/contingency_matrix.cuh>
 #include <random>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/stats/cov.cu b/cpp/test/stats/cov.cu
index 2db64a7999..1b4066bda5 100644
--- a/cpp/test/stats/cov.cu
+++ b/cpp/test/stats/cov.cu
@@ -17,9 +17,9 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
-#include <raft/stats/cov.hpp>
-#include <raft/stats/mean.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/stats/cov.cuh>
+#include <raft/stats/mean.cuh>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {
diff --git a/cpp/test/stats/dispersion.cu b/cpp/test/stats/dispersion.cu
index 256469be7c..b6e07bbff5 100644
--- a/cpp/test/stats/dispersion.cu
+++ b/cpp/test/stats/dispersion.cu
@@ -18,8 +18,8 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/interruptible.hpp>
-#include <raft/random/rng.hpp>
-#include <raft/stats/dispersion.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/stats/dispersion.cuh>
 #include <rmm/device_uvector.hpp>
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/cpp/test/stats/entropy.cu b/cpp/test/stats/entropy.cu
index 7074b1a6aa..fb9e82058e 100644
--- a/cpp/test/stats/entropy.cu
+++ b/cpp/test/stats/entropy.cu
@@ -19,7 +19,7 @@
 #include <iostream>
 #include <raft/cudart_utils.h>
 #include <raft/interruptible.hpp>
-#include <raft/stats/entropy.hpp>
+#include <raft/stats/entropy.cuh>
 #include <random>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/stats/histogram.cu b/cpp/test/stats/histogram.cu
index ff538fcdca..efc6d8ad93 100644
--- a/cpp/test/stats/histogram.cu
+++ b/cpp/test/stats/histogram.cu
@@ -19,8 +19,8 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/interruptible.hpp>
-#include <raft/random/rng.hpp>
-#include <raft/stats/histogram.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/stats/histogram.cuh>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/test/stats/homogeneity_score.cu b/cpp/test/stats/homogeneity_score.cu
index 44434aef8d..697cea55ad 100644
--- a/cpp/test/stats/homogeneity_score.cu
+++ b/cpp/test/stats/homogeneity_score.cu
@@ -18,8 +18,8 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <raft/cudart_utils.h>
-#include <raft/stats/homogeneity_score.hpp>
-#include <raft/stats/mutual_info_score.hpp>
+#include <raft/stats/homogeneity_score.cuh>
+#include <raft/stats/mutual_info_score.cuh>
 #include <random>
 
 namespace raft {
diff --git a/cpp/test/stats/information_criterion.cu b/cpp/test/stats/information_criterion.cu
index 034567efa5..802e3fee23 100644
--- a/cpp/test/stats/information_criterion.cu
+++ b/cpp/test/stats/information_criterion.cu
@@ -16,7 +16,7 @@
 
 #include <test_utils.h>
 
-#include <raft/stats/information_criterion.hpp>
+#include <raft/stats/information_criterion.cuh>
 
 #include <raft/cudart_utils.h>
 #include <raft/mr/device/allocator.hpp>
diff --git a/cpp/test/stats/kl_divergence.cu b/cpp/test/stats/kl_divergence.cu
index 050f48f557..d66a832e30 100644
--- a/cpp/test/stats/kl_divergence.cu
+++ b/cpp/test/stats/kl_divergence.cu
@@ -18,7 +18,7 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <raft/cudart_utils.h>
-#include <raft/stats/kl_divergence.hpp>
+#include <raft/stats/kl_divergence.cuh>
 #include <random>
 
 namespace raft {
diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu
index f6ad98e1a4..b0a9eae1d4 100644
--- a/cpp/test/stats/mean.cu
+++ b/cpp/test/stats/mean.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
-#include <raft/stats/mean.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/stats/mean.cuh>
 #include <stdio.h>
 #include <stdlib.h>
 
diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu
index ddabe0e814..ffc1d482c2 100644
--- a/cpp/test/stats/mean_center.cu
+++ b/cpp/test/stats/mean_center.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,9 +18,9 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
-#include <raft/stats/mean.hpp>
-#include <raft/stats/mean_center.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/stats/mean.cuh>
+#include <raft/stats/mean_center.cuh>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/test/stats/meanvar.cu b/cpp/test/stats/meanvar.cu
index b0efe1c7dd..0ea390393a 100644
--- a/cpp/test/stats/meanvar.cu
+++ b/cpp/test/stats/meanvar.cu
@@ -17,9 +17,9 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/matrix/math.hpp>
-#include <raft/random/rng.hpp>
-#include <raft/stats/meanvar.hpp>
+#include <raft/matrix/math.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/stats/meanvar.cuh>
 
 #include <algorithm>
 
diff --git a/cpp/test/stats/minmax.cu b/cpp/test/stats/minmax.cu
index 61b16b65ae..ca6ab31437 100644
--- a/cpp/test/stats/minmax.cu
+++ b/cpp/test/stats/minmax.cu
@@ -19,8 +19,8 @@
 #include <limits>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
-#include <raft/stats/minmax.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/stats/minmax.cuh>
 #include <stdio.h>
 #include <stdlib.h>
 
diff --git a/cpp/test/stats/mutual_info_score.cu b/cpp/test/stats/mutual_info_score.cu
index b7f6406009..ad4ec900c9 100644
--- a/cpp/test/stats/mutual_info_score.cu
+++ b/cpp/test/stats/mutual_info_score.cu
@@ -18,8 +18,8 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <raft/cudart_utils.h>
-#include <raft/stats/contingency_matrix.hpp>
-#include <raft/stats/mutual_info_score.hpp>
+#include <raft/stats/contingency_matrix.cuh>
+#include <raft/stats/mutual_info_score.cuh>
 #include <random>
 
 namespace raft {
diff --git a/cpp/test/stats/rand_index.cu b/cpp/test/stats/rand_index.cu
index 1f4805a160..f1ec58d944 100644
--- a/cpp/test/stats/rand_index.cu
+++ b/cpp/test/stats/rand_index.cu
@@ -22,7 +22,7 @@
 
 #include <algorithm>
 #include <iostream>
-#include <raft/stats/rand_index.hpp>
+#include <raft/stats/rand_index.cuh>
 #include <random>
 
 namespace raft {
diff --git a/cpp/test/stats/silhouette_score.cu b/cpp/test/stats/silhouette_score.cu
index 6efb3a4f78..8542276bd7 100644
--- a/cpp/test/stats/silhouette_score.cu
+++ b/cpp/test/stats/silhouette_score.cu
@@ -21,10 +21,10 @@
 #include <raft/distance/distance_type.hpp>
 
 #if defined RAFT_DISTANCE_COMPILED && defined RAFT_NN_COMPILED
-#include <raft/stats/specializations.hpp>
+#include <raft/stats/specializations.cuh>
 #endif
 
-#include <raft/stats/silhouette_score.hpp>
+#include <raft/stats/silhouette_score.cuh>
 #include <random>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index ef7964201f..64e264bb3e 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,10 +17,10 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/matrix/math.hpp>
-#include <raft/random/rng.hpp>
-#include <raft/stats/mean.hpp>
-#include <raft/stats/stddev.hpp>
+#include <raft/matrix/math.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/stats/mean.cuh>
+#include <raft/stats/stddev.cuh>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index 0df140b8b4..125f4e2de0 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eltwise.hpp>
-#include <raft/random/rng.hpp>
-#include <raft/stats/sum.hpp>
+#include <raft/linalg/eltwise.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/stats/sum.cuh>
 
 namespace raft {
 namespace stats {
diff --git a/cpp/test/stats/trustworthiness.cu b/cpp/test/stats/trustworthiness.cu
index ebbd52a332..a963957d32 100644
--- a/cpp/test/stats/trustworthiness.cu
+++ b/cpp/test/stats/trustworthiness.cu
@@ -18,13 +18,13 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <raft/cudart_utils.h>
-#include <raft/distance/distance.hpp>
+#include <raft/distance/distance.cuh>
 
 #if defined RAFT_DISTANCE_COMPILED && defined RAFT_NN_COMPILED
-#include <raft/stats/specializations.hpp>
+#include <raft/stats/specializations.cuh>
 #endif
 
-#include <raft/stats/trustworthiness_score.hpp>
+#include <raft/stats/trustworthiness_score.cuh>
 #include <vector>
 
 namespace raft {
diff --git a/cpp/test/stats/v_measure.cu b/cpp/test/stats/v_measure.cu
index 2ff60c0a86..65a875c5e0 100644
--- a/cpp/test/stats/v_measure.cu
+++ b/cpp/test/stats/v_measure.cu
@@ -18,8 +18,8 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <raft/cudart_utils.h>
-#include <raft/stats/homogeneity_score.hpp>
-#include <raft/stats/v_measure.hpp>
+#include <raft/stats/homogeneity_score.cuh>
+#include <raft/stats/v_measure.cuh>
 #include <random>
 
 namespace raft {
diff --git a/cpp/test/stats/weighted_mean.cu b/cpp/test/stats/weighted_mean.cu
index ee58747b69..dc67947a27 100644
--- a/cpp/test/stats/weighted_mean.cu
+++ b/cpp/test/stats/weighted_mean.cu
@@ -17,8 +17,8 @@
 #include "../test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/random/rng.hpp>
-#include <raft/stats/weighted_mean.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/stats/weighted_mean.cuh>
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 
diff --git a/python/raft/dask/common/comms_utils.pyx b/python/raft/dask/common/comms_utils.pyx
index 990e882be5..38c5670372 100644
--- a/python/raft/dask/common/comms_utils.pyx
+++ b/python/raft/dask/common/comms_utils.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/raft/dask/common/nccl.pyx b/python/raft/dask/common/nccl.pyx
index fd91f34eb5..fd113e2222 100644
--- a/python/raft/dask/common/nccl.pyx
+++ b/python/raft/dask/common/nccl.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.