rapidsai · rapids-bot · Feb 8, 2022 · Oct 21, 2021 · Nov 5, 2021 · Nov 17, 2021
@@ -17,7 +17,7 @@
 #pragma once
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/reduce.cuh>
+#include <raft/linalg/reduce.hpp>
 
 namespace raft {
 namespace distance {

@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/norm.hpp>
 
 namespace raft {
 namespace distance {

@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cuda_runtime_api.h>
-#include <raft/linalg/distance_type.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/detail/canberra.cuh>
 #include <raft/distance/detail/chebyshev.cuh>
@@ -31,6 +30,7 @@
 #include <raft/distance/detail/l1.cuh>
 #include <raft/distance/detail/minkowski.cuh>
 #include <raft/distance/detail/russell_rao.cuh>
+#include <raft/linalg/distance_type.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {

@@ -16,7 +16,7 @@
 
 #pragma once
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/norm.hpp>
 
 namespace raft {
 namespace distance {

@@ -21,7 +21,7 @@
 #include <limits>
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/contractions.cuh>
+#include <raft/linalg/contractions.hpp>
 
 namespace raft {
 namespace distance {

@@ -16,7 +16,7 @@
 
 #pragma once
 #include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 
 namespace raft {
 namespace distance {

@@ -16,8 +16,8 @@
 #pragma once
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/contractions.cuh>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/contractions.hpp>
+#include <raft/linalg/norm.hpp>
 #include <raft/vectorized.cuh>
 
 #include <cstddef>

@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include <raft/linalg/distance_type.h>
 #include <raft/distance/detail/distance.cuh>
 #include <raft/handle.hpp>
+#include <raft/linalg/distance_type.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace raft {

@@ -32,10 +32,10 @@
 ///@todo: enable once we have migrated cuml-comms layer too
 //#include <common/cuml_comms_int.hpp>
 
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
 #include <raft/sparse/cusparse_wrappers.h>
 #include <raft/comms/comms.hpp>
+#include <raft/linalg/cublas_wrappers.hpp>
+#include <raft/linalg/cusolver_wrappers.hpp>
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/exec_policy.hpp>
 #include "cudart_utils.h"

@@ -20,7 +20,7 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 

@@ -20,8 +20,8 @@
 #include <limits>
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/init.h>
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/init.hpp>
 
 namespace raft {
 namespace label {

@@ -16,12 +16,17 @@
 
 #pragma once
 
-#include "binary_op.cuh"
-#include "unary_op.cuh"
+#include "detail/add.cuh"
+#include "detail/functional.cuh"
+
+#include "binary_op.hpp"
+#include "unary_op.hpp"
 
 namespace raft {
 namespace linalg {
 
+using detail::adds_scalar;
+
 /**
  * @brief Elementwise scalar add operation on the input buffer
  *
@@ -39,8 +44,7 @@ namespace linalg {
 template <typename InT, typename OutT = InT, typename IdxType = int>
 void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
 {
-  auto op = [scalar] __device__(InT in) { return OutT(in + scalar); };
-  unaryOp<InT, decltype(op), IdxType, OutT>(out, in, len, op, stream);
+  unaryOp(out, in, len, adds_scalar<InT, OutT>(scalar), stream);
 }
 
 /**
@@ -59,18 +63,7 @@ void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t s
 template <typename InT, typename OutT = InT, typename IdxType = int>
 void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
 {
-  auto op = [] __device__(InT a, InT b) { return OutT(a + b); };
-  binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
-}
-
-template <class math_t, typename IdxType>
-__global__ void add_dev_scalar_kernel(math_t* outDev,
-                                      const math_t* inDev,
-                                      const math_t* singleScalarDev,
-                                      IdxType len)
-{
-  IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
-  if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; }
+  binaryOp(out, in1, in2, len, thrust::plus<InT>(), stream);
 }
 
 /** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
@@ -90,11 +83,7 @@ void addDevScalar(math_t* outDev,
                   IdxType len,
                   cudaStream_t stream)
 {
-  // TODO: block dimension has not been tuned
-  dim3 block(256);
-  dim3 grid(raft::ceildiv(len, (IdxType)block.x));
-  add_dev_scalar_kernel<math_t><<<grid, block, 0, stream>>>(outDev, inDev, singleScalarDev, len);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  detail::addDevScalar(outDev, inDev, singleScalarDev, len, stream);
 }
 
 };  // end namespace linalg

@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/binary_op.cuh"
+
+#include <raft/cuda_utils.cuh>
+
+namespace raft {
+namespace linalg {
+
+/**
+ * @brief perform element-wise binary operation on the input arrays
+ * @tparam InType input data-type
+ * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam OutType output data-type
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @param out the output array
+ * @param in1 the first input array
+ * @param in2 the second input array
+ * @param len number of elements in the input array
+ * @param op the device-lambda
+ * @param stream cuda stream where to launch work
+ * @note Lambda must be a functor with the following signature:
+ *       `OutType func(const InType& val1, const InType& val2);`
+ */
+template <typename InType,
+          typename Lambda,
+          typename OutType = InType,
+          typename IdxType = int,
+          int TPB          = 256>
+void binaryOp(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
+{
+  detail::binaryOp(out, in1, in2, len, op, stream);
+}
+
+};  // end namespace linalg
+};  // end namespace raft
@@ -16,11 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
-#include <raft/cuda_utils.cuh>
-#include <raft/handle.hpp>
-#include <raft/linalg/binary_op.cuh>
+#include "detail/cholesky_r1_update.hpp"
 
 namespace raft {
 namespace linalg {
@@ -132,94 +128,7 @@ void choleskyRank1Update(const raft::handle_t& handle,
                          cudaStream_t stream,
                          math_t eps = -1)
 {
-  // The matrix A' is defined as:
-  // A' = [[A_11, A_12]
-  //       [A_21, A_22]]
-  // where:
-  // - A_11 = A, matrix of size (n-1)x(n-1)
-  // - A_21[j] = A_12.T[j] = A_new[j] j=0..n-2, vector with (n-1) elements
-  // - A_22 = A_new[n-1] scalar.
-  //
-  // Instead of caclulating the Cholelsky decomposition of A' from scratch,
-  // we just update L with the new row. The new Cholesky decomposition will be
-  // calculated as:
-  // L' = [[L_11,    0]
-  //       [L_12, L_22]]
-  // where L_11 is the Cholesky decomposition of A (size [n-1 x n-1]), and
-  // L_12 and L_22 are the new quantities that we need to calculate.
-
-  // We need a workspace in device memory to store a scalar. Additionally, in
-  // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats.
-  const int align = 256;
-  int offset =
-    (uplo == CUBLAS_FILL_MODE_LOWER) ? raft::alignTo<int>(sizeof(math_t) * (n - 1), align) : 0;
-  if (workspace == nullptr) {
-    *n_bytes = offset + 1 * sizeof(math_t);
-    return;
-  }
-  math_t* s    = reinterpret_cast<math_t*>(((char*)workspace) + offset);
-  math_t* L_22 = L + (n - 1) * ld + n - 1;
-
-  math_t* A_new;
-  math_t* A_row;
-  if (uplo == CUBLAS_FILL_MODE_UPPER) {
-    // A_new is stored as the n-1 th column of L
-    A_new = L + (n - 1) * ld;
-  } else {
-    // If the input is lower triangular, then the new elements of A are stored
-    // as the n-th row of L. Since the matrix is column major, this is non
-    // contiguous. We copy elements from A_row to a contiguous workspace A_new.
-    A_row = L + n - 1;
-    A_new = reinterpret_cast<math_t*>(workspace);
-    RAFT_CUBLAS_TRY(
-      raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream));
-  }
-  cublasOperation_t op = (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  if (n > 1) {
-    // Calculate L_12 = x by solving equation L_11 x = A_12
-    math_t alpha = 1;
-    RAFT_CUBLAS_TRY(raft::linalg::cublastrsm(handle.get_cublas_handle(),
-                                             CUBLAS_SIDE_LEFT,
-                                             uplo,
-                                             op,
-                                             CUBLAS_DIAG_NON_UNIT,
-                                             n - 1,
-                                             1,
-                                             &alpha,
-                                             L,
-                                             ld,
-                                             A_new,
-                                             n - 1,
-                                             stream));
-
-    // A_new now stores L_12, we calculate s = L_12 * L_12
-    RAFT_CUBLAS_TRY(
-      raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream));
-
-    if (uplo == CUBLAS_FILL_MODE_LOWER) {
-      // Copy back the L_12 elements as the n-th row of L
-      RAFT_CUBLAS_TRY(
-        raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream));
-    }
-  } else {  // n == 1 case
-    RAFT_CUDA_TRY(cudaMemsetAsync(s, 0, sizeof(math_t), stream));
-  }
-
-  // L_22 = sqrt(A_22 - L_12 * L_12)
-  math_t s_host;
-  math_t L_22_host;
-  raft::update_host(&s_host, s, 1, stream);
-  raft::update_host(&L_22_host, L_22, 1, stream);  // L_22 stores A_22
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-  L_22_host = std::sqrt(L_22_host - s_host);
-
-  // Check for numeric error with sqrt. If the matrix is not positive definit or
-  // the system is very ill conditioned then the A_22 - L_12 * L_12 can be
-  // negative, which would result L_22 = NaN. A small positive eps parameter
-  // can be used to prevent this.
-  if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { L_22_host = eps; }
-  ASSERT(!std::isnan(L_22_host), "Error during Cholesky rank one update");
-  raft::update_device(L_22, &L_22_host, 1, stream);
+  detail::choleskyRank1Update(handle, L, n, ld, workspace, n_bytes, uplo, stream, eps);
 }
 };  // namespace linalg
 };  // namespace raft