diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh index 5b2df3bcb3..941f7cc633 100644 --- a/cpp/include/raft/linalg/eig.cuh +++ b/cpp/include/raft/linalg/eig.cuh @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh index cc912d7d86..a50448acbe 100644 --- a/cpp/include/raft/linalg/qr.cuh +++ b/cpp/include/raft/linalg/qr.cuh @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh index 8b40a80903..2315920689 100644 --- a/cpp/include/raft/linalg/svd.cuh +++ b/cpp/include/raft/linalg/svd.cuh @@ -21,8 +21,8 @@ #include #include #include -#include -#include +#include +#include #include #include #include "eig.cuh" diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh new file mode 100644 index 0000000000..f79cb397b7 --- /dev/null +++ b/cpp/include/raft/matrix/detail/math.cuh @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft { +namespace matrix { +namespace detail { + +// Computes the argmax(d_in) column-wise in a DxN matrix +template +__global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) { + typedef cub::BlockReduce, TPB> BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + // compute maxIndex=argMax index for column + using KVP = cub::KeyValuePair; + int rowStart = blockIdx.x * D; + KVP thread_data(-1, -raft::myInf()); + + for (int i = threadIdx.x; i < D; i += TPB) { + int idx = rowStart + i; + thread_data = cub::ArgMax()(thread_data, KVP(i, d_in[idx])); + } + + auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax()); + + if (threadIdx.x == 0) { + argmax[blockIdx.x] = maxKV.key; + } +} + +template +void argmax(const math_t *in, int n_rows, int n_cols, math_t *out, + cudaStream_t stream) { + int D = n_rows; + int N = n_cols; + if (D <= 32) { + argmaxKernel<<>>(in, D, N, out); + } else if (D <= 64) { + argmaxKernel<<>>(in, D, N, out); + } else if (D <= 128) { + argmaxKernel<<>>(in, D, N, out); + } else { + argmaxKernel<<>>(in, D, N, out); + } + CUDA_CHECK(cudaPeekAtLastError()); +} + +// Utility kernel needed for signFlip. +// Computes the argmax(abs(d_in)) column-wise in a DxN matrix followed by +// flipping the sign if the |max| value for each column is negative. +template +__global__ void signFlipKernel(T *d_in, int D, int N) { + typedef cub::BlockReduce, TPB> BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + // compute maxIndex=argMax (with abs()) index for column + using KVP = cub::KeyValuePair; + int rowStart = blockIdx.x * D; + KVP thread_data(0, 0); + for (int i = threadIdx.x; i < D; i += TPB) { + int idx = rowStart + i; + thread_data = cub::ArgMax()(thread_data, KVP(idx, abs(d_in[idx]))); + } + auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax()); + + // flip column sign if d_in[maxIndex] < 0 + __shared__ bool need_sign_flip; + if (threadIdx.x == 0) { + need_sign_flip = d_in[maxKV.key] < T(0); + } + __syncthreads(); + + if (need_sign_flip) { + for (int i = threadIdx.x; i < D; i += TPB) { + int idx = rowStart + i; + d_in[idx] = -d_in[idx]; + } + } +} + +template +void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) { + int D = n_rows; + int N = n_cols; + auto data = inout; + if (D <= 32) { + signFlipKernel<<>>(data, D, N); + } else if (D <= 64) { + signFlipKernel<<>>(data, D, N); + } else if (D <= 128) { + signFlipKernel<<>>(data, D, N); + } else { + signFlipKernel<<>>(data, D, N); + } + CUDA_CHECK(cudaPeekAtLastError()); +} + +} // end namespace detail +} // end namespace matrix +} // end namespace raft \ No newline at end of file diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh new file mode 100644 index 0000000000..8293d01bdb --- /dev/null +++ b/cpp/include/raft/matrix/detail/matrix.cuh @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +#include + +namespace raft { +namespace matrix { +namespace detail { + +template +void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, + const idx_array_t *indices, idx_t n_rows_indices, + cudaStream_t stream, bool rowMajor = false) { + if (rowMajor) { + const idx_t TPB = 256; + cache:: + get_vecs<<>>( + in, n_cols, indices, n_rows_indices, out); + CUDA_CHECK(cudaPeekAtLastError()); + return; + } + + idx_t size = n_rows_indices * n_cols; + auto counting = thrust::make_counting_iterator(0); + + thrust::for_each(rmm::exec_policy(stream), counting, counting + size, + [=] __device__(idx_t idx) { + idx_t row = idx % n_rows_indices; + idx_t col = idx / n_rows_indices; + + out[col * n_rows_indices + row] = + in[col * n_rows + indices[row]]; + }); +} + +/** + * @brief Kernel for copying a slice of a big matrix to a small matrix with a + * size matches that slice + * @param src_d: input matrix + * @param m: number of rows of input matrix + * @param n: number of columns of input matrix + * @param dst_d: output matrix + * @param x1, y1: coordinate of the top-left point of the wanted area (0-based) + * @param x2, y2: coordinate of the bottom-right point of the wanted area + * (1-based) + */ +template +__global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1, + idx_t y1, idx_t x2, idx_t y2) { + idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; + idx_t dm = x2 - x1, dn = y2 - y1; + if (idx < dm * dn) { + idx_t i = idx % dm, j = idx / dm; + idx_t is = i + x1, js = j + y1; + dst_d[idx] = src_d[is + js * m]; + } +} + +template +void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1, + idx_t y1, idx_t x2, idx_t y2, cudaStream_t stream) { + // Slicing + dim3 block(64); + dim3 grid(((x2 - x1) * (y2 - y1) + block.x - 1) / block.x); + slice<<>>(in, n_rows, n_cols, out, x1, y1, x2, y2); +} + +/** + * @brief Kernel for copying the upper triangular part of a matrix to another + * @param src: input matrix with a size of mxn + * @param dst: output matrix with a size of kxk + * @param n_rows: number of rows of input matrix + * @param n_cols: number of columns of input matrix + * @param k: min(n_rows, n_cols) + */ +template +__global__ void getUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, + idx_t n_cols, idx_t k) { + idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; + idx_t m = n_rows, n = n_cols; + if (idx < m * n) { + idx_t i = idx % m, j = idx / m; + if (i < k && j < k && j >= i) { + dst[i + j * k] = src[idx]; + } + } +} + +template +void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols, + cudaStream_t stream) { + idx_t m = n_rows, n = n_cols; + idx_t k = min(m, n); + dim3 block(64); + dim3 grid((m * n + block.x - 1) / block.x); + getUpperTriangular<<>>(src, dst, m, n, k); +} + +/** + * @brief Copy a vector to the diagonal of a matrix + * @param vec: vector of length k = min(n_rows, n_cols) + * @param matrix: matrix of size n_rows x n_cols + * @param m: number of rows of the matrix + * @param n: number of columns of the matrix + * @param k: dimensionality + */ +template +__global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, idx_t m, + idx_t n, idx_t k) { + idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; + + if (idx < k) { + matrix[idx + idx * m] = vec[idx]; + } +} + +template +void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols, + cudaStream_t stream) { + idx_t k = min(n_rows, n_cols); + dim3 block(64); + dim3 grid((k + block.x - 1) / block.x); + copyVectorToMatrixDiagonal<<>>(vec, matrix, n_rows, + n_cols, k); +} + +/** + * @brief Calculate the inverse of the diagonal of a square matrix + * element-wise and in place + * @param in: square input matrix with size len x len + * @param len: size of one side of the matrix + */ +template +__global__ void matrixDiagonalInverse(m_t *in, idx_t len) { + idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; + if (idx < len) { + in[idx + idx * len] = 1.0 / in[idx + idx * len]; + } +} + +template +void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) { + dim3 block(64); + dim3 grid((len + block.x - 1) / block.x); + matrixDiagonalInverse<<>>(in, len); +} + +} // end namespace detail +} // end namespace matrix +} // end namespace raft \ No newline at end of file diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.hpp similarity index 84% rename from cpp/include/raft/matrix/math.cuh rename to cpp/include/raft/matrix/math.hpp index 41ca85dce0..e67440019f 100644 --- a/cpp/include/raft/matrix/math.cuh +++ b/cpp/include/raft/matrix/math.hpp @@ -16,6 +16,8 @@ #pragma once +#include "detail/math.cuh" + #include #include #include @@ -304,29 +306,6 @@ void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len, /** @} */ -// Computes the argmax(d_in) column-wise in a DxN matrix -template -__global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) { - typedef cub::BlockReduce, TPB> BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - - // compute maxIndex=argMax index for column - using KVP = cub::KeyValuePair; - int rowStart = blockIdx.x * D; - KVP thread_data(-1, -raft::myInf()); - - for (int i = threadIdx.x; i < D; i += TPB) { - int idx = rowStart + i; - thread_data = cub::ArgMax()(thread_data, KVP(i, d_in[idx])); - } - - auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax()); - - if (threadIdx.x == 0) { - argmax[blockIdx.x] = maxKV.key; - } -} - /** * @brief Argmax: find the row idx with maximum value for each column * @param in: input matrix @@ -338,51 +317,7 @@ __global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) { template void argmax(const math_t *in, int n_rows, int n_cols, math_t *out, cudaStream_t stream) { - int D = n_rows; - int N = n_cols; - if (D <= 32) { - argmaxKernel<<>>(in, D, N, out); - } else if (D <= 64) { - argmaxKernel<<>>(in, D, N, out); - } else if (D <= 128) { - argmaxKernel<<>>(in, D, N, out); - } else { - argmaxKernel<<>>(in, D, N, out); - } - CUDA_CHECK(cudaPeekAtLastError()); -} - -// Utility kernel needed for signFlip. -// Computes the argmax(abs(d_in)) column-wise in a DxN matrix followed by -// flipping the sign if the |max| value for each column is negative. -template -__global__ void signFlipKernel(T *d_in, int D, int N) { - typedef cub::BlockReduce, TPB> BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - - // compute maxIndex=argMax (with abs()) index for column - using KVP = cub::KeyValuePair; - int rowStart = blockIdx.x * D; - KVP thread_data(0, 0); - for (int i = threadIdx.x; i < D; i += TPB) { - int idx = rowStart + i; - thread_data = cub::ArgMax()(thread_data, KVP(idx, abs(d_in[idx]))); - } - auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax()); - - // flip column sign if d_in[maxIndex] < 0 - __shared__ bool need_sign_flip; - if (threadIdx.x == 0) { - need_sign_flip = d_in[maxKV.key] < T(0); - } - __syncthreads(); - - if (need_sign_flip) { - for (int i = threadIdx.x; i < D; i += TPB) { - int idx = rowStart + i; - d_in[idx] = -d_in[idx]; - } - } + detail::argmax(in, n_rows, n_cols, out, stream); } /** @@ -395,19 +330,7 @@ __global__ void signFlipKernel(T *d_in, int D, int N) { */ template void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) { - int D = n_rows; - int N = n_cols; - auto data = inout; - if (D <= 32) { - signFlipKernel<<>>(data, D, N); - } else if (D <= 64) { - signFlipKernel<<>>(data, D, N); - } else if (D <= 128) { - signFlipKernel<<>>(data, D, N); - } else { - signFlipKernel<<>>(data, D, N); - } - CUDA_CHECK(cudaPeekAtLastError()); + detail::signFlip(inout, n_rows, n_cols, stream); } template diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.hpp similarity index 68% rename from cpp/include/raft/matrix/matrix.cuh rename to cpp/include/raft/matrix/matrix.hpp index 688b92da09..8dd9fbf487 100644 --- a/cpp/include/raft/matrix/matrix.cuh +++ b/cpp/include/raft/matrix/matrix.hpp @@ -16,17 +16,15 @@ #pragma once +#include "detail/matrix.cuh" + #include #include #include #include -#include #include #include -#include -#include #include -#include namespace raft { namespace matrix { @@ -52,26 +50,8 @@ template void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, const idx_array_t *indices, idx_t n_rows_indices, cudaStream_t stream, bool rowMajor = false) { - if (rowMajor) { - const idx_t TPB = 256; - cache:: - get_vecs<<>>( - in, n_cols, indices, n_rows_indices, out); - CUDA_CHECK(cudaPeekAtLastError()); - return; - } - - idx_t size = n_rows_indices * n_cols; - auto counting = thrust::make_counting_iterator(0); - - thrust::for_each(rmm::exec_policy(stream), counting, counting + size, - [=] __device__(idx_t idx) { - idx_t row = idx % n_rows_indices; - idx_t col = idx / n_rows_indices; - - out[col * n_rows_indices + row] = - in[col * n_rows + indices[row]]; - }); + detail::copyRows(in, n_rows, n_cols, out, indices, n_rows_indices, stream, + rowMajor); } /** @@ -185,10 +165,10 @@ void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { */ template void print(const m_t *in, idx_t n_rows, idx_t n_cols, char h_separator = ' ', - char v_separator = '\n') { + char v_separator = '\n', + cudaStream_t stream = rmm::cuda_stream_default) { std::vector h_matrix = std::vector(n_cols * n_rows); - CUDA_CHECK(cudaMemcpy(h_matrix.data(), in, n_cols * n_rows * sizeof(m_t), - cudaMemcpyDeviceToHost)); + raft::update_host(h_matrix.data(), in, n_cols * n_rows, stream); for (idx_t i = 0; i < n_rows; i++) { for (idx_t j = 0; j < n_cols; j++) { @@ -214,29 +194,6 @@ void printHost(const m_t *in, idx_t n_rows, idx_t n_cols) { } } -/** - * @brief Kernel for copying a slice of a big matrix to a small matrix with a - * size matches that slice - * @param src_d: input matrix - * @param m: number of rows of input matrix - * @param n: number of columns of input matrix - * @param dst_d: output matrix - * @param x1, y1: coordinate of the top-left point of the wanted area (0-based) - * @param x2, y2: coordinate of the bottom-right point of the wanted area - * (1-based) - */ -template -__global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1, - idx_t y1, idx_t x2, idx_t y2) { - idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; - idx_t dm = x2 - x1, dn = y2 - y1; - if (idx < dm * dn) { - idx_t i = idx % dm, j = idx / dm; - idx_t is = i + x1, js = j + y1; - dst_d[idx] = src_d[is + js * m]; - } -} - /** * @brief Slice a matrix (in-place) * @param in: input matrix @@ -253,31 +210,7 @@ __global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1, template void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1, idx_t y1, idx_t x2, idx_t y2, cudaStream_t stream) { - // Slicing - dim3 block(64); - dim3 grid(((x2 - x1) * (y2 - y1) + block.x - 1) / block.x); - slice<<>>(in, n_rows, n_cols, out, x1, y1, x2, y2); -} - -/** - * @brief Kernel for copying the upper triangular part of a matrix to another - * @param src: input matrix with a size of mxn - * @param dst: output matrix with a size of kxk - * @param n_rows: number of rows of input matrix - * @param n_cols: number of columns of input matrix - * @param k: min(n_rows, n_cols) - */ -template -__global__ void getUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, - idx_t n_cols, idx_t k) { - idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; - idx_t m = n_rows, n = n_cols; - if (idx < m * n) { - idx_t i = idx % m, j = idx / m; - if (i < k && j < k && j >= i) { - dst[i + j * k] = src[idx]; - } - } + detail::sliceMatrix(in, n_rows, n_cols, out, x1, y1, x2, y2, stream); } /** @@ -291,29 +224,7 @@ __global__ void getUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, template void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { - idx_t m = n_rows, n = n_cols; - idx_t k = min(m, n); - dim3 block(64); - dim3 grid((m * n + block.x - 1) / block.x); - getUpperTriangular<<>>(src, dst, m, n, k); -} - -/** - * @brief Copy a vector to the diagonal of a matrix - * @param vec: vector of length k = min(n_rows, n_cols) - * @param matrix: matrix of size n_rows x n_cols - * @param m: number of rows of the matrix - * @param n: number of columns of the matrix - * @param k: dimensionality - */ -template -__global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, idx_t m, - idx_t n, idx_t k) { - idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; - - if (idx < k) { - matrix[idx + idx * m] = vec[idx]; - } + detail::copyUpperTriangular(src, dst, n_rows, n_cols, stream); } /** @@ -327,25 +238,7 @@ __global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, idx_t m, template void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream) { - idx_t k = min(n_rows, n_cols); - dim3 block(64); - dim3 grid((k + block.x - 1) / block.x); - copyVectorToMatrixDiagonal<<>>(vec, matrix, n_rows, - n_cols, k); -} - -/** - * @brief Calculate the inverse of the diagonal of a square matrix - * element-wise and in place - * @param in: square input matrix with size len x len - * @param len: size of one side of the matrix - */ -template -__global__ void matrixDiagonalInverse(m_t *in, idx_t len) { - idx_t idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx < len) { - in[idx + idx * len] = 1.0 / in[idx + idx * len]; - } + detail::initializeDiagonalMatrix(vec, matrix, n_rows, n_cols, stream); } /** @@ -356,9 +249,7 @@ __global__ void matrixDiagonalInverse(m_t *in, idx_t len) { */ template void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) { - dim3 block(64); - dim3 grid((len + block.x - 1) / block.x); - matrixDiagonalInverse<<>>(in, len); + detail::getDiagonalInverseMatrix(in, len, stream); } /** diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/detail/rng_impl.cuh similarity index 62% rename from cpp/include/raft/random/rng.cuh rename to cpp/include/raft/random/detail/rng_impl.cuh index 3d2e44e49b..654c46bbf9 100644 --- a/cpp/include/raft/random/rng.cuh +++ b/cpp/include/raft/random/detail/rng_impl.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -27,10 +28,10 @@ #include #include #include -#include "rng_impl.cuh" namespace raft { namespace random { +namespace detail { /** all different generator types used */ enum GeneratorType { @@ -42,12 +43,48 @@ enum GeneratorType { GenKiss99 }; +template +DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1, + Type sigma2, Type mu2) { + constexpr Type twoPi = Type(2.0) * Type(3.141592654); + constexpr Type minus2 = -Type(2.0); + Type R = raft::mySqrt(minus2 * raft::myLog(val1)); + Type theta = twoPi * val2; + Type s, c; + raft::mySinCos(theta, s, c); + val1 = R * c * sigma1 + mu1; + val2 = R * s * sigma2 + mu2; +} +template +DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) { + box_muller_transform(val1, val2, sigma1, mu1, sigma1, mu1); +} + +/** + * @brief generator-agnostic way of generating random numbers + * @tparam GenType the generator object that expose 'next' method + */ +template +struct Generator { + DI Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) + : gen(seed, subsequence, offset) {} + + template + DI void next(Type &ret) { + gen.next(ret); + } + + private: + /** the actual generator */ + GenType gen; +}; + template __global__ void randKernel(uint64_t seed, uint64_t offset, OutType *ptr, LenType len, Lambda randOp) { LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x; - detail::Generator gen(seed, (uint64_t)tid, offset); + Generator gen(seed, (uint64_t)tid, offset); const LenType stride = gridDim.x * blockDim.x; for (LenType idx = tid; idx < len; idx += stride) { MathType val; @@ -62,7 +99,7 @@ template gen(seed, (uint64_t)tid, offset); + Generator gen(seed, (uint64_t)tid, offset); const LenType stride = gridDim.x * blockDim.x; for (LenType idx = tid; idx < len; idx += stride) { MathType val1, val2; @@ -84,47 +121,252 @@ __global__ void constFillKernel(Type *ptr, int len, Type val) { } } -/** - * @brief Helper method to compute Box Muller transform - * - * @tparam Type data type - * - * @param[inout] val1 first value - * @param[inout] val2 second value - * @param[in] sigma1 standard deviation of output gaussian for first value - * @param[in] mu1 mean of output gaussian for first value - * @param[in] sigma2 standard deviation of output gaussian for second value - * @param[in] mu2 mean of output gaussian for second value - * @{ - */ -template -DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1, - Type sigma2, Type mu2) { - constexpr Type twoPi = Type(2.0) * Type(3.141592654); - constexpr Type minus2 = -Type(2.0); - Type R = raft::mySqrt(minus2 * raft::myLog(val1)); - Type theta = twoPi * val2; - Type s, c; - raft::mySinCos(theta, s, c); - val1 = R * c * sigma1 + mu1; - val2 = R * s * sigma2 + mu2; -} -template -DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) { - box_muller_transform(val1, val2, sigma1, mu1, sigma1, mu1); -} -/** @} */ +/** Philox-based random number generator */ +// Courtesy: Jakub Szuppe +struct PhiloxGenerator { + /** + * @brief ctor. Initializes the state for RNG + * @param seed random seed (can be same across all threads) + * @param subsequence as found in curand docs + * @param offset as found in curand docs + */ + DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) { + curand_init(seed, subsequence, offset, &state); + } -/** The main random number generator class, fully on GPUs */ -class Rng { - public: /** - * @brief ctor - * @param _s 64b seed used to initialize the RNG - * @param _t backend device RNG generator type - * @note Refer to the `Rng::seed` method for details about seeding the engine + * @defgroup NextRand Generate the next random number + * @{ + */ + DI void next(float &ret) { ret = curand_uniform(&(this->state)); } + DI void next(double &ret) { ret = curand_uniform_double(&(this->state)); } + DI void next(uint32_t &ret) { ret = curand(&(this->state)); } + DI void next(uint64_t &ret) { + uint32_t a, b; + next(a); + next(b); + ret = (uint64_t)a | ((uint64_t)b << 32); + } + DI void next(int32_t &ret) { + uint32_t val; + next(val); + ret = int32_t(val & 0x7fffffff); + } + DI void next(int64_t &ret) { + uint64_t val; + next(val); + ret = int64_t(val & 0x7fffffffffffffff); + } + /** @} */ + + private: + /** the state for RNG */ + curandStatePhilox4_32_10_t state; +}; + +/** LFSR taps-filter for generating random numbers. */ +// Courtesy: Vinay Deshpande +struct TapsGenerator { + /** + * @brief ctor. Initializes the state for RNG + * @param seed the seed (can be same across all threads) + * @param subsequence unused + * @param offset unused + */ + DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) { + uint64_t delta = (blockIdx.x * blockDim.x) + threadIdx.x; + uint64_t stride = blockDim.x * gridDim.x; + delta += ((blockIdx.y * blockDim.y) + threadIdx.y) * stride; + stride *= blockDim.y * gridDim.y; + delta += ((blockIdx.z * blockDim.z) + threadIdx.z) * stride; + state = seed + delta + 1; + } + + /** + * @defgroup NextRand Generate the next random number + * @{ + */ + template + DI void next(Type &ret) { + constexpr double ULL_LARGE = 1.8446744073709551614e19; + uint64_t val; + next(val); + ret = static_cast(val); + ret /= static_cast(ULL_LARGE); + } + DI void next(uint64_t &ret) { + constexpr uint64_t TAPS = 0x8000100040002000ULL; + constexpr int ROUNDS = 128; + for (int i = 0; i < ROUNDS; i++) + state = (state >> 1) ^ (-(state & 1ULL) & TAPS); + ret = state; + } + DI void next(uint32_t &ret) { + uint64_t val; + next(val); + ret = (uint32_t)val; + } + DI void next(int32_t &ret) { + uint32_t val; + next(val); + ret = int32_t(val & 0x7fffffff); + } + DI void next(int64_t &ret) { + uint64_t val; + next(val); + ret = int64_t(val & 0x7fffffffffffffff); + } + /** @} */ + + private: + /** the state for RNG */ + uint64_t state; +}; + +/** Kiss99-based random number generator */ + +struct Kiss99Generator { + /** + * @brief ctor. Initializes the state for RNG + * @param seed the seed (can be same across all threads) + * @param subsequence unused + * @param offset unused */ - Rng(uint64_t _s, GeneratorType _t = GenPhilox) + DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) { + initKiss99(seed); + } + + /** + * @defgroup NextRand Generate the next random number + * @{ + */ + template + DI void next(Type &ret) { + constexpr double U_LARGE = 4.294967295e9; + uint32_t val; + next(val); + ret = static_cast(val); + ret /= static_cast(U_LARGE); + } + DI void next(uint32_t &ret) { + uint32_t MWC; + z = 36969 * (z & 65535) + (z >> 16); + w = 18000 * (w & 65535) + (w >> 16); + MWC = ((z << 16) + w); + jsr ^= (jsr << 17); + jsr ^= (jsr >> 13); + jsr ^= (jsr << 5); + jcong = 69069 * jcong + 1234567; + MWC = ((MWC ^ jcong) + jsr); + ret = MWC; + } + DI void next(uint64_t &ret) { + uint32_t a, b; + next(a); + next(b); + ret = (uint64_t)a | ((uint64_t)b << 32); + } + DI void next(int32_t &ret) { + uint32_t val; + next(val); + ret = int32_t(val & 0x7fffffff); + } + DI void next(int64_t &ret) { + uint64_t val; + next(val); + ret = int64_t(val & 0x7fffffffffffffff); + } + /** @} */ + + private: + /** one of the kiss99 states */ + uint32_t z; + /** one of the kiss99 states */ + uint32_t w; + /** one of the kiss99 states */ + uint32_t jsr; + /** one of the kiss99 states */ + uint32_t jcong; + + // This function multiplies 128-bit hash by 128-bit FNV prime and returns lower + // 128 bits. It uses 32-bit wide multiply only. + DI void mulByFnv1a128Prime(uint32_t *h) { + typedef union { + uint32_t u32[2]; + uint64_t u64[1]; + } words64; + + // 128-bit FNV prime = p3 * 2^96 + p2 * 2^64 + p1 * 2^32 + p0 + // Here p0 = 315, p2 = 16777216, p1 = p3 = 0 + const uint32_t p0 = uint32_t(315), p2 = uint32_t(16777216); + // Partial products + words64 h0p0, h1p0, h2p0, h0p2, h3p0, h1p2; + + h0p0.u64[0] = uint64_t(h[0]) * p0; + h1p0.u64[0] = uint64_t(h[1]) * p0; + h2p0.u64[0] = uint64_t(h[2]) * p0; + h0p2.u64[0] = uint64_t(h[0]) * p2; + h3p0.u64[0] = uint64_t(h[3]) * p0; + h1p2.u64[0] = uint64_t(h[1]) * p2; + + // h_n[0] = LO(h[0]*p[0]); + // h_n[1] = HI(h[0]*p[0]) + LO(h[1]*p[0]); + // h_n[2] = HI(h[1]*p[0]) + LO(h[2]*p[0]) + LO(h[0]*p[2]); + // h_n[3] = HI(h[2]*p[0]) + HI(h[0]*p[2]) + LO(h[3]*p[0]) + LO(h[1]*p[2]); + uint32_t carry = 0; + h[0] = h0p0.u32[0]; + + h[1] = h0p0.u32[1] + h1p0.u32[0]; + carry = h[1] < h0p0.u32[1] ? 1 : 0; + + h[2] = h1p0.u32[1] + carry; + carry = h[2] < h1p0.u32[1] ? 1 : 0; + h[2] += h2p0.u32[0]; + carry = h[2] < h2p0.u32[0] ? carry + 1 : carry; + h[2] += h0p2.u32[0]; + carry = h[2] < h0p2.u32[0] ? carry + 1 : carry; + + h[3] = h2p0.u32[1] + h0p2.u32[1] + h3p0.u32[0] + h1p2.u32[0] + carry; + return; + } + + DI void fnv1a128(uint32_t *hash, uint32_t txt) { + hash[0] ^= (txt >> 0) & 0xFF; + mulByFnv1a128Prime(hash); + hash[0] ^= (txt >> 8) & 0xFF; + mulByFnv1a128Prime(hash); + hash[0] ^= (txt >> 16) & 0xFF; + mulByFnv1a128Prime(hash); + hash[0] ^= (txt >> 24) & 0xFF; + mulByFnv1a128Prime(hash); + } + + DI void initKiss99(uint64_t seed) { + // Initialize hash to 128-bit FNV1a basis + uint32_t hash[4] = {1653982605UL, 1656234357UL, 129696066UL, 1818371886UL}; + + // Digest threadIdx, blockIdx and seed + fnv1a128(hash, threadIdx.x); + fnv1a128(hash, threadIdx.y); + fnv1a128(hash, threadIdx.z); + fnv1a128(hash, blockIdx.x); + fnv1a128(hash, blockIdx.y); + fnv1a128(hash, blockIdx.z); + fnv1a128(hash, uint32_t(seed)); + fnv1a128(hash, uint32_t(seed >> 32)); + + // Initialize KISS99 state with hash + z = hash[0]; + w = hash[1]; + jsr = hash[2]; + jcong = hash[3]; + } +}; + +/** The main random number generator class, fully on GPUs */ +class RngImpl { + public: + RngImpl(uint64_t _s, GeneratorType _t = GenPhilox) : type(_t), offset(0), // simple heuristic to make sure all SMs will be occupied properly @@ -134,28 +376,11 @@ class Rng { seed(_s); } - /** - * @brief Seed (and thus re-initialize) the underlying RNG engine - * @param _s 64b seed used to initialize the RNG - * @note If you need non-reproducibility, pass a seed that's, for example, a - * function of timestamp. Another example is to use the c++11's - * `std::random_device` for setting seed. - */ void seed(uint64_t _s) { gen.seed(_s); offset = 0; } - /** - * @brief Generates the 'a' and 'b' parameters for a modulo affine - * transformation equation: `(ax + b) % n` - * - * @tparam IdxT integer type - * - * @param[in] n the modulo range - * @param[out] a slope parameter - * @param[out] b intercept parameter - */ template void affine_transform_params(IdxT n, IdxT &a, IdxT &b) { // always keep 'a' to be coprime to 'n' @@ -168,17 +393,6 @@ class Rng { b = gen() % n; } - /** - * @brief Generate uniformly distributed numbers in the given range - * @tparam Type data type of output random number - * @tparam LenType data type used to represent length of the arrays - * @param ptr the output array - * @param len the number of elements in the output - * @param start start of the range - * @param end end of the range - * @param stream stream where to launch the kernel - * @{ - */ template void uniform(Type *ptr, LenType len, Type start, Type end, cudaStream_t stream) { @@ -203,19 +417,7 @@ class Rng { }, stream); } - /** @} */ - /** - * @brief Generate normal distributed numbers - * @tparam Type data type of output random number - * @tparam LenType data type used to represent length of the arrays - * @param ptr the output array - * @param len the number of elements in the output - * @param mu mean of the distribution - * @param sigma std-dev of the distribution - * @param stream stream where to launch the kernel - * @{ - */ template void normal(Type *ptr, LenType len, Type mu, Type sigma, cudaStream_t stream) { @@ -240,28 +442,7 @@ class Rng { }, NumThreads, nBlocks, type, stream); } - /** @} */ - /** - * @brief Generate normal distributed table according to the given set of - * means and scalar standard deviations. - * - * Each row in this table conforms to a normally distributed n-dim vector - * whose mean is the input vector and standard deviation is the corresponding - * vector or scalar. Correlations among the dimensions itself is assumed to - * be absent. - * - * @tparam Type data type of output random number - * @tparam LenType data type used to represent length of the arrays - * @param ptr the output table (dim = n_rows x n_cols) - * @param n_rows number of rows in the table - * @param n_cols number of columns in the table - * @param mu mean vector (dim = n_cols x 1). - * @param sigma_vec std-dev vector of each component (dim = n_cols x 1). Pass - * a nullptr to use the same scalar 'sigma' across all components - * @param sigma scalar sigma to be used if 'sigma_vec' is nullptr - * @param stream stream where to launch the kernel - */ template void normalTable(Type *ptr, LenType n_rows, LenType n_cols, const Type *mu, const Type *sigma_vec, Type sigma, cudaStream_t stream) { @@ -280,33 +461,13 @@ class Rng { NumThreads, nBlocks, type, stream); } - /** - * @brief Fill an array with the given value - * @tparam Type data type of output random number - * @tparam LenType data type used to represent length of the arrays - * @param ptr the output array - * @param len the number of elements in the output - * @param val value to be filled - * @param stream stream where to launch the kernel - */ template void fill(Type *ptr, LenType len, Type val, cudaStream_t stream) { - constFillKernel<<>>(ptr, len, val); + detail::constFillKernel + <<>>(ptr, len, val); CUDA_CHECK(cudaPeekAtLastError()); } - /** - * @brief Generate bernoulli distributed boolean array - * - * @tparam Type data type in which to compute the probabilities - * @tparam OutType output data type - * @tparam LenType data type used to represent length of the arrays - * - * @param[out] ptr the output array - * @param[in] len the number of elements in the output - * @param[in] prob coin-toss probability for heads - * @param[in] stream stream where to launch the kernel - */ template void bernoulli(OutType *ptr, LenType len, Type prob, cudaStream_t stream) { custom_distribution( @@ -314,16 +475,6 @@ class Rng { stream); } - /** - * @brief Generate bernoulli distributed array and applies scale - * @tparam Type data type in which to compute the probabilities - * @tparam LenType data type used to represent length of the arrays - * @param ptr the output array - * @param len the number of elements in the output - * @param prob coin-toss probability for heads - * @param scale scaling factor - * @param stream stream where to launch the kernel - */ template void scaled_bernoulli(Type *ptr, LenType len, Type prob, Type scale, cudaStream_t stream) { @@ -337,17 +488,6 @@ class Rng { stream); } - /** - * @brief Generate gumbel distributed random numbers - * @tparam Type data type of output random number - * @tparam LenType data type used to represent length of the arrays - * @param ptr output array - * @param len number of elements in the output array - * @param mu mean value - * @param beta scale value - * @param stream stream where to launch the kernel - * @note https://en.wikipedia.org/wiki/Gumbel_distribution - */ template void gumbel(Type *ptr, LenType len, Type mu, Type beta, cudaStream_t stream) { custom_distribution( @@ -358,16 +498,6 @@ class Rng { stream); } - /** - * @brief Generate lognormal distributed numbers - * @tparam Type data type of output random number - * @tparam LenType data type used to represent length of the arrays - * @param ptr the output array - * @param len the number of elements in the output - * @param mu mean of the distribution - * @param sigma std-dev of the distribution - * @param stream stream where to launch the kernel - */ template void lognormal(Type *ptr, LenType len, Type mu, Type sigma, cudaStream_t stream) { @@ -381,16 +511,6 @@ class Rng { NumThreads, nBlocks, type, stream); } - /** - * @brief Generate logistic distributed random numbers - * @tparam Type data type of output random number - * @tparam LenType data type used to represent length of the arrays - * @param ptr output array - * @param len number of elements in the output array - * @param mu mean value - * @param scale scale value - * @param stream stream where to launch the kernel - */ template void logistic(Type *ptr, LenType len, Type mu, Type scale, cudaStream_t stream) { @@ -403,15 +523,6 @@ class Rng { stream); } - /** - * @brief Generate exponentially distributed random numbers - * @tparam Type data type of output random number - * @tparam LenType data type used to represent length of the arrays - * @param ptr output array - * @param len number of elements in the output array - * @param lambda the lambda - * @param stream stream where to launch the kernel - */ template void exponential(Type *ptr, LenType len, Type lambda, cudaStream_t stream) { custom_distribution( @@ -423,15 +534,6 @@ class Rng { stream); } - /** - * @brief Generate rayleigh distributed random numbers - * @tparam Type data type of output random number - * @tparam LenType data type used to represent length of the arrays - * @param ptr output array - * @param len number of elements in the output array - * @param sigma the sigma - * @param stream stream where to launch the kernel - */ template void rayleigh(Type *ptr, LenType len, Type sigma, cudaStream_t stream) { custom_distribution( @@ -444,16 +546,6 @@ class Rng { stream); } - /** - * @brief Generate laplace distributed random numbers - * @tparam Type data type of output random number - * @tparam LenType data type used to represent length of the arrays - * @param ptr output array - * @param len number of elements in the output array - * @param mu the mean - * @param scale the scale - * @param stream stream where to launch the kernel - */ template void laplace(Type *ptr, LenType len, Type mu, Type scale, cudaStream_t stream) { @@ -474,31 +566,6 @@ class Rng { stream); } - /** - * @brief Sample the input array without replacement, optionally based on the - * input weight vector for each element in the array - * - * Implementation here is based on the `one-pass sampling` algo described here: - * https://www.ethz.ch/content/dam/ethz/special-interest/baug/ivt/ivt-dam/vpl/reports/1101-1200/ab1141.pdf - * - * @note In the sampled array the elements which are picked will always appear - * in the increasing order of their weights as computed using the exponential - * distribution. So, if you're particular about the order (for eg. array - * permutations), then this might not be the right choice! - * - * @tparam DataT data type - * @tparam WeightsT weights type - * @tparam IdxT index type - * @param out output sampled array (of length 'sampledLen') - * @param outIdx indices of the sampled array (of length 'sampledLen'). Pass - * a nullptr if this is not required. - * @param in input array to be sampled (of length 'len') - * @param wts weights array (of length 'len'). Pass a nullptr if uniform - * sampling is desired - * @param sampledLen output sampled array length - * @param len input array length - * @param stream cuda stream - */ template void sampleWithoutReplacement(const raft::handle_t &handle, DataT *out, IdxT *outIdx, const DataT *in, @@ -535,24 +602,9 @@ class Rng { CUDA_CHECK(cudaMemcpyAsync(outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, cudaMemcpyDeviceToDevice, stream)); } - scatter(out, in, outIdxPtr, sampledLen, stream); + raft::scatter(out, in, outIdxPtr, sampledLen, stream); } - /** - * @brief Core method to generate a pdf based on the cdf that is defined in - * the input device lambda - * - * @tparam OutType output type - * @tparam MathType type on which arithmetic is done - * @tparam LenTyp index type - * @tparam Lambda device lambda (or operator) - * - * @param[out] ptr output buffer [on device] [len = len] - * @param[in] len number of elements to be generated - * @param[in] randOp the device lambda or operator - * @param[in] stream cuda stream - * @{ - */ template void custom_distribution(OutType *ptr, LenType len, Lambda randOp, @@ -573,10 +625,10 @@ class Rng { /** generator type */ GeneratorType type; /** - * offset is also used to initialize curand state. - * Limits period of Philox RNG from (4 * 2^128) to (Blocks * Threads * 2^64), - * but is still a large period. - */ + * offset is also used to initialize curand state. + * Limits period of Philox RNG from (4 * 2^128) to (Blocks * Threads * 2^64), + * but is still a large period. + */ uint64_t offset; /** number of blocks to launch */ int nBlocks; @@ -617,15 +669,18 @@ class Rng { nThreads, nBlocks); switch (type) { case GenPhilox: - randKernel + detail::randKernel <<>>(seed, offset, ptr, len, randOp); break; case GenTaps: - randKernel + detail::randKernel <<>>(seed, offset, ptr, len, randOp); break; case GenKiss99: - randKernel + detail::randKernel <<>>(seed, offset, ptr, len, randOp); break; default: @@ -646,17 +701,18 @@ class Rng { nThreads, nBlocks); switch (type) { case GenPhilox: - rand2Kernel + detail::rand2Kernel <<>>(seed, offset, ptr, len, rand2Op); break; case GenTaps: - rand2Kernel + detail::rand2Kernel <<>>(seed, offset, ptr, len, rand2Op); break; case GenKiss99: - rand2Kernel + detail::rand2Kernel <<>>(seed, offset, ptr, len, rand2Op); break; default: @@ -667,5 +723,6 @@ class Rng { } }; +}; // end namespace detail }; // end namespace random }; // end namespace raft diff --git a/cpp/include/raft/random/rng.hpp b/cpp/include/raft/random/rng.hpp new file mode 100644 index 0000000000..b6b0911ab0 --- /dev/null +++ b/cpp/include/raft/random/rng.hpp @@ -0,0 +1,376 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/rng_impl.cuh" + +namespace raft { +namespace random { + +/** all different generator types used */ +using detail::GeneratorType; +/** curand-based philox generator */ +using detail::GenPhilox; +/** GenTaps : LFSR taps generator */ +using detail::GenTaps; +/** GenKiss99 : kiss99 generator (currently the fastest) */ +using detail::GenKiss99; + +/** Philox-based random number generator */ +using detail::PhiloxGenerator; +/** LFSR taps-filter for generating random numbers. */ +using detail::TapsGenerator; +/** Kiss99-based random number generator */ +using detail::Kiss99Generator; + +/** + * @brief Helper method to compute Box Muller transform + * + * @tparam Type data type + * + * @param[inout] val1 first value + * @param[inout] val2 second value + * @param[in] sigma1 standard deviation of output gaussian for first value + * @param[in] mu1 mean of output gaussian for first value + * @param[in] sigma2 standard deviation of output gaussian for second value + * @param[in] mu2 mean of output gaussian for second value + * @{ + */ +template +DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1, + Type sigma2, Type mu2) { + detail::box_muller_transform(val1, val2, sigma1, mu1, sigma1, mu2); +} +template +DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) { + detail::box_muller_transform(val1, val2, sigma1, mu1); +} +/** @} */ + +/** The main random number generator class, fully on GPUs */ +class Rng : public detail::RngImpl { + public: + /** + * @brief ctor + * @param _s 64b seed used to initialize the RNG + * @param _t backend device RNG generator type + * @note Refer to the `Rng::seed` method for details about seeding the engine + */ + Rng(uint64_t _s, GeneratorType _t = GenPhilox) : detail::RngImpl{_s, _t} {} + + /** + * @brief Seed (and thus re-initialize) the underlying RNG engine + * @param _s 64b seed used to initialize the RNG + * @note If you need non-reproducibility, pass a seed that's, for example, a + * function of timestamp. Another example is to use the c++11's + * `std::random_device` for setting seed. + */ + void seed(uint64_t _s) { detail::RngImpl::seed(_s); } + + /** + * @brief Generates the 'a' and 'b' parameters for a modulo affine + * transformation equation: `(ax + b) % n` + * + * @tparam IdxT integer type + * + * @param[in] n the modulo range + * @param[out] a slope parameter + * @param[out] b intercept parameter + */ + template + void affine_transform_params(IdxT n, IdxT &a, IdxT &b) { + detail::RngImpl::affine_transform_params(n, a, b); + } + + /** + * @brief Generate uniformly distributed numbers in the given range + * @tparam Type data type of output random number + * @tparam LenType data type used to represent length of the arrays + * @param ptr the output array + * @param len the number of elements in the output + * @param start start of the range + * @param end end of the range + * @param stream stream where to launch the kernel + * @{ + */ + template + void uniform(Type *ptr, LenType len, Type start, Type end, + cudaStream_t stream) { + detail::RngImpl::uniform(ptr, len, start, end, stream); + } + template + void uniformInt(IntType *ptr, LenType len, IntType start, IntType end, + cudaStream_t stream) { + detail::RngImpl::uniformInt(ptr, len, start, end, stream); + } + /** @} */ + + /** + * @brief Generate normal distributed numbers + * @tparam Type data type of output random number + * @tparam LenType data type used to represent length of the arrays + * @param ptr the output array + * @param len the number of elements in the output + * @param mu mean of the distribution + * @param sigma std-dev of the distribution + * @param stream stream where to launch the kernel + * @{ + */ + template + void normal(Type *ptr, LenType len, Type mu, Type sigma, + cudaStream_t stream) { + detail::RngImpl::normal(ptr, len, mu, sigma, stream); + } + template + void normalInt(IntType *ptr, LenType len, IntType mu, IntType sigma, + cudaStream_t stream) { + detail::RngImpl::normalInt(ptr, len, mu, sigma, stream); + } + /** @} */ + + /** + * @brief Generate normal distributed table according to the given set of + * means and scalar standard deviations. + * + * Each row in this table conforms to a normally distributed n-dim vector + * whose mean is the input vector and standard deviation is the corresponding + * vector or scalar. Correlations among the dimensions itself is assumed to + * be absent. + * + * @tparam Type data type of output random number + * @tparam LenType data type used to represent length of the arrays + * @param ptr the output table (dim = n_rows x n_cols) + * @param n_rows number of rows in the table + * @param n_cols number of columns in the table + * @param mu mean vector (dim = n_cols x 1). + * @param sigma_vec std-dev vector of each component (dim = n_cols x 1). Pass + * a nullptr to use the same scalar 'sigma' across all components + * @param sigma scalar sigma to be used if 'sigma_vec' is nullptr + * @param stream stream where to launch the kernel + */ + template + void normalTable(Type *ptr, LenType n_rows, LenType n_cols, const Type *mu, + const Type *sigma_vec, Type sigma, cudaStream_t stream) { + detail::RngImpl::normalTable(ptr, n_rows, n_cols, mu, sigma_vec, sigma, + stream); + } + + /** + * @brief Fill an array with the given value + * @tparam Type data type of output random number + * @tparam LenType data type used to represent length of the arrays + * @param ptr the output array + * @param len the number of elements in the output + * @param val value to be filled + * @param stream stream where to launch the kernel + */ + template + void fill(Type *ptr, LenType len, Type val, cudaStream_t stream) { + detail::RngImpl::fill(ptr, len, val, stream); + } + + /** + * @brief Generate bernoulli distributed boolean array + * + * @tparam Type data type in which to compute the probabilities + * @tparam OutType output data type + * @tparam LenType data type used to represent length of the arrays + * + * @param[out] ptr the output array + * @param[in] len the number of elements in the output + * @param[in] prob coin-toss probability for heads + * @param[in] stream stream where to launch the kernel + */ + template + void bernoulli(OutType *ptr, LenType len, Type prob, cudaStream_t stream) { + detail::RngImpl::bernoulli(ptr, len, prob, stream); + } + + /** + * @brief Generate bernoulli distributed array and applies scale + * @tparam Type data type in which to compute the probabilities + * @tparam LenType data type used to represent length of the arrays + * @param ptr the output array + * @param len the number of elements in the output + * @param prob coin-toss probability for heads + * @param scale scaling factor + * @param stream stream where to launch the kernel + */ + template + void scaled_bernoulli(Type *ptr, LenType len, Type prob, Type scale, + cudaStream_t stream) { + detail::RngImpl::scaled_bernoulli(ptr, len, prob, scale, stream); + } + + /** + * @brief Generate gumbel distributed random numbers + * @tparam Type data type of output random number + * @tparam LenType data type used to represent length of the arrays + * @param ptr output array + * @param len number of elements in the output array + * @param mu mean value + * @param beta scale value + * @param stream stream where to launch the kernel + * @note https://en.wikipedia.org/wiki/Gumbel_distribution + */ + template + void gumbel(Type *ptr, LenType len, Type mu, Type beta, cudaStream_t stream) { + detail::RngImpl::gumbel(ptr, len, mu, beta, stream); + } + + /** + * @brief Generate lognormal distributed numbers + * @tparam Type data type of output random number + * @tparam LenType data type used to represent length of the arrays + * @param ptr the output array + * @param len the number of elements in the output + * @param mu mean of the distribution + * @param sigma std-dev of the distribution + * @param stream stream where to launch the kernel + */ + template + void lognormal(Type *ptr, LenType len, Type mu, Type sigma, + cudaStream_t stream) { + detail::RngImpl::lognormal(ptr, len, mu, sigma, stream); + } + + /** + * @brief Generate logistic distributed random numbers + * @tparam Type data type of output random number + * @tparam LenType data type used to represent length of the arrays + * @param ptr output array + * @param len number of elements in the output array + * @param mu mean value + * @param scale scale value + * @param stream stream where to launch the kernel + */ + template + void logistic(Type *ptr, LenType len, Type mu, Type scale, + cudaStream_t stream) { + detail::RngImpl::logistic(ptr, len, mu, scale, stream); + } + + /** + * @brief Generate exponentially distributed random numbers + * @tparam Type data type of output random number + * @tparam LenType data type used to represent length of the arrays + * @param ptr output array + * @param len number of elements in the output array + * @param lambda the lambda + * @param stream stream where to launch the kernel + */ + template + void exponential(Type *ptr, LenType len, Type lambda, cudaStream_t stream) { + detail::RngImpl::exponential(ptr, len, lambda, stream); + } + + /** + * @brief Generate rayleigh distributed random numbers + * @tparam Type data type of output random number + * @tparam LenType data type used to represent length of the arrays + * @param ptr output array + * @param len number of elements in the output array + * @param sigma the sigma + * @param stream stream where to launch the kernel + */ + template + void rayleigh(Type *ptr, LenType len, Type sigma, cudaStream_t stream) { + detail::RngImpl::rayleigh(ptr, len, sigma, stream); + } + + /** + * @brief Generate laplace distributed random numbers + * @tparam Type data type of output random number + * @tparam LenType data type used to represent length of the arrays + * @param ptr output array + * @param len number of elements in the output array + * @param mu the mean + * @param scale the scale + * @param stream stream where to launch the kernel + */ + template + void laplace(Type *ptr, LenType len, Type mu, Type scale, + cudaStream_t stream) { + detail::RngImpl::laplace(ptr, len, mu, scale, stream); + } + + /** + * @brief Sample the input array without replacement, optionally based on the + * input weight vector for each element in the array + * + * Implementation here is based on the `one-pass sampling` algo described here: + * https://www.ethz.ch/content/dam/ethz/special-interest/baug/ivt/ivt-dam/vpl/reports/1101-1200/ab1141.pdf + * + * @note In the sampled array the elements which are picked will always appear + * in the increasing order of their weights as computed using the exponential + * distribution. So, if you're particular about the order (for eg. array + * permutations), then this might not be the right choice! + * + * @tparam DataT data type + * @tparam WeightsT weights type + * @tparam IdxT index type + * @param out output sampled array (of length 'sampledLen') + * @param outIdx indices of the sampled array (of length 'sampledLen'). Pass + * a nullptr if this is not required. + * @param in input array to be sampled (of length 'len') + * @param wts weights array (of length 'len'). Pass a nullptr if uniform + * sampling is desired + * @param sampledLen output sampled array length + * @param len input array length + * @param stream cuda stream + */ + template + void sampleWithoutReplacement(const raft::handle_t &handle, DataT *out, + IdxT *outIdx, const DataT *in, + const WeightsT *wts, IdxT sampledLen, IdxT len, + cudaStream_t stream) { + detail::RngImpl::sampleWithoutReplacement(handle, out, outIdx, in, wts, + sampledLen, len, stream); + } + + /** + * @brief Core method to generate a pdf based on the cdf that is defined in + * the input device lambda + * + * @tparam OutType output type + * @tparam MathType type on which arithmetic is done + * @tparam LenTyp index type + * @tparam Lambda device lambda (or operator) + * + * @param[out] ptr output buffer [on device] [len = len] + * @param[in] len number of elements to be generated + * @param[in] randOp the device lambda or operator + * @param[in] stream cuda stream + * @{ + */ + template + void custom_distribution(OutType *ptr, LenType len, Lambda randOp, + cudaStream_t stream) { + detail::RngImpl::custom_distribution(ptr, len, randOp, stream); + } + template + void custom_distribution2(OutType *ptr, LenType len, Lambda randOp, + cudaStream_t stream) { + detail::RngImpl::custom_distribution2(ptr, len, randOp, stream); + } + /** @} */ +}; + +}; // end namespace random +}; // end namespace raft diff --git a/cpp/include/raft/random/rng_impl.cuh b/cpp/include/raft/random/rng_impl.cuh deleted file mode 100644 index d44c6f018b..0000000000 --- a/cpp/include/raft/random/rng_impl.cuh +++ /dev/null @@ -1,290 +0,0 @@ -/* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -namespace raft { -namespace random { -namespace detail { - -/** Philox-based random number generator */ -// Courtesy: Jakub Szuppe -struct PhiloxGenerator { - /** - * @brief ctor. Initializes the state for RNG - * @param seed random seed (can be same across all threads) - * @param subsequence as found in curand docs - * @param offset as found in curand docs - */ - DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) { - curand_init(seed, subsequence, offset, &state); - } - - /** - * @defgroup NextRand Generate the next random number - * @{ - */ - DI void next(float& ret) { ret = curand_uniform(&(this->state)); } - DI void next(double& ret) { ret = curand_uniform_double(&(this->state)); } - DI void next(uint32_t& ret) { ret = curand(&(this->state)); } - DI void next(uint64_t& ret) { - uint32_t a, b; - next(a); - next(b); - ret = (uint64_t)a | ((uint64_t)b << 32); - } - DI void next(int32_t& ret) { - uint32_t val; - next(val); - ret = int32_t(val & 0x7fffffff); - } - DI void next(int64_t& ret) { - uint64_t val; - next(val); - ret = int64_t(val & 0x7fffffffffffffff); - } - /** @} */ - - private: - /** the state for RNG */ - curandStatePhilox4_32_10_t state; -}; - -/** LFSR taps-filter for generating random numbers. */ -// Courtesy: Vinay Deshpande -struct TapsGenerator { - /** - * @brief ctor. Initializes the state for RNG - * @param seed the seed (can be same across all threads) - * @param subsequence unused - * @param offset unused - */ - DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) { - uint64_t delta = (blockIdx.x * blockDim.x) + threadIdx.x; - uint64_t stride = blockDim.x * gridDim.x; - delta += ((blockIdx.y * blockDim.y) + threadIdx.y) * stride; - stride *= blockDim.y * gridDim.y; - delta += ((blockIdx.z * blockDim.z) + threadIdx.z) * stride; - state = seed + delta + 1; - } - - /** - * @defgroup NextRand Generate the next random number - * @{ - */ - template - DI void next(Type& ret) { - constexpr double ULL_LARGE = 1.8446744073709551614e19; - uint64_t val; - next(val); - ret = static_cast(val); - ret /= static_cast(ULL_LARGE); - } - DI void next(uint64_t& ret) { - constexpr uint64_t TAPS = 0x8000100040002000ULL; - constexpr int ROUNDS = 128; - for (int i = 0; i < ROUNDS; i++) - state = (state >> 1) ^ (-(state & 1ULL) & TAPS); - ret = state; - } - DI void next(uint32_t& ret) { - uint64_t val; - next(val); - ret = (uint32_t)val; - } - DI void next(int32_t& ret) { - uint32_t val; - next(val); - ret = int32_t(val & 0x7fffffff); - } - DI void next(int64_t& ret) { - uint64_t val; - next(val); - ret = int64_t(val & 0x7fffffffffffffff); - } - /** @} */ - - private: - /** the state for RNG */ - uint64_t state; -}; - -/** Kiss99-based random number generator */ - -struct Kiss99Generator { - /** - * @brief ctor. Initializes the state for RNG - * @param seed the seed (can be same across all threads) - * @param subsequence unused - * @param offset unused - */ - DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) { - initKiss99(seed); - } - - /** - * @defgroup NextRand Generate the next random number - * @{ - */ - template - DI void next(Type& ret) { - constexpr double U_LARGE = 4.294967295e9; - uint32_t val; - next(val); - ret = static_cast(val); - ret /= static_cast(U_LARGE); - } - DI void next(uint32_t& ret) { - uint32_t MWC; - z = 36969 * (z & 65535) + (z >> 16); - w = 18000 * (w & 65535) + (w >> 16); - MWC = ((z << 16) + w); - jsr ^= (jsr << 17); - jsr ^= (jsr >> 13); - jsr ^= (jsr << 5); - jcong = 69069 * jcong + 1234567; - MWC = ((MWC ^ jcong) + jsr); - ret = MWC; - } - DI void next(uint64_t& ret) { - uint32_t a, b; - next(a); - next(b); - ret = (uint64_t)a | ((uint64_t)b << 32); - } - DI void next(int32_t& ret) { - uint32_t val; - next(val); - ret = int32_t(val & 0x7fffffff); - } - DI void next(int64_t& ret) { - uint64_t val; - next(val); - ret = int64_t(val & 0x7fffffffffffffff); - } - /** @} */ - - private: - /** one of the kiss99 states */ - uint32_t z; - /** one of the kiss99 states */ - uint32_t w; - /** one of the kiss99 states */ - uint32_t jsr; - /** one of the kiss99 states */ - uint32_t jcong; - - // This function multiplies 128-bit hash by 128-bit FNV prime and returns lower - // 128 bits. It uses 32-bit wide multiply only. - DI void mulByFnv1a128Prime(uint32_t* h) { - typedef union { - uint32_t u32[2]; - uint64_t u64[1]; - } words64; - - // 128-bit FNV prime = p3 * 2^96 + p2 * 2^64 + p1 * 2^32 + p0 - // Here p0 = 315, p2 = 16777216, p1 = p3 = 0 - const uint32_t p0 = uint32_t(315), p2 = uint32_t(16777216); - // Partial products - words64 h0p0, h1p0, h2p0, h0p2, h3p0, h1p2; - - h0p0.u64[0] = uint64_t(h[0]) * p0; - h1p0.u64[0] = uint64_t(h[1]) * p0; - h2p0.u64[0] = uint64_t(h[2]) * p0; - h0p2.u64[0] = uint64_t(h[0]) * p2; - h3p0.u64[0] = uint64_t(h[3]) * p0; - h1p2.u64[0] = uint64_t(h[1]) * p2; - - // h_n[0] = LO(h[0]*p[0]); - // h_n[1] = HI(h[0]*p[0]) + LO(h[1]*p[0]); - // h_n[2] = HI(h[1]*p[0]) + LO(h[2]*p[0]) + LO(h[0]*p[2]); - // h_n[3] = HI(h[2]*p[0]) + HI(h[0]*p[2]) + LO(h[3]*p[0]) + LO(h[1]*p[2]); - uint32_t carry = 0; - h[0] = h0p0.u32[0]; - - h[1] = h0p0.u32[1] + h1p0.u32[0]; - carry = h[1] < h0p0.u32[1] ? 1 : 0; - - h[2] = h1p0.u32[1] + carry; - carry = h[2] < h1p0.u32[1] ? 1 : 0; - h[2] += h2p0.u32[0]; - carry = h[2] < h2p0.u32[0] ? carry + 1 : carry; - h[2] += h0p2.u32[0]; - carry = h[2] < h0p2.u32[0] ? carry + 1 : carry; - - h[3] = h2p0.u32[1] + h0p2.u32[1] + h3p0.u32[0] + h1p2.u32[0] + carry; - return; - } - - DI void fnv1a128(uint32_t* hash, uint32_t txt) { - hash[0] ^= (txt >> 0) & 0xFF; - mulByFnv1a128Prime(hash); - hash[0] ^= (txt >> 8) & 0xFF; - mulByFnv1a128Prime(hash); - hash[0] ^= (txt >> 16) & 0xFF; - mulByFnv1a128Prime(hash); - hash[0] ^= (txt >> 24) & 0xFF; - mulByFnv1a128Prime(hash); - } - - DI void initKiss99(uint64_t seed) { - // Initialize hash to 128-bit FNV1a basis - uint32_t hash[4] = {1653982605UL, 1656234357UL, 129696066UL, 1818371886UL}; - - // Digest threadIdx, blockIdx and seed - fnv1a128(hash, threadIdx.x); - fnv1a128(hash, threadIdx.y); - fnv1a128(hash, threadIdx.z); - fnv1a128(hash, blockIdx.x); - fnv1a128(hash, blockIdx.y); - fnv1a128(hash, blockIdx.z); - fnv1a128(hash, uint32_t(seed)); - fnv1a128(hash, uint32_t(seed >> 32)); - - // Initialize KISS99 state with hash - z = hash[0]; - w = hash[1]; - jsr = hash[2]; - jcong = hash[3]; - } -}; - -/** - * @brief generator-agnostic way of generating random numbers - * @tparam GenType the generator object that expose 'next' method - */ -template -struct Generator { - DI Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) - : gen(seed, subsequence, offset) {} - - template - DI void next(Type& ret) { - gen.next(ret); - } - - private: - /** the actual generator */ - GenType gen; -}; - -}; // end namespace detail -}; // end namespace random -}; // end namespace raft diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh index fc1a7c0d8d..b796b63dc8 100644 --- a/cpp/include/raft/sparse/selection/knn.cuh +++ b/cpp/include/raft/sparse/selection/knn.cuh @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh index 909e28708e..7354fa3497 100644 --- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh +++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh @@ -31,8 +31,8 @@ #include -#include -#include +#include +#include #include #include diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp index 876e91e877..b66ea025a2 100644 --- a/cpp/include/raft/spatial/knn/detail/processing.hpp +++ b/cpp/include/raft/spatial/knn/detail/processing.hpp @@ -19,8 +19,8 @@ #include #include #include -#include -#include +#include +#include #include namespace raft { diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh similarity index 81% rename from cpp/include/raft/stats/mean.cuh rename to cpp/include/raft/stats/detail/mean.cuh index 8691cabc85..1b338a035a 100644 --- a/cpp/include/raft/stats/mean.cuh +++ b/cpp/include/raft/stats/detail/mean.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,13 +16,14 @@ #pragma once -#include #include -#include #include +#include + namespace raft { namespace stats { +namespace detail { ///@todo: ColsPerBlk has been tested only for 32! template @@ -62,23 +63,6 @@ __global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D, } } -/** - * @brief Compute mean of the input matrix - * - * Mean operation is assumed to be performed on a given column. - * - * @tparam Type: the data type - * @tparam IdxType Integer type used to for addressing - * @param mu: the output mean vector - * @param data: the input matrix - * @param D: number of columns of data - * @param N: number of rows of data - * @param sample: whether to evaluate sample mean or not. In other words, - * whether - * to normalize the output using N-1 or N, for true or false, respectively - * @param rowMajor: whether the input data is row or col major - * @param stream: cuda stream - */ template void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream) { @@ -102,5 +86,6 @@ void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample, CUDA_CHECK(cudaPeekAtLastError()); } -}; // namespace stats -}; // namespace raft +} // namespace detail +} // namespace stats +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/stats/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh similarity index 87% rename from cpp/include/raft/stats/stddev.cuh rename to cpp/include/raft/stats/detail/stddev.cuh index f12c633829..e8917a60b3 100644 --- a/cpp/include/raft/stats/stddev.cuh +++ b/cpp/include/raft/stats/detail/stddev.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,13 +16,14 @@ #pragma once -#include #include -#include #include +#include + namespace raft { namespace stats { +namespace detail { ///@todo: ColPerBlk has been tested only for 32! template @@ -131,23 +132,23 @@ void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N, } /** - * @brief Compute variance of the input matrix - * - * Variance operation is assumed to be performed on a given column. - * - * @tparam Type the data type - * @tparam IdxType Integer type used to for addressing - * @param var the output stddev vector - * @param data the input matrix - * @param mu the mean vector - * @param D number of columns of data - * @param N number of rows of data - * @param sample whether to evaluate sample stddev or not. In other words, - * whether - * to normalize the output using N-1 or N, for true or false, respectively - * @param rowMajor whether the input data is row or col major - * @param stream cuda stream where to launch work - */ + * @brief Compute variance of the input matrix + * + * Variance operation is assumed to be performed on a given column. + * + * @tparam Type the data type + * @tparam IdxType Integer type used to for addressing + * @param var the output stddev vector + * @param data the input matrix + * @param mu the mean vector + * @param D number of columns of data + * @param N number of rows of data + * @param sample whether to evaluate sample stddev or not. In other words, + * whether + * to normalize the output using N-1 or N, for true or false, respectively + * @param rowMajor whether the input data is row or col major + * @param stream cuda stream where to launch work + */ template void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream) { @@ -172,5 +173,6 @@ void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N, CUDA_CHECK(cudaPeekAtLastError()); } -}; // namespace stats -}; // namespace raft +} // namespace detail +} // namespace stats +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/stats/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh similarity index 83% rename from cpp/include/raft/stats/sum.cuh rename to cpp/include/raft/stats/detail/sum.cuh index 5f8416c7e2..37a3313ed1 100644 --- a/cpp/include/raft/stats/sum.cuh +++ b/cpp/include/raft/stats/detail/sum.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,13 +16,14 @@ #pragma once -#include -#include #include #include +#include + namespace raft { namespace stats { +namespace detail { ///@todo: ColsPerBlk has been tested only for 32! template @@ -62,20 +63,6 @@ __global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D, } } -/** - * @brief Compute sum of the input matrix - * - * Sum operation is assumed to be performed on a given column. - * - * @tparam Type the data type - * @tparam IdxType Integer type used to for addressing - * @param output the output mean vector - * @param input the input matrix - * @param D number of columns of data - * @param N number of rows of data - * @param rowMajor whether the input data is row or col major - * @param stream cuda stream where to launch work - */ template void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream) { @@ -96,5 +83,6 @@ void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor, CUDA_CHECK(cudaPeekAtLastError()); } -}; // end namespace stats -}; // end namespace raft +} // namespace detail +} // namespace stats +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/stats/mean.hpp b/cpp/include/raft/stats/mean.hpp new file mode 100644 index 0000000000..6e4cf39850 --- /dev/null +++ b/cpp/include/raft/stats/mean.hpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/mean.cuh" + +#include + +namespace raft { +namespace stats { + +/** + * @brief Compute mean of the input matrix + * + * Mean operation is assumed to be performed on a given column. + * + * @tparam Type: the data type + * @tparam IdxType Integer type used to for addressing + * @param mu: the output mean vector + * @param data: the input matrix + * @param D: number of columns of data + * @param N: number of rows of data + * @param sample: whether to evaluate sample mean or not. In other words, + * whether + * to normalize the output using N-1 or N, for true or false, respectively + * @param rowMajor: whether the input data is row or col major + * @param stream: cuda stream + */ +template +void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample, + bool rowMajor, cudaStream_t stream) { + detail::mean(mu, data, D, N, sample, rowMajor, stream); +} + +}; // namespace stats +}; // namespace raft diff --git a/cpp/include/raft/stats/mean_center.cuh b/cpp/include/raft/stats/mean_center.hpp similarity index 100% rename from cpp/include/raft/stats/mean_center.cuh rename to cpp/include/raft/stats/mean_center.hpp diff --git a/cpp/include/raft/stats/stddev.hpp b/cpp/include/raft/stats/stddev.hpp new file mode 100644 index 0000000000..17c5ae457d --- /dev/null +++ b/cpp/include/raft/stats/stddev.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/stddev.cuh" + +#include + +namespace raft { +namespace stats { + +/** + * @brief Compute stddev of the input matrix + * + * Stddev operation is assumed to be performed on a given column. + * + * @tparam Type the data type + * @tparam IdxType Integer type used to for addressing + * @param std the output stddev vector + * @param data the input matrix + * @param mu the mean vector + * @param D number of columns of data + * @param N number of rows of data + * @param sample whether to evaluate sample stddev or not. In other words, + * whether + * to normalize the output using N-1 or N, for true or false, respectively + * @param rowMajor whether the input data is row or col major + * @param stream cuda stream where to launch work + */ +template +void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N, + bool sample, bool rowMajor, cudaStream_t stream) { + detail::stddev(std, data, mu, D, N, sample, rowMajor, stream); +} + +/** + * @brief Compute variance of the input matrix + * + * Variance operation is assumed to be performed on a given column. + * + * @tparam Type the data type + * @tparam IdxType Integer type used to for addressing + * @param var the output stddev vector + * @param data the input matrix + * @param mu the mean vector + * @param D number of columns of data + * @param N number of rows of data + * @param sample whether to evaluate sample stddev or not. In other words, + * whether + * to normalize the output using N-1 or N, for true or false, respectively + * @param rowMajor whether the input data is row or col major + * @param stream cuda stream where to launch work + */ +template +void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N, + bool sample, bool rowMajor, cudaStream_t stream) { + detail::vars(var, data, mu, D, N, sample, rowMajor, stream); +} + +}; // namespace stats +}; // namespace raft diff --git a/cpp/include/raft/stats/sum.hpp b/cpp/include/raft/stats/sum.hpp new file mode 100644 index 0000000000..4f67acdf36 --- /dev/null +++ b/cpp/include/raft/stats/sum.hpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/sum.cuh" + +#include + +namespace raft { +namespace stats { + +/** + * @brief Compute sum of the input matrix + * + * Sum operation is assumed to be performed on a given column. + * + * @tparam Type the data type + * @tparam IdxType Integer type used to for addressing + * @param output the output mean vector + * @param input the input matrix + * @param D number of columns of data + * @param N number of rows of data + * @param rowMajor whether the input data is row or col major + * @param stream cuda stream where to launch work + */ +template +void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor, + cudaStream_t stream) { + detail::sum(output, input, D, N, rowMajor, stream); +} + +}; // end namespace stats +}; // end namespace raft diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu index 82c2bfdfec..efa1e2cd41 100644 --- a/cpp/test/distance/dist_adj.cu +++ b/cpp/test/distance/dist_adj.cu @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include "../test_utils.h" diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh index f6b18198d2..f31fbc9165 100644 --- a/cpp/test/distance/distance_base.cuh +++ b/cpp/test/distance/distance_base.cuh @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include "../test_utils.h" namespace raft { diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu index 8c06abf370..33782baf8d 100644 --- a/cpp/test/distance/fused_l2_nn.cu +++ b/cpp/test/distance/fused_l2_nn.cu @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include "../test_utils.h" namespace raft { diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu index eea9df046b..48ad83dfd2 100644 --- a/cpp/test/linalg/add.cu +++ b/cpp/test/linalg/add.cu @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include "../test_utils.h" #include "add.cuh" diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu index b60f04cd34..c8121bfbe4 100644 --- a/cpp/test/linalg/binary_op.cu +++ b/cpp/test/linalg/binary_op.cu @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include "../test_utils.h" #include "binary_op.cuh" diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu index f17a0f0f5d..fdfc3052b7 100644 --- a/cpp/test/linalg/coalesced_reduction.cu +++ b/cpp/test/linalg/coalesced_reduction.cu @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include "../test_utils.h" #include "reduce.cuh" diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu index 430c35f41b..d90955147c 100644 --- a/cpp/test/linalg/divide.cu +++ b/cpp/test/linalg/divide.cu @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include "../test_utils.h" #include "unary_op.cuh" diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu index 87d6c4faa6..2ac9118506 100644 --- a/cpp/test/linalg/eig.cu +++ b/cpp/test/linalg/eig.cu @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include "../test_utils.h" namespace raft { diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu index 3c75654733..9eb1c10313 100644 --- a/cpp/test/linalg/eig_sel.cu +++ b/cpp/test/linalg/eig_sel.cu @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include "../test_utils.h" namespace raft { diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu index 65bd7e4427..c3b26f5423 100644 --- a/cpp/test/linalg/eltwise.cu +++ b/cpp/test/linalg/eltwise.cu @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include "../test_utils.h" namespace raft { diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu index cecfc5eb8e..699d40d55e 100644 --- a/cpp/test/linalg/gemm_layout.cu +++ b/cpp/test/linalg/gemm_layout.cu @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include "../test_utils.h" namespace raft { diff --git a/cpp/test/linalg/gemv.cu b/cpp/test/linalg/gemv.cu index 4a474bc461..92e59ae49b 100644 --- a/cpp/test/linalg/gemv.cu +++ b/cpp/test/linalg/gemv.cu @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include "../test_utils.h" namespace raft { diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu index 271ae13b2e..f04c225aa9 100644 --- a/cpp/test/linalg/map.cu +++ b/cpp/test/linalg/map.cu @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include "../test_utils.h" namespace raft { diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu index e77809def7..9d59e49e60 100644 --- a/cpp/test/linalg/map_then_reduce.cu +++ b/cpp/test/linalg/map_then_reduce.cu @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include "../test_utils.h" diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu index 28296ad7bd..aad1d1e137 100644 --- a/cpp/test/linalg/matrix_vector_op.cu +++ b/cpp/test/linalg/matrix_vector_op.cu @@ -16,7 +16,7 @@ #include #include -#include +#include #include "../test_utils.h" #include "matrix_vector_op.cuh" diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu index d0772e538d..f78ae64f05 100644 --- a/cpp/test/linalg/multiply.cu +++ b/cpp/test/linalg/multiply.cu @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include "../test_utils.h" #include "unary_op.cuh" diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu index 94b703d15e..659956534e 100644 --- a/cpp/test/linalg/norm.cu +++ b/cpp/test/linalg/norm.cu @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include "../test_utils.h" namespace raft { diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu index cf7585dc23..9822ca2c60 100644 --- a/cpp/test/linalg/reduce.cu +++ b/cpp/test/linalg/reduce.cu @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include "../test_utils.h" #include "reduce.cuh" diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu index 56632a59cc..4f761d39f6 100644 --- a/cpp/test/linalg/strided_reduction.cu +++ b/cpp/test/linalg/strided_reduction.cu @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include "../test_utils.h" #include "reduce.cuh" diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu index df3686ee32..0a82da61c9 100644 --- a/cpp/test/linalg/subtract.cu +++ b/cpp/test/linalg/subtract.cu @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include "../test_utils.h" namespace raft { diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu index cbd6df9c8f..8ebbf19683 100644 --- a/cpp/test/linalg/svd.cu +++ b/cpp/test/linalg/svd.cu @@ -18,8 +18,8 @@ #include #include #include -#include -#include +#include +#include #include "../test_utils.h" namespace raft { diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu index b3f7f5b126..1d8ef08673 100644 --- a/cpp/test/linalg/transpose.cu +++ b/cpp/test/linalg/transpose.cu @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include "../test_utils.h" namespace raft { diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu index c3d10d70e7..0fcf465150 100644 --- a/cpp/test/linalg/unary_op.cu +++ b/cpp/test/linalg/unary_op.cu @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include "../test_utils.h" #include "unary_op.cuh" diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu index 84aa310076..7c7f29815b 100644 --- a/cpp/test/matrix/math.cu +++ b/cpp/test/matrix/math.cu @@ -16,8 +16,8 @@ #include #include -#include -#include +#include +#include #include "../test_utils.h" namespace raft { diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu index 9dde1cca9a..e247abad1e 100644 --- a/cpp/test/matrix/matrix.cu +++ b/cpp/test/matrix/matrix.cu @@ -16,8 +16,8 @@ #include #include -#include -#include +#include +#include #include #include "../test_utils.h" diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu index f0e0f6cb93..810d6cb871 100644 --- a/cpp/test/random/rng.cu +++ b/cpp/test/random/rng.cu @@ -18,14 +18,16 @@ #include #include #include -#include -#include -#include +#include +#include +#include #include "../test_utils.h" namespace raft { namespace random { +using namespace raft::random::detail; + enum RandomType { RNG_Normal, RNG_LogNormal, diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu index e51700fbb7..cef2d47276 100644 --- a/cpp/test/random/rng_int.cu +++ b/cpp/test/random/rng_int.cu @@ -18,12 +18,14 @@ #include #include #include -#include +#include #include "../test_utils.h" namespace raft { namespace random { +using namespace raft::random::detail; + enum RandomType { RNG_Uniform }; template diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu index ecb4164616..1d33f08c62 100644 --- a/cpp/test/random/sample_without_replacement.cu +++ b/cpp/test/random/sample_without_replacement.cu @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include "../test_utils.h" @@ -25,6 +25,8 @@ namespace raft { namespace random { +using namespace raft::random::detail; + // Terminology: // SWoR - Sample Without Replacement diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu index b9d4d18e98..a5f08489f1 100644 --- a/cpp/test/sparse/add.cu +++ b/cpp/test/sparse/add.cu @@ -20,7 +20,7 @@ #include #include -#include +#include #include "../test_utils.h" #include diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu index 8bdd5b88c7..d30114bbcb 100644 --- a/cpp/test/sparse/convert_coo.cu +++ b/cpp/test/sparse/convert_coo.cu @@ -20,7 +20,7 @@ #include #include -#include +#include #include "../test_utils.h" diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu index 2f1ed99332..cd665934c2 100644 --- a/cpp/test/sparse/convert_csr.cu +++ b/cpp/test/sparse/convert_csr.cu @@ -16,7 +16,7 @@ #include #include -#include +#include #include "../test_utils.h" #include diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu index 2201702b03..fbadadb29d 100644 --- a/cpp/test/sparse/degree.cu +++ b/cpp/test/sparse/degree.cu @@ -16,7 +16,7 @@ #include #include -#include +#include #include "../test_utils.h" #include diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu index 4634e5fc0e..58ad9cf803 100644 --- a/cpp/test/sparse/filter.cu +++ b/cpp/test/sparse/filter.cu @@ -16,7 +16,7 @@ #include #include -#include +#include #include "../test_utils.h" #include diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu index 712f95018c..c2a1c4b93c 100644 --- a/cpp/test/sparse/knn_graph.cu +++ b/cpp/test/sparse/knn_graph.cu @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include #include "../test_utils.h" diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu index 91b9bc6c54..4900b3ff2b 100644 --- a/cpp/test/sparse/norm.cu +++ b/cpp/test/sparse/norm.cu @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include #include "../test_utils.h" diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu index 8011d73a6e..d527e7323e 100644 --- a/cpp/test/sparse/row_op.cu +++ b/cpp/test/sparse/row_op.cu @@ -20,7 +20,7 @@ #include #include -#include +#include #include "../test_utils.h" #include diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu index 92833630dd..7d43780cfd 100644 --- a/cpp/test/sparse/sort.cu +++ b/cpp/test/sparse/sort.cu @@ -16,7 +16,7 @@ #include #include -#include +#include #include "../test_utils.h" #include diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu index d50211f971..77d9d3d822 100644 --- a/cpp/test/sparse/symmetrize.cu +++ b/cpp/test/sparse/symmetrize.cu @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include #include diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu index 9884202cc0..cf866a5663 100644 --- a/cpp/test/stats/mean.cu +++ b/cpp/test/stats/mean.cu @@ -19,8 +19,8 @@ #include #include #include -#include -#include +#include +#include #include "../test_utils.h" namespace raft { diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu index 9845663df9..dcc4b4e551 100644 --- a/cpp/test/stats/mean_center.cu +++ b/cpp/test/stats/mean_center.cu @@ -16,9 +16,9 @@ #include #include -#include -#include -#include +#include +#include +#include #include "../linalg/matrix_vector_op.cuh" #include "../test_utils.h" diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu index 8c42b70c07..53f392aaf3 100644 --- a/cpp/test/stats/stddev.cu +++ b/cpp/test/stats/stddev.cu @@ -16,10 +16,10 @@ #include #include -#include -#include -#include -#include +#include +#include +#include +#include #include "../test_utils.h" namespace raft { diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu index f5b341cb0e..ac4d642c8e 100644 --- a/cpp/test/stats/sum.cu +++ b/cpp/test/stats/sum.cu @@ -17,8 +17,8 @@ #include #include #include -#include -#include +#include +#include #include "../test_utils.h" namespace raft {