[cuda, hip] unify csr, dense and ell kernels

ginkgo-project · Aug 21, 2024 · c0e434c · c0e434c
1 parent 83a577c
commit c0e434c
Show file tree

Hide file tree

Showing 19 changed files with 356 additions and 257 deletions.
diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
@@ -23,6 +23,9 @@ set(CUDA_HIP_SOURCES
  factorization/par_ilut_select_kernels.cpp
  factorization/par_ilut_spgeam_kernels.cpp
  factorization/par_ilut_sweep_kernels.cpp
+ matrix/batch_csr_kernels.cpp
+ matrix/batch_dense_kernels.cpp
+ matrix/batch_ell_kernels.cpp
  matrix/coo_kernels.cpp
  matrix/dense_kernels.cpp
  matrix/diagonal_kernels.cpp

diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
@@ -35,11 +35,15 @@ namespace gko {
 namespace kernels {
 namespace GKO_DEVICE_NAMESPACE {
 namespace batch_single_kernels {
+namespace {
 
 
 constexpr auto default_block_size = 256;
 
 
+}
+
+
 template <typename ValueType, typename Mapping>
 __device__ __forceinline__ void scale(
  const gko::batch::multi_vector::batch_item<const ValueType>& alpha,

diff --git a/.../matrix/batch_csr_kernel_launcher.hpp.inc → common/cuda_hip/matrix/batch_csr_kernels.cpp b/.../matrix/batch_csr_kernel_launcher.hpp.inc → common/cuda_hip/matrix/batch_csr_kernels.cpp
@@ -2,6 +2,34 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "common/cuda_hip/matrix/batch_csr_kernels.hpp"
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_csr.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_csr_kernels.hpp"
+#include "core/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_csr {
+
+
+constexpr auto default_block_size = 256;
+
+
 template <typename ValueType, typename IndexType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
  const batch::matrix::Csr<ValueType, IndexType>* mat,
@@ -15,8 +43,9 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
  if (b->get_common_size()[1] > 1) {
  GKO_NOT_IMPLEMENTED;
  }
- simple_apply_kernel<<<num_blocks, default_block_size, 0,
- exec->get_stream()>>>(mat_ub, b_ub, x_ub);
+ batch_single_kernels::simple_apply_kernel<<<num_blocks, default_block_size,
+ 0, exec->get_stream()>>>(
+ mat_ub, b_ub, x_ub);
 }
 
 
@@ -41,9 +70,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
  if (b->get_common_size()[1] > 1) {
  GKO_NOT_IMPLEMENTED;
  }
- advanced_apply_kernel<<<num_blocks, default_block_size, 0,
-  exec->get_stream()>>>(alpha_ub, mat_ub, b_ub,
-   beta_ub, x_ub);
+ batch_single_kernels::advanced_apply_kernel<<<
+ num_blocks, default_block_size, 0, exec->get_stream()>>>(
+ alpha_ub, mat_ub, b_ub, beta_ub, x_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
@@ -59,8 +88,10 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
  const auto col_scale_vals = col_scale->get_const_data();
  const auto row_scale_vals = row_scale->get_const_data();
  const auto mat_ub = get_batch_struct(input);
- scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
- as_device_type(col_scale_vals), as_device_type(row_scale_vals), mat_ub);
+ batch_single_kernels::
+ scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+ as_device_type(col_scale_vals), as_device_type(row_scale_vals),
+ mat_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
@@ -77,10 +108,16 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
  const auto alpha_ub = get_batch_struct(alpha);
  const auto beta_ub = get_batch_struct(beta);
  const auto mat_ub = get_batch_struct(mat);
- add_scaled_identity_kernel<<<num_blocks, default_block_size, 0,
-  exec->get_stream()>>>(alpha_ub, beta_ub,
-   mat_ub);
+ batch_single_kernels::add_scaled_identity_kernel<<<
+ num_blocks, default_block_size, 0, exec->get_stream()>>>(
+ alpha_ub, beta_ub, mat_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(
  GKO_DECLARE_BATCH_CSR_ADD_SCALED_IDENTITY_KERNEL);
+
+
+} // namespace batch_csr
+} // namespace GKO_DEVICE_NAMESPACE
+} // namespace kernels
+} // namespace gko
diff --git a/...cuda_hip/matrix/batch_csr_kernels.hpp.inc → common/cuda_hip/matrix/batch_csr_kernels.hpp b/...cuda_hip/matrix/batch_csr_kernels.hpp.inc → common/cuda_hip/matrix/batch_csr_kernels.hpp
@@ -2,6 +2,44 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_csr.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
+
+#if defined(GKO_COMPILING_CUDA)
+#include "cuda/base/batch_struct.hpp"
+#include "cuda/matrix/batch_struct.hpp"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/base/batch_struct.hip.hpp"
+#include "hip/matrix/batch_struct.hip.hpp"
+#else
+#error "batch struct def missing"
+#endif
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_single_kernels {
+
+
 template <typename ValueType, typename IndexType>
 __device__ __forceinline__ void simple_apply(
  const gko::batch::matrix::csr::batch_item<const ValueType, IndexType>& mat,
@@ -21,23 +59,11 @@ __device__ __forceinline__ void simple_apply(
 }
 
 template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(
- default_block_size,
- sm_oversubscription) void simple_apply_kernel(const gko::batch::matrix::
- csr::uniform_batch<
- const ValueType,
- IndexType>
- mat,
- const gko::batch::
- multi_vector::
- uniform_batch<
- const ValueType>
- b,
- const gko::batch::
- multi_vector::
- uniform_batch<
- ValueType>
- x)
+__global__ __launch_bounds__(default_block_size) void simple_apply_kernel(
+ const gko::batch::matrix::csr::uniform_batch<const ValueType, IndexType>
+ mat,
+ const gko::batch::multi_vector::uniform_batch<const ValueType> b,
+ const gko::batch::multi_vector::uniform_batch<ValueType> x)
 {
  for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
  batch_id += gridDim.x) {
@@ -71,33 +97,13 @@ __device__ __forceinline__ void advanced_apply(
 }
 
 template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(
- default_block_size,
- sm_oversubscription) void advanced_apply_kernel(const gko::batch::
- multi_vector::
- uniform_batch<
- const ValueType>
- alpha,
- const gko::batch::matrix::
- csr::uniform_batch<
- const ValueType,
- IndexType>
- mat,
- const gko::batch::
- multi_vector::
- uniform_batch<
- const ValueType>
- b,
- const gko::batch::
- multi_vector::
- uniform_batch<
- const ValueType>
- beta,
- const gko::batch::
- multi_vector::
- uniform_batch<
- ValueType>
- x)
+__global__ __launch_bounds__(default_block_size) void advanced_apply_kernel(
+ const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
+ const gko::batch::matrix::csr::uniform_batch<const ValueType, IndexType>
+ mat,
+ const gko::batch::multi_vector::uniform_batch<const ValueType> b,
+ const gko::batch::multi_vector::uniform_batch<const ValueType> beta,
+ const gko::batch::multi_vector::uniform_batch<ValueType> x)
 {
  for (size_type batch_id = blockIdx.x; batch_id < mat.num_batch_items;
  batch_id += gridDim.x) {
@@ -196,3 +202,9 @@ __global__ void add_scaled_identity_kernel(
  add_scaled_identity(alpha_b.values[0], beta_b.values[0], mat_b);
  }
 }
+
+
+} // namespace batch_single_kernels
+} // namespace GKO_DEVICE_NAMESPACE
+} // namespace kernels
+} // namespace gko
diff --git a/...atrix/batch_dense_kernel_launcher.hpp.inc → ...n/cuda_hip/matrix/batch_dense_kernels.cpp b/...atrix/batch_dense_kernel_launcher.hpp.inc → ...n/cuda_hip/matrix/batch_dense_kernels.cpp
@@ -2,6 +2,34 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "common/cuda_hip/matrix/batch_dense_kernels.hpp"
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/batch_dense.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "core/base/batch_struct.hpp"
+#include "core/matrix/batch_dense_kernels.hpp"
+#include "core/matrix/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_dense {
+
+
+constexpr auto default_block_size = 256;
+
+
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
  const batch::matrix::Dense<ValueType>* mat,
@@ -15,8 +43,9 @@ void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
  if (b->get_common_size()[1] > 1) {
  GKO_NOT_IMPLEMENTED;
  }
- simple_apply_kernel<<<num_blocks, default_block_size, 0,
- exec->get_stream()>>>(mat_ub, b_ub, x_ub);
+ batch_single_kernels::simple_apply_kernel<<<num_blocks, default_block_size,
+ 0, exec->get_stream()>>>(
+ mat_ub, b_ub, x_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -40,9 +69,9 @@ void advanced_apply(std::shared_ptr<const DefaultExecutor> exec,
  if (b->get_common_size()[1] > 1) {
  GKO_NOT_IMPLEMENTED;
  }
- advanced_apply_kernel<<<num_blocks, default_block_size, 0,
-  exec->get_stream()>>>(alpha_ub, mat_ub, b_ub,
-   beta_ub, x_ub);
+ batch_single_kernels::advanced_apply_kernel<<<
+ num_blocks, default_block_size, 0, exec->get_stream()>>>(
+ alpha_ub, mat_ub, b_ub, beta_ub, x_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -58,8 +87,10 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
  const auto col_scale_vals = col_scale->get_const_data();
  const auto row_scale_vals = row_scale->get_const_data();
  const auto mat_ub = get_batch_struct(input);
- scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
- as_device_type(col_scale_vals), as_device_type(row_scale_vals), mat_ub);
+ batch_single_kernels::
+ scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+ as_device_type(col_scale_vals), as_device_type(row_scale_vals),
+ mat_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_SCALE_KERNEL);
@@ -75,7 +106,8 @@ void scale_add(std::shared_ptr<const DefaultExecutor> exec,
  const auto alpha_ub = get_batch_struct(alpha);
  const auto mat_ub = get_batch_struct(mat);
  const auto in_out_ub = get_batch_struct(in_out);
- scale_add_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+ batch_single_kernels::scale_add_kernel<<<num_blocks, default_block_size, 0,
+ exec->get_stream()>>>(
  alpha_ub, mat_ub, in_out_ub);
 }
 
@@ -92,10 +124,16 @@ void add_scaled_identity(std::shared_ptr<const DefaultExecutor> exec,
  const auto alpha_ub = get_batch_struct(alpha);
  const auto beta_ub = get_batch_struct(beta);
  const auto mat_ub = get_batch_struct(mat);
- add_scaled_identity_kernel<<<num_blocks, default_block_size, 0,
-  exec->get_stream()>>>(alpha_ub, beta_ub,
-   mat_ub);
+ batch_single_kernels::add_scaled_identity_kernel<<<
+ num_blocks, default_block_size, 0, exec->get_stream()>>>(
+ alpha_ub, beta_ub, mat_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
  GKO_DECLARE_BATCH_DENSE_ADD_SCALED_IDENTITY_KERNEL);
+
+
+} // namespace batch_dense
+} // namespace GKO_DEVICE_NAMESPACE
+} // namespace kernels
+} // namespace gko